From fb0907ef7c07a53d2260286a549ca821038539fb Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 16 May 2026 19:40:18 +0800 Subject: [PATCH] feat: add module backend assignment support --- README.md | 2 + docs/backend.md | 122 ++++++++ examples/cli/main.cpp | 4 +- examples/common/common.cpp | 12 + examples/common/common.h | 18 +- include/stable-diffusion.h | 6 +- src/anima.hpp | 4 +- src/auto_encoder_kl.hpp | 4 +- src/clip.hpp | 4 +- src/conditioner.hpp | 40 +-- src/control.hpp | 4 +- src/diffusion_model.hpp | 36 +-- src/ernie_image.hpp | 4 +- src/esrgan.hpp | 4 +- src/flux.hpp | 6 +- src/ggml_extend.hpp | 94 +----- src/ggml_extend_backend.cpp | 600 ++++++++++++++++++++++++++++++++++++ src/ggml_extend_backend.h | 77 +++++ src/ggml_extend_backend.hpp | 298 ------------------ src/hidream_o1.hpp | 12 +- src/llm.hpp | 10 +- src/lora.hpp | 3 +- src/mmdit.hpp | 6 +- src/model.cpp | 2 +- src/pmid.hpp | 8 +- src/qwen_image.hpp | 6 +- src/stable-diffusion.cpp | 416 ++++++++++++++----------- src/t5.hpp | 10 +- src/tae.hpp | 8 +- src/unet.hpp | 4 +- src/upscaler.cpp | 60 +++- src/upscaler.h | 11 +- src/util.cpp | 72 +---- src/util.h | 1 - src/vae.hpp | 8 +- src/wan.hpp | 12 +- src/z_image.hpp | 6 +- 37 files changed, 1234 insertions(+), 760 deletions(-) create mode 100644 docs/backend.md create mode 100644 src/ggml_extend_backend.cpp create mode 100644 src/ggml_extend_backend.h delete mode 100644 src/ggml_extend_backend.hpp diff --git a/README.md b/README.md index 33c272e9e..d889f28fd 100644 --- a/README.md +++ b/README.md @@ -133,9 +133,11 @@ API and command-line option may change frequently.*** ## Performance If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md). +For runtime and parameter backend placement, see the [backend selection guide](./docs/backend.md). ## More Guides +- [Backend selection](./docs/backend.md) - [SD1.x/SD2.x/SDXL](./docs/sd.md) - [SD3/SD3.5](./docs/sd3.md) - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md) diff --git a/docs/backend.md b/docs/backend.md new file mode 100644 index 000000000..53088b0e7 --- /dev/null +++ b/docs/backend.md @@ -0,0 +1,122 @@ +# Backend selection + +`stable-diffusion.cpp` has two backend assignments: + +- `--backend` selects the runtime backend used to execute model graphs. +- `--params-backend` selects the backend used to allocate model parameters. + +If `--params-backend` is not set, parameters use the same backend as their module runtime backend. + +## Syntax + +A backend assignment can be a single backend name: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend cpu +``` + +This applies to every module that does not have a more specific assignment. + +Assignments can also target individual modules: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend te=cpu,vae=cuda0,diffusion=vulkan0 +``` + +The same syntax is used for parameter placement: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu +``` + +Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent. + +`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend all=cuda0,te=cpu +``` + +## Modules + +| Module | Purpose | Accepted names | +| --- | --- | --- | +| `diffusion` | UNet, DiT, MMDiT, Flux, Wan, Qwen Image, and other diffusion models | `diffusion`, `model`, `unet`, `dit` | +| `te` | Text encoders and conditioners | `te`, `clip`, `text`, `textencoder`, `textencoders`, `conditioner`, `cond`, `llm`, `t5`, `t5xxl` | +| `clip_vision` | CLIP vision encoder | `clip_vision`, `clipvision`, `clip-vision`, `vision` | +| `vae` | VAE and TAE | `vae`, `firststage`, `autoencoder`, `tae` | +| `controlnet` | ControlNet | `controlnet`, `control` | +| `photomaker` | PhotoMaker ID encoder and PhotoMaker LoRA | `photomaker`, `photomakerid`, `pmid`, `photo` | +| `upscaler` | ESRGAN upscaler | `upscaler`, `esrgan`, `hires` | + +`te` is the preferred module name for text encoders. `clip` is kept as an accepted alias because many existing commands and model names use CLIP terminology. + +## Backend names + +Backend names are resolved against the GGML backend device list. Matching is case-insensitive and accepts exact names or unique prefixes, so common values include names such as: + +- `cpu` +- `cuda0` +- `vulkan0` +- `metal` + +The special values `auto`, `default`, and an empty backend name select the default backend. The default preference is GPU, then integrated GPU, then CPU. + +The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend. + +## Runtime backend vs. parameter backend + +The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated. + +For example: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu +``` + +This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed. + +Per-module assignments can be mixed: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,te=cpu,vae=cpu --params-backend diffusion=cuda0,te=cpu,vae=cpu +``` + +This keeps text encoding and VAE execution on CPU while the diffusion model runs on GPU. + +## Backend sharing and lifetime + +Backends are managed by `SDBackendManager`. + +Within one manager, backend instances are cached by resolved backend device name. If multiple modules request the same backend, they share the same `ggml_backend_t`. + +For example: + +```shell +--backend te=cpu,vae=cpu +``` + +uses one shared CPU backend for both `te` and `vae` runtime execution. + +Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance. + +`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them. + +## Compatibility flags + +The older CPU placement flags are still supported: + +- `--clip-on-cpu` +- `--vae-on-cpu` +- `--control-net-cpu` +- `--offload-to-cpu` + +`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`. + +`--offload-to-cpu` affects parameter backend assignment only when `--params-backend` is not set. It is equivalent to: + +```shell +--params-backend cpu +``` + +Explicit `--backend` and `--params-backend` assignments are preferred for new commands. diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 27513f475..bfe044fd3 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -749,7 +749,9 @@ int main(int argc, const char* argv[]) { ctx_params.offload_params_to_cpu, ctx_params.diffusion_conv_direct, ctx_params.n_threads, - gen_params.upscale_tile_size)); + gen_params.upscale_tile_size, + ctx_params.backend.c_str(), + ctx_params.params_backend.c_str())); if (upscaler_ctx == nullptr) { LOG_ERROR("new_upscaler_ctx failed"); diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 981834761..de7991ac5 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -380,6 +380,14 @@ ArgOptions SDContextParams::get_options() { "--upscale-model", "path to esrgan model.", &esrgan_path}, + {"", + "--backend", + "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0", + &backend}, + {"", + "--params-backend", + "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu", + ¶ms_backend}, }; options.int_options = { @@ -676,6 +684,8 @@ std::string SDContextParams::to_string() const { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" + << " backend: \"" << backend << "\",\n" + << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" @@ -751,6 +761,8 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f chroma_t5_mask_pad, qwen_image_zero_cond_t, max_vram, + backend.c_str(), + params_backend.c_str(), }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index badaa875d..d7ed9594e 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -110,14 +110,16 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; float max_vram = 0.f; - bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool flash_attn = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; + std::string backend; + std::string params_backend; + bool enable_mmap = false; + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool flash_attn = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; bool circular = false; bool circular_x = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index d1f3dc608..80913bd10 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -206,6 +206,8 @@ typedef struct { int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) + const char* backend; + const char* params_backend; } sd_ctx_params_t; typedef struct { @@ -427,7 +429,9 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, bool offload_params_to_cpu, bool direct, int n_threads, - int tile_size); + int tile_size, + const char* backend, + const char* params_backend); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, diff --git a/src/anima.hpp b/src/anima.hpp index 4bfc04749..486aec3ad 100644 --- a/src/anima.hpp +++ b/src/anima.hpp @@ -526,10 +526,10 @@ namespace Anima { AnimaNet net; AnimaRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { int64_t num_layers = 0; std::string layer_tag = prefix + ".net.blocks."; for (const auto& kv : tensor_storage_map) { diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index 4fb28a16f..489f8fd30 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -664,13 +664,13 @@ struct AutoEncoderKL : public VAE { AutoEncoderKLModel ae; AutoEncoderKL(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool decode_only = false, bool use_video_decoder = false, SDVersion version = VERSION_SD1) - : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) { + : decode_only(decode_only), VAE(version, backend, params_backend) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { scale_factor = 0.18215f; shift_factor = 0.f; diff --git a/src/clip.hpp b/src/clip.hpp index 8b2084c49..a3567324c 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -469,13 +469,13 @@ struct CLIPTextModelRunner : public GGMLRunner { std::vector attention_mask_vec; CLIPTextModelRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, bool force_clip_f32 = false) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, prefix)) { diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 5050eeffe..8e631e477 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -134,7 +134,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::map> embedding_pos_map; FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::map& orig_embedding_map, SDVersion version = VERSION_SD1, @@ -148,12 +148,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } bool force_clip_f32 = !embedding_map.empty(); if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); + text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); + text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); - text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); + text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } } @@ -670,9 +670,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { CLIPVisionModelProjection vision_model; FrozenCLIPVisionEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { std::string prefix = "cond_stage_model.transformer"; bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { @@ -729,7 +729,7 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) : clip_g_tokenizer(0) { bool use_clip_l = false; @@ -749,13 +749,13 @@ struct SD3CLIPEmbedder : public Conditioner { return; } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); } if (use_clip_g) { - clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + clip_g = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } if (use_t5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); + t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer"); } } @@ -1097,7 +1097,7 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) { bool use_clip_l = false; bool use_t5 = false; @@ -1115,12 +1115,12 @@ struct FluxCLIPEmbedder : public Conditioner { } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); } else { LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); } if (use_t5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); + t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer"); } else { LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); } @@ -1351,7 +1351,7 @@ struct T5CLIPEmbedder : public Conditioner { bool is_umt5 = false; T5CLIPEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, bool use_mask = false, int mask_pad = 0, @@ -1368,7 +1368,7 @@ struct T5CLIPEmbedder : public Conditioner { LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); return; } else { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5); + t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5); } } @@ -1553,12 +1553,12 @@ struct AnimaConditioner : public Conditioner { std::shared_ptr llm; AnimaConditioner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) { qwen_tokenizer = std::make_shared(); llm = std::make_shared(LLM::LLMArch::QWEN3, backend, - offload_params_to_cpu, + params_backend, tensor_storage_map, "text_encoders.llm", false); @@ -1671,7 +1671,7 @@ struct LLMEmbedder : public Conditioner { std::shared_ptr llm; LLMEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_QWEN_IMAGE, const std::string prefix = "", @@ -1692,7 +1692,7 @@ struct LLMEmbedder : public Conditioner { } llm = std::make_shared(arch, backend, - offload_params_to_cpu, + params_backend, tensor_storage_map, "text_encoders.llm", enable_vision); diff --git a/src/control.hpp b/src/control.hpp index d227ec948..fd1f6d869 100644 --- a/src/control.hpp +++ b/src/control.hpp @@ -319,10 +319,10 @@ struct ControlNet : public GGMLRunner { bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_SD1) - : GGMLRunner(backend, offload_params_to_cpu), control_net(version) { + : GGMLRunner(backend, params_backend), control_net(version) { control_net.init(params_ctx, tensor_storage_map, ""); } diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 26021ef24..70fc81da9 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -64,10 +64,10 @@ struct UNetModel : public DiffusionModel { UNetModelRunner unet; UNetModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_SD1) - : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) { + : unet(backend, params_backend, tensor_storage_map, "model.diffusion_model", version) { } std::string get_desc() override { @@ -135,9 +135,9 @@ struct MMDiTModel : public DiffusionModel { MMDiTRunner mmdit; MMDiTModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) - : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") { + : mmdit(backend, params_backend, tensor_storage_map, "model.diffusion_model") { } std::string get_desc() override { @@ -202,11 +202,11 @@ struct FluxModel : public DiffusionModel { Flux::FluxRunner flux; FluxModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_FLUX, bool use_mask = false) - : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) { + : flux(backend, params_backend, tensor_storage_map, "model.diffusion_model", version, use_mask) { } std::string get_desc() override { @@ -277,10 +277,10 @@ struct AnimaModel : public DiffusionModel { Anima::AnimaRunner anima; AnimaModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), anima(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { @@ -345,11 +345,11 @@ struct WanModel : public DiffusionModel { WAN::WanRunner wan; WanModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model", SDVersion version = VERSION_WAN2) - : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) { + : prefix(prefix), wan(backend, params_backend, tensor_storage_map, prefix, version) { } std::string get_desc() override { @@ -417,12 +417,12 @@ struct QwenImageModel : public DiffusionModel { Qwen::QwenImageRunner qwen_image; QwenImageModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model", SDVersion version = VERSION_QWEN_IMAGE, bool zero_cond_t = false) - : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) { + : prefix(prefix), qwen_image(backend, params_backend, tensor_storage_map, prefix, version, zero_cond_t) { } std::string get_desc() override { @@ -488,10 +488,10 @@ struct HiDreamO1Model : public DiffusionModel { HiDreamO1::HiDreamO1Runner hidream_o1; HiDreamO1Model(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model") - : prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), hidream_o1(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { @@ -564,11 +564,11 @@ struct ZImageModel : public DiffusionModel { ZImage::ZImageRunner z_image; ZImageModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model", SDVersion version = VERSION_Z_IMAGE) - : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) { + : prefix(prefix), z_image(backend, params_backend, tensor_storage_map, prefix, version) { } std::string get_desc() override { @@ -634,10 +634,10 @@ struct ErnieImageModel : public DiffusionModel { ErnieImage::ErnieImageRunner ernie_image; ErnieImageModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), ernie_image(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { diff --git a/src/ernie_image.hpp b/src/ernie_image.hpp index 931794f1a..bea84cdfb 100644 --- a/src/ernie_image.hpp +++ b/src/ernie_image.hpp @@ -331,10 +331,10 @@ namespace ErnieImage { std::vector pe_vec; ErnieImageRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { ernie_params.num_layers = 0; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, prefix)) { diff --git a/src/esrgan.hpp b/src/esrgan.hpp index f84b77a29..f54baca3c 100644 --- a/src/esrgan.hpp +++ b/src/esrgan.hpp @@ -161,10 +161,10 @@ struct ESRGAN : public GGMLRunner { int tile_size = 128; // avoid cuda OOM for 4gb VRAM ESRGAN(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, int tile_size = 128, const String2TensorStorage& tensor_storage_map = {}) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { this->tile_size = tile_size; } diff --git a/src/flux.hpp b/src/flux.hpp index 732a37197..2aac3be0c 100644 --- a/src/flux.hpp +++ b/src/flux.hpp @@ -1189,12 +1189,12 @@ namespace Flux { bool use_mask = false; FluxRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_FLUX, bool use_mask = false) - : GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) { + : GGMLRunner(backend, params_backend), version(version), use_mask(use_mask) { flux_params.version = version; flux_params.guidance_embed = false; flux_params.depth = 0; @@ -1564,7 +1564,7 @@ namespace Flux { } std::shared_ptr flux = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_FLUX2, diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index d2e655cef..8e2ed6940 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -26,7 +26,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.hpp" +#include "ggml_extend_backend.h" #include "ggml_graph_cut.h" #include "model.h" @@ -73,48 +73,6 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha } } -__STATIC_INLINE__ bool backend_name_exists(std::string name) { - ggml_backend_load_all_once(); - const size_t device_count = ggml_backend_dev_count(); - for (size_t i = 0; i < device_count; ++i) { - if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) { - return true; - } - } - return false; -} - -__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) { - if (name == "" || backend_name_exists(name)) { - return name; - } else { - LOG_WARN("Backend %s not found, using default backend", name.c_str()); - return ""; - } -} - -__STATIC_INLINE__ std::string get_default_backend_name() { - ggml_backend_load_all_once(); - // should pick the same backend as ggml_backend_init_best - ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); - dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); - dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (dev == nullptr) { - return ""; - } - return ggml_backend_dev_name(dev); -} - -__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") { - ggml_backend_load_all_once(); - LOG_DEBUG("Initializing backend: %s", name.c_str()); - if (name.empty()) { - return ggml_backend_init_best(); - } else { - return ggml_backend_init_by_name(name.c_str(), nullptr); - } -} - static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); // n-mode tensor-matrix product @@ -190,7 +148,7 @@ __STATIC_INLINE__ void ggml_ext_im_set_randn_f32(ggml_tensor* tensor, std::share uint32_t n = (uint32_t)ggml_nelements(tensor); std::vector random_numbers = rng->randn(n); for (uint32_t i = 0; i < n; i++) { - ggml_set_f32_1d(tensor, i, random_numbers[i]); + ggml_ext_im_set_f32_1d(tensor, i, random_numbers[i]); } } @@ -422,39 +380,6 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st // file.close(); // } -__STATIC_INLINE__ void copy_ggml_tensor(ggml_tensor* dst, ggml_tensor* src) { - if (dst->type == src->type) { - dst->nb[0] = src->nb[0]; - dst->nb[1] = src->nb[1]; - dst->nb[2] = src->nb[2]; - dst->nb[3] = src->nb[3]; - - memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst)); - return; - } - ggml_init_params params; - params.mem_size = 10 * 1024 * 1024; // for padding - params.mem_buffer = nullptr; - params.no_alloc = false; - ggml_context* ctx = ggml_init(params); - if (!ctx) { - LOG_ERROR("ggml_init() failed"); - return; - } - ggml_tensor* final = ggml_cpy(ctx, src, dst); - - ggml_cgraph* graph = ggml_new_graph(ctx); - ggml_build_forward_expand(graph, final); - ggml_graph_compute_with_ctx(ctx, graph, 1); - ggml_free(ctx); -} - -__STATIC_INLINE__ ggml_tensor* ggml_ext_dup_and_cpy_tensor(ggml_context* ctx, ggml_tensor* src) { - ggml_tensor* dup = ggml_dup_tensor(ctx, src); - copy_ggml_tensor(dup, src); - return dup; -} - __STATIC_INLINE__ float sigmoid(float x) { return 1 / (1.0f + expf(-x)); } @@ -2669,13 +2594,11 @@ struct GGMLRunner { public: virtual std::string get_desc() = 0; - GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false) - : runtime_backend(backend) { - if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) { - params_backend = ggml_backend_cpu_init(); - } else { - params_backend = runtime_backend; - } + GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend) + : params_backend(params_backend), + runtime_backend(backend) { + GGML_ASSERT(runtime_backend != nullptr); + GGML_ASSERT(params_backend != nullptr); alloc_params_ctx(); } @@ -2684,9 +2607,6 @@ struct GGMLRunner { free_compute_buffer(); free_params_ctx(); free_compute_ctx(); - if (params_backend != runtime_backend) { - ggml_backend_free(params_backend); - } free_cache_ctx_and_buffer(); } diff --git a/src/ggml_extend_backend.cpp b/src/ggml_extend_backend.cpp new file mode 100644 index 000000000..4bf8268e5 --- /dev/null +++ b/src/ggml_extend_backend.cpp @@ -0,0 +1,600 @@ +#include "ggml_extend_backend.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" + +static std::string trim_copy(const std::string& value) { + size_t begin = 0; + while (begin < value.size() && std::isspace(static_cast(value[begin]))) { + ++begin; + } + size_t end = value.size(); + while (end > begin && std::isspace(static_cast(value[end - 1]))) { + --end; + } + return value.substr(begin, end - begin); +} + +static std::string lower_copy(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return value; +} + +static std::vector split_copy(const std::string& value, char delimiter) { + std::vector parts; + std::string part; + std::istringstream stream(value); + while (std::getline(stream, part, delimiter)) { + parts.push_back(part); + } + return parts; +} + +static bool is_default_backend_token(const std::string& name) { + const std::string lower = lower_copy(trim_copy(name)); + return lower.empty() || lower == "default" || lower == "auto"; +} + +static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) { + std::string name = lower_copy(trim_copy(raw_name)); + name.erase(std::remove(name.begin(), name.end(), '-'), name.end()); + name.erase(std::remove(name.begin(), name.end(), '_'), name.end()); + + if (name == "diffusion" || name == "model" || name == "unet" || name == "dit") { + *module = SDBackendModule::DIFFUSION; + return true; + } + if (name == "te" || name == "clip" || name == "text" || name == "textencoder" || name == "textencoders" || name == "conditioner" || name == "cond" || name == "llm" || name == "t5" || name == "t5xxl") { + *module = SDBackendModule::TE; + return true; + } + if (name == "clipvision" || name == "vision") { + *module = SDBackendModule::CLIP_VISION; + return true; + } + if (name == "vae" || name == "firststage" || name == "autoencoder" || name == "tae") { + *module = SDBackendModule::VAE; + return true; + } + if (name == "controlnet" || name == "control") { + *module = SDBackendModule::CONTROL_NET; + return true; + } + if (name == "photomaker" || name == "photomakerid" || name == "pmid" || name == "photo") { + *module = SDBackendModule::PHOTOMAKER; + return true; + } + if (name == "upscaler" || name == "esrgan" || name == "hires") { + *module = SDBackendModule::UPSCALER; + return true; + } + return false; +} + +static std::string module_assignment_name(const SDBackendAssignment& assignment, SDBackendModule module) { + auto it = assignment.module_names.find(module); + if (it != assignment.module_names.end()) { + return it->second; + } + return assignment.default_name; +} + +static std::string backend_cache_key(ggml_backend_t backend) { + if (backend == nullptr) { + return ""; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev != nullptr) { + return lower_copy(ggml_backend_dev_name(dev)); + } + const char* backend_name = ggml_backend_name(backend); + return backend_name != nullptr ? lower_copy(backend_name) : ""; +} + +static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) { + ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); + if (dev == nullptr) { + return ""; + } + return ggml_backend_dev_name(dev); +} + +static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { + if (tensor == nullptr) { + return nullptr; + } + + return tensor->view_src ? tensor->view_src->buffer : tensor->buffer; +} + +static bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) { + if (tensor == nullptr || tensor->data == nullptr) { + return false; + } + + ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor); + return buffer == nullptr || ggml_backend_buffer_is_host(buffer); +} + +static size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return static_cast(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]); +} + +template +static void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) { + const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3); + + if (ggml_backend_tensor_is_host_accessible(tensor)) { + auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset); + *dst = value; + return; + } + + ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T)); +} + +static void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) { + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = {0, 0, 0, 0}; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } + + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +static void ggml_backend_load_all_once() { + // If the registry already has devices and the CPU backend is present, + // assume either static registration or explicit host-side preloading has + // completed and avoid rescanning the default paths. + if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { + return; + } + // In dynamic-backend mode the backend modules are discovered at runtime, + // so we must load them before asking for the CPU backend or its proc table. + // If the host preloaded only a subset of backends, allow one default-path + // scan so missing modules can still be discovered. + static std::once_flag once; + std::call_once(once, []() { + if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { + return; + } + ggml_backend_load_all(); + }); +} + +bool sd_backend_is(ggml_backend_t backend, const std::string& name) { + if (!backend) { + return false; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (!dev) { + return false; + } + std::string dev_name = ggml_backend_dev_name(dev); + return lower_copy(dev_name).find(lower_copy(name)) != std::string::npos; +} + +static std::string get_default_backend_name() { + ggml_backend_load_all_once(); + // should pick the same backend preference as ggml_backend_init_best + std::string name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + if (!name.empty()) { + return name; + } + name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + if (!name.empty()) { + return name; + } + return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); +} + +static std::string sd_resolve_backend_name(const std::string& name) { + ggml_backend_load_all_once(); + std::string requested = trim_copy(name); + std::string lower = lower_copy(requested); + + if (is_default_backend_token(lower)) { + return get_default_backend_name(); + } + if (lower == "gpu") { + std::string result = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + if (!result.empty()) { + return result; + } + return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + } + + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + std::string dev_name = ggml_backend_dev_name(dev); + if (lower_copy(dev_name) == lower) { + return dev_name; + } + } + + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + std::string dev_name = ggml_backend_dev_name(dev); + std::string dev_lower = lower_copy(dev_name); + if (dev_lower.rfind(lower, 0) == 0) { + return dev_name; + } + } + + return ""; +} + +static bool backend_name_exists(const std::string& name) { + return !sd_resolve_backend_name(name).empty(); +} + +static ggml_backend_t init_named_backend(const std::string& name) { + ggml_backend_load_all_once(); + LOG_DEBUG("Initializing backend: %s", name.c_str()); + if (trim_copy(name).empty()) { + return ggml_backend_init_best(); + } + + std::string resolved = sd_resolve_backend_name(name); + if (resolved.empty()) { + return nullptr; + } + return ggml_backend_init_by_name(resolved.c_str(), nullptr); +} + +static ggml_backend_t sd_get_default_backend() { + ggml_backend_load_all_once(); + static std::once_flag once; + std::call_once(once, []() { + size_t dev_count = ggml_backend_dev_count(); + if (dev_count == 0) { + LOG_ERROR("No devices found!"); + } else { + LOG_DEBUG("Found %zu backend devices:", dev_count); + for (size_t i = 0; i < dev_count; ++i) { + auto dev = ggml_backend_dev_get(i); + LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev)); + } + } + }); + + ggml_backend_t backend = nullptr; + const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); + if (SD_VK_DEVICE != nullptr) { + std::string sd_vk_device_str = SD_VK_DEVICE; + try { + unsigned long long device = std::stoull(sd_vk_device_str); + std::string vk_device_name = "Vulkan" + std::to_string(device); + if (backend_name_exists(vk_device_name)) { + LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str()); + backend = init_named_backend(vk_device_name); + if (!backend) { + LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str()); + } + } else { + LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str()); + } + } catch (const std::invalid_argument&) { + LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE); + } catch (const std::out_of_range&) { + LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE); + } + } + + if (!backend) { + std::string dev_name = get_default_backend_name(); + backend = init_named_backend(dev_name); + if (!backend && !dev_name.empty()) { + LOG_WARN("device %s failed to init", dev_name.c_str()); + } + } + + if (!backend) { + LOG_WARN("loading CPU backend"); + backend = ggml_backend_cpu_init(); + } + + if (ggml_backend_is_cpu(backend)) { + LOG_DEBUG("Using CPU backend"); + } + + return backend; +} + +static bool sd_parse_backend_assignment(const std::string& spec, SDBackendAssignment* assignment, std::string* error) { + if (assignment == nullptr) { + return false; + } + + *assignment = {}; + const std::string in = trim_copy(spec); + if (in.empty()) { + return true; + } + + for (const std::string& raw_part : split_copy(in, ',')) { + const std::string part = trim_copy(raw_part); + if (part.empty()) { + continue; + } + + const size_t eq = part.find('='); + if (eq == std::string::npos) { + assignment->set_default(part); + continue; + } + + const std::string key = trim_copy(part.substr(0, eq)); + const std::string value = trim_copy(part.substr(eq + 1)); + if (key.empty() || value.empty()) { + if (error != nullptr) { + *error = "invalid backend assignment '" + part + "'"; + } + return false; + } + + const std::string key_lower = lower_copy(key); + if (key_lower == "all" || key_lower == "default" || key_lower == "*") { + assignment->set_default(value); + continue; + } + + SDBackendModule module = SDBackendModule::DIFFUSION; + if (!parse_backend_module(key, &module)) { + if (error != nullptr) { + *error = "unknown backend module '" + key + "'"; + } + return false; + } + assignment->set_module(module, value); + } + return true; +} + +bool SDBackendAssignment::empty() const { + return default_name.empty() && module_names.empty(); +} + +std::string SDBackendAssignment::get(SDBackendModule module) const { + return module_assignment_name(*this, module); +} + +void SDBackendAssignment::set_default(const std::string& name) { + default_name = trim_copy(name); +} + +void SDBackendAssignment::set_module(SDBackendModule module, const std::string& name) { + module_names[module] = trim_copy(name); +} + +void SDBackendHandleDeleter::operator()(ggml_backend_t backend) const { + ggml_backend_free(backend); +} + +SDBackendManager::~SDBackendManager() { + reset(); +} + +void SDBackendManager::reset() { + backends_.clear(); + runtime_assignment_ = {}; + params_assignment_ = {}; +} + +ggml_backend_t SDBackendManager::runtime_backend(SDBackendModule module) { + return init_cached_backend(runtime_assignment_.get(module)); +} + +ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) { + std::string name = params_assignment_.get(module); + if (name.empty()) { + return runtime_backend(module); + } + return init_cached_backend(name); +} + +bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) { + return ggml_backend_is_cpu(runtime_backend(module)); +} + +bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) { + return ggml_backend_is_cpu(params_backend(module)); +} + +bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) { + ggml_backend_t backend = runtime_backend(module); + if (backend == nullptr) { + return false; + } + if (ggml_backend_is_cpu(backend)) { + return true; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev == nullptr) { + return false; + } + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + return props.caps.buffer_from_host_ptr; +} + +bool SDBackendManager::init(const char* backend_spec, + const char* params_backend_spec, + bool offload_params_to_cpu, + bool keep_clip_on_cpu, + bool keep_vae_on_cpu, + bool keep_control_net_on_cpu, + std::string* error) { + reset(); + + if (!sd_parse_backend_assignment(SAFE_STR(backend_spec), &runtime_assignment_, error)) { + return false; + } + if (!sd_parse_backend_assignment(SAFE_STR(params_backend_spec), ¶ms_assignment_, error)) { + return false; + } + + if (runtime_assignment_.empty()) { + if (keep_clip_on_cpu) { + runtime_assignment_.set_module(SDBackendModule::TE, "cpu"); + } + if (keep_vae_on_cpu) { + runtime_assignment_.set_module(SDBackendModule::VAE, "cpu"); + } + if (keep_control_net_on_cpu) { + runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu"); + } + } + + if (params_assignment_.empty() && offload_params_to_cpu) { + params_assignment_.set_default("cpu"); + } + + return validate(error); +} + +bool SDBackendManager::validate(std::string* error) const { + auto validate_name = [&](const std::string& name) -> bool { + if (is_default_backend_token(name)) { + return true; + } + if (!sd_resolve_backend_name(name).empty()) { + return true; + } + if (error != nullptr) { + *error = "backend '" + name + "' was not found"; + } + return false; + }; + + if (!validate_name(runtime_assignment_.default_name) || + !validate_name(params_assignment_.default_name)) { + return false; + } + for (const auto& kv : runtime_assignment_.module_names) { + if (!validate_name(kv.second)) { + return false; + } + } + for (const auto& kv : params_assignment_.module_names) { + if (!validate_name(kv.second)) { + return false; + } + } + return true; +} + +ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) { + std::string resolved = sd_resolve_backend_name(name); + std::string key = lower_copy(resolved); + ggml_backend_t backend = nullptr; + + if (!key.empty()) { + auto it = backends_.find(key); + if (it != backends_.end()) { + return it->second.get(); + } + } else if (!is_default_backend_token(name)) { + LOG_ERROR("backend '%s' was not found", name.c_str()); + return nullptr; + } + + backend = is_default_backend_token(name) ? sd_get_default_backend() : init_named_backend(resolved); + if (backend == nullptr) { + LOG_ERROR("failed to initialize backend '%s'", name.c_str()); + return nullptr; + } + + std::string actual_key = backend_cache_key(backend); + if (actual_key.empty()) { + actual_key = !key.empty() ? key : lower_copy(trim_copy(name)); + } + + auto it = backends_.find(actual_key); + if (it != backends_.end()) { + ggml_backend_free(backend); + return it->second.get(); + } + + SDBackendHandle handle(backend); + backends_.emplace(actual_key, std::move(handle)); + return backend; +} + +const char* sd_backend_module_name(SDBackendModule module) { + switch (module) { + case SDBackendModule::DIFFUSION: + return "diffusion"; + case SDBackendModule::TE: + return "te"; + case SDBackendModule::CLIP_VISION: + return "clip_vision"; + case SDBackendModule::VAE: + return "vae"; + case SDBackendModule::CONTROL_NET: + return "controlnet"; + case SDBackendModule::PHOTOMAKER: + return "photomaker"; + case SDBackendModule::UPSCALER: + return "upscaler"; + } + return "unknown"; +} diff --git a/src/ggml_extend_backend.h b/src/ggml_extend_backend.h new file mode 100644 index 000000000..b98efc10d --- /dev/null +++ b/src/ggml_extend_backend.h @@ -0,0 +1,77 @@ +#ifndef __SD_GGML_EXTEND_BACKEND_H__ +#define __SD_GGML_EXTEND_BACKEND_H__ + +#include +#include +#include +#include +#include + +#include "ggml-backend.h" +#include "ggml-cpu.h" +#include "ggml.h" + +enum class SDBackendModule { + DIFFUSION, + TE, + CLIP_VISION, + VAE, + CONTROL_NET, + PHOTOMAKER, + UPSCALER, +}; + +struct SDBackendAssignment { + std::string default_name; + std::unordered_map module_names; + + bool empty() const; + std::string get(SDBackendModule module) const; + void set_default(const std::string& name); + void set_module(SDBackendModule module, const std::string& name); +}; + +struct SDBackendHandleDeleter { + void operator()(ggml_backend_t backend) const; +}; + +using SDBackendHandle = std::unique_ptr; + +class SDBackendManager { +private: + SDBackendAssignment runtime_assignment_; + SDBackendAssignment params_assignment_; + std::unordered_map backends_; + +public: + SDBackendManager() = default; + ~SDBackendManager(); + + SDBackendManager(const SDBackendManager&) = delete; + SDBackendManager& operator=(const SDBackendManager&) = delete; + + bool init(const char* backend_spec, + const char* params_backend_spec, + bool offload_params_to_cpu, + bool keep_clip_on_cpu, + bool keep_vae_on_cpu, + bool keep_control_net_on_cpu, + std::string* error); + void reset(); + + ggml_backend_t runtime_backend(SDBackendModule module); + ggml_backend_t params_backend(SDBackendModule module); + + bool runtime_backend_is_cpu(SDBackendModule module); + bool params_backend_is_cpu(SDBackendModule module); + bool runtime_backend_supports_host_buffer(SDBackendModule module); + +private: + bool validate(std::string* error) const; + ggml_backend_t init_cached_backend(const std::string& name); +}; + +bool sd_backend_is(ggml_backend_t backend, const std::string& name); +const char* sd_backend_module_name(SDBackendModule module); +void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value); +#endif diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp deleted file mode 100644 index 50158c883..000000000 --- a/src/ggml_extend_backend.hpp +++ /dev/null @@ -1,298 +0,0 @@ -#ifndef __GGML_EXTEND_BACKEND_HPP__ -#define __GGML_EXTEND_BACKEND_HPP__ - -#include -#include - -#include "ggml-backend.h" -#include "ggml.h" - -#ifndef __STATIC_INLINE__ -#define __STATIC_INLINE__ static inline -#endif - -inline void ggml_backend_load_all_once() { - // If the registry already has devices and the CPU backend is present, - // assume either static registration or explicit host-side preloading has - // completed and avoid rescanning the default paths. - if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { - return; - } - // In dynamic-backend mode the backend modules are discovered at runtime, - // so we must load them before asking for the CPU backend or its proc table. - // If the host preloaded only a subset of backends, allow one default-path - // scan so missing modules can still be discovered. - static std::once_flag once; - std::call_once(once, []() { - if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { - return; - } - ggml_backend_load_all(); - }); -} - -// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS: -// those are CMake options used to configure ggml itself, but they are not -// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode. -// In practice, this target can reliably see GGML_BACKEND_DL, but not whether -// the CPU backend was compiled as a loadable module. We therefore use runtime -// backend discovery instead of compile-time assumptions. - -__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() { - ggml_backend_reg_t reg = ggml_backend_reg_by_name("CPU"); - if (reg != nullptr) { - return reg; - } - - ggml_backend_load_all_once(); - return ggml_backend_reg_by_name("CPU"); -} - -__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) { - if (backend != nullptr) { - ggml_backend_dev_t device = ggml_backend_get_device(backend); - if (device != nullptr) { - return ggml_backend_dev_backend_reg(device); - } - } - - return ggml_backend_cpu_reg(); -} - -__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() { - ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - if (backend != nullptr) { - return backend; - } - - ggml_backend_load_all_once(); - return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); -} - -__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) { - if (backend == nullptr) { - return false; - } - - ggml_backend_dev_t device = ggml_backend_get_device(backend); - if (device != nullptr) { - return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; - } - - const char* backend_name = ggml_backend_name(backend); - return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0; -} - -__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { - ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); - if (reg == nullptr) { - return; - } - - auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")); - if (fn != nullptr) { - fn(backend_cpu, n_threads); - } -} - -using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - -__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { - ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); - if (reg == nullptr) { - return; - } - - auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool")); - if (fn != nullptr) { - fn(backend_cpu, threadpool); - } -} - -__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) { - ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); - if (reg == nullptr) { - return; - } - - auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback")); - if (fn != nullptr) { - fn(backend_cpu, abort_callback, abort_callback_data); - } -} - -__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { - if (tensor == nullptr) { - return nullptr; - } - - return tensor->view_src ? tensor->view_src->buffer : tensor->buffer; -} - -__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) { - if (tensor == nullptr || tensor->data == nullptr) { - return false; - } - - ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor); - return buffer == nullptr || ggml_backend_buffer_is_host(buffer); -} - -__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]); -} - -template -__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) { - const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3); - - if (ggml_backend_tensor_is_host_accessible(tensor)) { - auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset); - *dst = value; - return; - } - - ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T)); -} - -__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) { - switch (tensor->type) { - case GGML_TYPE_I8: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); - break; - case GGML_TYPE_I16: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); - break; - case GGML_TYPE_I32: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); - break; - case GGML_TYPE_F16: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value)); - break; - case GGML_TYPE_BF16: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value)); - break; - case GGML_TYPE_F32: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value); - break; - default: - GGML_ABORT("fatal error"); - } -} - -__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) { - if (!ggml_is_contiguous(tensor)) { - int64_t id[4] = {0, 0, 0, 0}; - ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); - ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); - return; - } - - switch (tensor->type) { - case GGML_TYPE_I8: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); - break; - case GGML_TYPE_I16: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); - break; - case GGML_TYPE_I32: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); - break; - case GGML_TYPE_F16: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value)); - break; - case GGML_TYPE_BF16: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value)); - break; - case GGML_TYPE_F32: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value); - break; - default: - GGML_ABORT("fatal error"); - } -} - -__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) { - (void)ctx; - - // The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but - // the backend proc table does not expose it in GGML_BACKEND_DL mode. - // Recreate the old behavior by initializing the CPU backend explicitly and - // executing the graph through the generic backend API. - ggml_backend_t backend = ggml_backend_cpu_init(); - if (backend == nullptr) { - return GGML_STATUS_ALLOC_FAILED; - } - - ggml_backend_cpu_set_n_threads(backend, n_threads); - - const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph); - ggml_backend_free(backend); - - return status; -} - -__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) { - GGML_ASSERT(tensor != nullptr); - - if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) { - const int64_t nelements = ggml_nelements(tensor); - - switch (tensor->type) { - case GGML_TYPE_I8: { - auto* data = reinterpret_cast(tensor->data); - const int8_t v = static_cast(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_I16: { - auto* data = reinterpret_cast(tensor->data); - const int16_t v = static_cast(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_I32: { - auto* data = reinterpret_cast(tensor->data); - const int32_t v = static_cast(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_F16: { - auto* data = reinterpret_cast(tensor->data); - const ggml_fp16_t v = ggml_fp32_to_fp16(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_BF16: { - auto* data = reinterpret_cast(tensor->data); - const ggml_bf16_t v = ggml_fp32_to_bf16(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_F32: { - auto* data = reinterpret_cast(tensor->data); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = value; - } - } break; - default: - GGML_ABORT("fatal error"); - } - - return tensor; - } - - const int64_t nelements = ggml_nelements(tensor); - for (int64_t i = 0; i < nelements; ++i) { - ggml_set_f32_1d(tensor, static_cast(i), value); - } - - return tensor; -} - -#endif diff --git a/src/hidream_o1.hpp b/src/hidream_o1.hpp index 908f2de30..d72739d56 100644 --- a/src/hidream_o1.hpp +++ b/src/hidream_o1.hpp @@ -279,10 +279,10 @@ namespace HiDreamO1 { std::array, 4> pos_embed_weight_data_; HiDreamO1VisionRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model.visual") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), params(make_hidream_o1_params()), model(std::make_shared(false, params.llm.vision)) { model->init(params_ctx, tensor_storage_map, prefix); @@ -336,10 +336,10 @@ namespace HiDreamO1 { std::vector attention_mask_vec; HiDreamO1Runner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), params(make_hidream_o1_params()) { model = HiDreamO1Model(params); model.init(params_ctx, tensor_storage_map, prefix); @@ -461,9 +461,9 @@ namespace HiDreamO1 { std::shared_ptr vision_runner; HiDreamO1Conditioner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) - : vision_runner(std::make_shared(backend, offload_params_to_cpu, tensor_storage_map)) {} + : vision_runner(std::make_shared(backend, params_backend, tensor_storage_map)) {} void get_param_tensors(std::map& tensors) override { vision_runner->get_param_tensors(tensors); diff --git a/src/llm.hpp b/src/llm.hpp index 8509b6b77..877bb82a3 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -979,11 +979,11 @@ namespace LLM { public: LLMRunner(LLMArch arch, ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool enable_vision_ = false) - : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) { + : GGMLRunner(backend, params_backend), enable_vision(enable_vision_) { params.arch = arch; if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) { params.head_dim = 128; @@ -1227,11 +1227,11 @@ namespace LLM { LLMEmbedder(LLMArch arch, ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", bool enable_vision = false) - : model(arch, backend, offload_params_to_cpu, tensor_storage_map, prefix, enable_vision) { + : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) { if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) { tokenizer = std::make_shared(); } else { @@ -1481,7 +1481,7 @@ namespace LLM { std::shared_ptr llm = std::make_shared(arch, backend, - true, + backend, tensor_storage_map, "text_encoders.llm", true); diff --git a/src/lora.hpp b/src/lora.hpp index b57bc4226..3d2b76992 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -22,10 +22,11 @@ struct LoraModel : public GGMLRunner { LoraModel(const std::string& lora_id, ggml_backend_t backend, + ggml_backend_t params_backend, const std::string& file_path = "", std::string prefix = "", SDVersion version = VERSION_COUNT) - : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) { + : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) { prefix = "lora." + prefix; if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) { load_failed = true; diff --git a/src/mmdit.hpp b/src/mmdit.hpp index e57041dc9..6fcd732e5 100644 --- a/src/mmdit.hpp +++ b/src/mmdit.hpp @@ -828,10 +828,10 @@ struct MMDiTRunner : public GGMLRunner { MMDiT mmdit; MMDiTRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") - : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) { + : GGMLRunner(backend, params_backend), mmdit(tensor_storage_map) { mmdit.init(params_ctx, tensor_storage_map, prefix); } @@ -934,7 +934,7 @@ struct MMDiTRunner : public GGMLRunner { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr mmdit = std::make_shared(backend, false); + std::shared_ptr mmdit = std::make_shared(backend, backend); { LOG_INFO("loading from '%s'", file_path.c_str()); diff --git a/src/model.cpp b/src/model.cpp index 9d7a9233d..df63c669b 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -24,7 +24,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.hpp" +#include "ggml_extend_backend.h" #include "zip.h" #include "name_conversion.h" diff --git a/src/pmid.hpp b/src/pmid.hpp index f19a8c3cc..2a9d2da7c 100644 --- a/src/pmid.hpp +++ b/src/pmid.hpp @@ -411,13 +411,13 @@ struct PhotoMakerIDEncoder : public GGMLRunner { public: PhotoMakerIDEncoder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f) - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), version(version), pm_version(pm_v), style_strength(sty) { @@ -568,11 +568,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner { bool applied = false; PhotoMakerIDEmbed(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, ModelLoader* ml, const std::string& file_path = "", const std::string& prefix = "") - : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) { + : file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) { if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) { load_failed = true; } diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index 35d32109e..73c1f9aec 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -488,12 +488,12 @@ namespace Qwen { SDVersion version; QwenImageRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_QWEN_IMAGE, bool zero_cond_t = false) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { qwen_image_params.num_layers = 0; qwen_image_params.zero_cond_t = zero_cond_t; for (auto pair : tensor_storage_map) { @@ -686,7 +686,7 @@ namespace Qwen { } std::shared_ptr qwen_image = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_QWEN_IMAGE); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index f8dbc68bb..e46d217c9 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -113,10 +113,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { class StableDiffusionGGML { public: std::vector mmap_tensor_store; - ggml_backend_t backend = nullptr; // general backend - ggml_backend_t clip_backend = nullptr; - ggml_backend_t control_net_backend = nullptr; - ggml_backend_t vae_backend = nullptr; + SDBackendManager backend_manager; SDVersion version; bool vae_decode_only = false; @@ -151,6 +148,8 @@ class StableDiffusionGGML { bool offload_params_to_cpu = false; float max_vram = 0.f; bool use_pmid = false; + std::string backend_spec; + std::string params_backend_spec; bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; @@ -164,21 +163,44 @@ class StableDiffusionGGML { StableDiffusionGGML() = default; - ~StableDiffusionGGML() { - if (clip_backend != backend) { - ggml_backend_free(clip_backend); + ~StableDiffusionGGML() = default; + + ggml_backend_t backend_for(SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.runtime_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s backend", sd_backend_module_name(module)); } - if (control_net_backend != backend) { - ggml_backend_free(control_net_backend); + return module_backend; + } + + ggml_backend_t params_backend_for(SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.params_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s params backend", sd_backend_module_name(module)); } - if (vae_backend != backend) { - ggml_backend_free(vae_backend); + return module_backend; + } + + bool ensure_backend_pair(SDBackendModule module) { + if (backend_for(module) == nullptr) { + return false; } - ggml_backend_free(backend); + return params_backend_for(module) != nullptr; } - void init_backend() { - backend = sd_get_default_backend(); + bool init_backend(const sd_ctx_params_t* sd_ctx_params) { + std::string error; + if (!backend_manager.init(sd_ctx_params->backend, + sd_ctx_params->params_backend, + sd_ctx_params->offload_params_to_cpu, + sd_ctx_params->keep_clip_on_cpu, + sd_ctx_params->keep_vae_on_cpu, + sd_ctx_params->keep_control_net_on_cpu, + &error)) { + LOG_ERROR("backend config failed: %s", error.c_str()); + return false; + } + return ensure_backend_pair(SDBackendModule::DIFFUSION); } std::shared_ptr get_rng(rng_type_t rng_type) { @@ -197,6 +219,8 @@ class StableDiffusionGGML { free_params_immediately = sd_ctx_params->free_params_immediately; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; max_vram = sd_ctx_params->max_vram; + backend_spec = SAFE_STR(sd_ctx_params->backend); + params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); bool use_tae = false; @@ -209,8 +233,10 @@ class StableDiffusionGGML { ggml_log_set(ggml_log_callback_default, nullptr); - init_backend(); - max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend); + if (!init_backend(sd_ctx_params)) { + return false; + } + max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION)); ModelLoader model_loader; @@ -368,7 +394,6 @@ class StableDiffusionGGML { std::map mmap_able_tensors; bool enable_mmap_tensors = false; - bool main_backend_mmap = false; bool needs_writable_mmap = false; if (sd_ctx_params->enable_mmap) { if (apply_lora_immediately) { @@ -376,21 +401,19 @@ class StableDiffusionGGML { LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); } enable_mmap_tensors = true; - if (offload_params_to_cpu) { - main_backend_mmap = true; - } else { - ggml_backend_dev_t dev = ggml_backend_get_device(backend); - struct ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - main_backend_mmap = props.caps.buffer_from_host_ptr; - } } // split definition to avoid msvc choking on the extra parameter handling - auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) { + auto module_can_mmap = [&](SDBackendModule module) { + return enable_mmap_tensors && + (backend_manager.runtime_backend_is_cpu(module) || + backend_manager.params_backend_is_cpu(module) || + backend_manager.runtime_backend_supports_host_buffer(module)); + }; + + auto get_param_tensors_p = [&](auto&& model, bool do_mmap, const char* prefix) { std::map temp; model->get_param_tensors(temp, prefix); - bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu); for (const auto& [key, tensor] : temp) { tensors[key] = tensor; if (do_mmap) { @@ -399,10 +422,9 @@ class StableDiffusionGGML { } }; - auto get_param_tensors = [&](auto&& model, bool force_cpu = false) { + auto get_param_tensors = [&](auto&& model, bool do_mmap) { std::map temp; model->get_param_tensors(temp); - bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu); for (const auto& [key, tensor] : temp) { tensors[key] = tensor; if (do_mmap) { @@ -426,22 +448,20 @@ class StableDiffusionGGML { LOG_INFO("Using circular padding for convolutions"); } - bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; - const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram); { - clip_backend = backend; - if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("CLIP: Using CPU backend"); - clip_backend = ggml_backend_cpu_init(); + if (!ensure_backend_pair(SDBackendModule::TE) || + !ensure_backend_pair(SDBackendModule::DIFFUSION)) { + return false; } + if (sd_version_is_sd3(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map); } else if (sd_version_is_flux(version)) { bool is_chroma = false; @@ -461,54 +481,54 @@ class StableDiffusionGGML { "--chroma-disable-dit-mask as a workaround."); } - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else if (version == VERSION_OVIS_IMAGE) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", false); } else { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); } - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, true, 0, true); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { - high_noise_diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + high_noise_diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.high_noise_diffusion_model", version); @@ -516,62 +536,65 @@ class StableDiffusionGGML { if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") { - clip_vision = std::make_shared(backend, - offload_params_to_cpu, + if (!ensure_backend_pair(SDBackendModule::CLIP_VISION)) { + return false; + } + clip_vision = std::make_shared(backend_for(SDBackendModule::CLIP_VISION), + params_backend_for(SDBackendModule::CLIP_VISION), tensor_storage_map); clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(clip_vision); + get_param_tensors(clip_vision, module_can_mmap(SDBackendModule::CLIP_VISION)); } } else if (sd_version_is_qwen_image(version)) { bool enable_vision = false; if (!vae_decode_only) { enable_vision = true; } - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", enable_vision); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); } else if (version == VERSION_HIDREAM_O1) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model"); } else if (sd_version_is_anima(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version); } else if (sd_version_is_ernie_image(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); } else { // SD1.x SD2.x SDXL @@ -580,21 +603,21 @@ class StableDiffusionGGML { embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, embbeding_map, version, PM_VERSION_2); } else { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, embbeding_map, version); } - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, version); if (sd_ctx_params->diffusion_conv_direct) { @@ -604,10 +627,10 @@ class StableDiffusionGGML { } cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(cond_stage_model, clip_on_cpu); + get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE)); diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(diffusion_model); + get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -615,30 +638,27 @@ class StableDiffusionGGML { if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(high_noise_diffusion_model); + get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); } - if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("VAE Autoencoder: Using CPU backend"); - vae_backend = ggml_backend_cpu_init(); - } else { - vae_backend = backend; + if (!ensure_backend_pair(SDBackendModule::VAE)) { + return false; } auto create_tae = [&]() -> std::shared_ptr { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - return std::make_shared(vae_backend, - offload_params_to_cpu, + return std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder", vae_decode_only, version); } else { - auto model = std::make_shared(vae_backend, - offload_params_to_cpu, + auto model = std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder.layers", vae_decode_only, @@ -651,15 +671,15 @@ class StableDiffusionGGML { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - return std::make_shared(vae_backend, - offload_params_to_cpu, + return std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, version); } else { - auto model = std::make_shared(vae_backend, - offload_params_to_cpu, + auto model = std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, @@ -678,28 +698,28 @@ class StableDiffusionGGML { } }; - bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu; + bool vae_mmap = module_can_mmap(SDBackendModule::VAE); if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, - vae_backend, - offload_params_to_cpu); + backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE)); } else if (use_tae && !tae_preview_only) { LOG_INFO("using TAE for encoding / decoding"); first_stage_model = create_tae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, force_vae_cpu, "tae"); + get_param_tensors_p(first_stage_model, vae_mmap, "tae"); } else { LOG_INFO("using VAE for encoding / decoding"); first_stage_model = create_vae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model"); + get_param_tensors_p(first_stage_model, vae_mmap, "first_stage_model"); if (use_tae && tae_preview_only) { LOG_INFO("using TAE for preview"); preview_vae = create_tae(); preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, force_vae_cpu, "vae"); + get_param_tensors_p(first_stage_model, vae_mmap, "vae"); } } @@ -712,15 +732,11 @@ class StableDiffusionGGML { } if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { - ggml_backend_t controlnet_backend = nullptr; - if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_DEBUG("ControlNet: Using CPU backend"); - controlnet_backend = ggml_backend_cpu_init(); - } else { - controlnet_backend = backend; + if (!ensure_backend_pair(SDBackendModule::CONTROL_NET)) { + return false; } - control_net = std::make_shared(controlnet_backend, - offload_params_to_cpu, + control_net = std::make_shared(backend_for(SDBackendModule::CONTROL_NET), + params_backend_for(SDBackendModule::CONTROL_NET), tensor_storage_map, version); if (sd_ctx_params->diffusion_conv_direct) { @@ -729,23 +745,31 @@ class StableDiffusionGGML { } } - if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { - pmid_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "pmid", - version, - PM_VERSION_2); - LOG_INFO("using PhotoMaker Version 2"); - } else { - pmid_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "pmid", - version); - } if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { - pmid_lora = std::make_shared("pmid", backend, sd_ctx_params->photo_maker_path, "", version); + if (!ensure_backend_pair(SDBackendModule::PHOTOMAKER)) { + return false; + } + if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { + pmid_model = std::make_shared(backend_for(SDBackendModule::PHOTOMAKER), + params_backend_for(SDBackendModule::PHOTOMAKER), + tensor_storage_map, + "pmid", + version, + PM_VERSION_2); + LOG_INFO("using PhotoMaker Version 2"); + } else { + pmid_model = std::make_shared(backend_for(SDBackendModule::PHOTOMAKER), + params_backend_for(SDBackendModule::PHOTOMAKER), + tensor_storage_map, + "pmid", + version); + } + pmid_lora = std::make_shared("pmid", + backend_for(SDBackendModule::PHOTOMAKER), + params_backend_for(SDBackendModule::PHOTOMAKER), + sd_ctx_params->photo_maker_path, + "", + version); auto lora_tensor_filter = [&](const std::string& tensor_name) { if (starts_with(tensor_name, "lora.model")) { return true; @@ -764,7 +788,7 @@ class StableDiffusionGGML { } } if (use_pmid) { - get_param_tensors_p(pmid_model, false, "pmid"); + get_param_tensors_p(pmid_model, module_can_mmap(SDBackendModule::PHOTOMAKER), "pmid"); } if (sd_ctx_params->flash_attn) { @@ -857,8 +881,10 @@ class StableDiffusionGGML { } } - if (clip_vision) { - clip_vision->alloc_params_buffer(); + if (clip_vision && !clip_vision->alloc_params_buffer()) { + LOG_ERROR("CLIP vision params buffer allocation failed"); + ggml_free(ctx); + return false; } if (cond_stage_model) { cond_stage_model->alloc_params_buffer(); @@ -869,18 +895,20 @@ class StableDiffusionGGML { if (high_noise_diffusion_model) { high_noise_diffusion_model->alloc_params_buffer(); } - if (first_stage_model) { - first_stage_model->alloc_params_buffer(); + if (first_stage_model && !first_stage_model->alloc_params_buffer()) { + LOG_ERROR("VAE params buffer allocation failed"); + ggml_free(ctx); + return false; } - if (preview_vae) { - preview_vae->alloc_params_buffer(); + if (preview_vae && !preview_vae->alloc_params_buffer()) { + LOG_ERROR("preview VAE params buffer allocation failed"); + ggml_free(ctx); + return false; } - if (use_pmid && pmid_model) { - if (!pmid_model->alloc_params_buffer()) { - LOG_ERROR(" pmid model params buffer allocation failed"); - ggml_free(ctx); - return false; - } + if (use_pmid && pmid_model && !pmid_model->alloc_params_buffer()) { + LOG_ERROR("PhotoMaker params buffer allocation failed"); + ggml_free(ctx); + return false; } bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); @@ -906,6 +934,7 @@ class StableDiffusionGGML { size_t control_net_params_mem_size = 0; if (control_net) { if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) { + ggml_free(ctx); return false; } control_net_params_mem_size = control_net->get_params_buffer_size(); @@ -917,28 +946,39 @@ class StableDiffusionGGML { size_t total_params_ram_size = 0; size_t total_params_vram_size = 0; - if (ggml_backend_is_cpu(clip_backend)) { - total_params_ram_size += clip_params_mem_size + pmid_params_mem_size; - } else { - total_params_vram_size += clip_params_mem_size + pmid_params_mem_size; - } - - if (ggml_backend_is_cpu(backend)) { - total_params_ram_size += unet_params_mem_size; - } else { - total_params_vram_size += unet_params_mem_size; - } - - if (ggml_backend_is_cpu(vae_backend)) { - total_params_ram_size += vae_params_mem_size; - } else { - total_params_vram_size += vae_params_mem_size; - } + auto add_params_memory = [&](size_t size, SDBackendModule module) { + if (size == 0) { + return true; + } + ggml_backend_t module_backend = params_backend_for(module); + if (module_backend == nullptr) { + return false; + } + if (ggml_backend_is_cpu(module_backend)) { + total_params_ram_size += size; + } else { + total_params_vram_size += size; + } + return true; + }; + auto params_memory_location = [&](size_t size, SDBackendModule module) { + if (size == 0) { + return "N/A"; + } + ggml_backend_t module_backend = params_backend_for(module); + if (module_backend == nullptr) { + return "N/A"; + } + return ggml_backend_is_cpu(module_backend) ? "RAM" : "VRAM"; + }; - if (ggml_backend_is_cpu(control_net_backend)) { - total_params_ram_size += control_net_params_mem_size; - } else { - total_params_vram_size += control_net_params_mem_size; + if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) || + !add_params_memory(pmid_params_mem_size, SDBackendModule::PHOTOMAKER) || + !add_params_memory(unet_params_mem_size, SDBackendModule::DIFFUSION) || + !add_params_memory(vae_params_mem_size, SDBackendModule::VAE) || + !add_params_memory(control_net_params_mem_size, SDBackendModule::CONTROL_NET)) { + ggml_free(ctx); + return false; } size_t total_params_size = total_params_ram_size + total_params_vram_size; @@ -949,15 +989,15 @@ class StableDiffusionGGML { total_params_vram_size / 1024.0 / 1024.0, total_params_ram_size / 1024.0 / 1024.0, clip_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM", + params_memory_location(clip_params_mem_size, SDBackendModule::TE), unet_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", + params_memory_location(unet_params_mem_size, SDBackendModule::DIFFUSION), vae_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM", + params_memory_location(vae_params_mem_size, SDBackendModule::VAE), control_net_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM", + params_memory_location(control_net_params_mem_size, SDBackendModule::CONTROL_NET), pmid_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM"); + params_memory_location(pmid_params_mem_size, SDBackendModule::PHOTOMAKER)); } // init denoiser @@ -1092,7 +1132,7 @@ class StableDiffusionGGML { std::shared_ptr load_lora_model_from_file(const std::string& lora_id, float multiplier, - ggml_backend_t backend, + SDBackendModule module, LoraModel::filter_t lora_tensor_filter = nullptr) { std::string lora_path = lora_id; static std::string high_noise_tag = "|high_noise|"; @@ -1102,7 +1142,15 @@ class StableDiffusionGGML { is_high_noise = true; LOG_DEBUG("high noise lora: %s", lora_path.c_str()); } - auto lora = std::make_shared(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version); + if (!ensure_backend_pair(module)) { + return nullptr; + } + auto lora = std::make_shared(lora_id, + backend_for(module), + params_backend_for(module), + lora_path, + is_high_noise ? "model.high_noise_" : "", + version); if (!lora->load_from_file(n_threads, lora_tensor_filter)) { LOG_WARN("load lora tensors from %s failed", lora_path.c_str()); return nullptr; @@ -1141,7 +1189,7 @@ class StableDiffusionGGML { for (auto& kv : lora_state_diff) { int64_t t0 = ggml_time_ms(); - auto lora = load_lora_model_from_file(kv.first, kv.second, backend); + auto lora = load_lora_model_from_file(kv.first, kv.second, SDBackendModule::DIFFUSION); if (!lora || lora->lora_tensors.empty()) { continue; } @@ -1199,7 +1247,7 @@ class StableDiffusionGGML { const std::string& lora_id = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_id, multiplier, SDBackendModule::TE, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); cond_stage_lora_models.push_back(lora); @@ -1236,7 +1284,7 @@ class StableDiffusionGGML { const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::DIFFUSION, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); diffusion_lora_models.push_back(lora); @@ -1274,7 +1322,7 @@ class StableDiffusionGGML { const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::VAE, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); first_stage_lora_models.push_back(lora); @@ -2296,15 +2344,17 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->chroma_use_dit_mask = true; sd_ctx_params->chroma_use_t5_mask = false; sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->backend = nullptr; + sd_ctx_params->params_backend = nullptr; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { - char* buf = (char*)malloc(4096); + char* buf = (char*)malloc(8192); if (!buf) return nullptr; buf[0] = '\0'; - snprintf(buf + strlen(buf), 4096 - strlen(buf), + snprintf(buf + strlen(buf), 8192 - strlen(buf), "model_path: %s\n" "clip_l_path: %s\n" "clip_g_path: %s\n" @@ -2328,6 +2378,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "prediction: %s\n" "offload_params_to_cpu: %s\n" "max_vram: %.3f\n" + "backend: %s\n" + "params_backend: %s\n" "keep_clip_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n" "keep_vae_on_cpu: %s\n" @@ -2361,6 +2413,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), sd_ctx_params->max_vram, + SAFE_STR(sd_ctx_params->backend), + SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu), @@ -3596,7 +3650,9 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path); hires_upscaler = std::make_unique(sd_ctx->sd->n_threads, false, - request.hires.upscale_tile_size); + request.hires.upscale_tile_size, + sd_ctx->sd->backend_spec, + sd_ctx->sd->params_backend_spec); const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram); hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); if (!hires_upscaler->load_from_file(request.hires.model_path, diff --git a/src/t5.hpp b/src/t5.hpp index 71545e522..01c35d7de 100644 --- a/src/t5.hpp +++ b/src/t5.hpp @@ -321,11 +321,11 @@ struct T5Runner : public GGMLRunner { std::vector relative_position_bucket_vec; T5Runner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool is_umt5 = false) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { if (is_umt5) { params.vocab_size = 256384; params.relative_attention = false; @@ -464,11 +464,11 @@ struct T5Embedder { T5Runner model; T5Embedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", bool is_umt5 = false) - : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { + : model(backend, params_backend, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { } void get_param_tensors(std::map& tensors, const std::string prefix) { @@ -576,7 +576,7 @@ struct T5Embedder { } } - std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true); + std::shared_ptr t5 = std::make_shared(backend, backend, tensor_storage_map, "", true); t5->alloc_params_buffer(); std::map tensors; diff --git a/src/tae.hpp b/src/tae.hpp index 0a0ca6827..44cffd07b 100644 --- a/src/tae.hpp +++ b/src/tae.hpp @@ -542,14 +542,14 @@ struct TinyImageAutoEncoder : public VAE { bool decode_only = false; TinyImageAutoEncoder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_SD1) : decode_only(decoder_only), taesd(decoder_only, version), - VAE(version, backend, offload_params_to_cpu) { + VAE(version, backend, params_backend) { scale_input = false; taesd.init(params_ctx, tensor_storage_map, prefix); } @@ -604,14 +604,14 @@ struct TinyVideoAutoEncoder : public VAE { bool decode_only = false; TinyVideoAutoEncoder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_WAN2) : decode_only(decoder_only), taehv(decoder_only, version), - VAE(version, backend, offload_params_to_cpu) { + VAE(version, backend, params_backend) { scale_input = false; taehv.init(params_ctx, tensor_storage_map, prefix); } diff --git a/src/unet.hpp b/src/unet.hpp index d7ea8c3fa..a67fe0764 100644 --- a/src/unet.hpp +++ b/src/unet.hpp @@ -603,11 +603,11 @@ struct UNetModelRunner : public GGMLRunner { UnetModelBlock unet; UNetModelRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, SDVersion version = VERSION_SD1) - : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_storage_map) { + : GGMLRunner(backend, params_backend), unet(version, tensor_storage_map) { unet.init(params_ctx, tensor_storage_map, prefix); } diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 25fc0c5df..1197ce35e 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -4,12 +4,18 @@ #include "stable-diffusion.h" #include "util.h" +#include + UpscalerGGML::UpscalerGGML(int n_threads, bool direct, - int tile_size) + int tile_size, + std::string backend_spec, + std::string params_backend_spec) : n_threads(n_threads), direct(direct), - tile_size(tile_size) { + tile_size(tile_size), + backend_spec(std::move(backend_spec)), + params_backend_spec(std::move(params_backend_spec)) { } void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) { @@ -24,19 +30,51 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, int n_threads) { ggml_log_set(ggml_log_callback_default, nullptr); - backend = sd_get_default_backend(); + std::string error; + if (!backend_manager.init(backend_spec.c_str(), + params_backend_spec.c_str(), + offload_params_to_cpu, + false, + false, + false, + &error)) { + LOG_ERROR("upscaler backend config failed: %s", error.c_str()); + return false; + } + auto backend_for = [&](SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.runtime_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s backend", sd_backend_module_name(module)); + } + return module_backend; + }; + auto params_backend_for = [&](SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.params_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s params backend", sd_backend_module_name(module)); + } + return module_backend; + }; + auto ensure_backend_pair = [&](SDBackendModule module) { + if (backend_for(module) == nullptr) { + return false; + } + return params_backend_for(module) != nullptr; + }; + if (!ensure_backend_pair(SDBackendModule::UPSCALER)) { + return false; + } ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); } model_loader.set_wtype_override(model_data_type); - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); - } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); + esrgan_upscaler = std::make_shared(backend_for(SDBackendModule::UPSCALER), + params_backend_for(SDBackendModule::UPSCALER), + tile_size, + model_loader.get_tensor_storage_map()); esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); if (direct) { esrgan_upscaler->set_conv2d_direct_enabled(true); @@ -110,14 +148,16 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, bool offload_params_to_cpu, bool direct, int n_threads, - int tile_size) { + int tile_size, + const char* backend, + const char* params_backend) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == nullptr) { return nullptr; } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size); + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size, SAFE_STR(backend), SAFE_STR(params_backend)); if (upscaler_ctx->upscaler == nullptr) { return nullptr; } diff --git a/src/upscaler.h b/src/upscaler.h index d667a6f15..e3967865c 100644 --- a/src/upscaler.h +++ b/src/upscaler.h @@ -2,6 +2,7 @@ #define __SD_UPSCALER_H__ #include "esrgan.hpp" +#include "ggml_extend_backend.h" #include "stable-diffusion.h" #include "tensor.hpp" @@ -9,7 +10,7 @@ #include struct UpscalerGGML { - ggml_backend_t backend = nullptr; // general backend + SDBackendManager backend_manager; ggml_type model_data_type = GGML_TYPE_F16; std::shared_ptr esrgan_upscaler; std::string esrgan_path; @@ -17,10 +18,14 @@ struct UpscalerGGML { bool direct = false; int tile_size = 128; size_t max_graph_vram_bytes = 0; + std::string backend_spec; + std::string params_backend_spec; UpscalerGGML(int n_threads, - bool direct = false, - int tile_size = 128); + bool direct = false, + int tile_size = 128, + std::string backend_spec = "", + std::string params_backend_spec = ""); bool load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, diff --git a/src/util.cpp b/src/util.cpp index 586284c84..1c2e5e899 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -25,7 +25,7 @@ #include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.hpp" +#include "ggml_extend_backend.h" #include "stable-diffusion.h" bool ends_with(const std::string& str, const std::string& ending) { @@ -758,76 +758,6 @@ std::vector> parse_prompt_attention(const std::str return res; } -// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. -bool sd_backend_is(ggml_backend_t backend, const std::string& name) { - if (!backend) { - return false; - } - ggml_backend_dev_t dev = ggml_backend_get_device(backend); - if (!dev) - return false; - std::string dev_name = ggml_backend_dev_name(dev); - return dev_name.find(name) != std::string::npos; -} - -ggml_backend_t sd_get_default_backend() { - ggml_backend_load_all_once(); - static std::once_flag once; - std::call_once(once, []() { - size_t dev_count = ggml_backend_dev_count(); - if (dev_count == 0) { - LOG_ERROR("No devices found!"); - } else { - LOG_DEBUG("Found %zu backend devices:", dev_count); - for (size_t i = 0; i < dev_count; ++i) { - auto dev = ggml_backend_dev_get(i); - LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev)); - } - } - }); - ggml_backend_t backend = nullptr; - const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); - if (SD_VK_DEVICE != nullptr) { - std::string sd_vk_device_str = SD_VK_DEVICE; - try { - unsigned long long device = std::stoull(sd_vk_device_str); - std::string vk_device_name = "Vulkan" + std::to_string(device); - if (backend_name_exists(vk_device_name)) { - LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str()); - backend = init_named_backend(vk_device_name); - if (!backend) { - LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str()); - } - } else { - LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str()); - } - } catch (const std::invalid_argument&) { - LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE); - } catch (const std::out_of_range&) { - LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE); - } - } - - if (!backend) { - std::string dev_name = get_default_backend_name(); - backend = init_named_backend(dev_name); - if (!backend && !dev_name.empty()) { - LOG_WARN("device %s failed to init", dev_name.c_str()); - } - } - - if (!backend) { - LOG_WARN("loading CPU backend"); - backend = ggml_backend_cpu_init(); - } - - if (ggml_backend_is_cpu(backend)) { - LOG_DEBUG("Using CPU backend"); - } - - return backend; -} - // namespace is needed to avoid conflicts with ggml_backend_extend.hpp namespace ggml_cpu { #include "ggml-cpu.h" diff --git a/src/util.h b/src/util.h index 628a1f9d7..9843ae18f 100644 --- a/src/util.h +++ b/src/util.h @@ -86,7 +86,6 @@ bool sd_should_preview_noisy(); // test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. bool sd_backend_is(ggml_backend_t backend, const std::string& name); -ggml_backend_t sd_get_default_backend(); #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) diff --git a/src/vae.hpp b/src/vae.hpp index 35e73e41f..278ed94b2 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -62,8 +62,8 @@ struct VAE : public GGMLRunner { } public: - VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) - : version(version), GGMLRunner(backend, offload_params_to_cpu) {} + VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend) + : version(version), GGMLRunner(backend, params_backend) {} int get_scale_factor() { int scale_factor = 8; @@ -216,8 +216,8 @@ struct VAE : public GGMLRunner { }; struct FakeVAE : public VAE { - FakeVAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) - : VAE(version, backend, offload_params_to_cpu) {} + FakeVAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend) + : VAE(version, backend, params_backend) {} int get_encoder_output_channels(int input_channels) { return input_channels; diff --git a/src/wan.hpp b/src/wan.hpp index 261453301..ec8c184a3 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -1126,12 +1126,12 @@ namespace WAN { WanVAE ae; WanVAERunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", bool decode_only = false, SDVersion version = VERSION_WAN2) - : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, offload_params_to_cpu) { + : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, params_backend) { ae.init(params_ctx, tensor_storage_map, prefix); } @@ -1329,7 +1329,7 @@ namespace WAN { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr vae = std::make_shared(backend, false, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V); + std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -2094,11 +2094,11 @@ namespace WAN { SDVersion version; WanRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_WAN2) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { wan_params.num_layers = 0; for (auto pair : tensor_storage_map) { std::string tensor_name = pair.first; @@ -2346,7 +2346,7 @@ namespace WAN { } std::shared_ptr wan = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_WAN2_2_TI2V); diff --git a/src/z_image.hpp b/src/z_image.hpp index 00b69c264..c0546931f 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -473,11 +473,11 @@ namespace ZImage { SDVersion version; ZImageRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_Z_IMAGE) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { z_image = ZImageModel(z_image_params); z_image.init(params_ctx, tensor_storage_map, prefix); } @@ -620,7 +620,7 @@ namespace ZImage { } std::shared_ptr z_image = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_QWEN_IMAGE);