Luce-Org · weicj · May 28, 2026 · May 28, 2026 · May 29, 2026 · May 29, 2026
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -237,6 +237,7 @@ add_library(dflash_common STATIC
     src/laguna/laguna_target_graph.cpp
     src/laguna/laguna_daemon.cpp
     src/laguna/laguna_backend.cpp
+    src/laguna/laguna_layer_split_adapter.cpp
     src/common/backend_ipc.cpp
     src/common/dflash_feature_ring.cpp
     src/common/dflash_capture.cpp
@@ -246,6 +247,7 @@ add_library(dflash_common STATIC
     src/common/dflash_draft_graph.cpp
     src/common/dflash_spec_decode.cpp
     src/common/layer_split_backend.cpp
+    src/common/layer_split_runtime.cpp
     src/qwen35/graph_builders.cpp
     src/qwen35moe/qwen35moe_ffn.cpp
     src/qwen35moe/qwen35moe_backend.cpp

diff --git a/server/src/common/backend_factory.cpp b/server/src/common/backend_factory.cpp
@@ -6,6 +6,7 @@
 #include "qwen35_backend.h"
 #include "qwen35moe_backend.h"
 #include "laguna_backend.h"
+#include "laguna_layer_split_adapter.h"
 #include "qwen3_backend.h"
 #include "gemma4_backend.h"
 #include "gemma4_layer_split_adapter.h"
@@ -124,8 +125,24 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
         return backend;
 
     } else if (arch == "laguna") {
+        if (args.device.is_layer_split()) {
+            LagunaLayerSplitAdapterConfig cfg;
+            cfg.target_path = args.model_path;
+            cfg.device      = args.device;
+            cfg.chunk       = args.chunk;
+
+            auto adapter = std::make_unique<LagunaLayerSplitAdapter>(cfg);
+            auto backend = std::make_unique<LayerSplitBackend>(std::move(adapter));
+            if (!backend->init()) {
+                std::fprintf(stderr, "[backend_factory] LayerSplitBackend(laguna) init failed\n");
+                return nullptr;
+            }
+            return backend;
+        }
+
         LagunaBackendArgs lcfg;
         lcfg.target_path = args.model_path;
+        lcfg.device      = args.device;
         lcfg.max_ctx     = args.device.max_ctx;
         lcfg.chunk       = args.chunk;
         // kv_type defaults to Q8_0 in LagunaBackendArgs

diff --git a/server/src/common/backend_precision.cpp b/server/src/common/backend_precision.cpp
@@ -7,7 +7,9 @@
 #include <cctype>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <string>
+#include <vector>
 
 namespace dflash::common {
 namespace {
@@ -106,36 +108,103 @@ int device_props_for(int device,
 #endif
 }
 
+bool arch_starts_with(const std::string & arch, const char * prefix) {
+    return arch.compare(0, std::strlen(prefix), prefix) == 0;
+}
+
+bool parse_precision_override(const char * env_name, ggml_type & out) {
+    if (!env_name || !env_name[0]) return false;
+    const char * s = std::getenv(env_name);
+    if (!s || !s[0]) return false;
+    if (std::strcmp(s, "f32") == 0 || std::strcmp(s, "F32") == 0) {
+        out = GGML_TYPE_F32;
+        return true;
+    }
+    if (std::strcmp(s, "f16") == 0 || std::strcmp(s, "F16") == 0) {
+        out = GGML_TYPE_F16;
+        return true;
+    }
+    if (std::strcmp(s, "bf16") == 0 || std::strcmp(s, "BF16") == 0) {
+        out = GGML_TYPE_BF16;
+        return true;
+    }
+    std::fprintf(stderr, "[precision] ignoring unsupported %s=%s\n", env_name, s);
+    return false;
+}
+
+void fill_policy_device_info(ggml_backend_t backend,
+                             std::string & backend_name_out,
+                             std::string & device_name_out,
+                             std::string & runtime_arch_out,
+                             int & device_id_out,
+                             int & cuda_sm_out) {
+    backend_name_out = backend_name(backend);
+    const std::string logical_name = backend_device_logical_name(backend);
+    device_name_out = backend_device_description(backend);
+    device_id_out = parse_backend_device_id(logical_name);
+    if (device_id_out < 0) {
+        device_id_out = current_device_id();
+    }
+    cuda_sm_out = device_props_for(
+        device_id_out,
+        device_name_out.empty() ? &device_name_out : nullptr,
+        &runtime_arch_out);
+    if (backend_name_out.empty()) backend_name_out = "unknown";
+    if (device_name_out.empty()) device_name_out = "unknown";
+}
+
 } // namespace
 
 const char * backend_precision_type_name(ggml_type type) {
     return ggml_type_name(type);
 }
 
+ggml_type select_cuda_backend_precision_type_for_sm(int sm) {
+    if (sm >= 80) return GGML_TYPE_BF16;
+    if (sm >= 70 || sm == 60) return GGML_TYPE_F16;
+    return GGML_TYPE_F32;
+}
+
+ggml_type select_hip_activation_precision_type_for_arch(const std::string & arch) {
+    if (arch.empty()) return GGML_TYPE_F32;
+    if (arch_starts_with(arch, "gfx90a") ||
+        arch_starts_with(arch, "gfx94")  ||
+        arch_starts_with(arch, "gfx95")  ||
+        arch_starts_with(arch, "gfx11")  ||
+        arch_starts_with(arch, "gfx12")) {
+        return GGML_TYPE_BF16;
+    }
+    if (arch_starts_with(arch, "gfx9") ||
+        arch_starts_with(arch, "gfx10")) {
+        return GGML_TYPE_F16;
+    }
+    return GGML_TYPE_F32;
+}
+
+ggml_type combine_activation_precision_types(ggml_type a, ggml_type b) {
+    if (a == GGML_TYPE_F32 || b == GGML_TYPE_F32) return GGML_TYPE_F32;
+    if (a == GGML_TYPE_F16 || b == GGML_TYPE_F16) return GGML_TYPE_F16;
+    if (a == GGML_TYPE_BF16 && b == GGML_TYPE_BF16) return GGML_TYPE_BF16;
+    return GGML_TYPE_F32;
+}
+
 BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend) {
     BackendPrecisionPolicy policy;
-    policy.backend_name = backend_name(backend);
-    const std::string logical_name = backend_device_logical_name(backend);
-    policy.device_name = backend_device_description(backend);
-    policy.device_id = parse_backend_device_id(logical_name);
-    if (policy.device_id < 0) {
-        policy.device_id = current_device_id();
-    }
+    fill_policy_device_info(backend, policy.backend_name, policy.device_name,
+                            policy.runtime_arch, policy.device_id,
+                            policy.cuda_sm);
 
 #if defined(DFLASH27B_BACKEND_CUDA)
-    policy.cuda_sm = device_props_for(
-        policy.device_id,
-        policy.device_name.empty() ? &policy.device_name : nullptr,
-        nullptr);
-    if (policy.cuda_sm >= 80) {
+    const ggml_type type = select_cuda_backend_precision_type_for_sm(policy.cuda_sm);
+    if (type == GGML_TYPE_BF16) {
         policy.weight_type  = GGML_TYPE_BF16;
         policy.compute_type = GGML_TYPE_BF16;
         policy.reason       = "CUDA sm80+ BF16 tensor-core path";
-    } else if (policy.cuda_sm >= 70) {
+    } else if (type == GGML_TYPE_F16 && policy.cuda_sm >= 70) {
         policy.weight_type  = GGML_TYPE_F16;
         policy.compute_type = GGML_TYPE_F16;
         policy.reason       = "CUDA sm70-sm79 F16 tensor-core path";
-    } else if (policy.cuda_sm == 60) {
+    } else if (type == GGML_TYPE_F16) {
         policy.weight_type  = GGML_TYPE_F16;
         policy.compute_type = GGML_TYPE_F16;
         policy.reason       = "CUDA sm60 GP100 F16 path";
@@ -145,10 +214,6 @@ BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend) {
         policy.reason       = "CUDA legacy compatibility fallback without useful F16/BF16 acceleration";
     }
 #elif defined(DFLASH27B_BACKEND_HIP) || defined(GGML_USE_HIP)
-    policy.cuda_sm = device_props_for(
-        policy.device_id,
-        policy.device_name.empty() ? &policy.device_name : nullptr,
-        &policy.runtime_arch);
     policy.weight_type  = GGML_TYPE_BF16;
     policy.compute_type = GGML_TYPE_BF16;
     policy.reason       = "HIP ROCm/ggml BF16-compatible path";
@@ -158,12 +223,92 @@ BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend) {
     policy.reason       = "portable non-GPU fallback";
 #endif
 
-    if (policy.backend_name.empty()) {
-        policy.backend_name = "unknown";
+    return policy;
+}
+
+BackendActivationPolicy select_common_activation_precision_policy(
+        const std::vector<ggml_backend_t> & backends,
+        bool force_f32,
+        const char * override_env) {
+    if (backends.empty()) {
+        return select_activation_precision_policy(nullptr, force_f32, override_env);
+    }
+
+    BackendActivationPolicy policy =
+        select_activation_precision_policy(backends.front(), force_f32, override_env);
+    ggml_type common_type = policy.activation_type;
+    bool mixed = false;
+    for (size_t i = 1; i < backends.size(); ++i) {
+        const BackendActivationPolicy shard_policy =
+            select_activation_precision_policy(backends[i], force_f32, override_env);
+        if (shard_policy.activation_type != common_type) {
+            mixed = true;
+        }
+        const ggml_type combined =
+            combine_activation_precision_types(common_type, shard_policy.activation_type);
+        if (combined != common_type) {
+            mixed = true;
+        }
+        common_type = combined;
+    }
+    if (mixed) {
+        policy.activation_type = common_type;
+        policy.backend_name = "mixed";
+        policy.device_name = "mixed";
+        policy.runtime_arch = "mixed";
+        policy.device_id = -1;
+        policy.cuda_sm = 0;
+        policy.reason = "common shard-compatible activation path";
+    }
+    return policy;
+}
+
+BackendActivationPolicy select_activation_precision_policy(
+        ggml_backend_t backend,
+        bool force_f32,
+        const char * override_env) {
+    BackendActivationPolicy policy;
+    fill_policy_device_info(backend, policy.backend_name, policy.device_name,
+                            policy.runtime_arch, policy.device_id,
+                            policy.cuda_sm);
+
+    ggml_type override_type = GGML_TYPE_F32;
+    if (!force_f32 && parse_precision_override(override_env, override_type)) {
+        policy.activation_type = override_type;
+        policy.reason = std::string(override_env) + " override";
+        return policy;
+    }
+    if (force_f32) {
+        policy.activation_type = GGML_TYPE_F32;
+        policy.reason = "F32 required by capture/IPC feature boundary";
+        return policy;
+    }
+
+#if defined(DFLASH27B_BACKEND_CUDA)
+    policy.activation_type = select_cuda_backend_precision_type_for_sm(policy.cuda_sm);
+    if (policy.activation_type == GGML_TYPE_BF16) {
+        policy.reason = "CUDA sm80+ BF16 activation path";
+    } else if (policy.activation_type == GGML_TYPE_F16 && policy.cuda_sm >= 70) {
+        policy.reason = "CUDA sm70-sm79 F16 activation path";
+    } else if (policy.activation_type == GGML_TYPE_F16) {
+        policy.reason = "CUDA sm60 GP100 F16 activation path";
+    } else {
+        policy.reason = "CUDA legacy F32 activation fallback";
     }
-    if (policy.device_name.empty()) {
-        policy.device_name = "unknown";
+#elif defined(DFLASH27B_BACKEND_HIP) || defined(GGML_USE_HIP)
+    policy.activation_type =
+        select_hip_activation_precision_type_for_arch(policy.runtime_arch);
+    if (policy.activation_type == GGML_TYPE_BF16) {
+        policy.reason = "HIP native BF16 activation path";
+    } else if (policy.activation_type == GGML_TYPE_F16) {
+        policy.reason = "HIP FP16 activation path";
+    } else {
+        policy.reason = "HIP legacy F32 activation fallback";
     }
+#else
+    policy.activation_type = GGML_TYPE_F32;
+    policy.reason = "portable F32 activation fallback";
+#endif
     return policy;
 }
 

diff --git a/server/src/common/backend_precision.h b/server/src/common/backend_precision.h
@@ -4,6 +4,7 @@
 #include "ggml-backend.h"
 
 #include <string>
+#include <vector>
 
 namespace dflash::common {
 
@@ -18,8 +19,29 @@ struct BackendPrecisionPolicy {
     std::string reason;
 };
 
+struct BackendActivationPolicy {
+    ggml_type   activation_type = GGML_TYPE_F32;
+    std::string backend_name;
+    std::string device_name;
+    std::string runtime_arch;
+    int         device_id       = -1;
+    int         cuda_sm         = 0;
+    std::string reason;
+};
+
 BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend);
+BackendActivationPolicy select_activation_precision_policy(
+    ggml_backend_t backend,
+    bool force_f32 = false,
+    const char * override_env = nullptr);
+BackendActivationPolicy select_common_activation_precision_policy(
+    const std::vector<ggml_backend_t> & backends,
+    bool force_f32 = false,
+    const char * override_env = nullptr);
 
 const char * backend_precision_type_name(ggml_type type);
+ggml_type select_cuda_backend_precision_type_for_sm(int sm);
+ggml_type select_hip_activation_precision_type_for_arch(const std::string & arch);
+ggml_type combine_activation_precision_types(ggml_type a, ggml_type b);
 
 } // namespace dflash::common