Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ add_library(dflash_common STATIC
src/laguna/laguna_target_graph.cpp
src/laguna/laguna_daemon.cpp
src/laguna/laguna_backend.cpp
src/laguna/laguna_layer_split_adapter.cpp
src/common/backend_ipc.cpp
src/common/dflash_feature_ring.cpp
src/common/dflash_capture.cpp
Expand All @@ -246,6 +247,7 @@ add_library(dflash_common STATIC
src/common/dflash_draft_graph.cpp
src/common/dflash_spec_decode.cpp
src/common/layer_split_backend.cpp
src/common/layer_split_runtime.cpp
src/qwen35/graph_builders.cpp
src/qwen35moe/qwen35moe_ffn.cpp
src/qwen35moe/qwen35moe_backend.cpp
Expand Down
17 changes: 17 additions & 0 deletions server/src/common/backend_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "qwen35_backend.h"
#include "qwen35moe_backend.h"
#include "laguna_backend.h"
#include "laguna_layer_split_adapter.h"
#include "qwen3_backend.h"
#include "gemma4_backend.h"
#include "gemma4_layer_split_adapter.h"
Expand Down Expand Up @@ -124,8 +125,24 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
return backend;

} else if (arch == "laguna") {
if (args.device.is_layer_split()) {
LagunaLayerSplitAdapterConfig cfg;
cfg.target_path = args.model_path;
cfg.device = args.device;
cfg.chunk = args.chunk;

auto adapter = std::make_unique<LagunaLayerSplitAdapter>(cfg);
auto backend = std::make_unique<LayerSplitBackend>(std::move(adapter));
if (!backend->init()) {
std::fprintf(stderr, "[backend_factory] LayerSplitBackend(laguna) init failed\n");
return nullptr;
}
return backend;
}

LagunaBackendArgs lcfg;
lcfg.target_path = args.model_path;
lcfg.device = args.device;
lcfg.max_ctx = args.device.max_ctx;
lcfg.chunk = args.chunk;
// kv_type defaults to Q8_0 in LagunaBackendArgs
Expand Down
189 changes: 167 additions & 22 deletions server/src/common/backend_precision.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>

namespace dflash::common {
namespace {
Expand Down Expand Up @@ -106,36 +108,103 @@ int device_props_for(int device,
#endif
}

bool arch_starts_with(const std::string & arch, const char * prefix) {
return arch.compare(0, std::strlen(prefix), prefix) == 0;
}

bool parse_precision_override(const char * env_name, ggml_type & out) {
if (!env_name || !env_name[0]) return false;
const char * s = std::getenv(env_name);
if (!s || !s[0]) return false;
if (std::strcmp(s, "f32") == 0 || std::strcmp(s, "F32") == 0) {
out = GGML_TYPE_F32;
return true;
}
if (std::strcmp(s, "f16") == 0 || std::strcmp(s, "F16") == 0) {
out = GGML_TYPE_F16;
return true;
}
if (std::strcmp(s, "bf16") == 0 || std::strcmp(s, "BF16") == 0) {
out = GGML_TYPE_BF16;
return true;
}
std::fprintf(stderr, "[precision] ignoring unsupported %s=%s\n", env_name, s);
return false;
}

void fill_policy_device_info(ggml_backend_t backend,
std::string & backend_name_out,
std::string & device_name_out,
std::string & runtime_arch_out,
int & device_id_out,
int & cuda_sm_out) {
backend_name_out = backend_name(backend);
const std::string logical_name = backend_device_logical_name(backend);
device_name_out = backend_device_description(backend);
device_id_out = parse_backend_device_id(logical_name);
if (device_id_out < 0) {
device_id_out = current_device_id();
}
cuda_sm_out = device_props_for(
device_id_out,
device_name_out.empty() ? &device_name_out : nullptr,
&runtime_arch_out);
if (backend_name_out.empty()) backend_name_out = "unknown";
if (device_name_out.empty()) device_name_out = "unknown";
}

} // namespace

const char * backend_precision_type_name(ggml_type type) {
return ggml_type_name(type);
}

ggml_type select_cuda_backend_precision_type_for_sm(int sm) {
if (sm >= 80) return GGML_TYPE_BF16;
if (sm >= 70 || sm == 60) return GGML_TYPE_F16;
return GGML_TYPE_F32;
}

ggml_type select_hip_activation_precision_type_for_arch(const std::string & arch) {
if (arch.empty()) return GGML_TYPE_F32;
if (arch_starts_with(arch, "gfx90a") ||
arch_starts_with(arch, "gfx94") ||
arch_starts_with(arch, "gfx95") ||
arch_starts_with(arch, "gfx11") ||
arch_starts_with(arch, "gfx12")) {
return GGML_TYPE_BF16;
}
if (arch_starts_with(arch, "gfx9") ||
arch_starts_with(arch, "gfx10")) {
return GGML_TYPE_F16;
}
return GGML_TYPE_F32;
}

ggml_type combine_activation_precision_types(ggml_type a, ggml_type b) {
if (a == GGML_TYPE_F32 || b == GGML_TYPE_F32) return GGML_TYPE_F32;
if (a == GGML_TYPE_F16 || b == GGML_TYPE_F16) return GGML_TYPE_F16;
if (a == GGML_TYPE_BF16 && b == GGML_TYPE_BF16) return GGML_TYPE_BF16;
return GGML_TYPE_F32;
}

BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend) {
BackendPrecisionPolicy policy;
policy.backend_name = backend_name(backend);
const std::string logical_name = backend_device_logical_name(backend);
policy.device_name = backend_device_description(backend);
policy.device_id = parse_backend_device_id(logical_name);
if (policy.device_id < 0) {
policy.device_id = current_device_id();
}
fill_policy_device_info(backend, policy.backend_name, policy.device_name,
policy.runtime_arch, policy.device_id,
policy.cuda_sm);

#if defined(DFLASH27B_BACKEND_CUDA)
policy.cuda_sm = device_props_for(
policy.device_id,
policy.device_name.empty() ? &policy.device_name : nullptr,
nullptr);
if (policy.cuda_sm >= 80) {
const ggml_type type = select_cuda_backend_precision_type_for_sm(policy.cuda_sm);
if (type == GGML_TYPE_BF16) {
policy.weight_type = GGML_TYPE_BF16;
policy.compute_type = GGML_TYPE_BF16;
policy.reason = "CUDA sm80+ BF16 tensor-core path";
} else if (policy.cuda_sm >= 70) {
} else if (type == GGML_TYPE_F16 && policy.cuda_sm >= 70) {
policy.weight_type = GGML_TYPE_F16;
policy.compute_type = GGML_TYPE_F16;
policy.reason = "CUDA sm70-sm79 F16 tensor-core path";
} else if (policy.cuda_sm == 60) {
} else if (type == GGML_TYPE_F16) {
policy.weight_type = GGML_TYPE_F16;
policy.compute_type = GGML_TYPE_F16;
policy.reason = "CUDA sm60 GP100 F16 path";
Expand All @@ -145,10 +214,6 @@ BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend) {
policy.reason = "CUDA legacy compatibility fallback without useful F16/BF16 acceleration";
}
#elif defined(DFLASH27B_BACKEND_HIP) || defined(GGML_USE_HIP)
policy.cuda_sm = device_props_for(
policy.device_id,
policy.device_name.empty() ? &policy.device_name : nullptr,
&policy.runtime_arch);
policy.weight_type = GGML_TYPE_BF16;
policy.compute_type = GGML_TYPE_BF16;
policy.reason = "HIP ROCm/ggml BF16-compatible path";
Expand All @@ -158,12 +223,92 @@ BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend) {
policy.reason = "portable non-GPU fallback";
#endif

if (policy.backend_name.empty()) {
policy.backend_name = "unknown";
return policy;
}

BackendActivationPolicy select_common_activation_precision_policy(
const std::vector<ggml_backend_t> & backends,
bool force_f32,
const char * override_env) {
if (backends.empty()) {
return select_activation_precision_policy(nullptr, force_f32, override_env);
}

BackendActivationPolicy policy =
select_activation_precision_policy(backends.front(), force_f32, override_env);
ggml_type common_type = policy.activation_type;
bool mixed = false;
for (size_t i = 1; i < backends.size(); ++i) {
const BackendActivationPolicy shard_policy =
select_activation_precision_policy(backends[i], force_f32, override_env);
if (shard_policy.activation_type != common_type) {
mixed = true;
}
const ggml_type combined =
combine_activation_precision_types(common_type, shard_policy.activation_type);
if (combined != common_type) {
mixed = true;
}
common_type = combined;
}
if (mixed) {
policy.activation_type = common_type;
policy.backend_name = "mixed";
policy.device_name = "mixed";
policy.runtime_arch = "mixed";
policy.device_id = -1;
policy.cuda_sm = 0;
policy.reason = "common shard-compatible activation path";
}
return policy;
}

BackendActivationPolicy select_activation_precision_policy(
ggml_backend_t backend,
bool force_f32,
const char * override_env) {
BackendActivationPolicy policy;
fill_policy_device_info(backend, policy.backend_name, policy.device_name,
policy.runtime_arch, policy.device_id,
policy.cuda_sm);

ggml_type override_type = GGML_TYPE_F32;
if (!force_f32 && parse_precision_override(override_env, override_type)) {
policy.activation_type = override_type;
policy.reason = std::string(override_env) + " override";
return policy;
}
if (force_f32) {
policy.activation_type = GGML_TYPE_F32;
policy.reason = "F32 required by capture/IPC feature boundary";
return policy;
}

#if defined(DFLASH27B_BACKEND_CUDA)
policy.activation_type = select_cuda_backend_precision_type_for_sm(policy.cuda_sm);
if (policy.activation_type == GGML_TYPE_BF16) {
policy.reason = "CUDA sm80+ BF16 activation path";
} else if (policy.activation_type == GGML_TYPE_F16 && policy.cuda_sm >= 70) {
policy.reason = "CUDA sm70-sm79 F16 activation path";
} else if (policy.activation_type == GGML_TYPE_F16) {
policy.reason = "CUDA sm60 GP100 F16 activation path";
} else {
policy.reason = "CUDA legacy F32 activation fallback";
}
if (policy.device_name.empty()) {
policy.device_name = "unknown";
#elif defined(DFLASH27B_BACKEND_HIP) || defined(GGML_USE_HIP)
policy.activation_type =
select_hip_activation_precision_type_for_arch(policy.runtime_arch);
if (policy.activation_type == GGML_TYPE_BF16) {
policy.reason = "HIP native BF16 activation path";
} else if (policy.activation_type == GGML_TYPE_F16) {
policy.reason = "HIP FP16 activation path";
} else {
policy.reason = "HIP legacy F32 activation fallback";
}
#else
policy.activation_type = GGML_TYPE_F32;
policy.reason = "portable F32 activation fallback";
#endif
return policy;
}

Expand Down
22 changes: 22 additions & 0 deletions server/src/common/backend_precision.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "ggml-backend.h"

#include <string>
#include <vector>

namespace dflash::common {

Expand All @@ -18,8 +19,29 @@ struct BackendPrecisionPolicy {
std::string reason;
};

struct BackendActivationPolicy {
ggml_type activation_type = GGML_TYPE_F32;
std::string backend_name;
std::string device_name;
std::string runtime_arch;
int device_id = -1;
int cuda_sm = 0;
std::string reason;
};

BackendPrecisionPolicy select_drafter_precision_policy(ggml_backend_t backend);
BackendActivationPolicy select_activation_precision_policy(
ggml_backend_t backend,
bool force_f32 = false,
const char * override_env = nullptr);
BackendActivationPolicy select_common_activation_precision_policy(
const std::vector<ggml_backend_t> & backends,
bool force_f32 = false,
const char * override_env = nullptr);

const char * backend_precision_type_name(ggml_type type);
ggml_type select_cuda_backend_precision_type_for_sm(int sm);
ggml_type select_hip_activation_precision_type_for_arch(const std::string & arch);
ggml_type combine_activation_precision_types(ggml_type a, ggml_type b);

} // namespace dflash::common
Loading
Loading