diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index dc8899b46ef..a6334a0f3a9 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -218,6 +218,7 @@ option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Lib option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON) +option(GGML_HIP_FORCE_MMQ "ggml: use MMQ kernels instead of hipBLAS" OFF) option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF) option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF) option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f6725265504..e4e5ef1c7dd 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,9 @@ extern "C" { GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_Q1_0 = 41, - GGML_TYPE_COUNT = 42, + GGML_TYPE_Q4_0_ROCMFP4 = 42, // ROCmFP4 experimental UE4M3 scales + packed AMD FP4 blocks + GGML_TYPE_Q4_0_ROCMFP4_FAST = 43, // ROCmFP4 single-scale speed layout + GGML_TYPE_COUNT = 44, }; // precision @@ -473,6 +475,8 @@ extern "C" { GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0_ROCMFP4 = 100, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST = 101, // ROCmFP4 single-scale speed layout }; // available tensor operations: diff --git a/ggml/rocmfp4/README.md b/ggml/rocmfp4/README.md new file mode 100644 index 00000000000..64affe7eab3 --- /dev/null +++ b/ggml/rocmfp4/README.md @@ -0,0 +1,48 @@ +# ROCmFP4 + +ROCmFP4 adds two experimental 4-bit GGUF tensor layouts intended for compact +AMD-oriented inference formats: + +- `Q4_0_ROCMFP4`: 32 weights per block, packed 4-bit values, and two finite + unsigned E4M3 scale bytes, one scale per 16 weights. The block size is + 18 bytes, or 4.50 bits per weight. +- `Q4_0_ROCMFP4_FAST`: 32 weights per block, packed 4-bit values, and one + finite unsigned E4M3 scale byte for the full block. The block size is + 17 bytes, or 4.25 bits per weight. + +The 4-bit values use a small signed codebook with levels up to `5.0` after +the decoded scale is applied. Quantization searches all finite E4M3 scale +candidates and keeps the lowest-error assignment. Invalid scale bytes are +rejected during row validation so malformed tensors fail early. + +This directory contains the format-specific CPU reference helpers. This initial +change adds the GGUF tensor types, row validation, quantization, and +dequantization needed to create and inspect ROCmFP4 files from the normal +`llama-quantize` workflow. + +- CPU reference quantization, dequantization, row validation, and + `llama-quantize` support. + +The feature is additive. Existing tensor types, file types, and backend +dispatch paths are unchanged unless a tensor is explicitly stored as +`Q4_0_ROCMFP4` or `Q4_0_ROCMFP4_FAST`. + +Accelerated ROCm/HIP and Vulkan execution paths can be added in follow-up +changes after the tensor formats and CPU reference behavior are reviewed. + +Example quantization: + +```sh +./llama-quantize model-f16.gguf model-rocmfp4.gguf Q4_0_ROCMFP4 +./llama-quantize model-f16.gguf model-rocmfp4-fast.gguf Q4_0_ROCMFP4_FAST +``` + +Importance matrices use the existing quantize interface: + +```sh +./llama-quantize --imatrix imatrix.dat model-f16.gguf model-rocmfp4.gguf Q4_0_ROCMFP4 +``` + +Advanced mixed recipes can be expressed with the existing +`--tensor-type` and `--tensor-type-file` options instead of adding extra +public file-type presets. diff --git a/ggml/rocmfp4/rocmfp4.c b/ggml/rocmfp4/rocmfp4.c new file mode 100644 index 00000000000..8258adc9282 --- /dev/null +++ b/ggml/rocmfp4/rocmfp4.c @@ -0,0 +1,463 @@ +#define GGML_COMMON_DECL_C +#include "../src/ggml-common.h" + +#include "rocmfp4.h" + +#include +#include +#include +#include + +// ROCmFP4 stores a signed integer FP4-like codebook at half-scale. It is +// E2M1-derived, but the largest magnitude is retuned from 12 to 10 after +// sampling Qwen3 dense tensors; this reduces outlier pull without changing the +// packed 4-bit layout or integer dot-product path. +static const int8_t rocmfp4_codebook[16] = { + 0, 1, 2, 3, 4, 6, 8, 10, + 0, -1, -2, -3, -4, -6, -8,-10, +}; + +static inline int8_t rocmfp4_decode(uint8_t q) { + return rocmfp4_codebook[q & 0x0f]; +} + +static inline float rocmfp4_ue4m3_to_fp32_half(uint8_t e) { + // Unsigned E4M3 scale. Return half the raw value because the codebook + // stores half-scale integer levels (e.g. 10 represents 5.0 raw-scale units). + if (e == 0 || e == 0x7f || e == 0xff) { + return 0.0f; + } + + const int exp = (e >> 3) & 0x0f; + const int man = e & 0x07; + if (exp == 0) { + return (float) man * (1.0f / 1024.0f); + } + + const uint32_t bits = ((uint32_t) exp + 119u) << 23 | ((uint32_t) man << 20); + float result; + memcpy(&result, &bits, sizeof(float)); + return result; +} + +static inline uint8_t rocmfp4_best_index_scaled(float x, float inv_scale_half) { + if (!isfinite(x)) { + return 0; + } + + // Exact nearest-neighbor thresholds for Codebook10: + // 0, +/-1, +/-2, +/-3, +/-4, +/-6, +/-8, +/-10 + // Ties intentionally choose the lower-magnitude code, matching the former + // linear scan because the positive codes and zero appear first. + const float a = fabsf(x * inv_scale_half); + if (a <= 0.5f) { + return 0; + } + + const bool neg = x < 0.0f; + if (a <= 1.5f) { + return neg ? 9 : 1; + } + if (a <= 2.5f) { + return neg ? 10 : 2; + } + if (a <= 3.5f) { + return neg ? 11 : 3; + } + if (a <= 5.0f) { + return neg ? 12 : 4; + } + if (a <= 7.0f) { + return neg ? 13 : 5; + } + if (a <= 9.0f) { + return neg ? 14 : 6; + } + + return neg ? 15 : 7; +} + +static uint8_t rocmfp4_best_index(float x, float scale_half) { + if (!(scale_half > 0.0f) || !isfinite(scale_half)) { + return 0; + } + + return rocmfp4_best_index_scaled(x, 1.0f / scale_half); +} + +static inline bool rocmfp4_scale_is_valid(uint8_t e) { + // ROCmFP4 scale bytes are unsigned finite E4M3 values. 0x7f is NaN in the + // unsigned encoding and values with the sign bit set are not valid scales. + return e <= 0x7e; +} + +static float rocmfp4_block_mse_for_scale( + const float * x, int n, const float * quant_weights, float sigma2, int e, float best_err) { + const float scale_half = rocmfp4_ue4m3_to_fp32_half((uint8_t) e); + const float inv_scale_half = 1.0f / scale_half; + float err = 0.0f; + + for (int i = 0; i < n; ++i) { + const uint8_t q = rocmfp4_best_index_scaled(x[i], inv_scale_half); + const float y = (float) rocmfp4_decode(q) * scale_half; + const float d = x[i] - y; + + float w = 1.0f; + if (quant_weights) { + // Match llama.cpp's imatrix weighting style for Q4_0: calibration + // importance is scaled by row energy so large activations remain protected. + const float qw = quant_weights[i]; + w = isfinite(qw) && qw > 0.0f ? qw * sqrtf(sigma2 + x[i]*x[i]) : 0.0f; + } + + err += w*d*d; + if (err > best_err) { + return err; + } + } + + return err; +} + +static int rocmfp4_nearest_scale_ue4m3(float target_scale_half) { + if (!(target_scale_half > 0.0f) || !isfinite(target_scale_half)) { + return 1; + } + + int lo = 1; + int hi = 126; + while (lo < hi) { + const int mid = lo + (hi - lo) / 2; + if (rocmfp4_ue4m3_to_fp32_half((uint8_t) mid) < target_scale_half) { + lo = mid + 1; + } else { + hi = mid; + } + } + + if (lo == 1) { + return 1; + } + + const float hi_scale = rocmfp4_ue4m3_to_fp32_half((uint8_t) lo); + const float lo_scale = rocmfp4_ue4m3_to_fp32_half((uint8_t) (lo - 1)); + + // Match the former ascending nearest scan: exact midpoint ties keep the + // lower scale byte. + return (target_scale_half - lo_scale <= hi_scale - target_scale_half) ? lo - 1 : lo; +} + +static uint8_t rocmfp4_choose_scale_ue4m3_exhaustive( + const float * x, int n, const float * quant_weights, float sigma2, float max_abs) { + const int start_e = rocmfp4_nearest_scale_ue4m3(max_abs / 10.0f); + + int best_e = 0; + float best_err = FLT_MAX; + + for (int delta = 0; delta <= 125; ++delta) { + const int candidates[2] = { start_e - delta, start_e + delta }; + for (int ci = 0; ci < 2; ++ci) { + const int e = candidates[ci]; + if (e < 1 || e > 126 || (delta == 0 && ci == 1)) { + continue; + } + + const float err = rocmfp4_block_mse_for_scale(x, n, quant_weights, sigma2, e, best_err); + if (err < best_err || (err == best_err && e < best_e)) { + best_err = err; + best_e = e; + } + } + } + + return (uint8_t) best_e; +} + +static uint8_t rocmfp4_choose_scale_ue4m3(const float * x, int n, const float * quant_weights, float sigma2) { + float max_abs = 0.0f; + for (int i = 0; i < n; ++i) { + const float xi = x[i]; + max_abs = fmaxf(max_abs, fabsf(xi)); + } + + if (!(max_abs > 0.0f) || !isfinite(max_abs)) { + return 0; + } + + return rocmfp4_choose_scale_ue4m3_exhaustive(x, n, quant_weights, sigma2, max_abs); +} + +static void rocmfp4_quantize_row_q4_0_weighted( + const float * GGML_RESTRICT x, block_rocmfp4 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights) { + assert(k % QK_ROCMFP4 == 0); + + float sum_x2 = 0.0f; + for (int64_t i = 0; i < k; ++i) { + sum_x2 += x[i]*x[i]; + } + const float sigma2 = sum_x2 / (float) k; + + const int64_t nb = k / QK_ROCMFP4; + for (int64_t ib = 0; ib < nb; ++ib) { + const float * xb = x + ib*QK_ROCMFP4; + const float * qw = quant_weights ? quant_weights + ib*QK_ROCMFP4 : NULL; + const uint8_t e0 = rocmfp4_choose_scale_ue4m3(xb, QK_ROCMFP4/2, qw, sigma2); + const uint8_t e1 = rocmfp4_choose_scale_ue4m3(xb + QK_ROCMFP4/2, QK_ROCMFP4/2, qw ? qw + QK_ROCMFP4/2 : NULL, sigma2); + const float scale_half0 = rocmfp4_ue4m3_to_fp32_half(e0); + const float scale_half1 = rocmfp4_ue4m3_to_fp32_half(e1); + const float inv_scale_half0 = scale_half0 > 0.0f ? 1.0f / scale_half0 : 0.0f; + const float inv_scale_half1 = scale_half1 > 0.0f ? 1.0f / scale_half1 : 0.0f; + + y[ib].e[0] = e0; + y[ib].e[1] = e1; + memset(y[ib].qs, 0, sizeof(y[ib].qs)); + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + const uint8_t q0 = rocmfp4_best_index_scaled(xb[j], inv_scale_half0); + const uint8_t q1 = rocmfp4_best_index_scaled(xb[j + QK_ROCMFP4/2], inv_scale_half1); + y[ib].qs[j] = q0 | (q1 << 4); + } + } +} + +static void rocmfp4_quantize_row_q4_0_fast_weighted( + const float * GGML_RESTRICT x, block_rocmfp4_fast * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights) { + assert(k % QK_ROCMFP4 == 0); + + float sum_x2 = 0.0f; + for (int64_t i = 0; i < k; ++i) { + sum_x2 += x[i]*x[i]; + } + const float sigma2 = sum_x2 / (float) k; + + const int64_t nb = k / QK_ROCMFP4; + for (int64_t ib = 0; ib < nb; ++ib) { + const float * xb = x + ib*QK_ROCMFP4; + const float * qw = quant_weights ? quant_weights + ib*QK_ROCMFP4 : NULL; + const uint8_t e = rocmfp4_choose_scale_ue4m3(xb, QK_ROCMFP4, qw, sigma2); + const float scale_half = rocmfp4_ue4m3_to_fp32_half(e); + const float inv_scale_half = scale_half > 0.0f ? 1.0f / scale_half : 0.0f; + + y[ib].e = e; + memset(y[ib].qs, 0, sizeof(y[ib].qs)); + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + const uint8_t q0 = rocmfp4_best_index_scaled(xb[j], inv_scale_half); + const uint8_t q1 = rocmfp4_best_index_scaled(xb[j + QK_ROCMFP4/2], inv_scale_half); + y[ib].qs[j] = q0 | (q1 << 4); + } + } +} + +void rocmfp4_quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_rocmfp4 * GGML_RESTRICT y, int64_t k) { + assert(k % QK_ROCMFP4 == 0); + + const int64_t nb = k / QK_ROCMFP4; + for (int64_t ib = 0; ib < nb; ++ib) { + const float * xb = x + ib*QK_ROCMFP4; + const uint8_t e0 = rocmfp4_choose_scale_ue4m3(xb, QK_ROCMFP4/2, NULL, 0.0f); + const uint8_t e1 = rocmfp4_choose_scale_ue4m3(xb + QK_ROCMFP4/2, QK_ROCMFP4/2, NULL, 0.0f); + const float scale_half0 = rocmfp4_ue4m3_to_fp32_half(e0); + const float scale_half1 = rocmfp4_ue4m3_to_fp32_half(e1); + const float inv_scale_half0 = scale_half0 > 0.0f ? 1.0f / scale_half0 : 0.0f; + const float inv_scale_half1 = scale_half1 > 0.0f ? 1.0f / scale_half1 : 0.0f; + + y[ib].e[0] = e0; + y[ib].e[1] = e1; + memset(y[ib].qs, 0, sizeof(y[ib].qs)); + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + const uint8_t q0 = rocmfp4_best_index_scaled(xb[j], inv_scale_half0); + const uint8_t q1 = rocmfp4_best_index_scaled(xb[j + QK_ROCMFP4/2], inv_scale_half1); + y[ib].qs[j] = q0 | (q1 << 4); + } + } +} + +void rocmfp4_quantize_row_q4_0_fast_ref(const float * GGML_RESTRICT x, block_rocmfp4_fast * GGML_RESTRICT y, int64_t k) { + assert(k % QK_ROCMFP4 == 0); + + const int64_t nb = k / QK_ROCMFP4; + for (int64_t ib = 0; ib < nb; ++ib) { + const float * xb = x + ib*QK_ROCMFP4; + const uint8_t e = rocmfp4_choose_scale_ue4m3(xb, QK_ROCMFP4, NULL, 0.0f); + const float scale_half = rocmfp4_ue4m3_to_fp32_half(e); + const float inv_scale_half = scale_half > 0.0f ? 1.0f / scale_half : 0.0f; + + y[ib].e = e; + memset(y[ib].qs, 0, sizeof(y[ib].qs)); + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + const uint8_t q0 = rocmfp4_best_index_scaled(xb[j], inv_scale_half); + const uint8_t q1 = rocmfp4_best_index_scaled(xb[j + QK_ROCMFP4/2], inv_scale_half); + y[ib].qs[j] = q0 | (q1 << 4); + } + } +} + +void rocmfp4_dequantize_row_q4_0(const block_rocmfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_ROCMFP4 == 0); + + const int64_t nb = k / QK_ROCMFP4; + for (int64_t ib = 0; ib < nb; ++ib) { + const float d0 = rocmfp4_ue4m3_to_fp32_half(x[ib].e[0]); + const float d1 = rocmfp4_ue4m3_to_fp32_half(x[ib].e[1]); + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + y[ib*QK_ROCMFP4 + j] = (float) rocmfp4_decode(x[ib].qs[j] & 0x0f) * d0; + y[ib*QK_ROCMFP4 + j + QK_ROCMFP4/2] = (float) rocmfp4_decode(x[ib].qs[j] >> 4) * d1; + } + } +} + +void rocmfp4_dequantize_row_q4_0_fast(const block_rocmfp4_fast * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_ROCMFP4 == 0); + + const int64_t nb = k / QK_ROCMFP4; + for (int64_t ib = 0; ib < nb; ++ib) { + const float d = rocmfp4_ue4m3_to_fp32_half(x[ib].e); + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + y[ib*QK_ROCMFP4 + j] = (float) rocmfp4_decode(x[ib].qs[j] & 0x0f) * d; + y[ib*QK_ROCMFP4 + j + QK_ROCMFP4/2] = (float) rocmfp4_decode(x[ib].qs[j] >> 4) * d; + } + } +} + +void rocmfp4_quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + rocmfp4_quantize_row_q4_0_ref(x, (block_rocmfp4 *) y, k); +} + +void rocmfp4_quantize_row_q4_0_fast(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + rocmfp4_quantize_row_q4_0_fast_ref(x, (block_rocmfp4_fast *) y, k); +} + +size_t rocmfp4_quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + const size_t row_size = ggml_row_size(GGML_TYPE_Q4_0_ROCMFP4, n_per_row); + + if (!imatrix) { + rocmfp4_quantize_row_q4_0_ref(src, (block_rocmfp4 *) dst, nrows*n_per_row); + return nrows * row_size; + } + + char * qrow = (char *) dst; + for (int64_t row = 0; row < nrows; ++row) { + rocmfp4_quantize_row_q4_0_weighted(src, (block_rocmfp4 *) qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + + return nrows * row_size; +} + +size_t rocmfp4_quantize_q4_0_fast(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { + const size_t row_size = ggml_row_size(GGML_TYPE_Q4_0_ROCMFP4_FAST, n_per_row); + + if (!imatrix) { + rocmfp4_quantize_row_q4_0_fast_ref(src, (block_rocmfp4_fast *) dst, nrows*n_per_row); + return nrows * row_size; + } + + char * qrow = (char *) dst; + for (int64_t row = 0; row < nrows; ++row) { + rocmfp4_quantize_row_q4_0_fast_weighted(src, (block_rocmfp4_fast *) qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + + return nrows * row_size; +} + +bool rocmfp4_validate_row_data(const void * data, size_t nbytes) { + if (nbytes % sizeof(block_rocmfp4) != 0) { + return false; + } + + const block_rocmfp4 * blocks = (const block_rocmfp4 *) data; + const size_t nblocks = nbytes / sizeof(block_rocmfp4); + for (size_t i = 0; i < nblocks; ++i) { + if (!rocmfp4_scale_is_valid(blocks[i].e[0]) || !rocmfp4_scale_is_valid(blocks[i].e[1])) { + return false; + } + } + + return true; +} + +bool rocmfp4_validate_row_data_fast(const void * data, size_t nbytes) { + if (nbytes % sizeof(block_rocmfp4_fast) != 0) { + return false; + } + + const block_rocmfp4_fast * blocks = (const block_rocmfp4_fast *) data; + const size_t nblocks = nbytes / sizeof(block_rocmfp4_fast); + for (size_t i = 0; i < nblocks; ++i) { + if (!rocmfp4_scale_is_valid(blocks[i].e)) { + return false; + } + } + + return true; +} + +void rocmfp4_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); + assert(nrc == 1); + GGML_UNUSED(nrc); + assert(n % QK_ROCMFP4 == 0); + assert(QK_ROCMFP4 == QK8_0); + + const block_rocmfp4 * GGML_RESTRICT x = (const block_rocmfp4 *) vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *) vy; + + const int nb = n / QK_ROCMFP4; + float sumf = 0.0f; + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = rocmfp4_ue4m3_to_fp32_half(x[ib].e[0]) * ggml_fp16_to_fp32(y[ib].d); + const float d1 = rocmfp4_ue4m3_to_fp32_half(x[ib].e[1]) * ggml_fp16_to_fp32(y[ib].d); + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + sumi0 += rocmfp4_decode(x[ib].qs[j] & 0x0f) * y[ib].qs[j]; + sumi1 += rocmfp4_decode(x[ib].qs[j] >> 4) * y[ib].qs[j + QK_ROCMFP4/2]; + } + + sumf += d0 * (float) sumi0 + d1 * (float) sumi1; + } + + *s = sumf; +} + +void rocmfp4_vec_dot_q4_0_fast_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); + assert(nrc == 1); + GGML_UNUSED(nrc); + assert(n % QK_ROCMFP4 == 0); + assert(QK_ROCMFP4 == QK8_0); + + const block_rocmfp4_fast * GGML_RESTRICT x = (const block_rocmfp4_fast *) vx; + const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *) vy; + + const int nb = n / QK_ROCMFP4; + float sumf = 0.0f; + + for (int ib = 0; ib < nb; ++ib) { + const float d = rocmfp4_ue4m3_to_fp32_half(x[ib].e) * ggml_fp16_to_fp32(y[ib].d); + int sumi = 0; + + for (int j = 0; j < QK_ROCMFP4/2; ++j) { + sumi += rocmfp4_decode(x[ib].qs[j] & 0x0f) * y[ib].qs[j]; + sumi += rocmfp4_decode(x[ib].qs[j] >> 4) * y[ib].qs[j + QK_ROCMFP4/2]; + } + + sumf += d * (float) sumi; + } + + *s = sumf; +} diff --git a/ggml/rocmfp4/rocmfp4.h b/ggml/rocmfp4/rocmfp4.h new file mode 100644 index 00000000000..9756f6ad4c3 --- /dev/null +++ b/ggml/rocmfp4/rocmfp4.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define QK_ROCMFP4 32 +#define QR_ROCMFP4 2 +#define QI_ROCMFP4 (QK_ROCMFP4 / (4 * QR_ROCMFP4)) +#define QS_ROCMFP4 32 + +// AMD-tuned compact layout: 16 bytes of packed E2M1-derived 4-bit codes, then +// one unsigned E4M3 scale byte per 16-weight half block. +typedef struct { + uint8_t qs[QK_ROCMFP4/2]; + uint8_t e[2]; +} block_rocmfp4; + +// Speed-focused layout: same 32 packed ROCmFP4 nibbles, but one UE4M3 scale +// for the whole block. This is a separate GGUF type so fast 4.25 BPW artifacts +// never alias the safer dual-scale format above. +typedef struct { + uint8_t qs[QK_ROCMFP4/2]; + uint8_t e; +} block_rocmfp4_fast; + +#if defined(__cplusplus) +static_assert(sizeof(block_rocmfp4) == QK_ROCMFP4/2 + 2*sizeof(uint8_t), "wrong rocmfp4 block size/padding"); +static_assert(sizeof(block_rocmfp4_fast) == QK_ROCMFP4/2 + sizeof(uint8_t), "wrong rocmfp4 fast block size/padding"); +#else +_Static_assert(sizeof(block_rocmfp4) == QK_ROCMFP4/2 + 2*sizeof(uint8_t), "wrong rocmfp4 block size/padding"); +_Static_assert(sizeof(block_rocmfp4_fast) == QK_ROCMFP4/2 + sizeof(uint8_t), "wrong rocmfp4 fast block size/padding"); +#endif + +GGML_API void rocmfp4_quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_rocmfp4 * GGML_RESTRICT y, int64_t k); +GGML_API void rocmfp4_dequantize_row_q4_0(const block_rocmfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void rocmfp4_quantize_row_q4_0_fast_ref(const float * GGML_RESTRICT x, block_rocmfp4_fast * GGML_RESTRICT y, int64_t k); +GGML_API void rocmfp4_dequantize_row_q4_0_fast(const block_rocmfp4_fast * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + +GGML_API void rocmfp4_quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +GGML_API size_t rocmfp4_quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void rocmfp4_quantize_row_q4_0_fast(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +GGML_API size_t rocmfp4_quantize_q4_0_fast(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API bool rocmfp4_validate_row_data(const void * data, size_t nbytes); +GGML_API bool rocmfp4_validate_row_data_fast(const void * data, size_t nbytes); + +GGML_API void rocmfp4_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +GGML_API void rocmfp4_vec_dot_q4_0_fast_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c26c3f1470d..3732d9bcaef 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -206,6 +206,8 @@ add_library(ggml-base ggml-threading.h ggml-quants.c ggml-quants.h + ../rocmfp4/rocmfp4.c + ../rocmfp4/rocmfp4.h gguf.cpp) set_target_properties(ggml-base PROPERTIES diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f05683b44cd..0e38f8aba15 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -1117,6 +1117,13 @@ GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16) 0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12, GGML_TABLE_END() +// ROCmFP4 uses an E2M1-derived value set with the largest level retuned from +// 12 to 10. Keeping this table separate from MXFP4 lets ROCmFP4 evolve +// without changing stock MXFP4/NVFP4 behavior. +GGML_TABLE_BEGIN(int8_t, kvalues_rocmfp4, 16) + 0, 1, 2, 3, 4, 6, 8, 10, 0, -1, -2, -3, -4, -6, -8, -10, +GGML_TABLE_END() + #define NGRID_IQ1S 2048 #define IQ1S_DELTA 0.125f #define IQ1M_DELTA 0.125f diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index cd5c61a8187..daa0fac4c67 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -14,6 +14,7 @@ #include "ops.h" #include "ggml.h" #include "common.h" +#include "../../rocmfp4/rocmfp4.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -237,6 +238,18 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, #endif }, + [GGML_TYPE_Q4_0_ROCMFP4] = { + .from_float = rocmfp4_quantize_row_q4_0, + .vec_dot = rocmfp4_vec_dot_q4_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q4_0_ROCMFP4_FAST] = { + .from_float = rocmfp4_quantize_row_q4_0_fast, + .vec_dot = rocmfp4_vec_dot_q4_0_fast_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_Q4_1] = { .from_float = quantize_row_q4_1, .vec_dot = ggml_vec_dot_q4_1_q8_1, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index dc73696ad9f..bef0301fd61 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -1253,6 +1253,8 @@ void ggml_compute_forward_acc( case GGML_TYPE_Q8_1: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: + case GGML_TYPE_Q4_0_ROCMFP4: + case GGML_TYPE_Q4_0_ROCMFP4_FAST: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: @@ -4699,6 +4701,8 @@ void ggml_compute_forward_set( case GGML_TYPE_Q8_1: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: + case GGML_TYPE_Q4_0_ROCMFP4: + case GGML_TYPE_Q4_0_ROCMFP4_FAST: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: @@ -4909,7 +4913,7 @@ static void ggml_compute_forward_get_rows_f32( void ggml_compute_forward_get_rows( const ggml_compute_params * params, - ggml_tensor * dst) { + ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; @@ -4923,6 +4927,8 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_Q8_1: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: + case GGML_TYPE_Q4_0_ROCMFP4: + case GGML_TYPE_Q4_0_ROCMFP4_FAST: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: @@ -5649,6 +5655,8 @@ void ggml_compute_forward_clamp( case GGML_TYPE_Q8_1: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: + case GGML_TYPE_Q4_0_ROCMFP4: + case GGML_TYPE_Q4_0_ROCMFP4_FAST: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15d231f70c0..79b3e718883 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5,6 +5,7 @@ #include "ggml-impl.h" #include "ggml-cpu/ggml-cpu-impl.h" #include "ggml-cpu.h" +#include "../rocmfp4/rocmfp4.h" #include #include @@ -5465,6 +5466,20 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb); } break; + case GGML_TYPE_Q4_0_ROCMFP4: + { + if (!rocmfp4_validate_row_data(data, nbytes)) { + fprintf(stderr, "%s: invalid ROCmFP4 row data\n", __func__); + return false; + } + } break; + case GGML_TYPE_Q4_0_ROCMFP4_FAST: + { + if (!rocmfp4_validate_row_data_fast(data, nbytes)) { + fprintf(stderr, "%s: invalid ROCmFP4 fast row data\n", __func__); + return false; + } + } break; case GGML_TYPE_Q4_1: { VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8815c67d8bc..d728fb69c34 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -9,6 +9,7 @@ // FIXME: required here for quantization functions #include "ggml-quants.h" +#include "../rocmfp4/rocmfp4.h" #ifdef GGML_USE_CPU_HBM #include @@ -682,6 +683,22 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, }, + [GGML_TYPE_Q4_0_ROCMFP4] = { + .type_name = "q4_0_rocmfp4", + .blck_size = QK_ROCMFP4, + .type_size = sizeof(block_rocmfp4), + .is_quantized = true, + .to_float = (ggml_to_float_t) rocmfp4_dequantize_row_q4_0, + .from_float_ref = (ggml_from_float_t) rocmfp4_quantize_row_q4_0_ref, + }, + [GGML_TYPE_Q4_0_ROCMFP4_FAST] = { + .type_name = "q4_0_rocmfp4_fast", + .blck_size = QK_ROCMFP4, + .type_size = sizeof(block_rocmfp4_fast), + .is_quantized = true, + .to_float = (ggml_to_float_t) rocmfp4_dequantize_row_q4_0_fast, + .from_float_ref = (ggml_from_float_t) rocmfp4_quantize_row_q4_0_fast_ref, + }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", .blck_size = QK4_1, @@ -1406,6 +1423,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break; case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_0_ROCMFP4: wtype = GGML_TYPE_Q4_0_ROCMFP4; break; + case GGML_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST: wtype = GGML_TYPE_Q4_0_ROCMFP4_FAST; break; case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q1_0: wtype = GGML_TYPE_Q1_0; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; @@ -7692,6 +7711,12 @@ size_t ggml_quantize_chunk( switch (type) { case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0: result = quantize_q4_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_0_ROCMFP4: + result = rocmfp4_quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); + break; + case GGML_TYPE_Q4_0_ROCMFP4_FAST: + result = rocmfp4_quantize_q4_0_fast(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); + break; case GGML_TYPE_Q4_1: result = quantize_q4_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_0: result = quantize_q5_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_1: result = quantize_q5_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/include/llama.h b/include/llama.h index a79a491c592..8672c38eda6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -155,6 +155,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4 = 41, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST = 42, // ROCmFP4 single-scale speed layout LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index c645d0785ab..d13d3e880eb 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -38,6 +38,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4: return "Q4_0_ROCMFP4"; + case LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST: return "Q4_0_ROCMFP4_FAST"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; @@ -738,6 +740,8 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; + case GGML_TYPE_Q4_0_ROCMFP4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4; break; + case GGML_TYPE_Q4_0_ROCMFP4_FAST: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST; break; case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56f..99010342dd2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -384,6 +384,8 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso case GGML_TYPE_Q3_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break; + case GGML_TYPE_Q4_0_ROCMFP4: + case GGML_TYPE_Q4_0_ROCMFP4_FAST: return_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break; @@ -443,7 +445,10 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type const int64_t nx = tensor->ne[0]; const int64_t qk_k = ggml_blck_size(new_type); - if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST) { + new_type = GGML_TYPE_Q4_0_ROCMFP4_FAST; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) { new_type = GGML_TYPE_Q8_0; } else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { @@ -509,6 +514,12 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type } } } else if (category_is_attn_v(category)) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4) { + // Keep attention V above 4-bit FP4 in the default mixed preset, + // matching the existing policy of spending extra bits on selected + // attention tensors for low-bit quantization modes. + new_type = GGML_TYPE_Q5_K; + } if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } @@ -568,7 +579,12 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type } else if (category == tensor_category::FFN_DOWN) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4) { + // Down projections carry residual-stream detail. Spend extra bits + // here so the experimental FP4 tensors do not dominate perplexity. + new_type = (n_layer > 0 && i_layer >= (2 * n_layer) / 3) ? GGML_TYPE_Q5_K : GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } @@ -611,6 +627,11 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type } ++qs.i_ffn_down; } else if (category == tensor_category::ATTENTION_OUTPUT) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4) { + // Output projections are another high-leverage location; Q5_K is + // a modest size increase that helps preserve chat coherence. + new_type = GGML_TYPE_Q5_K; + } if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -640,7 +661,13 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (category == tensor_category::FFN_GATE) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4 && use_more_bits(i_layer, n_layer)) { + // Gate projections are the cheapest place to buy back coherence in + // this preset. Keep selected layers at Q5_K, but leave FFN-up in + // ROCmFP4 to avoid sliding back to a bulky/slower mostly-Q5 mix. + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_gate; @@ -648,7 +675,11 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (category == tensor_category::FFN_UP) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4) { + GGML_UNUSED(i_layer); + GGML_UNUSED(n_layer); + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; @@ -792,6 +823,8 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds ggml_type llama_ftype_get_default_type(llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0; + case LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4: return GGML_TYPE_Q4_0_ROCMFP4; + case LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST: return GGML_TYPE_Q4_0_ROCMFP4_FAST; case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1; case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0; case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 7292bda6f4e..8da9920159b 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -34,6 +34,8 @@ struct quant_option { static const std::vector QUANT_OPTIONS = { { "Q1_0", LLAMA_FTYPE_MOSTLY_Q1_0, " 1.125 bpw quantization", }, { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_ROCMFP4", LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4, " 4.50 bpw ROCmFP4 dual-scale layout", }, + { "Q4_0_ROCMFP4_FAST", LLAMA_FTYPE_MOSTLY_Q4_0_ROCMFP4_FAST, " 4.25 bpw ROCmFP4 single-scale layout", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, { "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },