ggml-org · am17an · Jun 1, 2026 · Jun 1, 2026 · Jun 2, 2026 · Jun 3, 2026
@@ -47,6 +47,52 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+#
+# inline chat templates
+#
+# Built-in Jinja chat templates for model architectures whose GGUF files ship
+# without an embedded chat_template. The header is generated from the manifest
+# common/chat-inline-templates.h.in into the build directory and included by
+# common/chat.cpp (a quoted include resolves to the generated copy because no
+# chat-inline-templates.h exists next to chat.cpp).
+
+set(CHAT_INLINE_TEMPLATES_MANIFEST ${CMAKE_CURRENT_SOURCE_DIR}/chat-inline-templates.h.in)
+set(CHAT_INLINE_TEMPLATES_GEN_DIR  ${CMAKE_CURRENT_BINARY_DIR}/generated)
+set(CHAT_INLINE_TEMPLATES_HEADER   ${CHAT_INLINE_TEMPLATES_GEN_DIR}/chat-inline-templates.h)
+set(CHAT_INLINE_TEMPLATES_SCRIPT   ${PROJECT_SOURCE_DIR}/scripts/gen-chat-inline-templates.py)
+
+# every template the manifest may reference, so the header regenerates on edits
+file(GLOB CHAT_INLINE_TEMPLATE_FILES ${PROJECT_SOURCE_DIR}/models/templates/*.jinja)
+
+find_package(Python3 COMPONENTS Interpreter QUIET)
+
+if (Python3_FOUND)
+    add_custom_command(
+        OUTPUT ${CHAT_INLINE_TEMPLATES_HEADER}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CHAT_INLINE_TEMPLATES_GEN_DIR}
+        COMMAND ${Python3_EXECUTABLE} ${CHAT_INLINE_TEMPLATES_SCRIPT}
+                --manifest  ${CHAT_INLINE_TEMPLATES_MANIFEST}
+                --repo-root ${PROJECT_SOURCE_DIR}
+                --output    ${CHAT_INLINE_TEMPLATES_HEADER}
+        DEPENDS ${CHAT_INLINE_TEMPLATES_MANIFEST}
+                ${CHAT_INLINE_TEMPLATES_SCRIPT}
+                ${CHAT_INLINE_TEMPLATE_FILES}
+        COMMENT "Generating chat-inline-templates.h"
+        VERBATIM
+    )
+else()
+    message(WARNING "Python3 not found: inline chat templates disabled (generating empty chat-inline-templates.h)")
+    file(MAKE_DIRECTORY ${CHAT_INLINE_TEMPLATES_GEN_DIR})
+    file(WRITE ${CHAT_INLINE_TEMPLATES_HEADER}
+"// AUTO-GENERATED fallback (Python3 not found) - no inline templates available.
+#pragma once
+#include <cstddef>
+struct common_chat_inline_template_entry { const char * arch; const char * tmpl; };
+static const common_chat_inline_template_entry COMMON_CHAT_INLINE_TEMPLATES[] = { { nullptr, nullptr } };
+static const size_t COMMON_CHAT_INLINE_TEMPLATES_COUNT = 0;
+")
+endif()
+
 #
 # llama-common
 #
@@ -65,6 +111,7 @@ add_library(${TARGET}
     chat-peg-parser.h
     chat.cpp
     chat.h
+    ${CHAT_INLINE_TEMPLATES_HEADER}
     common.cpp
     common.h
     console.cpp
@@ -125,6 +172,7 @@ set_target_properties(${TARGET} PROPERTIES
 )
 
 target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_include_directories(${TARGET} PRIVATE ${CHAT_INLINE_TEMPLATES_GEN_DIR})
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 
 if (BUILD_SHARED_LIBS)

@@ -0,0 +1,22 @@
+// Manifest of built-in ("inline") Jinja chat templates.
+//
+// Some models ship GGUF files without an embedded chat_template. For those, we
+// fall back to a built-in template selected by the model architecture name.
+//
+// This file is the MANIFEST: it is empty by default and only lists, as
+// placeholder comments, the architectures that need an inline template. At
+// build time scripts/gen-chat-inline-templates.py reads the placeholders below,
+// embeds each referenced template file as a C string, and writes the filled-in
+// chat-inline-templates.h into the build directory (which common/chat.cpp then
+// includes). This file itself is never compiled.
+//
+// To register a new inline template, add a line of the form:
+//
+//   /* <arch-name>: <path/to/template.jinja> */
+//
+// where <arch-name> exactly matches llm_arch_name() in src/llama-arch.cpp and
+// the path is relative to the repository root.
+//
+// Registered inline templates:
+
+/* deepseek-v4-flash: models/templates/deepseek-ai-DeepSeek-V4.jinja */
@@ -1,5 +1,6 @@
 #include "chat.h"
 
+#include "chat-inline-templates.h"
 #include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
@@ -652,6 +653,15 @@ std::string common_chat_templates_source(const struct common_chat_templates * tm
     return tmpls->template_default->source();
 }
 
+std::string common_chat_template_inline(const std::string & arch) {
+    for (size_t i = 0; i < COMMON_CHAT_INLINE_TEMPLATES_COUNT; ++i) {
+        if (arch == COMMON_CHAT_INLINE_TEMPLATES[i].arch) {
+            return COMMON_CHAT_INLINE_TEMPLATES[i].tmpl;
+        }
+    }
+    return "";
+}
+
 common_chat_templates_ptr common_chat_templates_init(const struct llama_model * model,
                                                      const std::string &        chat_template_override,
                                                      const std::string &        bos_token_override,
@@ -675,6 +685,22 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
     } else {
         default_template_src = chat_template_override;
     }
+
+    // No template override and the model has no embedded chat template: fall back
+    // to a built-in inline template selected by the model architecture, if any.
+    if (chat_template_override.empty() && default_template_src.empty() && template_tool_use_src.empty() && model != nullptr) {
+        char arch_buf[128] = { 0 };
+        if (llama_model_meta_val_str(model, "general.architecture", arch_buf, sizeof(arch_buf)) > 0) {
+            std::string inline_template = common_chat_template_inline(arch_buf);
+            if (!inline_template.empty()) {
+                LOG_INF("%s: no chat template found in model, using built-in inline template for arch '%s'\n",
+                        __func__, arch_buf);
+                default_template_src  = inline_template;
+                has_explicit_template = true;
+            }
+        }
+    }
+
     if (default_template_src.empty() || default_template_src == "chatml") {
         if (!template_tool_use_src.empty()) {
             default_template_src = template_tool_use_src;

@@ -261,6 +261,12 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
 bool        common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
 
+// Returns the built-in ("inline") chat template registered for the given model
+// architecture name (see common/chat-inline-templates.h.in), or an empty string
+// if none is registered. Used as a fallback for models that ship without an
+// embedded chat_template and when no template override is supplied.
+std::string common_chat_template_inline(const std::string & arch);
+
 struct common_chat_params common_chat_templates_apply(const struct common_chat_templates *        tmpls,
                                                       const struct common_chat_templates_inputs & inputs);
 

@@ -48,6 +48,7 @@
     "DeepseekV2ForCausalLM": "deepseek",
     "DeepseekV3ForCausalLM": "deepseek",
     "DeepseekV32ForCausalLM": "deepseek",
+    "DeepseekV4ForCausalLM": "deepseek",
     "DistilBertForMaskedLM": "bert",
     "DistilBertForSequenceClassification": "bert",
     "DistilBertModel": "bert",

@@ -1332,15 +1332,15 @@

        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))  # ty: ignore[unresolved-attribute]
        assert max(tokenizer.vocab.values()) < vocab_size  # ty: ignore[unresolved-attribute]

        tokpre = self.get_vocab_base_pre(tokenizer)

        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}  # ty: ignore[unresolved-attribute]
        added_vocab = tokenizer.get_added_vocab()  # ty: ignore[unresolved-attribute]

        added_tokens_decoder = tokenizer.added_tokens_decoder  # ty: ignore[unresolved-attribute]

        for i in range(vocab_size):
            if i not in reverse_vocab:
@@ -1353,7 +1353,7 @@
                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
                    if not added_tokens_decoder[i].normalized:
                        previous_token = token
                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))  # ty: ignore[unresolved-attribute, invalid-assignment]
                        if previous_token != token:
                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

@@ -1714,14 +1714,14 @@
    def _set_vocab_hybriddna(self):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))  # ty: ignore[unresolved-attribute]
        assert max(tokenizer.vocab.values()) < vocab_size  # ty: ignore[unresolved-attribute]

        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}  # ty: ignore[unresolved-attribute]
        # k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
        # dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
        # k-mer's own id (llama.cpp strips it on detokenization)
        for kmer in tokenizer.kmers:  # ty: ignore[unresolved-attribute]
            reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000"  # ty: ignore[unresolved-attribute]
        added_vocab = tokenizer.get_added_vocab()  # ty: ignore[unresolved-attribute]
        added_tokens_decoder = tokenizer.added_tokens_decoder  # ty: ignore[unresolved-attribute]
@@ -2587,6 +2587,17 @@
         return cls._wrap_fn(func)(*args, **kwargs)
 
 
+if hasattr(torch, "float8_e8m0fnu"):
+    _torch_float8_e8m0 = torch.float8_e8m0fnu
+    LazyTorchTensor._dtype_map[_torch_float8_e8m0] = np.uint8
+    LazyTorchTensor._dtype_byteswap_map[_torch_float8_e8m0] = np.uint8
+    LazyTorchTensor._dtype_str_map["F8_E8M0"] = _torch_float8_e8m0
+else:
+    # Older torch builds do not expose F8_E8M0. Keep the raw bytes so callers
+    # that know the format can decode them explicitly.
+    LazyTorchTensor._dtype_str_map["F8_E8M0"] = torch.uint8
+
+
 def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
     # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
     # maybe we should fallback to text model's arch in that case, since not many models have both