Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,52 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()

#
# inline chat templates
#
# Built-in Jinja chat templates for model architectures whose GGUF files ship
# without an embedded chat_template. The header is generated from the manifest
# common/chat-inline-templates.h.in into the build directory and included by
# common/chat.cpp (a quoted include resolves to the generated copy because no
# chat-inline-templates.h exists next to chat.cpp).

set(CHAT_INLINE_TEMPLATES_MANIFEST ${CMAKE_CURRENT_SOURCE_DIR}/chat-inline-templates.h.in)
set(CHAT_INLINE_TEMPLATES_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
set(CHAT_INLINE_TEMPLATES_HEADER ${CHAT_INLINE_TEMPLATES_GEN_DIR}/chat-inline-templates.h)
set(CHAT_INLINE_TEMPLATES_SCRIPT ${PROJECT_SOURCE_DIR}/scripts/gen-chat-inline-templates.py)

# every template the manifest may reference, so the header regenerates on edits
file(GLOB CHAT_INLINE_TEMPLATE_FILES ${PROJECT_SOURCE_DIR}/models/templates/*.jinja)

find_package(Python3 COMPONENTS Interpreter QUIET)

if (Python3_FOUND)
add_custom_command(
OUTPUT ${CHAT_INLINE_TEMPLATES_HEADER}
COMMAND ${CMAKE_COMMAND} -E make_directory ${CHAT_INLINE_TEMPLATES_GEN_DIR}
COMMAND ${Python3_EXECUTABLE} ${CHAT_INLINE_TEMPLATES_SCRIPT}
--manifest ${CHAT_INLINE_TEMPLATES_MANIFEST}
--repo-root ${PROJECT_SOURCE_DIR}
--output ${CHAT_INLINE_TEMPLATES_HEADER}
DEPENDS ${CHAT_INLINE_TEMPLATES_MANIFEST}
${CHAT_INLINE_TEMPLATES_SCRIPT}
${CHAT_INLINE_TEMPLATE_FILES}
COMMENT "Generating chat-inline-templates.h"
VERBATIM
)
else()
message(WARNING "Python3 not found: inline chat templates disabled (generating empty chat-inline-templates.h)")
file(MAKE_DIRECTORY ${CHAT_INLINE_TEMPLATES_GEN_DIR})
file(WRITE ${CHAT_INLINE_TEMPLATES_HEADER}
"// AUTO-GENERATED fallback (Python3 not found) - no inline templates available.
#pragma once
#include <cstddef>
struct common_chat_inline_template_entry { const char * arch; const char * tmpl; };
static const common_chat_inline_template_entry COMMON_CHAT_INLINE_TEMPLATES[] = { { nullptr, nullptr } };
static const size_t COMMON_CHAT_INLINE_TEMPLATES_COUNT = 0;
")
endif()

#
# llama-common
#
Expand All @@ -65,6 +111,7 @@ add_library(${TARGET}
chat-peg-parser.h
chat.cpp
chat.h
${CHAT_INLINE_TEMPLATES_HEADER}
common.cpp
common.h
console.cpp
Expand Down Expand Up @@ -125,6 +172,7 @@ set_target_properties(${TARGET} PROPERTIES
)

target_include_directories(${TARGET} PUBLIC . ../vendor)
target_include_directories(${TARGET} PRIVATE ${CHAT_INLINE_TEMPLATES_GEN_DIR})
target_compile_features (${TARGET} PUBLIC cxx_std_17)

if (BUILD_SHARED_LIBS)
Expand Down
22 changes: 22 additions & 0 deletions common/chat-inline-templates.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Manifest of built-in ("inline") Jinja chat templates.
//
// Some models ship GGUF files without an embedded chat_template. For those, we
// fall back to a built-in template selected by the model architecture name.
//
// This file is the MANIFEST: it is empty by default and only lists, as
// placeholder comments, the architectures that need an inline template. At
// build time scripts/gen-chat-inline-templates.py reads the placeholders below,
// embeds each referenced template file as a C string, and writes the filled-in
// chat-inline-templates.h into the build directory (which common/chat.cpp then
// includes). This file itself is never compiled.
//
// To register a new inline template, add a line of the form:
//
// /* <arch-name>: <path/to/template.jinja> */
//
// where <arch-name> exactly matches llm_arch_name() in src/llama-arch.cpp and
// the path is relative to the repository root.
//
// Registered inline templates:

/* deepseek-v4-flash: models/templates/deepseek-ai-DeepSeek-V4.jinja */
26 changes: 26 additions & 0 deletions common/chat.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "chat.h"

#include "chat-inline-templates.h"
#include "chat-auto-parser-helpers.h"
#include "chat-auto-parser.h"
#include "chat-peg-parser.h"
Expand Down Expand Up @@ -652,6 +653,15 @@ std::string common_chat_templates_source(const struct common_chat_templates * tm
return tmpls->template_default->source();
}

std::string common_chat_template_inline(const std::string & arch) {
for (size_t i = 0; i < COMMON_CHAT_INLINE_TEMPLATES_COUNT; ++i) {
if (arch == COMMON_CHAT_INLINE_TEMPLATES[i].arch) {
return COMMON_CHAT_INLINE_TEMPLATES[i].tmpl;
}
}
return "";
}

common_chat_templates_ptr common_chat_templates_init(const struct llama_model * model,
const std::string & chat_template_override,
const std::string & bos_token_override,
Expand All @@ -675,6 +685,22 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
} else {
default_template_src = chat_template_override;
}

// No template override and the model has no embedded chat template: fall back
// to a built-in inline template selected by the model architecture, if any.
if (chat_template_override.empty() && default_template_src.empty() && template_tool_use_src.empty() && model != nullptr) {
char arch_buf[128] = { 0 };
if (llama_model_meta_val_str(model, "general.architecture", arch_buf, sizeof(arch_buf)) > 0) {
std::string inline_template = common_chat_template_inline(arch_buf);
if (!inline_template.empty()) {
LOG_INF("%s: no chat template found in model, using built-in inline template for arch '%s'\n",
__func__, arch_buf);
default_template_src = inline_template;
has_explicit_template = true;
}
}
}

if (default_template_src.empty() || default_template_src == "chatml") {
if (!template_tool_use_src.empty()) {
default_template_src = template_tool_use_src;
Expand Down
6 changes: 6 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,12 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");

// Returns the built-in ("inline") chat template registered for the given model
// architecture name (see common/chat-inline-templates.h.in), or an empty string
// if none is registered. Used as a fallback for models that ship without an
// embedded chat_template and when no template override is supplied.
std::string common_chat_template_inline(const std::string & arch);

struct common_chat_params common_chat_templates_apply(const struct common_chat_templates * tmpls,
const struct common_chat_templates_inputs & inputs);

Expand Down
1 change: 1 addition & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"DeepseekV2ForCausalLM": "deepseek",
"DeepseekV3ForCausalLM": "deepseek",
"DeepseekV32ForCausalLM": "deepseek",
"DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
"DistilBertModel": "bert",
Expand Down
11 changes: 11 additions & 0 deletions conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,15 +1332,15 @@

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]

Check warning on line 1335 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1335:76: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

Check warning on line 1336 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1336:60: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

tokpre = self.get_vocab_base_pre(tokenizer)

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]

Check warning on line 1340 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1340:93: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]

Check warning on line 1341 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1341:52: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]

Check warning on line 1343 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1343:64: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

for i in range(vocab_size):
if i not in reverse_vocab:
Expand All @@ -1353,7 +1353,7 @@
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]

Check warning on line 1356 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1356:102: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

Expand Down Expand Up @@ -1714,14 +1714,14 @@
def _set_vocab_hybriddna(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]

Check warning on line 1717 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1717:76: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

Check warning on line 1718 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1718:60: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]

Check warning on line 1720 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1720:93: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
# k-mer's own id (llama.cpp strips it on detokenization)
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]

Check warning on line 1724 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1724:39: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
Expand Down Expand Up @@ -2587,6 +2587,17 @@
return cls._wrap_fn(func)(*args, **kwargs)


if hasattr(torch, "float8_e8m0fnu"):
_torch_float8_e8m0 = torch.float8_e8m0fnu
LazyTorchTensor._dtype_map[_torch_float8_e8m0] = np.uint8
LazyTorchTensor._dtype_byteswap_map[_torch_float8_e8m0] = np.uint8
LazyTorchTensor._dtype_str_map["F8_E8M0"] = _torch_float8_e8m0
else:
# Older torch builds do not expose F8_E8M0. Keep the raw bytes so callers
# that know the format can decode them explicitly.
LazyTorchTensor._dtype_str_map["F8_E8M0"] = torch.uint8


def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
# maybe we should fallback to text model's arch in that case, since not many models have both
Expand Down
Loading
Loading