diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b06d98b32..d67ad2a81e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,18 @@ endfunction()
if (LLAMA_BUILD)
set(BUILD_SHARED_LIBS "On")
+ # Newer embedded llama.cpp snapshots expect build metadata even when they
+ # are added as a subdirectory instead of built standalone.
+ if (NOT DEFINED LLAMA_BUILD_NUMBER)
+ set(LLAMA_BUILD_NUMBER 0 CACHE STRING "Embedded llama.cpp build number" FORCE)
+ endif()
+ if (NOT DEFINED LLAMA_BUILD_COMMIT)
+ set(LLAMA_BUILD_COMMIT unknown CACHE STRING "Embedded llama.cpp build commit" FORCE)
+ endif()
+ if (NOT DEFINED LLAMA_INSTALL_VERSION)
+ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER} CACHE STRING "Embedded llama.cpp install version" FORCE)
+ endif()
+
set(CMAKE_SKIP_BUILD_RPATH FALSE)
# When building, don't use the install RPATH already
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b5175a7f2e..41a2e09a5e 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -43,6 +43,8 @@ def __init__(
self.params = params
self.verbose = verbose
self._exit_stack = ExitStack()
+ self.sampler = None
+ self.custom_samplers: List[Tuple[int, Any]] = []
model = None
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd82..44005bb6b8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -341,7 +341,11 @@ def __init__(
self._logits_all = logits_all if draft_model is None else True
self.context_params.embeddings = embedding # TODO: Rename to embeddings
self.context_params.offload_kqv = offload_kqv
- self.context_params.flash_attn = flash_attn
+ self.context_params.flash_attn_type = (
+ llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+ if flash_attn
+ else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+ )
if op_offload is not None:
self.context_params.op_offload = op_offload
@@ -2096,7 +2100,10 @@ def __getstate__(self):
logits_all=self._logits_all,
embedding=self.context_params.embeddings,
offload_kqv=self.context_params.offload_kqv,
- flash_attn=self.context_params.flash_attn,
+ flash_attn=(
+ self.context_params.flash_attn_type
+ == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+ ),
op_offload=self.context_params.op_offload,
swa_full=self.context_params.swa_full,
# Sampling Params
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f738ab9bb4..a77bcdb399 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -243,6 +243,7 @@ def raise_exception(message: str):
tools=tools,
tool_choice=tool_choice,
strftime_now=self.strftime_now,
+ **kwargs,
)
stopping_criteria = None
@@ -599,6 +600,7 @@ def chat_completion_handler(
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
+ **kwargs,
)
prompt = llama.tokenize(
result.prompt.encode("utf-8"),
@@ -744,12 +746,16 @@ def hf_tokenizer_config_to_chat_formatter(
chat_template = tokenizer_config["chat_template"]
assert "bos_token" in tokenizer_config
- assert isinstance(tokenizer_config["bos_token"], str)
- bos_token = tokenizer_config["bos_token"]
+ assert tokenizer_config["bos_token"] is None or isinstance(
+ tokenizer_config["bos_token"], str
+ )
+ bos_token = tokenizer_config["bos_token"] or ""
assert "eos_token" in tokenizer_config
- assert isinstance(tokenizer_config["eos_token"], str)
- eos_token = tokenizer_config["eos_token"]
+ assert tokenizer_config["eos_token"] is None or isinstance(
+ tokenizer_config["eos_token"], str
+ )
+ eos_token = tokenizer_config["eos_token"] or ""
env = ImmutableSandboxedEnvironment(
trim_blocks=True,
@@ -760,22 +766,17 @@ def format_tokenizer_config(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
- # TODO: veryify this is correct
- # Add a blank assistant message to the end of the messages to prompt the model to generate a response
- if add_generation_prompt:
- messages = [
- *messages,
- llama_types.ChatCompletionRequestAssistantMessage(
- role="assistant", content=""
- ),
- ]
prompt = env.render(
messages=messages,
bos_token=bos_token,
eos_token=eos_token,
+ add_generation_prompt=add_generation_prompt,
+ **kwargs,
)
return ChatFormatterResponse(
- prompt=prompt, stop=[eos_token, bos_token], added_special=True
+ prompt=prompt,
+ stop=[token for token in (eos_token, bos_token) if token],
+ added_special=True,
)
return format_tokenizer_config
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 711d42a6ae..345aec9a05 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -462,6 +462,15 @@
LLAMA_ATTENTION_TYPE_CAUSAL = 0
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
+# enum llama_flash_attn_type {
+# LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
+# LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+# LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
+# };
+LLAMA_FLASH_ATTN_TYPE_AUTO = -1
+LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
+LLAMA_FLASH_ATTN_TYPE_ENABLED = 1
+
# enum llama_split_mode {
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
@@ -710,9 +719,12 @@ class llama_model_params(ctypes.Structure):
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
vocab_only (bool): only load the vocabulary, no weights
use_mmap (bool): use mmap if possible
+ use_direct_io (bool): use direct IO if supported
use_mlock (bool): force system to keep model in RAM
check_tensors (bool): validate model tensor data
- use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
+ use_extra_bufts (bool): use extra buffer types (used for weight repacking)
+ no_host (bool): bypass host buffer allowing extra buffers to be used
+ no_alloc (bool): only load metadata and simulate memory allocations"""
if TYPE_CHECKING:
devices: CtypesArray[ctypes.c_void_p] # NOTE: unused
@@ -726,9 +738,12 @@ class llama_model_params(ctypes.Structure):
kv_overrides: CtypesArray[llama_model_kv_override]
vocab_only: bool
use_mmap: bool
+ use_direct_io: bool
use_mlock: bool
check_tensors: bool
use_extra_bufts: bool
+ no_host: bool
+ no_alloc: bool
_fields_ = [
("devices", ctypes.c_void_p), # NOTE: unnused
@@ -742,9 +757,12 @@ class llama_model_params(ctypes.Structure):
("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
("vocab_only", ctypes.c_bool),
("use_mmap", ctypes.c_bool),
+ ("use_direct_io", ctypes.c_bool),
("use_mlock", ctypes.c_bool),
("check_tensors", ctypes.c_bool),
("use_extra_bufts", ctypes.c_bool),
+ ("no_host", ctypes.c_bool),
+ ("no_alloc", ctypes.c_bool),
]
@@ -810,6 +828,7 @@ class llama_context_params(ctypes.Structure):
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
attention_type (int): attention type to use for embeddings
+ flash_attn_type (int): when to enable Flash Attention
rope_freq_base (float): RoPE base frequency, 0 = from model
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -826,11 +845,12 @@ class llama_context_params(ctypes.Structure):
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
embeddings (bool): if true, extract embeddings (together with logits)
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
- flash_attn (bool): whether to use flash attention
no_perf (bool): whether to measure performance timings
op_offload (bool): offload host tensor operations to device
swa_full (bool): use full-size SWA cache
kv_unified (bool): use a unified buffer across the input sequences when computing the attention
+ samplers (ctypes.c_void_p): backend sampler chain configuration
+ n_samplers (int): number of backend sampler chain entries
"""
if TYPE_CHECKING:
@@ -843,6 +863,7 @@ class llama_context_params(ctypes.Structure):
rope_scaling_type: int
pooling_type: int
attention_type: int
+ flash_attn_type: int
rope_freq_base: float
rope_freq_scale: float
yarn_ext_factor: float
@@ -859,11 +880,12 @@ class llama_context_params(ctypes.Structure):
abort_callback_data: ctypes.c_void_p
embeddings: bool
offload_kqv: bool
- flash_attn: bool
no_perf: bool
op_offload: bool
swa_full: bool
kv_unified: bool
+ samplers: ctypes.c_void_p
+ n_samplers: int
_fields_ = [
("n_ctx", ctypes.c_uint32),
@@ -875,6 +897,7 @@ class llama_context_params(ctypes.Structure):
("rope_scaling_type", ctypes.c_int),
("pooling_type", ctypes.c_int),
("attention_type", ctypes.c_int),
+ ("flash_attn_type", ctypes.c_int),
("rope_freq_base", ctypes.c_float),
("rope_freq_scale", ctypes.c_float),
("yarn_ext_factor", ctypes.c_float),
@@ -891,11 +914,12 @@ class llama_context_params(ctypes.Structure):
("abort_callback_data", ctypes.c_void_p),
("embeddings", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
- ("flash_attn", ctypes.c_bool),
("no_perf", ctypes.c_bool),
("op_offload", ctypes.c_bool),
("swa_full", ctypes.c_bool),
("kv_unified", ctypes.c_bool),
+ ("samplers", ctypes.c_void_p),
+ ("n_samplers", ctypes.c_size_t),
]
@@ -1409,6 +1433,7 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int:
"llama_get_kv_self",
[llama_context_p_ctypes],
llama_kv_cache_p_ctypes,
+ enabled=False,
)
def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
"""Get the KV cache for self-attention (DEPRECATED)"""
@@ -1739,6 +1764,7 @@ def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /):
"llama_set_adapter_lora",
[llama_context_p_ctypes, llama_adapter_lora_p_ctypes, ctypes.c_float],
ctypes.c_int32,
+ enabled=False,
)
def llama_set_adapter_lora(
ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, /
@@ -1757,6 +1783,7 @@ def llama_set_adapter_lora(
"llama_rm_adapter_lora",
[llama_context_p_ctypes, llama_adapter_lora_p_ctypes],
ctypes.c_int32,
+ enabled=False,
)
def llama_rm_adapter_lora(
ctx: llama_context_p, adapter: llama_adapter_lora_p, /
@@ -1772,6 +1799,7 @@ def llama_rm_adapter_lora(
"llama_clear_adapter_lora",
[llama_context_p_ctypes],
None,
+ enabled=False,
)
def llama_clear_adapter_lora(ctx: llama_context_p, /):
"""Remove all LoRA adapters from given context"""
@@ -1802,6 +1830,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /):
ctypes.c_int32,
],
ctypes.c_int32,
+ enabled=False,
)
def llama_apply_adapter_cvec(
ctx: llama_context_p,
@@ -2045,7 +2074,7 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
@ctypes_function(
- "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
+ "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32, enabled=False
)
def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
"""Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
@@ -2056,7 +2085,7 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
@ctypes_function(
- "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
+ "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32, enabled=False
)
def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
"""Returns the number of used KV cells (DEPRECATED)"""
@@ -2068,7 +2097,7 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
# struct llama_context * ctx),
# "Use llama_memory_clear() instead");
@ctypes_function(
- "llama_kv_self_clear", [llama_context_p_ctypes], None
+ "llama_kv_self_clear", [llama_context_p_ctypes], None, enabled=False
)
def llama_kv_self_clear(ctx: llama_context_p, /):
"""Clear the KV cache (DEPRECATED)"""
@@ -2095,6 +2124,7 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
llama_pos,
],
ctypes.c_bool,
+ enabled=False,
)
def llama_kv_self_seq_rm(
ctx: llama_context_p,
@@ -2128,6 +2158,7 @@ def llama_kv_self_seq_rm(
llama_pos,
],
None,
+ enabled=False,
)
def llama_kv_self_seq_cp(
ctx: llama_context_p,
@@ -2147,7 +2178,7 @@ def llama_kv_self_seq_cp(
# llama_seq_id seq_id),
# "Use llama_memory_seq_keep() instead");
@ctypes_function(
- "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+ "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None, enabled=False
)
def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
"""Keep only specified sequence in KV cache (DEPRECATED)"""
@@ -2176,6 +2207,7 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
llama_pos,
],
None,
+ enabled=False,
)
def llama_kv_self_seq_add(
ctx: llama_context_p,
@@ -2211,6 +2243,7 @@ def llama_kv_self_seq_add(
ctypes.c_int,
],
None,
+ enabled=False,
)
def llama_kv_self_seq_div(
ctx: llama_context_p,
@@ -2233,7 +2266,7 @@ def llama_kv_self_seq_div(
# llama_seq_id seq_id),
# "Use llama_memory_seq_pos_min() instead");
@ctypes_function(
- "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
+ "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos, enabled=False
)
def llama_kv_self_seq_pos_min(
ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
@@ -2250,7 +2283,7 @@ def llama_kv_self_seq_pos_min(
# llama_seq_id seq_id),
# "Use llama_memory_seq_pos_max() instead");
@ctypes_function(
- "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
+ "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos, enabled=False
)
def llama_kv_self_seq_pos_max(
ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
@@ -2264,7 +2297,7 @@ def llama_kv_self_seq_pos_max(
# // - lazily on next llama_decode()
# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
# "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None, enabled=False)
def llama_kv_self_defrag(ctx: llama_context_p, /):
"""Defragment the KV cache (DEPRECATED)"""
...
@@ -2273,7 +2306,7 @@ def llama_kv_self_defrag(ctx: llama_context_p, /):
# // Check if the context supports KV cache shifting
# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
# "use llama_memory_can_shift() instead");
-@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool, enabled=False)
def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
"""Check if the context supports KV cache shifting (DEPRECATED)"""
...
@@ -2282,7 +2315,7 @@ def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
# "simply remove this call, updates are applied lazily on the next llama_decode()");
-@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None, enabled=False)
def llama_kv_self_update(ctx: llama_context_p, /):
"""Apply the KV cache updates (DEPRECATED)"""
...
@@ -3806,7 +3839,7 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
# "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
+@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes, enabled=False)
def llama_sampler_init_softmax() -> llama_sampler_p:
...
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72b7..3266447ae8 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -86,4 +86,40 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
]
)
- assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "")
+ assert chat_formatter_respoonse.prompt == "[INST] Hello, world! [/INST]"
+
+
+def test_jinja2_chat_formatter_passes_template_kwargs():
+ chat_formatter = llama_chat_format.Jinja2ChatFormatter(
+ template="{{ '\n\n\n\n' if enable_thinking is defined and enable_thinking is false else '\n' }}",
+ eos_token="<|im_end|>",
+ bos_token="",
+ )
+
+ response = chat_formatter(
+ messages=[
+ ChatCompletionRequestUserMessage(role="user", content="Hello, world!"),
+ ],
+ enable_thinking=False,
+ )
+
+ assert response.prompt == "\n\n\n\n"
+
+
+def test_hf_tokenizer_config_supports_null_bos_and_template_generation_prompt():
+ tokenizer_config = {
+ "chat_template": "{{ bos_token }}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}{% if enable_thinking is defined and enable_thinking is false %}\n\n\n\n{% endif %}",
+ "bos_token": None,
+ "eos_token": "<|im_end|>",
+ }
+ chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config)
+
+ response = chat_formatter(
+ messages=[
+ ChatCompletionRequestUserMessage(role="user", content="Hello, world!"),
+ ],
+ enable_thinking=False,
+ )
+
+ assert response.prompt == "<|im_start|>assistant\n\n\n\n\n"
+ assert response.stop == ["<|im_end|>"]
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be42..182acfe5c5 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit 182acfe5c5eb17a4f82d9181fa7bd91510e3b93d