diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b06d98b32..d67ad2a81e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,18 @@ endfunction() if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") + # Newer embedded llama.cpp snapshots expect build metadata even when they + # are added as a subdirectory instead of built standalone. + if (NOT DEFINED LLAMA_BUILD_NUMBER) + set(LLAMA_BUILD_NUMBER 0 CACHE STRING "Embedded llama.cpp build number" FORCE) + endif() + if (NOT DEFINED LLAMA_BUILD_COMMIT) + set(LLAMA_BUILD_COMMIT unknown CACHE STRING "Embedded llama.cpp build commit" FORCE) + endif() + if (NOT DEFINED LLAMA_INSTALL_VERSION) + set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER} CACHE STRING "Embedded llama.cpp install version" FORCE) + endif() + set(CMAKE_SKIP_BUILD_RPATH FALSE) # When building, don't use the install RPATH already diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b5175a7f2e..41a2e09a5e 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -43,6 +43,8 @@ def __init__( self.params = params self.verbose = verbose self._exit_stack = ExitStack() + self.sampler = None + self.custom_samplers: List[Tuple[int, Any]] = [] model = None diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd82..44005bb6b8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -341,7 +341,11 @@ def __init__( self._logits_all = logits_all if draft_model is None else True self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - self.context_params.flash_attn = flash_attn + self.context_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + if flash_attn + else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED + ) if op_offload is not None: self.context_params.op_offload = op_offload @@ -2096,7 +2100,10 @@ def __getstate__(self): logits_all=self._logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, - flash_attn=self.context_params.flash_attn, + flash_attn=( + self.context_params.flash_attn_type + == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + ), op_offload=self.context_params.op_offload, swa_full=self.context_params.swa_full, # Sampling Params diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f738ab9bb4..a77bcdb399 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -243,6 +243,7 @@ def raise_exception(message: str): tools=tools, tool_choice=tool_choice, strftime_now=self.strftime_now, + **kwargs, ) stopping_criteria = None @@ -599,6 +600,7 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + **kwargs, ) prompt = llama.tokenize( result.prompt.encode("utf-8"), @@ -744,12 +746,16 @@ def hf_tokenizer_config_to_chat_formatter( chat_template = tokenizer_config["chat_template"] assert "bos_token" in tokenizer_config - assert isinstance(tokenizer_config["bos_token"], str) - bos_token = tokenizer_config["bos_token"] + assert tokenizer_config["bos_token"] is None or isinstance( + tokenizer_config["bos_token"], str + ) + bos_token = tokenizer_config["bos_token"] or "" assert "eos_token" in tokenizer_config - assert isinstance(tokenizer_config["eos_token"], str) - eos_token = tokenizer_config["eos_token"] + assert tokenizer_config["eos_token"] is None or isinstance( + tokenizer_config["eos_token"], str + ) + eos_token = tokenizer_config["eos_token"] or "" env = ImmutableSandboxedEnvironment( trim_blocks=True, @@ -760,22 +766,17 @@ def format_tokenizer_config( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - # TODO: veryify this is correct - # Add a blank assistant message to the end of the messages to prompt the model to generate a response - if add_generation_prompt: - messages = [ - *messages, - llama_types.ChatCompletionRequestAssistantMessage( - role="assistant", content="" - ), - ] prompt = env.render( messages=messages, bos_token=bos_token, eos_token=eos_token, + add_generation_prompt=add_generation_prompt, + **kwargs, ) return ChatFormatterResponse( - prompt=prompt, stop=[eos_token, bos_token], added_special=True + prompt=prompt, + stop=[token for token in (eos_token, bos_token) if token], + added_special=True, ) return format_tokenizer_config diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 711d42a6ae..345aec9a05 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -462,6 +462,15 @@ LLAMA_ATTENTION_TYPE_CAUSAL = 0 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1 +# enum llama_flash_attn_type { +# LLAMA_FLASH_ATTN_TYPE_AUTO = -1, +# LLAMA_FLASH_ATTN_TYPE_DISABLED = 0, +# LLAMA_FLASH_ATTN_TYPE_ENABLED = 1, +# }; +LLAMA_FLASH_ATTN_TYPE_AUTO = -1 +LLAMA_FLASH_ATTN_TYPE_DISABLED = 0 +LLAMA_FLASH_ATTN_TYPE_ENABLED = 1 + # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU @@ -710,9 +719,12 @@ class llama_model_params(ctypes.Structure): kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible + use_direct_io (bool): use direct IO if supported use_mlock (bool): force system to keep model in RAM check_tensors (bool): validate model tensor data - use_extra_bufts (bool): use extra buffer types (used for weight repacking)""" + use_extra_bufts (bool): use extra buffer types (used for weight repacking) + no_host (bool): bypass host buffer allowing extra buffers to be used + no_alloc (bool): only load metadata and simulate memory allocations""" if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused @@ -726,9 +738,12 @@ class llama_model_params(ctypes.Structure): kv_overrides: CtypesArray[llama_model_kv_override] vocab_only: bool use_mmap: bool + use_direct_io: bool use_mlock: bool check_tensors: bool use_extra_bufts: bool + no_host: bool + no_alloc: bool _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused @@ -742,9 +757,12 @@ class llama_model_params(ctypes.Structure): ("kv_overrides", ctypes.POINTER(llama_model_kv_override)), ("vocab_only", ctypes.c_bool), ("use_mmap", ctypes.c_bool), + ("use_direct_io", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), + ("no_host", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), ] @@ -810,6 +828,7 @@ class llama_context_params(ctypes.Structure): rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings + flash_attn_type (int): when to enable Flash Attention rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -826,11 +845,12 @@ class llama_context_params(ctypes.Structure): abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU - flash_attn (bool): whether to use flash attention no_perf (bool): whether to measure performance timings op_offload (bool): offload host tensor operations to device swa_full (bool): use full-size SWA cache kv_unified (bool): use a unified buffer across the input sequences when computing the attention + samplers (ctypes.c_void_p): backend sampler chain configuration + n_samplers (int): number of backend sampler chain entries """ if TYPE_CHECKING: @@ -843,6 +863,7 @@ class llama_context_params(ctypes.Structure): rope_scaling_type: int pooling_type: int attention_type: int + flash_attn_type: int rope_freq_base: float rope_freq_scale: float yarn_ext_factor: float @@ -859,11 +880,12 @@ class llama_context_params(ctypes.Structure): abort_callback_data: ctypes.c_void_p embeddings: bool offload_kqv: bool - flash_attn: bool no_perf: bool op_offload: bool swa_full: bool kv_unified: bool + samplers: ctypes.c_void_p + n_samplers: int _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -875,6 +897,7 @@ class llama_context_params(ctypes.Structure): ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), + ("flash_attn_type", ctypes.c_int), ("rope_freq_base", ctypes.c_float), ("rope_freq_scale", ctypes.c_float), ("yarn_ext_factor", ctypes.c_float), @@ -891,11 +914,12 @@ class llama_context_params(ctypes.Structure): ("abort_callback_data", ctypes.c_void_p), ("embeddings", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), - ("flash_attn", ctypes.c_bool), ("no_perf", ctypes.c_bool), ("op_offload", ctypes.c_bool), ("swa_full", ctypes.c_bool), ("kv_unified", ctypes.c_bool), + ("samplers", ctypes.c_void_p), + ("n_samplers", ctypes.c_size_t), ] @@ -1409,6 +1433,7 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int: "llama_get_kv_self", [llama_context_p_ctypes], llama_kv_cache_p_ctypes, + enabled=False, ) def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: """Get the KV cache for self-attention (DEPRECATED)""" @@ -1739,6 +1764,7 @@ def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): "llama_set_adapter_lora", [llama_context_p_ctypes, llama_adapter_lora_p_ctypes, ctypes.c_float], ctypes.c_int32, + enabled=False, ) def llama_set_adapter_lora( ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, / @@ -1757,6 +1783,7 @@ def llama_set_adapter_lora( "llama_rm_adapter_lora", [llama_context_p_ctypes, llama_adapter_lora_p_ctypes], ctypes.c_int32, + enabled=False, ) def llama_rm_adapter_lora( ctx: llama_context_p, adapter: llama_adapter_lora_p, / @@ -1772,6 +1799,7 @@ def llama_rm_adapter_lora( "llama_clear_adapter_lora", [llama_context_p_ctypes], None, + enabled=False, ) def llama_clear_adapter_lora(ctx: llama_context_p, /): """Remove all LoRA adapters from given context""" @@ -1802,6 +1830,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /): ctypes.c_int32, ], ctypes.c_int32, + enabled=False, ) def llama_apply_adapter_cvec( ctx: llama_context_p, @@ -2045,7 +2074,7 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), # "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); @ctypes_function( - "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 + "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32, enabled=False ) def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" @@ -2056,7 +2085,7 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), # "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); @ctypes_function( - "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 + "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32, enabled=False ) def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: """Returns the number of used KV cells (DEPRECATED)""" @@ -2068,7 +2097,7 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: # struct llama_context * ctx), # "Use llama_memory_clear() instead"); @ctypes_function( - "llama_kv_self_clear", [llama_context_p_ctypes], None + "llama_kv_self_clear", [llama_context_p_ctypes], None, enabled=False ) def llama_kv_self_clear(ctx: llama_context_p, /): """Clear the KV cache (DEPRECATED)""" @@ -2095,6 +2124,7 @@ def llama_kv_self_clear(ctx: llama_context_p, /): llama_pos, ], ctypes.c_bool, + enabled=False, ) def llama_kv_self_seq_rm( ctx: llama_context_p, @@ -2128,6 +2158,7 @@ def llama_kv_self_seq_rm( llama_pos, ], None, + enabled=False, ) def llama_kv_self_seq_cp( ctx: llama_context_p, @@ -2147,7 +2178,7 @@ def llama_kv_self_seq_cp( # llama_seq_id seq_id), # "Use llama_memory_seq_keep() instead"); @ctypes_function( - "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None + "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None, enabled=False ) def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): """Keep only specified sequence in KV cache (DEPRECATED)""" @@ -2176,6 +2207,7 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int llama_pos, ], None, + enabled=False, ) def llama_kv_self_seq_add( ctx: llama_context_p, @@ -2211,6 +2243,7 @@ def llama_kv_self_seq_add( ctypes.c_int, ], None, + enabled=False, ) def llama_kv_self_seq_div( ctx: llama_context_p, @@ -2233,7 +2266,7 @@ def llama_kv_self_seq_div( # llama_seq_id seq_id), # "Use llama_memory_seq_pos_min() instead"); @ctypes_function( - "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos + "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos, enabled=False ) def llama_kv_self_seq_pos_min( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / @@ -2250,7 +2283,7 @@ def llama_kv_self_seq_pos_min( # llama_seq_id seq_id), # "Use llama_memory_seq_pos_max() instead"); @ctypes_function( - "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos + "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos, enabled=False ) def llama_kv_self_seq_pos_max( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / @@ -2264,7 +2297,7 @@ def llama_kv_self_seq_pos_max( # // - lazily on next llama_decode() # DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), # "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); -@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) +@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None, enabled=False) def llama_kv_self_defrag(ctx: llama_context_p, /): """Defragment the KV cache (DEPRECATED)""" ... @@ -2273,7 +2306,7 @@ def llama_kv_self_defrag(ctx: llama_context_p, /): # // Check if the context supports KV cache shifting # DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), # "use llama_memory_can_shift() instead"); -@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) +@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool, enabled=False) def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: """Check if the context supports KV cache shifting (DEPRECATED)""" ... @@ -2282,7 +2315,7 @@ def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) # DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), # "simply remove this call, updates are applied lazily on the next llama_decode()"); -@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) +@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None, enabled=False) def llama_kv_self_update(ctx: llama_context_p, /): """Apply the KV cache updates (DEPRECATED)""" ... @@ -3806,7 +3839,7 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p: # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), # "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); -@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) +@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes, enabled=False) def llama_sampler_init_softmax() -> llama_sampler_p: ... diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index f031bf72b7..3266447ae8 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -86,4 +86,40 @@ def test_hf_tokenizer_config_str_to_chat_formatter(): ] ) - assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "") + assert chat_formatter_respoonse.prompt == "[INST] Hello, world! [/INST]" + + +def test_jinja2_chat_formatter_passes_template_kwargs(): + chat_formatter = llama_chat_format.Jinja2ChatFormatter( + template="{{ '\n\n\n\n' if enable_thinking is defined and enable_thinking is false else '\n' }}", + eos_token="<|im_end|>", + bos_token="", + ) + + response = chat_formatter( + messages=[ + ChatCompletionRequestUserMessage(role="user", content="Hello, world!"), + ], + enable_thinking=False, + ) + + assert response.prompt == "\n\n\n\n" + + +def test_hf_tokenizer_config_supports_null_bos_and_template_generation_prompt(): + tokenizer_config = { + "chat_template": "{{ bos_token }}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}{% if enable_thinking is defined and enable_thinking is false %}\n\n\n\n{% endif %}", + "bos_token": None, + "eos_token": "<|im_end|>", + } + chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config) + + response = chat_formatter( + messages=[ + ChatCompletionRequestUserMessage(role="user", content="Hello, world!"), + ], + enable_thinking=False, + ) + + assert response.prompt == "<|im_start|>assistant\n\n\n\n\n" + assert response.stop == ["<|im_end|>"] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4227c9be42..182acfe5c5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 +Subproject commit 182acfe5c5eb17a4f82d9181fa7bd91510e3b93d