From ca3b00a204d5a48b1c54eb609b20203dcdaa87be Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 22 Mar 2026 15:20:48 -0700 Subject: [PATCH 01/19] fix(ci): Rename `huggingface-cli` to `hf` (#2149) * Fix model download in test workflow * Use hf CLI in test workflow * Use hf CLI name in CI and docs * Reference PR in changelog --- .github/workflows/test.yaml | 2 +- CHANGELOG.md | 2 ++ README.md | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 95f6e5a27..1d2b1983c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -22,7 +22,7 @@ jobs: - name: Install huggingface-hub run: pip install huggingface-hub - name: Download model - run: huggingface-cli download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} + run: hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }} - name: Cache model uses: actions/cache@v4 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 16954eb88..1f577c1a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149 + ## [0.3.16] - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317 diff --git a/README.md b/README.md index 382f7cbed..d2ba297ca 100644 --- a/README.md +++ b/README.md @@ -328,7 +328,7 @@ llm = Llama.from_pretrained( ) ``` -By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool. +By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`hf`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool. ### Chat Completion From 9f661ff2cf63e72aea328daab15e521230dd20b0 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 22 Mar 2026 16:10:47 -0700 Subject: [PATCH 02/19] fix(ci): Fix macos tests, support both Intel and Apple Silicon testing (#2150) * fix(ci): use supported macos runner label * fix(ci): add apple silicon macos test coverage * fix(ci): run standard macos tests on apple silicon * fix(ci): simplify apple silicon macos install * fix(ci): disable ggml native on apple silicon runner * docs: update changelog for macos ci runner fix --- .github/workflows/test.yaml | 11 +++++------ CHANGELOG.md | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1d2b1983c..af4cacac4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -96,7 +96,7 @@ jobs: build-macos: needs: download-model - runs-on: macos-13 + runs-on: macos-15 strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12"] @@ -127,17 +127,16 @@ jobs: run: | python3 -m pip install --upgrade pip python3 -m pip install uv - python3 -m uv pip install -e .[all] --verbose - CMAKE_ARGS="-DLLAMA_METAL=off" python3 -m uv pip install .[all] --verbose + CMAKE_ARGS="-DGGML_NATIVE=off" python3 -m uv pip install -e .[all] --verbose shell: bash - name: Test with pytest run: | python3 -m pytest - build-macos-metal: + build-macos-intel: needs: download-model - runs-on: macos-13 + runs-on: macos-15-intel steps: - uses: actions/checkout@v4 with: @@ -163,7 +162,7 @@ jobs: - name: Install dependencies run: | python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose + python3 -m pip install .[all] --verbose shell: bash - name: Test with pytest diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f577c1a4..9bc1c9a0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149 ## [0.3.16] From a9b4a067300c89857334195518e0bb9430d1c059 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 22 Mar 2026 16:12:02 -0700 Subject: [PATCH 03/19] misc: Add Ruff formatting (#2148) * Add Ruff formatting and safe lint baseline * Update changelog for Ruff setup --- .github/workflows/lint.yaml | 29 ++ .gitignore | 1 + CHANGELOG.md | 1 + Makefile | 10 + README.md | 14 + llama_cpp/_ggml.py | 2 +- llama_cpp/_internals.py | 98 ++++--- llama_cpp/_logger.py | 5 +- llama_cpp/llama.py | 22 +- llama_cpp/llama_cache.py | 6 +- llama_cpp/llama_chat_format.py | 192 ++++++++----- llama_cpp/llama_cpp.py | 470 ++++++++++++++------------------ llama_cpp/llama_grammar.py | 60 ++-- llama_cpp/llava_cpp.py | 28 +- llama_cpp/mtmd_cpp.py | 94 ++++--- llama_cpp/server/app.py | 6 +- llama_cpp/server/cli.py | 4 +- llama_cpp/server/model.py | 12 +- pyproject.toml | 13 +- tests/test_llama.py | 22 +- tests/test_llama_chat_format.py | 19 +- tests/test_llama_speculative.py | 9 +- 22 files changed, 607 insertions(+), 510 deletions(-) create mode 100644 .github/workflows/lint.yaml diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 000000000..8b3e6322d --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,29 @@ +name: Lint + +on: + pull_request: + branches: + - main + push: + branches: + - main + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Ruff + run: python -m pip install "ruff>=0.15.7" + + - name: Lint with Ruff + run: python -m ruff check llama_cpp tests + + - name: Check formatting with Ruff + run: python -m ruff format --check llama_cpp tests diff --git a/.gitignore b/.gitignore index 9d68dbcd9..ff773c668 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +.ruff_cache/ cover/ # Translations diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bc1c9a0a..7044f44d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main` - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149 diff --git a/Makefile b/Makefile index 26ddf2c7a..8e6cae2c1 100644 --- a/Makefile +++ b/Makefile @@ -67,6 +67,14 @@ deploy.gh-docs: test: python3 -m pytest --full-trace -v +lint: + python3 -m ruff check llama_cpp tests + python3 -m ruff format --check llama_cpp tests + +format: + python3 -m ruff check --fix llama_cpp tests + python3 -m ruff format llama_cpp tests + docker: docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile . @@ -93,5 +101,7 @@ clean: build.sdist \ deploy.pypi \ deploy.gh-docs \ + lint \ + format \ docker \ clean diff --git a/README.md b/README.md index d2ba297ca..b57c95807 100644 --- a/README.md +++ b/README.md @@ -752,6 +752,9 @@ pip install --upgrade pip # Install with pip pip install -e . +# install development tooling (tests, docs, ruff) +pip install -e '.[dev]' + # if you want to use the fastapi / openapi server pip install -e '.[server]' @@ -768,6 +771,17 @@ Now try running the tests pytest ``` +And check formatting / linting before opening a PR: + +```bash +python -m ruff check llama_cpp tests +python -m ruff format --check llama_cpp tests + +# or use the Makefile targets +make lint +make format +``` + There's a `Makefile` available with useful targets. A typical workflow would look like this: diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py index 5bee8a93b..5ece01e03 100644 --- a/llama_cpp/_ggml.py +++ b/llama_cpp/_ggml.py @@ -2,6 +2,7 @@ This module provides a minimal interface for working with ggml tensors from llama-cpp-python """ + import os import pathlib @@ -9,4 +10,3 @@ libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path) - diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b5175a7f2..b520b7ea5 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int): # Sampling functions - deprecated, use LlamaSampler instead def set_rng_seed(self, seed: int): - raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "set_rng_seed is deprecated, use LlamaSampler instead" + ) def sample_repetition_penalties( self, @@ -366,30 +368,44 @@ def sample_repetition_penalties( penalty_freq: float, penalty_present: float, ): - raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_repetition_penalties is deprecated, use LlamaSampler instead" + ) def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_softmax is deprecated, use LlamaSampler instead" + ) def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_k is deprecated, use LlamaSampler instead" + ) def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_p is deprecated, use LlamaSampler instead" + ) def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_min_p is deprecated, use LlamaSampler instead" + ) def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): - raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_typical is deprecated, use LlamaSampler instead" + ) def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead") def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_grammar is deprecated, use LlamaSampler instead" + ) def sample_token_mirostat( self, @@ -399,7 +415,9 @@ def sample_token_mirostat( m: int, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat is deprecated, use LlamaSampler instead" + ) def sample_token_mirostat_v2( self, @@ -408,17 +426,25 @@ def sample_token_mirostat_v2( eta: float, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead" + ) def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_greedy is deprecated, use LlamaSampler instead" + ) def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token is deprecated, use LlamaSampler instead" + ) # Grammar def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "grammar_accept_token is deprecated, use LlamaSampler instead" + ) def reset_timings(self): llama_cpp.llama_perf_context_reset(self.ctx) @@ -602,16 +628,16 @@ def sample( logits_array: Optional[npt.NDArray[np.single]] = None, ): # This method is deprecated in favor of using LlamaSampler directly - raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead" + ) def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): self.prev.append(id) class CustomSampler: - def __init__( - self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] - ): + def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]): self.apply_func = apply_func def apply_wrapper( @@ -723,20 +749,20 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_grammar_lazy_patterns( - self, - model: LlamaModel, + self, + model: LlamaModel, grammar: LlamaGrammar, trigger_patterns: List[str], - trigger_tokens: List[int] + trigger_tokens: List[int], ): # Convert patterns to C array pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))() for i, pattern in enumerate(trigger_patterns): pattern_ptrs[i] = pattern.encode("utf-8") - + # Convert tokens to C array token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) - + sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns( model.vocab, grammar._grammar.encode("utf-8"), @@ -744,7 +770,7 @@ def add_grammar_lazy_patterns( pattern_ptrs, len(trigger_patterns), token_array, - len(trigger_tokens) + len(trigger_tokens), ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) @@ -771,13 +797,13 @@ def add_dry( dry_base: float, dry_allowed_length: int, dry_penalty_last_n: int, - seq_breakers: List[str] + seq_breakers: List[str], ): # Convert seq_breakers to C array breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))() for i, breaker in enumerate(seq_breakers): breaker_ptrs[i] = breaker.encode("utf-8") - + sampler = llama_cpp.llama_sampler_init_dry( model.vocab, n_ctx_train, @@ -786,25 +812,19 @@ def add_dry( dry_allowed_length, dry_penalty_last_n, breaker_ptrs, - len(seq_breakers) + len(seq_breakers), ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def add_logit_bias( - self, - n_vocab: int, - logit_bias: Dict[int, float] - ): + def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]): # Convert logit_bias dict to C array bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))() for i, (token, bias) in enumerate(logit_bias.items()): bias_array[i].token = token bias_array[i].bias = bias - + sampler = llama_cpp.llama_sampler_init_logit_bias( - n_vocab, - len(logit_bias), - bias_array + n_vocab, len(logit_bias), bias_array ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) @@ -838,15 +858,17 @@ def reset(self): def clone(self): # NOTE: Custom samplers cannot be cloned due to Python callback limitations if self.custom_samplers: - raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers") - + raise NotImplementedError( + "Cannot clone LlamaSampler that contains custom samplers" + ) + cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler) # Create a new wrapper around the cloned sampler new_sampler = LlamaSampler.__new__(LlamaSampler) new_sampler.sampler = cloned_sampler new_sampler.custom_samplers = [] new_sampler._exit_stack = ExitStack() - + def free_sampler(): if new_sampler.sampler is not None: llama_cpp.llama_sampler_free(new_sampler.sampler) diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 787b3f108..31d89d099 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -25,6 +25,7 @@ _last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0] + # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); @llama_cpp.llama_log_callback def llama_log_callback( @@ -34,7 +35,9 @@ def llama_log_callback( ): # TODO: Correctly implement continue previous log global _last_log_level - log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level + log_level = ( + GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level + ) if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) _last_log_level = log_level diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..21a7430a0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -934,7 +934,8 @@ def generate( sample_idx += 1 if stopping_criteria is not None and stopping_criteria( - self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] + self._input_ids[:sample_idx], + self._scores[sample_idx - self.n_tokens, :], ): return tokens_or_none = yield token @@ -1157,9 +1158,9 @@ def _create_completion( bos_token_id: int = self.token_bos() cls_token_id: int = self._model.token_cls() sep_token_id: int = self._model.token_sep() - prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix - middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix - suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix + prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix + middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix + suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix add_space_prefix: bool = ( self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true" ) @@ -1315,7 +1316,7 @@ def logit_bias_processor( if seed is not None: self.set_seed(seed) else: - self.set_seed(random.Random(self._seed).randint(0, 2 ** 32)) + self.set_seed(random.Random(self._seed).randint(0, 2**32)) finish_reason = "length" multibyte_fix = 0 @@ -2056,7 +2057,10 @@ def create_chat_completion_openai_v1( stream = kwargs.get("stream", False) # type: ignore assert isinstance(stream, bool) if stream: - return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore + return ( + ChatCompletionChunk(**chunk) + for chunk in self.create_chat_completion(*args, **kwargs) + ) # type: ignore else: return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore except ImportError: @@ -2318,7 +2322,11 @@ def from_pretrained( if additional_files: for additonal_file_name in additional_files: # find the additional shard file: - matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] + matching_additional_files = [ + file + for file in file_list + if fnmatch.fnmatch(file, additonal_file_name) + ] if len(matching_additional_files) == 0: raise ValueError( diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py index e059e98e1..5220c7933 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -52,9 +52,9 @@ class LlamaRAMCache(BaseLlamaCache): def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[ - Tuple[int, ...], "llama_cpp.llama.LlamaState" - ] = OrderedDict() + self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = ( + OrderedDict() + ) @property def cache_size(self): diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f738ab9bb..8e8ac7bb3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -286,11 +286,15 @@ def _convert_text_completion_logprobs_to_chat( } for top_token, top_logprob in top_logprobs.items() ], - } for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]) + } + for (token, logprob, top_logprobs) in zip( + logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"] + ) ], "refusal": None, } + def _convert_text_completion_to_chat( completion: llama_types.Completion, ) -> llama_types.ChatCompletion: @@ -307,7 +311,9 @@ def _convert_text_completion_to_chat( "role": "assistant", "content": completion["choices"][0]["text"], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": completion["choices"][0]["finish_reason"], } ], @@ -351,7 +357,9 @@ def _convert_text_completion_chunks_to_chat( if chunk["choices"][0]["finish_reason"] is None else {} ), - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -368,7 +376,9 @@ def _convert_completion_to_chat( llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk] ]: if stream: - chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = ( + completion_or_chunks # type: ignore + ) return _convert_text_completion_chunks_to_chat(chunks) else: completion: llama_types.Completion = completion_or_chunks # type: ignore @@ -414,7 +424,9 @@ def _convert_completion_to_chat_function( } ], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": "tool_calls", } ], @@ -422,7 +434,9 @@ def _convert_completion_to_chat_function( } return chat_completion else: - chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = ( + completion_or_chunks # type: ignore + ) def _stream_response_to_function_stream( chunks: Iterator[llama_types.CreateCompletionStreamResponse], @@ -467,7 +481,9 @@ def _stream_response_to_function_stream( { "index": 0, "finish_reason": None, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -504,7 +520,9 @@ def _stream_response_to_function_stream( { "index": 0, "finish_reason": None, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -702,7 +720,7 @@ def chat_completion_handler( def hf_autotokenizer_to_chat_formatter( - pretrained_model_name_or_path: Union[str, os.PathLike[str]] + pretrained_model_name_or_path: Union[str, os.PathLike[str]], ) -> ChatFormatter: # https://huggingface.co/docs/transformers/main/chat_templating # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format @@ -727,7 +745,7 @@ def format_autotokenizer( def hf_autotokenizer_to_chat_completion_handler( - pretrained_model_name_or_path: Union[str, os.PathLike[str]] + pretrained_model_name_or_path: Union[str, os.PathLike[str]], ) -> LlamaChatCompletionHandler: chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path) return chat_formatter_to_chat_completion_handler(chat_formatter) @@ -1552,9 +1570,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"][ - "name" - ] = f"functions.{message['function_call']['name']}" + message["function_call"]["name"] = ( + f"functions.{message['function_call']['name']}" + ) all_messages.append(message) all_messages.append( @@ -1632,7 +1650,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): logits_processor=logits_processor, grammar=grammar, ) - return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore + return _convert_completion_to_chat( + completion_or_completion_chunks, stream=stream + ) # type: ignore if function_call is None or ( isinstance(function_call, str) and function_call == "auto" @@ -1748,7 +1768,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): } ], }, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "finish_reason": "tool_calls", } ], @@ -1789,9 +1811,9 @@ def functionary_v1_v2_chat_handler( SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" tokenizer = llama.tokenizer_ - assert hasattr( - tokenizer, "hf_tokenizer" - ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + assert hasattr(tokenizer, "hf_tokenizer"), ( + "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + ) from transformers import AutoTokenizer if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens: @@ -1941,9 +1963,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"][ - "name" - ] = f"functions.{message['function_call']['name']}" + message["function_call"]["name"] = ( + f"functions.{message['function_call']['name']}" + ) all_messages.append(message) if version == "v1": @@ -2005,7 +2027,9 @@ def prepare_messages_for_inference( completion_or_completion_chunks["choices"][0]["text"] = ( completion_or_completion_chunks["choices"][0]["text"].lstrip() ) - return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore + return _convert_completion_to_chat( + completion_or_completion_chunks, stream=stream + ) # type: ignore def get_grammar(function_call): function_body = None @@ -2160,7 +2184,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -2262,7 +2288,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": None, @@ -2300,7 +2328,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": buffer.pop(0), @@ -2323,7 +2353,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": "assistant", "content": ( @@ -2409,7 +2441,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + chunk["choices"][0]["logprobs"] + ), "delta": { "role": None, "content": None, @@ -2643,7 +2677,9 @@ def generate_streaming(tools, functions, function_call, prompt): choices=[ { "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "message": { "role": "assistant", "content": None if content == "" else content, @@ -2716,20 +2752,20 @@ def _init_mtmd_context(self, llama_model: llama.Llama): with suppress_stdout_stderr(disable=self.verbose): # Get default parameters ctx_params = self._mtmd_cpp.mtmd_context_params_default() - ctx_params.use_gpu = True # TODO: Make this configurable + ctx_params.use_gpu = True # TODO: Make this configurable ctx_params.print_timings = self.verbose ctx_params.n_threads = llama_model.n_threads ctx_params.verbosity = 2 if self.verbose else 0 # GGML_LOG_LEVEL_INFO = 2 # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( - self.clip_model_path.encode(), - llama_model.model, - ctx_params + self.clip_model_path.encode(), llama_model.model, ctx_params ) if self.mtmd_ctx is None: - raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}") + raise ValueError( + f"Failed to load mtmd context from: {self.clip_model_path}" + ) # Check if vision is supported if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx): @@ -2756,12 +2792,12 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes): bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf( self.mtmd_ctx, (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)), - len(image_bytes) + len(image_bytes), ) - + if bitmap is None: raise ValueError("Failed to create bitmap from image bytes") - + return bitmap def __call__( @@ -2820,10 +2856,10 @@ def __call__( trim_blocks=True, lstrip_blocks=True, ).from_string(self.CHAT_FORMAT) - + # Get the default media marker - media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8') - + media_marker = self._mtmd_cpp.mtmd_default_marker().decode("utf-8") + # Replace image URLs with media markers in the template text = template.render( messages=messages, @@ -2831,7 +2867,7 @@ def __call__( eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), ) - + # Replace image URLs in text with media markers for image_url in image_urls: text = text.replace(image_url, media_marker) @@ -2851,7 +2887,7 @@ def __call__( # Create input text structure input_text = self._mtmd_cpp.mtmd_input_text() - input_text.text = text.encode('utf-8') + input_text.text = text.encode("utf-8") input_text.add_special = True input_text.parse_special = True @@ -2862,13 +2898,15 @@ def __call__( try: # Tokenize text and images together - bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps) + bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))( + *bitmaps + ) result = self._mtmd_cpp.mtmd_tokenize( self.mtmd_ctx, chunks, ctypes.byref(input_text), bitmap_array, - len(bitmaps) + len(bitmaps), ) if result != 0: @@ -2881,40 +2919,45 @@ def __call__( # Process each chunk n_past = llama_cpp.llama_pos(0) n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks) - + for i in range(n_chunks): chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i) if chunk is None: continue chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk) - + if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT: # Handle text chunk n_tokens_out = ctypes.c_size_t() tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text( chunk, ctypes.byref(n_tokens_out) ) - + if tokens_ptr and n_tokens_out.value > 0: # Convert ctypes array to Python list tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)] - + if llama.n_tokens + len(tokens) > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}" ) llama.eval(tokens) - - elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]: + + elif chunk_type in [ + self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, + self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO, + ]: # Handle image/audio chunk using helper - chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk) - + chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens( + chunk + ) + if llama.n_tokens + chunk_n_tokens > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}" ) - + new_n_past = llama_cpp.llama_pos(0) result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( self.mtmd_ctx, @@ -2924,12 +2967,14 @@ def __call__( llama_cpp.llama_seq_id(0), llama.n_batch, False, # logits_last - ctypes.byref(new_n_past) + ctypes.byref(new_n_past), ) - + if result != 0: - raise ValueError(f"Failed to evaluate chunk: error code {result}") - + raise ValueError( + f"Failed to evaluate chunk: error code {result}" + ) + # Update llama's token count llama.n_tokens = new_n_past.value @@ -3019,7 +3064,7 @@ def __call__( grammar=grammar, logit_bias=logit_bias, ) - + if tool is not None: tool_name = tool["function"]["name"] return _convert_completion_to_chat_function( @@ -3032,10 +3077,12 @@ def _load_image(image_url: str) -> bytes: # TODO: Add Pillow support for other image formats beyond (jpg, png) if image_url.startswith("data:"): import base64 + image_bytes = base64.b64decode(image_url.split(",")[1]) return image_bytes else: import urllib.request + with urllib.request.urlopen(image_url) as f: image_bytes = f.read() return image_bytes @@ -3062,6 +3109,7 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): @staticmethod def split_text_on_image_urls(text: str, image_urls: List[str]): """This method is no longer used in the new implementation.""" + def find_first(s: str, substrs: List[str]): for i, substr in enumerate(substrs): pos = s.find(substr) @@ -3443,7 +3491,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): "{% endif %}" "{% endif %}" "{% endfor %}" - "{% for content in message['content'] %}" "{% if content.type == 'text' %}" "{{ content.text }}" @@ -3465,8 +3512,8 @@ class Qwen25VLChatHandler(Llava15ChatHandler): DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." CHAT_FORMAT = ( - #"{% set image_count = namespace(value=0) %}" - #"{% set video_count = namespace(value=0) %}" + # "{% set image_count = namespace(value=0) %}" + # "{% set video_count = namespace(value=0) %}" "{% for message in messages %}" "{% if loop.first and message['role'] != 'system' %}" "<|im_start|>system\n" @@ -3483,7 +3530,7 @@ class Qwen25VLChatHandler(Llava15ChatHandler): "{% else %}" "{{ content.image_url.url }}" "{% endif %}" - #"{% set image_count.value = image_count.value + 1 %}" + # "{% set image_count.value = image_count.value + 1 %}" "{% elif content['type'] == 'text' %}" "{{ content['text'] }}" "{% endif %}" @@ -3495,25 +3542,28 @@ class Qwen25VLChatHandler(Llava15ChatHandler): ) def __call__(self, **kwargs): - llama = kwargs['llama'] + llama = kwargs["llama"] # Clear state for multiple runs llama.reset() llama._ctx.kv_cache_clear() llama.n_tokens = 0 - if hasattr(llama, 'input_ids'): + if hasattr(llama, "input_ids"): llama.input_ids.fill(0) # Clear any handler state - if hasattr(self, '_last_image_embed'): + if hasattr(self, "_last_image_embed"): self._last_image_embed = None self._last_image_hash = None if self.verbose: - messages = kwargs.get('messages', []) + messages = kwargs.get("messages", []) image_count = len(self.get_image_urls(messages)) - print(f"Minimal - Cleared state, processing {image_count} images", file=sys.stderr) + print( + f"Minimal - Cleared state, processing {image_count} images", + file=sys.stderr, + ) # Use parent implementation return super().__call__(**kwargs) @@ -3636,7 +3686,9 @@ def chatml_function_calling( stop = ( [stop, "<|im_end|>"] if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + else stop + ["<|im_end|>"] + if stop + else ["<|im_end|>"] ) # Case 1: No tool choice by user @@ -3738,7 +3790,7 @@ def chatml_function_calling( # Case 3: Automatic tool choice assert isinstance(tool_choice, str) and tool_choice == "auto" function_names = " | ".join( - [f'''"functions.{tool['function']['name']}:"''' for tool in tools] + [f'''"functions.{tool["function"]["name"]}:"''' for tool in tools] ) initial_gbnf_tool_grammar = ( """root ::= functions | "message:"\n""" @@ -3914,7 +3966,9 @@ def chatml_function_calling( { "finish_reason": "tool_calls", "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), "message": { "role": "assistant", "content": None, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 711d42a6a..f13af67f3 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -33,7 +33,11 @@ # Specify the base name of the shared library to load _lib_base_name = "llama" _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH") -_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path) +_base_path = ( + pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + if _override_base_path is None + else pathlib.Path(_override_base_path) +) # Load the library _lib = load_shared_library(_lib_base_name, _base_path) @@ -559,6 +563,7 @@ class llama_token_data_array(ctypes.Structure): # typedef struct llama_batch { # int32_t n_tokens; + # llama_token * token; # float * embd; # llama_pos * pos; @@ -688,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure): # // override key-value pairs of the model meta data # const struct llama_model_kv_override * kv_overrides; + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights # bool use_mmap; // use mmap if possible @@ -716,7 +722,9 @@ class llama_model_params(ctypes.Structure): if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused - tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused + tensor_buft_overrides: CtypesArray[ + llama_model_tensor_buft_override + ] # NOTE: unused n_gpu_layers: int split_mode: int main_gpu: int @@ -731,8 +739,8 @@ class llama_model_params(ctypes.Structure): use_extra_bufts: bool _fields_ = [ - ("devices", ctypes.c_void_p), # NOTE: unnused - ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused + ("devices", ctypes.c_void_p), # NOTE: unnused + ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), @@ -784,6 +792,7 @@ class llama_model_params(ctypes.Structure): # ggml_abort_callback abort_callback; # void * abort_callback_data; + # // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU @@ -1137,8 +1146,7 @@ def llama_backend_free(): [ctypes.c_int], None, ) -def llama_numa_init(numa: int, /): - ... +def llama_numa_init(numa: int, /): ... # // Optional: an auto threadpool gets created in ggml if not passed explicitly @@ -1164,8 +1172,7 @@ def llama_numa_init(numa: int, /): ) def llama_load_model_from_file( path_model: bytes, params: llama_model_params, / -) -> Optional[llama_model_p]: - ... +) -> Optional[llama_model_p]: ... # // Load the model from a file @@ -1230,8 +1237,7 @@ def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /): [llama_model_p_ctypes], None, ) -def llama_free_model(model: llama_model_p, /): - ... +def llama_free_model(model: llama_model_p, /): ... # LLAMA_API void llama_model_free(struct llama_model * model); @@ -1240,8 +1246,7 @@ def llama_free_model(model: llama_model_p, /): [llama_model_p_ctypes], None, ) -def llama_model_free(model: llama_model_p, /): - ... +def llama_model_free(model: llama_model_p, /): ... # LLAMA_API struct llama_context * llama_init_from_model( @@ -1254,8 +1259,7 @@ def llama_model_free(model: llama_model_p, /): ) def llama_init_from_model( model: llama_model_p, params: llama_context_params, / -) -> Optional[llama_context_p]: - ... +) -> Optional[llama_context_p]: ... # DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model( @@ -1269,8 +1273,7 @@ def llama_init_from_model( ) def llama_new_context_with_model( model: llama_model_p, params: llama_context_params, / -) -> Optional[llama_context_p]: - ... +) -> Optional[llama_context_p]: ... # // Frees all allocated memory @@ -1291,104 +1294,87 @@ def llama_free(ctx: llama_context_p, /): [], ctypes.c_int64, ) -def llama_time_us() -> int: - ... +def llama_time_us() -> int: ... # LLAMA_API size_t llama_max_devices(void); @ctypes_function("llama_max_devices", [], ctypes.c_size_t) -def llama_max_devices() -> int: - ... +def llama_max_devices() -> int: ... # LLAMA_API size_t llama_max_parallel_sequences(void); @ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t) -def llama_max_parallel_sequences() -> int: - ... +def llama_max_parallel_sequences() -> int: ... # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) -def llama_supports_mmap() -> bool: - ... +def llama_supports_mmap() -> bool: ... # LLAMA_API bool llama_supports_mlock (void); @ctypes_function("llama_supports_mlock", [], ctypes.c_bool) -def llama_supports_mlock() -> bool: - ... +def llama_supports_mlock() -> bool: ... # LLAMA_API bool llama_supports_gpu_offload(void); @ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool) -def llama_supports_gpu_offload() -> bool: - ... +def llama_supports_gpu_offload() -> bool: ... # LLAMA_API bool llama_supports_rpc (void); @ctypes_function("llama_supports_rpc", [], ctypes.c_bool) -def llama_supports_rpc() -> bool: - ... +def llama_supports_rpc() -> bool: ... # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ctx(ctx: llama_context_p, /) -> int: - ... +def llama_n_ctx(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_batch(ctx: llama_context_p, /) -> int: - ... +def llama_n_batch(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); @ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_ubatch(ctx: llama_context_p, /) -> int: - ... +def llama_n_ubatch(ctx: llama_context_p, /) -> int: ... # LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); @ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32) -def llama_n_seq_max(ctx: llama_context_p, /) -> int: - ... +def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_ctx_train(model: llama_model_p, /) -> int: - ... +def llama_n_ctx_train(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead"); @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_embd(model: llama_model_p, /) -> int: - ... +def llama_n_embd(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead"); @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_layer(model: llama_model_p, /) -> int: - ... +def llama_n_layer(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead"); @ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32) -def llama_n_head(model: llama_model_p, /) -> int: - ... +def llama_n_head(model: llama_model_p, /) -> int: ... # DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); @ctypes_function("llama_n_vocab", [llama_vocab_p_ctypes], ctypes.c_int32) -def llama_n_vocab(model: llama_vocab_p, /) -> int: - ... +def llama_n_vocab(model: llama_vocab_p, /) -> int: ... # LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes) -def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: - ... +def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... # LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); @@ -1400,8 +1386,7 @@ def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]: # LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) -def llama_pooling_type(ctx: llama_context_p, /) -> int: - ... +def llama_pooling_type(ctx: llama_context_p, /) -> int: ... # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); @@ -1417,57 +1402,50 @@ def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes) -def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: - ... +def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: ... # LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @ctypes_function("llama_model_rope_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_model_rope_type(model: llama_model_p, /) -> int: - ... +def llama_model_rope_type(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model); @ctypes_function("llama_model_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_ctx_train(model: llama_model_p, /) -> int: - ... +def llama_model_n_ctx_train(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); @ctypes_function("llama_model_n_embd", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_embd(model: llama_model_p, /) -> int: - ... +def llama_model_n_embd(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_layer(model: llama_model_p, /) -> int: - ... +def llama_model_n_layer(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_head(model: llama_model_p, /) -> int: - ... +def llama_model_n_head(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_head_kv(model: llama_model_p, /) -> int: - ... +def llama_model_n_head_kv(model: llama_model_p, /) -> int: ... # LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); @ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32) -def llama_model_n_swa(model: llama_model_p, /) -> int: - ... +def llama_model_n_swa(model: llama_model_p, /) -> int: ... # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); -@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) -def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: - ... +@ctypes_function( + "llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float +) +def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: ... # // Returns the number of classifier outputs (only valid for classifier models) @@ -1481,7 +1459,9 @@ def llama_model_n_cls_out(model: llama_model_p, /) -> int: # // Returns label of classifier output by index ( Optional[bytes]: """Returns label of classifier output by index. Returns None if no label provided""" ... @@ -1489,14 +1469,12 @@ def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]: # LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int) -def llama_vocab_type(vocab: llama_vocab_p, /) -> int: - ... +def llama_vocab_type(vocab: llama_vocab_p, /) -> int: ... # LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab); @ctypes_function("llama_vocab_n_tokens", [llama_vocab_p_ctypes], ctypes.c_int32) -def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int: - ... +def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int: ... # // Functions to access the model's GGUF metadata scalar values @@ -1611,8 +1589,14 @@ def llama_model_size(model: llama_model_p, /) -> int: # // Get the default chat template. Returns nullptr if not available # // If name is NULL, returns the default chat template # LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); -@ctypes_function("llama_model_chat_template", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_char_p) -def llama_model_chat_template(model: llama_model_p, name: Optional[bytes], /) -> Optional[bytes]: +@ctypes_function( + "llama_model_chat_template", + [llama_model_p_ctypes, ctypes.c_char_p], + ctypes.c_char_p, +) +def llama_model_chat_template( + model: llama_model_p, name: Optional[bytes], / +) -> Optional[bytes]: """Get the default chat template. Returns None if not available If name is None, returns the default chat template""" ... @@ -1699,6 +1683,7 @@ def llama_model_quantize( # // Adapters # // + # // Load a LoRA adapter from file # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( # struct llama_model * model, @@ -1710,8 +1695,7 @@ def llama_model_quantize( ) def llama_adapter_lora_init( model: llama_model_p, path_lora: bytes, / -) -> Optional[llama_adapter_lora_p]: - ... +) -> Optional[llama_adapter_lora_p]: ... # // Manually free a LoRA adapter @@ -1722,8 +1706,7 @@ def llama_adapter_lora_init( [llama_adapter_lora_p_ctypes], None, ) -def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): - ... +def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ... # // The following functions operate on a llama_context, hence the naming: llama_verb_... @@ -1825,6 +1808,7 @@ def llama_apply_adapter_cvec( # // Memory # // + # // Clear the memory contents # // If data == true, the data buffers will also be cleared together with the metadata # LLAMA_API void llama_memory_clear( @@ -1916,9 +1900,7 @@ def llama_memory_seq_cp( # LLAMA_API void llama_memory_seq_keep( # llama_memory_t mem, # llama_seq_id seq_id); -@ctypes_function( - "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None -) +@ctypes_function("llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None) def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /): """Removes all tokens that do not belong to the specified sequence""" ... @@ -2040,13 +2022,12 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: # // KV cache for self-attention (TODO: deprecate in favor of llama_memory) # // + # // Returns the number of tokens in the KV cache (slow, use only for debug) # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), # "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 -) +@ctypes_function("llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32) def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" ... @@ -2055,9 +2036,7 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), # "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 -) +@ctypes_function("llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32) def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: """Returns the number of used KV cells (DEPRECATED)""" ... @@ -2067,9 +2046,7 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: # DEPRECATED(LLAMA_API void llama_kv_self_clear( # struct llama_context * ctx), # "Use llama_memory_clear() instead"); -@ctypes_function( - "llama_kv_self_clear", [llama_context_p_ctypes], None -) +@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None) def llama_kv_self_clear(ctx: llama_context_p, /): """Clear the KV cache (DEPRECATED)""" ... @@ -2146,9 +2123,7 @@ def llama_kv_self_seq_cp( # struct llama_context * ctx, # llama_seq_id seq_id), # "Use llama_memory_seq_keep() instead"); -@ctypes_function( - "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None -) +@ctypes_function("llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None) def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): """Keep only specified sequence in KV cache (DEPRECATED)""" ... @@ -2292,6 +2267,7 @@ def llama_kv_self_update(ctx: llama_context_p, /): # // State / sessions # // + # // Returns the *actual* size in bytes of the state # // (logits, embedding and memory) # // Only use when saving the state, not when restoring it, otherwise the size may be too small. @@ -2420,8 +2396,7 @@ def llama_state_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API DEPRECATED(bool llama_load_session_file( @@ -2449,8 +2424,7 @@ def llama_load_session_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API bool llama_state_save_file( @@ -2474,8 +2448,7 @@ def llama_state_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> bool: - ... +) -> bool: ... # LLAMA_API DEPRECATED(bool llama_save_session_file( @@ -2500,8 +2473,7 @@ def llama_save_session_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> bool: - ... +) -> bool: ... # // Get the exact size needed to copy the state of a single sequence @@ -2599,8 +2571,7 @@ def llama_state_seq_save_file( tokens: CtypesArray[llama_token], n_token_count: Union[ctypes.c_size_t, int], /, -) -> int: - ... +) -> int: ... # LLAMA_API size_t llama_state_seq_load_file( @@ -2630,14 +2601,14 @@ def llama_state_seq_load_file( n_token_capacity: Union[ctypes.c_size_t, int], n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t], /, -) -> int: - ... +) -> int: ... # // # // Decoding # // + # // Return batch for single sequence of tokens # // The sequence ID will be fixed to 0 # // The position of the tokens will be tracked automatically by llama_decode @@ -2947,14 +2918,14 @@ def llama_get_embeddings_seq( # // Vocab # // + # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token); @ctypes_function( "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p ) def llama_vocab_get_text( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bytes: - ... +) -> bytes: ... # LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token); @@ -2963,8 +2934,7 @@ def llama_vocab_get_text( ) def llama_vocab_get_score( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> float: - ... +) -> float: ... # LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token); @@ -2973,8 +2943,7 @@ def llama_vocab_get_score( ) def llama_vocab_get_attr( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> int: - ... +) -> int: ... # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) @@ -3055,8 +3024,7 @@ def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: - ... +def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: ... # LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab); @@ -3065,8 +3033,7 @@ def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: - ... +def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: ... # LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab); @@ -3075,8 +3042,7 @@ def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: - ... +def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: ... # LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab); @@ -3085,8 +3051,7 @@ def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab); @@ -3095,8 +3060,7 @@ def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab); @@ -3105,8 +3069,7 @@ def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab); @@ -3115,8 +3078,7 @@ def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab); @@ -3125,8 +3087,7 @@ def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ... # LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab); @@ -3135,8 +3096,7 @@ def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED functions @@ -3148,8 +3108,7 @@ def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ) def llama_token_get_text( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bytes: - ... +) -> bytes: ... # DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead"); @@ -3160,8 +3119,8 @@ def llama_token_get_text( ) def llama_token_get_score( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> float: - ... +) -> float: ... + # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead"); @ctypes_function( @@ -3171,8 +3130,8 @@ def llama_token_get_score( ) def llama_token_get_attr( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> int: - ... +) -> int: ... + # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead"); @ctypes_function( @@ -3182,8 +3141,8 @@ def llama_token_get_attr( ) def llama_token_is_eog( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bool: - ... +) -> bool: ... + # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead"); @ctypes_function( @@ -3193,8 +3152,8 @@ def llama_token_is_eog( ) def llama_token_is_control( vocab: llama_vocab_p, token: Union[llama_token, int], / -) -> bool: - ... +) -> bool: ... + # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead"); @ctypes_function( @@ -3202,8 +3161,8 @@ def llama_token_is_control( [llama_vocab_p_ctypes], llama_token, ) -def llama_token_bos(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_bos(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead"); @ctypes_function( @@ -3211,8 +3170,8 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_eos(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_eos(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead"); @ctypes_function( @@ -3220,8 +3179,8 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_eot(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_eot(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead"); @ctypes_function( @@ -3229,8 +3188,8 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_cls(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_cls(vocab: llama_vocab_p, /) -> int: ... + # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead"); @ctypes_function( @@ -3238,8 +3197,7 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_sep(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_sep(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead"); @@ -3248,8 +3206,7 @@ def llama_token_sep(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_nl(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_nl(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead"); @@ -3258,8 +3215,7 @@ def llama_token_nl(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_pad(vocab: llama_vocab_p, /) -> int: - ... +def llama_token_pad(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead"); @@ -3268,8 +3224,8 @@ def llama_token_pad(vocab: llama_vocab_p, /) -> int: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: - ... +def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ... + # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead"); @ctypes_function( @@ -3277,8 +3233,7 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], ctypes.c_bool, ) -def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: - ... +def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead"); @@ -3287,8 +3242,8 @@ def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead"); @ctypes_function( @@ -3296,8 +3251,8 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead"); @ctypes_function( @@ -3305,8 +3260,8 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead"); @ctypes_function( @@ -3314,8 +3269,8 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead"); @ctypes_function( @@ -3323,8 +3278,8 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ... + # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead"); @ctypes_function( @@ -3332,8 +3287,8 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ... + # // CLS is equivalent to BOS # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification @@ -3343,8 +3298,7 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: [llama_vocab_p_ctypes], llama_token, ) -def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: - ... +def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: ... # // @@ -3353,6 +3307,7 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: # // The API is thread-safe. # // + # /// @details Convert the provided text into tokens. # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. # /// @return Returns the number of tokens on success, no more than n_tokens_max @@ -3512,6 +3467,7 @@ def llama_detokenize( # // Chat templates # // + # /// Apply chat template. Inspired by hf apply_chat_template() on python. # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" # /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template @@ -3535,9 +3491,9 @@ def llama_detokenize( ctypes.c_char_p, # tmpl ctypes.POINTER(llama_chat_message), # chat ctypes.c_size_t, # n_msg - ctypes.c_bool, # add_ass (added) + ctypes.c_bool, # add_ass (added) ctypes.c_char_p, # buf - ctypes.c_int32, # length + ctypes.c_int32, # length ], ctypes.c_int32, ) @@ -3611,11 +3567,11 @@ def llama_chat_builtin_templates( # struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL # void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL + # // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph # //void (*apply_ggml) (struct llama_sampler * smpl, ...); # }; -class llama_sampler_i(ctypes.Structure): - ... +class llama_sampler_i(ctypes.Structure): ... # struct llama_sampler { @@ -3662,8 +3618,7 @@ class llama_sampler(ctypes.Structure): ) def llama_sampler_init( iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); @@ -3672,8 +3627,7 @@ def llama_sampler_init( [llama_sampler_p_ctypes], ctypes.c_char_p, ) -def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: - ... +def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: ... # LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); @@ -3682,8 +3636,7 @@ def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: [llama_sampler_p_ctypes, llama_token], None, ) -def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): - ... +def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): ... # LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); @@ -3694,8 +3647,7 @@ def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], ) def llama_sampler_apply( smpl: llama_sampler_p, cur_p: CtypesArray[llama_token_data_array], / -): - ... +): ... # LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl); @@ -3704,8 +3656,7 @@ def llama_sampler_apply( [llama_sampler_p_ctypes], None, ) -def llama_sampler_reset(smpl: llama_sampler_p, /): - ... +def llama_sampler_reset(smpl: llama_sampler_p, /): ... # LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl); @@ -3714,8 +3665,7 @@ def llama_sampler_reset(smpl: llama_sampler_p, /): [llama_sampler_p_ctypes], llama_sampler_p_ctypes, ) -def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: - ... +def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: ... # // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add) @@ -3725,21 +3675,22 @@ def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: [llama_sampler_p_ctypes], None, ) -def llama_sampler_free(smpl: llama_sampler_p, /): - ... +def llama_sampler_free(smpl: llama_sampler_p, /): ... # // llama_sampler_chain # // a type of llama_sampler that can chain multiple samplers one after another + # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params); @ctypes_function( "llama_sampler_chain_init", [llama_sampler_chain_params], llama_sampler_p_ctypes, ) -def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sampler_p: - ... +def llama_sampler_chain_init( + params: llama_sampler_chain_params, / +) -> llama_sampler_p: ... # // important: takes ownership of the sampler object and will free it when llama_sampler_free is called @@ -3749,8 +3700,7 @@ def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sam [llama_sampler_p_ctypes, llama_sampler_p_ctypes], None, ) -def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): - ... +def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ... # LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); @@ -3761,8 +3711,7 @@ def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ) def llama_sampler_chain_get( chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); @@ -3771,8 +3720,7 @@ def llama_sampler_chain_get( [llama_sampler_p_ctypes], ctypes.c_int, ) -def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: - ... +def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ... # // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed @@ -3784,22 +3732,20 @@ def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ) def llama_sampler_chain_remove( chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # // available samplers: + # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes) -def llama_sampler_init_greedy() -> llama_sampler_p: - ... +def llama_sampler_init_greedy() -> llama_sampler_p: ... # LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes) -def llama_sampler_init_dist(seed: int) -> llama_sampler_p: - ... +def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ... # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. @@ -3807,16 +3753,14 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p: # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), # "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) -def llama_sampler_init_softmax() -> llama_sampler_p: - ... +def llama_sampler_init_softmax() -> llama_sampler_p: ... # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 # /// Setting k <= 0 makes this a noop # LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes) -def llama_sampler_init_top_k(k: int) -> llama_sampler_p: - ... +def llama_sampler_init_top_k(k: int) -> llama_sampler_p: ... # /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 @@ -3826,8 +3770,7 @@ def llama_sampler_init_top_k(k: int) -> llama_sampler_p: [ctypes.c_float, ctypes.c_size_t], llama_sampler_p_ctypes, ) -def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: - ... +def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: ... # /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841 @@ -3837,8 +3780,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: [ctypes.c_float, ctypes.c_size_t], llama_sampler_p_ctypes, ) -def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: - ... +def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ... # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. @@ -3848,15 +3790,13 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: [ctypes.c_float, ctypes.c_size_t], llama_sampler_p_ctypes, ) -def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: - ... +def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: ... # /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf # LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes) -def llama_sampler_init_temp(t: float) -> llama_sampler_p: - ... +def llama_sampler_init_temp(t: float) -> llama_sampler_p: ... # /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. @@ -3868,8 +3808,7 @@ def llama_sampler_init_temp(t: float) -> llama_sampler_p: ) def llama_sampler_init_temp_ext( t: float, delta: float, exponent: float -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 @@ -3881,8 +3820,7 @@ def llama_sampler_init_temp_ext( ) def llama_sampler_init_xtc( p: float, t: float, min_keep: int, seed: int, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641 @@ -3892,8 +3830,7 @@ def llama_sampler_init_xtc( [ctypes.c_float], llama_sampler_p_ctypes, ) -def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: - ... +def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: ... # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. @@ -3910,8 +3847,7 @@ def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: ) def llama_sampler_init_mirostat( n_vocab: int, seed: int, tau: float, eta: float, m: int, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. @@ -3926,8 +3862,7 @@ def llama_sampler_init_mirostat( ) def llama_sampler_init_mirostat_v2( seed: int, tau: float, eta: float, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Intializes a GBNF grammar, see grammars/README.md for details. @@ -3942,8 +3877,7 @@ def llama_sampler_init_mirostat_v2( ) def llama_sampler_init_grammar( vocab: llama_vocab_p, grammar_str: bytes, grammar_root: bytes, / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy( @@ -3977,8 +3911,7 @@ def llama_sampler_init_grammar_lazy( trigger_tokens: CtypesArray[llama_token], num_trigger_tokens: int, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639 @@ -4012,8 +3945,7 @@ def llama_sampler_init_grammar_lazy_patterns( trigger_tokens: CtypesArray[llama_token], num_trigger_tokens: int, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. @@ -4033,8 +3965,7 @@ def llama_sampler_init_penalties( penalty_freq: float, penalty_present: float, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 @@ -4071,8 +4002,7 @@ def llama_sampler_init_dry( seq_breakers, num_breakers: int, /, -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( @@ -4086,8 +4016,7 @@ def llama_sampler_init_dry( ) def llama_sampler_init_logit_bias( n_vocab: int, n_logit_bias: int, logit_bias: CtypesArray[llama_logit_bias], / -) -> llama_sampler_p: - ... +) -> llama_sampler_p: ... # // this sampler is meant to be used for fill-in-the-middle infilling @@ -4097,8 +4026,7 @@ def llama_sampler_init_logit_bias( [llama_vocab_p_ctypes], llama_sampler_p_ctypes, ) -def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: - ... +def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: ... # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise @@ -4108,8 +4036,7 @@ def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: [llama_sampler_p_ctypes], ctypes.c_uint32, ) -def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: - ... +def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: ... # /// @details Sample and accept a token from the idx-th output of the last evaluation @@ -4121,14 +4048,14 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: ) def llama_sampler_sample( smpl: llama_sampler_p, ctx: llama_context_p, idx: int, / -) -> int: - ... +) -> int: ... # // # // Model split # // + # /// @details Build a split GGUF final path for this chunk. # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); @ctypes_function( @@ -4170,8 +4097,7 @@ def llama_split_prefix( # // Print system information # LLAMA_API const char * llama_print_system_info(void); @ctypes_function("llama_print_system_info", [], ctypes.c_char_p) -def llama_print_system_info() -> bytes: - ... +def llama_print_system_info() -> bytes: ... # // Set callback for all future logging events. @@ -4203,6 +4129,7 @@ def llama_log_set( # double t_p_eval_ms; # double t_eval_ms; + # int32_t n_p_eval; # int32_t n_eval; # int32_t n_reused; // number of times a ggml compute graph had been reused @@ -4222,6 +4149,7 @@ class llama_perf_context_data(ctypes.Structure): # struct llama_perf_sampler_data { # double t_sample_ms; + # int32_t n_sample; # }; class llama_perf_sampler_data(ctypes.Structure): @@ -4237,8 +4165,7 @@ class llama_perf_sampler_data(ctypes.Structure): [llama_context_p_ctypes], llama_perf_context_data, ) -def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: - ... +def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: ... # LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); @@ -4247,8 +4174,7 @@ def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: [llama_context_p_ctypes], None, ) -def llama_perf_context_print(ctx: llama_context_p, /): - ... +def llama_perf_context_print(ctx: llama_context_p, /): ... # LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); @@ -4257,8 +4183,7 @@ def llama_perf_context_print(ctx: llama_context_p, /): [llama_context_p_ctypes], None, ) -def llama_perf_context_reset(ctx: llama_context_p, /): - ... +def llama_perf_context_reset(ctx: llama_context_p, /): ... # // NOTE: the following work only with samplers constructed via llama_sampler_chain_init @@ -4268,8 +4193,7 @@ def llama_perf_context_reset(ctx: llama_context_p, /): [llama_sampler_p_ctypes], llama_perf_sampler_data, ) -def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: - ... +def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: ... # LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); @@ -4278,8 +4202,7 @@ def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: [llama_sampler_p_ctypes], None, ) -def llama_perf_sampler_print(chain: llama_sampler_p, /): - ... +def llama_perf_sampler_print(chain: llama_sampler_p, /): ... # LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); @@ -4288,8 +4211,7 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): [llama_sampler_p_ctypes], None, ) -def llama_perf_sampler_reset(chain: llama_sampler_p, /): - ... +def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... # // @@ -4298,7 +4220,10 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): # // function that returns whether or not a given tensor contains trainable parameters # typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata); -llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) +llama_opt_param_filter = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p +) + # // always returns true # LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata); @@ -4307,8 +4232,9 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): [ctypes.c_void_p, ctypes.c_void_p], ctypes.c_bool, ) -def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool: - ... +def llama_opt_param_filter_all( + tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, / +) -> bool: ... # struct llama_opt_params { @@ -4317,6 +4243,7 @@ def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_ # llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters # void * param_filter_ud; // userdata for determining which tensors contain trainable parameters + # ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters # void * get_opt_pars_ud; // userdata for calculating optimizer parameters # }; @@ -4325,7 +4252,10 @@ class llama_opt_params(ctypes.Structure): ("n_ctx_train", ctypes.c_uint32), ("param_filter", llama_opt_param_filter), ("param_filter_ud", ctypes.c_void_p), - ("get_opt_pars", ctypes.c_void_p), # ggml_opt_get_optimizer_params - not implemented here + ( + "get_opt_pars", + ctypes.c_void_p, + ), # ggml_opt_get_optimizer_params - not implemented here ("get_opt_pars_ud", ctypes.c_void_p), ] @@ -4336,8 +4266,9 @@ class llama_opt_params(ctypes.Structure): [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params], None, ) -def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /): - ... +def llama_opt_init( + lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, / +): ... # LLAMA_API void llama_opt_epoch( @@ -4353,7 +4284,7 @@ def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: lla [ llama_context_p_ctypes, ctypes.c_void_p, # ggml_opt_dataset_t - ctypes.c_void_p, # ggml_opt_result_t + ctypes.c_void_p, # ggml_opt_result_t ctypes.c_void_p, # ggml_opt_result_t ctypes.c_int64, ctypes.c_void_p, # ggml_opt_epoch_callback @@ -4370,5 +4301,4 @@ def llama_opt_epoch( callback_train: ctypes.c_void_p, callback_eval: ctypes.c_void_p, /, -): - ... +): ... diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index b95c77ab5..ba34dda83 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -297,7 +297,7 @@ def opt_repetitions(up_to_n, prefix_with_sep=False): if max_items is not None: result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0) else: - item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})' + item_operator = f"({separator_rule + ' ' if separator_rule else ''}{item_rule})" if min_items == 0 and separator_rule: result = f"({item_rule} {item_operator}*)?" @@ -450,9 +450,9 @@ def visit(n: dict): ref = n.get("$ref") if ref is not None and ref not in self._refs: if ref.startswith("https://"): - assert ( - self._allow_fetch - ), "Fetching remote schemas is not allowed (use --allow-fetch for force)" + assert self._allow_fetch, ( + "Fetching remote schemas is not allowed (use --allow-fetch for force)" + ) import requests frag_split = ref.split("#") @@ -475,9 +475,9 @@ def visit(n: dict): raise ValueError(f"Unsupported ref {ref}") for sel in ref.split("#")[-1].split("/")[1:]: - assert ( - target is not None and sel in target - ), f"Error resolving ref {ref}: {sel} not in {target}" + assert target is not None and sel in target, ( + f"Error resolving ref {ref}: {sel} not in {target}" + ) target = target[sel] self._refs[ref] = target @@ -492,7 +492,7 @@ def visit(n: dict): def _generate_union_rule(self, name, alt_schemas): return " | ".join( ( - self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}') + self.visit(alt_schema, f"{name}{'-' if name else 'alternative-'}{i}") for i, alt_schema in enumerate(alt_schemas) ) ) @@ -510,9 +510,9 @@ def _visit_pattern(self, pattern, name): we define sub-rules to keep the output lean. """ - assert pattern.startswith("^") and pattern.endswith( - "$" - ), 'Pattern must start with "^" and end with "$"' + assert pattern.startswith("^") and pattern.endswith("$"), ( + 'Pattern must start with "^" and end with "$"' + ) pattern = pattern[1:-1] sub_rule_ids = {} @@ -566,15 +566,15 @@ def join_seq(): elif c == "(": i += 1 if i < length: - assert ( - pattern[i] != "?" - ), f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' + assert pattern[i] != "?", ( + f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' + ) seq.append((f"({to_rule(transform())})", False)) elif c == ")": i += 1 - assert ( - start > 0 and pattern[start - 1] == "(" - ), f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}" + assert start > 0 and pattern[start - 1] == "(", ( + f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}" + ) return join_seq() elif c == "[": square_brackets = c @@ -586,9 +586,9 @@ def join_seq(): else: square_brackets += pattern[i] i += 1 - assert ( - i < length - ), f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}" + assert i < length, ( + f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}" + ) square_brackets += "]" i += 1 seq.append((square_brackets, False)) @@ -604,9 +604,9 @@ def join_seq(): while i < length and pattern[i] != "}": curly_brackets += pattern[i] i += 1 - assert ( - i < length - ), f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}" + assert i < length, ( + f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}" + ) curly_brackets += "}" i += 1 nums = [s.strip() for s in curly_brackets[1:-1].split(",")] @@ -777,13 +777,13 @@ def add_component(comp_schema, is_required): rule_name, '"[" space ' + ' "," space '.join( - self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') + self.visit(item, f"{name}{'-' if name else ''}tuple-{i}") for i, item in enumerate(items) ) + ' "]" space', ) else: - item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') + item_rule_name = self.visit(items, f"{name}{'-' if name else ''}item") min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") return self._add_rule( @@ -873,17 +873,17 @@ def _build_object_rule( prop_kv_rule_names = {} for prop_name, prop_schema in properties: prop_rule_name = self.visit( - prop_schema, f'{name}{"-" if name else ""}{prop_name}' + prop_schema, f"{name}{'-' if name else ''}{prop_name}" ) prop_kv_rule_names[prop_name] = self._add_rule( - f'{name}{"-" if name else ""}{prop_name}-kv', + f"{name}{'-' if name else ''}{prop_name}-kv", rf'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}', ) required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] if additional_properties == True or isinstance(additional_properties, dict): - sub_name = f'{name}{"-" if name else ""}additional' + sub_name = f"{name}{'-' if name else ''}additional" value_rule = self.visit( {} if additional_properties == True else additional_properties, f"{sub_name}-value", @@ -908,7 +908,7 @@ def get_recursive_refs(ks, first_is_optional): kv_rule_name = prop_kv_rule_names[k] if k == "*": res = self._add_rule( - f'{name}{"-" if name else ""}additional-kvs', + f"{name}{'-' if name else ''}additional-kvs", f'{kv_rule_name} ( "," space ' + kv_rule_name + " )*", ) elif first_is_optional: @@ -917,7 +917,7 @@ def get_recursive_refs(ks, first_is_optional): res = kv_rule_name if len(rest) > 0: res += " " + self._add_rule( - f'{name}{"-" if name else ""}{k}-rest', + f"{name}{'-' if name else ''}{k}-rest", get_recursive_refs(rest, first_is_optional=True), ) return res diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index d9dfaf5fd..3dc96d2f2 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -36,7 +36,11 @@ # Specify the base name of the shared library to load _libllava_base_name = "llava" _libllava_override_path = os.environ.get("LLAVA_CPP_LIB") -_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path() +_libllava_base_path = ( + pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + if _libllava_override_path is None + else pathlib.Path() +) # Load the library _libllava = load_shared_library(_libllava_base_name, _libllava_base_path) @@ -73,8 +77,7 @@ class llava_image_embed(Structure): ) def llava_validate_embed_size( ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, / -) -> bool: - ... +) -> bool: ... # /** build an image embed from image file bytes */ @@ -90,8 +93,7 @@ def llava_image_embed_make_with_bytes( image_bytes: CtypesArray[c_uint8], image_bytes_length: Union[c_int, int], /, -) -> "_Pointer[llava_image_embed]": - ... +) -> "_Pointer[llava_image_embed]": ... # /** build an image embed from a path to an image filename */ @@ -103,15 +105,13 @@ def llava_image_embed_make_with_bytes( ) def llava_image_embed_make_with_filename( ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, / -) -> "_Pointer[llava_image_embed]": - ... +) -> "_Pointer[llava_image_embed]": ... # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); # /** free an embedding made with llava_image_embed_make_* */ @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None) -def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): - ... +def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ... # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ @@ -132,8 +132,7 @@ def llava_eval_image_embed( n_batch: Union[c_int, int], n_past: "_Pointer[c_int]", /, -) -> bool: - ... +) -> bool: ... ################################################ @@ -146,13 +145,10 @@ def llava_eval_image_embed( @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) def clip_model_load( fname: bytes, verbosity: Union[c_int, int], / -) -> Optional[clip_ctx_p]: - ... +) -> Optional[clip_ctx_p]: ... # /** free mmproj model */ # CLIP_API void clip_free(struct clip_ctx * ctx); @ctypes_function("clip_free", [clip_ctx_p_ctypes], None) -def clip_free(ctx: clip_ctx_p, /): - ... - +def clip_free(ctx: clip_ctx_p, /): ... diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index a45f8f406..41753a7f6 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -39,7 +39,11 @@ # Specify the base name of the shared library to load _libmtmd_base_name = "mtmd" _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB") -_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path() +_libmtmd_base_path = ( + pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + if _libmtmd_override_path is None + else pathlib.Path() +) # Load the library _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path) @@ -71,6 +75,7 @@ MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2 + # Structures class mtmd_context_params(Structure): _fields_ = [ @@ -82,6 +87,7 @@ class mtmd_context_params(Structure): ("media_marker", c_char_p), ] + class mtmd_input_text(Structure): _fields_ = [ ("text", c_char_p), @@ -89,19 +95,21 @@ class mtmd_input_text(Structure): ("parse_special", c_bool), ] + ################################################ # mtmd.h functions ################################################ + # MTMD_API const char * mtmd_default_marker(void); @ctypes_function("mtmd_default_marker", [], c_char_p) -def mtmd_default_marker() -> bytes: - ... +def mtmd_default_marker() -> bytes: ... + # MTMD_API struct mtmd_context_params mtmd_context_params_default(void); @ctypes_function("mtmd_context_params_default", [], mtmd_context_params) -def mtmd_context_params_default() -> mtmd_context_params: - ... +def mtmd_context_params_default() -> mtmd_context_params: ... + # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, # const struct llama_model * text_model, @@ -109,70 +117,68 @@ def mtmd_context_params_default() -> mtmd_context_params: @ctypes_function( "mtmd_init_from_file", [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params], - mtmd_context_p_ctypes + mtmd_context_p_ctypes, ) def mtmd_init_from_file( mmproj_fname: bytes, text_model: llama_cpp.llama_model_p, ctx_params: mtmd_context_params, /, -) -> Optional[mtmd_context_p]: - ... +) -> Optional[mtmd_context_p]: ... + # MTMD_API void mtmd_free(mtmd_context * ctx); @ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None) -def mtmd_free(ctx: mtmd_context_p, /): - ... +def mtmd_free(ctx: mtmd_context_p, /): ... + # MTMD_API bool mtmd_support_vision(mtmd_context * ctx); @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) -def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: - ... +def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ... + # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data); @ctypes_function( - "mtmd_bitmap_init", - [c_uint32, c_uint32, POINTER(c_uint8)], - mtmd_bitmap_p_ctypes + "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes ) def mtmd_bitmap_init( nx: Union[c_uint32, int], ny: Union[c_uint32, int], data: CtypesArray[c_uint8], /, -) -> Optional[mtmd_bitmap_p]: - ... +) -> Optional[mtmd_bitmap_p]: ... + # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None) -def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): - ... +def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ... + # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes) -def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: - ... +def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ... + # MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); @ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None) -def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): - ... +def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): ... + # MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); @ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t) -def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: - ... +def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: ... + # MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx); @ctypes_function( "mtmd_input_chunks_get", [mtmd_input_chunks_p_ctypes, c_size_t], - mtmd_input_chunk_p_ctypes + mtmd_input_chunk_p_ctypes, ) def mtmd_input_chunks_get( chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], / -) -> Optional[mtmd_input_chunk_p]: - ... +) -> Optional[mtmd_input_chunk_p]: ... + # MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, # mtmd_input_chunks * output, @@ -197,52 +203,53 @@ def mtmd_tokenize( bitmaps: CtypesArray[mtmd_bitmap_p_ctypes], n_bitmaps: Union[c_size_t, int], /, -) -> int: - ... +) -> int: ... + # MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t) -def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: - ... +def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: ... + # MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk); @ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int) -def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: - ... +def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: ... + # MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output); @ctypes_function( "mtmd_input_chunk_get_tokens_text", [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)], - POINTER(llama_cpp.llama_token) + POINTER(llama_cpp.llama_token), ) def mtmd_input_chunk_get_tokens_text( chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", / -) -> Optional["_Pointer[llama_cpp.llama_token]"]: - ... +) -> Optional["_Pointer[llama_cpp.llama_token]"]: ... + ################################################ # mtmd-helper.h functions ################################################ + # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); @ctypes_function( "mtmd_helper_bitmap_init_from_buf", [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t], - mtmd_bitmap_p_ctypes + mtmd_bitmap_p_ctypes, ) def mtmd_helper_bitmap_init_from_buf( ctx: mtmd_context_p, buf: CtypesArray[c_uint8], length: Union[c_size_t, int], /, -) -> Optional[mtmd_bitmap_p]: - ... +) -> Optional[mtmd_bitmap_p]: ... + # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); @ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t) -def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: - ... +def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ... + # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, # struct llama_context * lctx, @@ -276,5 +283,4 @@ def mtmd_helper_eval_chunk_single( logits_last: Union[c_bool, bool], new_n_past: "_Pointer[llama_cpp.llama_pos]", /, -) -> int: - ... +) -> int: ... diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5120f2416..f776fe159 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -125,9 +125,9 @@ def create_app( server_settings = ServerSettings.model_validate(settings) model_settings = [ModelSettings.model_validate(settings)] - assert ( - server_settings is not None and model_settings is not None - ), "server_settings and model_settings must be provided together" + assert server_settings is not None and model_settings is not None, ( + "server_settings and model_settings must be provided together" + ) set_server_settings(server_settings) middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py index 3dd007676..8ed029063 100644 --- a/llama_cpp/server/cli.py +++ b/llama_cpp/server/cli.py @@ -14,7 +14,9 @@ def _get_base_type(annotation: Type[Any]) -> Type[Any]: elif getattr(annotation, "__origin__", None) is Union: assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore non_optional_args: List[Type[Any]] = [ - arg for arg in annotation.__args__ if arg is not type(None) # type: ignore + arg + for arg in annotation.__args__ + if arg is not type(None) # type: ignore ] if non_optional_args: return _get_base_type(non_optional_args[0]) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 11bd363b5..9e59e8563 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -186,18 +186,18 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "hf-autotokenizer": - assert ( - settings.hf_pretrained_model_name_or_path is not None - ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer" + assert settings.hf_pretrained_model_name_or_path is not None, ( + "hf_pretrained_model_name_or_path must be set for hf-autotokenizer" + ) chat_handler = ( llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler( settings.hf_pretrained_model_name_or_path ) ) elif settings.chat_format == "hf-tokenizer-config": - assert ( - settings.hf_tokenizer_config_path is not None - ), "hf_tokenizer_config_path must be set for hf-tokenizer-config" + assert settings.hf_tokenizer_config_path is not None, ( + "hf_tokenizer_config_path must be set for hf-tokenizer-config" + ) chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler( json.load(open(settings.hf_tokenizer_config_path)) ) diff --git a/pyproject.toml b/pyproject.toml index f5ae7b59c..e0b0dc520 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ test = [ "huggingface-hub>=0.23.0" ] dev = [ - "black>=23.3.0", + "ruff>=0.15.7", "twine>=4.0.2", "mkdocs>=1.4.3", "mkdocstrings[python]>=0.22.0", @@ -78,5 +78,16 @@ Issues = "https://github.com/abetlen/llama-cpp-python/issues" Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/" Changelog = "https://llama-cpp-python.readthedocs.io/en/latest/changelog/" +[tool.ruff] +target-version = "py38" +line-length = 88 +required-version = ">=0.15.7" +src = ["llama_cpp", "tests"] +extend-exclude = ["vendor", "examples/notebooks"] + +[tool.ruff.lint] +select = ["E4", "E7", "E9"] +ignore = ["E712"] + [tool.pytest.ini_options] testpaths = "tests" diff --git a/tests/test_llama.py b/tests/test_llama.py index 0a1a9f5ad..964b0895c 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -66,6 +66,7 @@ def llama_cpp_model_path(): def test_real_model(llama_cpp_model_path): import os + assert os.path.exists(llama_cpp_model_path) params = llama_cpp.llama_model_default_params() @@ -114,6 +115,7 @@ def test_real_model(llama_cpp_model_path): output_text = model.detokenize(output, special=True) assert output_text == b" over the lazy dog" + def test_real_llama(llama_cpp_model_path): model = llama_cpp.Llama( llama_cpp_model_path, @@ -132,11 +134,10 @@ def test_real_llama(llama_cpp_model_path): top_k=50, top_p=0.9, temperature=0.8, - seed=1337 + seed=1337, ) assert output["choices"][0]["text"] == " over the lazy dog" - output = model.create_completion( "The capital of france is paris, 'true' or 'false'?:\n", max_tokens=4, @@ -146,20 +147,19 @@ def test_real_llama(llama_cpp_model_path): seed=1337, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "true" | "false" -""") +"""), ) assert output["choices"][0]["text"] == "true" suffix = b"rot" tokens = model.tokenize(suffix, add_bos=True, special=True) + def logit_processor_func(input_ids, logits): for token in tokens: logits[token] *= 1000 return logits - logit_processors = llama_cpp.LogitsProcessorList( - [logit_processor_func] - ) + logit_processors = llama_cpp.LogitsProcessorList([logit_processor_func]) output = model.create_completion( "The capital of france is par", @@ -168,7 +168,7 @@ def logit_processor_func(input_ids, logits): top_p=0.9, temperature=0.8, seed=1337, - logits_processor=logit_processors + logits_processor=logit_processors, ) assert output["choices"][0]["text"].lower().startswith("rot") @@ -184,7 +184,7 @@ def logit_processor_func(input_ids, logits): temperature=0.8, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") +"""), ) number_1 = output["choices"][0]["text"] @@ -196,7 +196,7 @@ def logit_processor_func(input_ids, logits): temperature=0.8, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") +"""), ) number_2 = output["choices"][0]["text"] @@ -210,7 +210,7 @@ def logit_processor_func(input_ids, logits): temperature=0.8, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") +"""), ) number_3 = output["choices"][0]["text"] @@ -228,7 +228,7 @@ def test_real_llama_embeddings(llama_cpp_model_path): n_threads_batch=multiprocessing.cpu_count(), logits_all=False, flash_attn=True, - embedding=True + embedding=True, ) # Smoke test for now model.embed("Hello World") diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index f031bf72b..18c7279cf 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -10,13 +10,20 @@ from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter + def test_mistral_instruct(): chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" chat_formatter = jinja2.Template(chat_template) messages = [ - llama_types.ChatCompletionRequestUserMessage(role="user", content="Instruction"), - llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="Model answer"), - llama_types.ChatCompletionRequestUserMessage(role="user", content="Follow-up instruction"), + llama_types.ChatCompletionRequestUserMessage( + role="user", content="Instruction" + ), + llama_types.ChatCompletionRequestAssistantMessage( + role="assistant", content="Model answer" + ), + llama_types.ChatCompletionRequestUserMessage( + role="user", content="Follow-up instruction" + ), ] response = llama_chat_format.format_mistral_instruct( messages=messages, @@ -77,13 +84,11 @@ def test_mistral_instruct(): def test_hf_tokenizer_config_str_to_chat_formatter(): tokenizer_config = json.loads(mistral_7b_tokenizer_config) - chat_formatter = hf_tokenizer_config_to_chat_formatter( - tokenizer_config - ) + chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config) chat_formatter_respoonse = chat_formatter( messages=[ ChatCompletionRequestUserMessage(role="user", content="Hello, world!"), ] ) - assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "") + assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]") diff --git a/tests/test_llama_speculative.py b/tests/test_llama_speculative.py index b5d450567..d28c9ca9c 100644 --- a/tests/test_llama_speculative.py +++ b/tests/test_llama_speculative.py @@ -2,15 +2,20 @@ from llama_cpp.llama_speculative import LlamaPromptLookupDecoding + def test_find_candidate_pred_tokens(): find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens # Test Case 1: Matching ngram is found input_ids1 = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]) - result1 = find_candidate_pred_tokens(input_ids1, max_ngram_size=3, num_pred_tokens=2) + result1 = find_candidate_pred_tokens( + input_ids1, max_ngram_size=3, num_pred_tokens=2 + ) assert np.array_equal(result1, np.array([1, 2])) # Test Case 2: Matching ngram is not found input_ids2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) - result2 = find_candidate_pred_tokens(input_ids2, max_ngram_size=3, num_pred_tokens=2) + result2 = find_candidate_pred_tokens( + input_ids2, max_ngram_size=3, num_pred_tokens=2 + ) assert np.array_equal(result2, np.array([])) From 18aa31ef1e6150cf1b24b81583f5254aabe00199 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 22 Mar 2026 18:58:55 -0700 Subject: [PATCH 04/19] feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 (#2151) * Update llama.cpp and sync bindings * Clean up binding compatibility shims * Remove flash attention property shim * Remove mtmd verbosity shim * Add docstrings for new bindings * Format Ruff files and add changelog entry --- CHANGELOG.md | 1 + CMakeLists.txt | 20 + Makefile | 2 - llama_cpp/_internals.py | 8 +- llama_cpp/llama.py | 22 +- llama_cpp/llama_chat_format.py | 9 +- llama_cpp/llama_cpp.py | 881 +++++++++++++++++++++------------ llama_cpp/mtmd_cpp.py | 96 +++- tests/test_llama.py | 2 +- vendor/llama.cpp | 2 +- 10 files changed, 714 insertions(+), 329 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7044f44d6..e7506eed5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main` - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149 diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b06d98b3..9b2744cdc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,6 +153,26 @@ if (LLAMA_BUILD) add_compile_definitions(GGML_USE_METAL) endif() + # Upstream mtmd expects LLAMA_INSTALL_VERSION to be set by llama.cpp's + # top-level CMakeLists.txt. When we include tools/mtmd directly from the + # Python package build, that directory scope is skipped. + if (NOT DEFINED LLAMA_INSTALL_VERSION OR "${LLAMA_INSTALL_VERSION}" STREQUAL "") + set(LLAMA_INSTALL_VERSION 0.0.0) + find_package(Git QUIET) + if (Git_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp + OUTPUT_VARIABLE LLAMA_MTMD_BUILD_NUMBER + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE LLAMA_MTMD_BUILD_NUMBER_RESULT + ) + if (LLAMA_MTMD_BUILD_NUMBER_RESULT EQUAL 0) + set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_MTMD_BUILD_NUMBER}) + endif() + endif() + endif() + # Building llava add_subdirectory(vendor/llama.cpp/tools/mtmd) diff --git a/Makefile b/Makefile index 8e6cae2c1..db45246c7 100644 --- a/Makefile +++ b/Makefile @@ -82,8 +82,6 @@ run-server: python3 -m llama_cpp.server --model ${MODEL} clean: - - cd vendor/llama.cpp && make clean - - cd vendor/llama.cpp && rm libllama.so - rm -rf _skbuild - rm llama_cpp/lib/*.so - rm llama_cpp/lib/*.dylib diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b520b7ea5..d6258d224 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -2,6 +2,7 @@ import os import ctypes +import warnings from typing import ( Dict, @@ -699,8 +700,11 @@ def add_dist(self, seed: int): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_softmax(self): - sampler = llama_cpp.llama_sampler_init_softmax() - llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + warnings.warn( + "add_softmax is deprecated; llama_sampler_init_dist now samples directly from logits", + DeprecationWarning, + stacklevel=2, + ) def add_top_k(self, k: int): sampler = llama_cpp.llama_sampler_init_top_k(k) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 21a7430a0..1609ad16b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -341,7 +341,11 @@ def __init__( self._logits_all = logits_all if draft_model is None else True self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - self.context_params.flash_attn = flash_attn + self.context_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + if flash_attn + else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED + ) if op_offload is not None: self.context_params.op_offload = op_offload @@ -431,9 +435,9 @@ def free_lora_adapter(): self._stack.callback(free_lora_adapter) - if llama_cpp.llama_set_adapter_lora( - self._ctx.ctx, self._lora_adapter, self.lora_scale - ): + adapters = (llama_cpp.llama_adapter_lora_p_ctypes * 1)(self._lora_adapter) + scales = (ctypes.c_float * 1)(self.lora_scale) + if llama_cpp.llama_set_adapters_lora(self._ctx.ctx, adapters, 1, scales): raise RuntimeError( f"Failed to set LoRA adapter from lora path: {self.lora_path}" ) @@ -726,7 +730,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): sampler.add_grammar(self._model, grammar) if temp < 0.0: - sampler.add_softmax() sampler.add_dist(self._seed) elif temp == 0.0: sampler.add_greedy() @@ -1042,7 +1045,7 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + self._ctx.kv_cache_clear() self._ctx.decode(self._batch) self._batch.reset() @@ -1113,7 +1116,7 @@ def decode_batch(seq_sizes: List[int]): output = data[0] if isinstance(input, str) else data - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + self._ctx.kv_cache_clear() self.reset() if return_count: @@ -2100,7 +2103,10 @@ def __getstate__(self): logits_all=self._logits_all, embedding=self.context_params.embeddings, offload_kqv=self.context_params.offload_kqv, - flash_attn=self.context_params.flash_attn, + flash_attn=( + self.context_params.flash_attn_type + == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + ), op_offload=self.context_params.op_offload, swa_full=self.context_params.swa_full, # Sampling Params diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 8e8ac7bb3..d7910e984 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2755,7 +2755,14 @@ def _init_mtmd_context(self, llama_model: llama.Llama): ctx_params.use_gpu = True # TODO: Make this configurable ctx_params.print_timings = self.verbose ctx_params.n_threads = llama_model.n_threads - ctx_params.verbosity = 2 if self.verbose else 0 # GGML_LOG_LEVEL_INFO = 2 + ctx_params.flash_attn_type = ( + llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + if ( + llama_model.context_params.flash_attn_type + == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED + ) + else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED + ) # Initialize mtmd context self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file( diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f13af67f3..e51492c56 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -3,6 +3,7 @@ import os import ctypes import pathlib +import warnings from typing import ( Callable, @@ -77,6 +78,8 @@ # GGML_TYPE_I64 = 27, # GGML_TYPE_F64 = 28, # GGML_TYPE_IQ1_M = 29, +# GGML_TYPE_MXFP4 = 39, +# GGML_TYPE_NVFP4 = 40, # GGML_TYPE_COUNT, # }; GGML_TYPE_F32 = 0 @@ -107,7 +110,9 @@ GGML_TYPE_I64 = 27 GGML_TYPE_F64 = 28 GGML_TYPE_IQ1_M = 29 -GGML_TYPE_COUNT = 30 +GGML_TYPE_MXFP4 = 39 +GGML_TYPE_NVFP4 = 40 +GGML_TYPE_COUNT = 41 # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); @@ -173,6 +178,10 @@ llama_kv_cache_p = NewType("llama_kv_cache_p", int) llama_kv_cache_p_ctypes = ctypes.c_void_p +# struct gguf_context; +gguf_context_p = NewType("gguf_context_p", int) +gguf_context_p_ctypes = ctypes.c_void_p + # typedef int32_t llama_pos; llama_pos = ctypes.c_int32 # typedef int32_t llama_token; @@ -292,12 +301,14 @@ # LLAMA_ROPE_TYPE_NORM = 0, # LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, # LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE, +# LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE, # LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, # }; LLAMA_ROPE_TYPE_NONE = -1 LLAMA_ROPE_TYPE_NORM = 0 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8 +LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24 @@ -386,6 +397,7 @@ # LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors # LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors # LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors # # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -425,6 +437,7 @@ LLAMA_FTYPE_MOSTLY_TQ1_0 = 36 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38 +LLAMA_FTYPE_MOSTLY_NVFP4 = 39 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { @@ -467,6 +480,16 @@ LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1 +# enum llama_flash_attn_type { +# LLAMA_FLASH_ATTN_TYPE_AUTO = -1, +# LLAMA_FLASH_ATTN_TYPE_DISABLED = 0, +# LLAMA_FLASH_ATTN_TYPE_ENABLED = 1, +# }; +LLAMA_FLASH_ATTN_TYPE_AUTO = -1 +LLAMA_FLASH_ATTN_TYPE_DISABLED = 0 +LLAMA_FLASH_ATTN_TYPE_ENABLED = 1 + + # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU # LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs @@ -619,6 +642,34 @@ class llama_batch(ctypes.Structure): LLAMA_KV_OVERRIDE_TYPE_STR = 3 +# enum llama_model_meta_key { +# LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE, +# LLAMA_MODEL_META_KEY_SAMPLING_TOP_K, +# LLAMA_MODEL_META_KEY_SAMPLING_TOP_P, +# LLAMA_MODEL_META_KEY_SAMPLING_MIN_P, +# LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY, +# LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD, +# LLAMA_MODEL_META_KEY_SAMPLING_TEMP, +# LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N, +# LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU, +# LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA, +# }; +LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0 +LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1 +LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2 +LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3 +LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4 +LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5 +LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6 +LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7 +LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10 +LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11 + + # struct llama_model_kv_override { # enum llama_model_kv_override_type tag; @@ -695,11 +746,14 @@ class llama_model_kv_override(ctypes.Structure): # // Keep the booleans together to avoid misalignment during copy-by-value. -# bool vocab_only; // only load the vocabulary, no weights -# bool use_mmap; // use mmap if possible -# bool use_mlock; // force system to keep model in RAM -# bool check_tensors; // validate model tensor data +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_direct_io; // use direct io, takes precedence over use_mmap when supported +# bool use_mlock; // force system to keep model in RAM +# bool check_tensors; // validate model tensor data # bool use_extra_bufts; // use extra buffer types (used for weight repacking) +# bool no_host; // bypass host buffer allowing extra buffers to be used +# bool no_alloc; // only load metadata and simulate memory allocations # }; class llama_model_params(ctypes.Structure): """Parameters for llama_model @@ -716,9 +770,12 @@ class llama_model_params(ctypes.Structure): kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible + use_direct_io (bool): use direct io, takes precedence over use_mmap when supported use_mlock (bool): force system to keep model in RAM check_tensors (bool): validate model tensor data - use_extra_bufts (bool): use extra buffer types (used for weight repacking)""" + use_extra_bufts (bool): use extra buffer types (used for weight repacking) + no_host (bool): bypass host buffer allowing extra buffers to be used + no_alloc (bool): only load metadata and simulate memory allocations""" if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused @@ -734,9 +791,12 @@ class llama_model_params(ctypes.Structure): kv_overrides: CtypesArray[llama_model_kv_override] vocab_only: bool use_mmap: bool + use_direct_io: bool use_mlock: bool check_tensors: bool use_extra_bufts: bool + no_host: bool + no_alloc: bool _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused @@ -750,9 +810,27 @@ class llama_model_params(ctypes.Structure): ("kv_overrides", ctypes.POINTER(llama_model_kv_override)), ("vocab_only", ctypes.c_bool), ("use_mmap", ctypes.c_bool), + ("use_direct_io", ctypes.c_bool), ("use_mlock", ctypes.c_bool), ("check_tensors", ctypes.c_bool), ("use_extra_bufts", ctypes.c_bool), + ("no_host", ctypes.c_bool), + ("no_alloc", ctypes.c_bool), + ] + + +# struct llama_sampler_seq_config { +# llama_seq_id seq_id; +# struct llama_sampler * sampler; +# }; +class llama_sampler_seq_config(ctypes.Structure): + if TYPE_CHECKING: + seq_id: int + sampler: ctypes.c_void_p + + _fields_ = [ + ("seq_id", llama_seq_id), + ("sampler", ctypes.c_void_p), ] @@ -769,6 +847,7 @@ class llama_model_params(ctypes.Structure): # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id # enum llama_attention_type attention_type; // attention type to use for embeddings +# enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention # // ref: https://github.com/ggml-org/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model @@ -796,7 +875,6 @@ class llama_model_params(ctypes.Structure): # // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU -# bool flash_attn; // use flash attention [EXPERIMENTAL] # bool no_perf; // measure performance timings # bool op_offload; // offload host tensor operations to device # bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) @@ -805,6 +883,8 @@ class llama_model_params(ctypes.Structure): # bool kv_unified; // use a unified buffer across the input sequences when computing the attention # // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix # // ref: https://github.com/ggml-org/llama.cpp/pull/14363 +# struct llama_sampler_seq_config * samplers; +# size_t n_samplers; # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -819,6 +899,7 @@ class llama_context_params(ctypes.Structure): rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) attention_type (int): attention type to use for embeddings + flash_attn_type (int): when to enable flash attention rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -835,11 +916,12 @@ class llama_context_params(ctypes.Structure): abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU - flash_attn (bool): whether to use flash attention no_perf (bool): whether to measure performance timings op_offload (bool): offload host tensor operations to device swa_full (bool): use full-size SWA cache kv_unified (bool): use a unified buffer across the input sequences when computing the attention + samplers (ctypes.POINTER(llama_sampler_seq_config)): backend sampler chain configuration + n_samplers (int): number of backend sampler chain configurations """ if TYPE_CHECKING: @@ -852,6 +934,7 @@ class llama_context_params(ctypes.Structure): rope_scaling_type: int pooling_type: int attention_type: int + flash_attn_type: int rope_freq_base: float rope_freq_scale: float yarn_ext_factor: float @@ -868,11 +951,12 @@ class llama_context_params(ctypes.Structure): abort_callback_data: ctypes.c_void_p embeddings: bool offload_kqv: bool - flash_attn: bool no_perf: bool op_offload: bool swa_full: bool kv_unified: bool + samplers: ctypes.POINTER(llama_sampler_seq_config) + n_samplers: int _fields_ = [ ("n_ctx", ctypes.c_uint32), @@ -884,6 +968,7 @@ class llama_context_params(ctypes.Structure): ("rope_scaling_type", ctypes.c_int), ("pooling_type", ctypes.c_int), ("attention_type", ctypes.c_int), + ("flash_attn_type", ctypes.c_int), ("rope_freq_base", ctypes.c_float), ("rope_freq_scale", ctypes.c_float), ("yarn_ext_factor", ctypes.c_float), @@ -900,11 +985,12 @@ class llama_context_params(ctypes.Structure): ("abort_callback_data", ctypes.c_void_p), ("embeddings", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), - ("flash_attn", ctypes.c_bool), ("no_perf", ctypes.c_bool), ("op_offload", ctypes.c_bool), ("swa_full", ctypes.c_bool), ("kv_unified", ctypes.c_bool), + ("samplers", ctypes.POINTER(llama_sampler_seq_config)), + ("n_samplers", ctypes.c_size_t), ] @@ -935,6 +1021,7 @@ class llama_context_params(ctypes.Structure): # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored # bool pure; // quantize all tensors to the default type # bool keep_split; // quantize to the same number of shards +# bool dry_run; // calculate and show the final quantization size without performing quantization # void * imatrix; // pointer to importance matrix data # void * kv_overrides; // pointer to vector containing overrides # void * tensor_types; // pointer to vector containing tensor types @@ -953,6 +1040,7 @@ class llama_model_quantize_params(ctypes.Structure): only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored pure (bool): quantize all tensors to the default type keep_split (bool): quantize to the same number of shards + dry_run (bool): calculate and show the final quantization size without performing quantization imatrix (ctypes.c_void_p): pointer to importance matrix data kv_overrides (ctypes.c_void_p): pointer to vector containing overrides tensor_types (ctypes.c_void_p): pointer to vector containing tensor types @@ -969,6 +1057,7 @@ class llama_model_quantize_params(ctypes.Structure): only_copy: bool pure: bool keep_split: bool + dry_run: bool imatrix: ctypes.c_void_p kv_overrides: ctypes.c_void_p tensor_types: ctypes.c_void_p @@ -984,6 +1073,7 @@ class llama_model_quantize_params(ctypes.Structure): ("only_copy", ctypes.c_bool), ("pure", ctypes.c_bool), ("keep_split", ctypes.c_bool), + ("dry_run", ctypes.c_bool), ("imatrix", ctypes.c_void_p), ("kv_overrides", ctypes.c_void_p), ("tensor_types", ctypes.c_void_p), @@ -1095,6 +1185,13 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params: ... +# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type); +@ctypes_function("llama_flash_attn_type_name", [ctypes.c_int], ctypes.c_char_p) +def llama_flash_attn_type_name(flash_attn_type: int, /) -> Optional[bytes]: + """Get the flash attention type name.""" + ... + + # // Initialize the llama + ggml backend # // If numa is true, use NUMA optimizations # // Call once at the start of the program @@ -1249,6 +1346,36 @@ def llama_free_model(model: llama_model_p, /): ... def llama_model_free(model: llama_model_p, /): ... +# typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata); +llama_model_set_tensor_data_t = ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_void_p) + + +# LLAMA_API struct llama_model * llama_model_init_from_user( +# struct gguf_context * metadata, +# llama_model_set_tensor_data_t set_tensor_data, +# void * set_tensor_data_ud, +# struct llama_model_params params); +@ctypes_function( + "llama_model_init_from_user", + [ + gguf_context_p_ctypes, + llama_model_set_tensor_data_t, + ctypes.c_void_p, + llama_model_params, + ], + llama_model_p_ctypes, +) +def llama_model_init_from_user( + metadata: gguf_context_p, + set_tensor_data: llama_model_set_tensor_data_t, + set_tensor_data_ud: ctypes.c_void_p, + params: llama_model_params, + /, +) -> Optional[llama_model_p]: + """Initialize a model from user-provided metadata and tensor data.""" + ... + + # LLAMA_API struct llama_context * llama_init_from_model( # struct llama_model * model, # struct llama_context_params params); @@ -1288,6 +1415,54 @@ def llama_free(ctx: llama_context_p, /): ... +# enum llama_params_fit_status { +# LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, +# LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, +# LLAMA_PARAMS_FIT_STATUS_ERROR = 2, +# }; +LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0 +LLAMA_PARAMS_FIT_STATUS_FAILURE = 1 +LLAMA_PARAMS_FIT_STATUS_ERROR = 2 + + +# LLAMA_API enum llama_params_fit_status llama_params_fit( +# const char * path_model, +# struct llama_model_params * mparams, +# struct llama_context_params * cparams, +# float * tensor_split, +# struct llama_model_tensor_buft_override * tensor_buft_overrides, +# size_t * margins, +# uint32_t n_ctx_min, +# enum ggml_log_level log_level); +@ctypes_function( + "llama_params_fit", + [ + ctypes.c_char_p, + ctypes.POINTER(llama_model_params), + ctypes.POINTER(llama_context_params), + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_size_t), + ctypes.c_uint32, + ctypes.c_int, + ], + ctypes.c_int, +) +def llama_params_fit( + path_model: bytes, + mparams: CtypesPointerOrRef[llama_model_params], + cparams: CtypesPointerOrRef[llama_context_params], + tensor_split: Optional[CtypesPointer[ctypes.c_float]], + tensor_buft_overrides: ctypes.c_void_p, + margins: Optional[CtypesPointer[ctypes.c_size_t]], + n_ctx_min: int, + log_level: int, + /, +) -> int: + """Fit model and context parameters for a model path.""" + ... + + # LLAMA_API int64_t llama_time_us(void); @ctypes_function( "llama_time_us", @@ -1307,6 +1482,13 @@ def llama_max_devices() -> int: ... def llama_max_parallel_sequences() -> int: ... +# LLAMA_API size_t llama_max_tensor_buft_overrides(void); +@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t) +def llama_max_tensor_buft_overrides() -> int: + """Get the maximum number of tensor buffer type overrides.""" + ... + + # LLAMA_API bool llama_supports_mmap (void); @ctypes_function("llama_supports_mmap", [], ctypes.c_bool) def llama_supports_mmap() -> bool: ... @@ -1332,6 +1514,13 @@ def llama_supports_rpc() -> bool: ... def llama_n_ctx(ctx: llama_context_p, /) -> int: ... +# LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx); +@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_ctx_seq(ctx: llama_context_p, /) -> int: + """Get the context size per sequence.""" + ... + + # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32) def llama_n_batch(ctx: llama_context_p, /) -> int: ... @@ -1389,17 +1578,6 @@ def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]: def llama_pooling_type(ctx: llama_context_p, /) -> int: ... -# DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); -@ctypes_function( - "llama_get_kv_self", - [llama_context_p_ctypes], - llama_kv_cache_p_ctypes, -) -def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: - """Get the KV cache for self-attention (DEPRECATED)""" - ... - - # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes) def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: ... @@ -1420,6 +1598,20 @@ def llama_model_n_ctx_train(model: llama_model_p, /) -> int: ... def llama_model_n_embd(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model); +@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_embd_inp(model: llama_model_p, /) -> int: + """Get the model input embedding size.""" + ... + + +# LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model); +@ctypes_function("llama_model_n_embd_out", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_embd_out(model: llama_model_p, /) -> int: + """Get the model output embedding size.""" + ... + + # LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_layer(model: llama_model_p, /) -> int: ... @@ -1515,6 +1707,14 @@ def llama_model_meta_count(model: llama_model_p, /) -> int: ... +# // Get sampling metadata key name. Returns nullptr if the key is invalid +# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key); +@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p) +def llama_model_meta_key_str(key: int, /) -> Optional[bytes]: + """Get sampling metadata key name. Returns None if the key is invalid.""" + ... + + # // Get metadata key name by index # LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size); @ctypes_function( @@ -1647,6 +1847,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool: ... +# // Returns true if the model is hybrid (like Jamba, Granite, etc.) +# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model); +@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool) +def llama_model_is_hybrid(model: llama_model_p, /) -> bool: + """Returns true if the model is hybrid (like Jamba, Granite, etc.)""" + ... + + # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool) @@ -1698,6 +1906,85 @@ def llama_adapter_lora_init( ) -> Optional[llama_adapter_lora_p]: ... +# // Get metadata value as a string by key name +# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_val_str", + [ + llama_adapter_lora_p_ctypes, + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.c_size_t, + ], + ctypes.c_int32, +) +def llama_adapter_meta_val_str( + adapter: llama_adapter_lora_p, + key: bytes, + buf: Union[bytes, CtypesArray[ctypes.c_char]], + buf_size: int, + /, +) -> int: + """Get adapter metadata value as a string by key name.""" + ... + + +# // Get the number of metadata key/value pairs +# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter); +@ctypes_function( + "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32 +) +def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int: + """Get the number of adapter metadata key/value pairs.""" + ... + + +# // Get metadata key name by index +# LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_key_by_index", + [ + llama_adapter_lora_p_ctypes, + ctypes.c_int32, + ctypes.c_char_p, + ctypes.c_size_t, + ], + ctypes.c_int32, +) +def llama_adapter_meta_key_by_index( + adapter: llama_adapter_lora_p, + i: int, + buf: Union[bytes, CtypesArray[ctypes.c_char]], + buf_size: int, + /, +) -> int: + """Get adapter metadata key name by index.""" + ... + + +# // Get metadata value as a string by index +# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); +@ctypes_function( + "llama_adapter_meta_val_str_by_index", + [ + llama_adapter_lora_p_ctypes, + ctypes.c_int32, + ctypes.c_char_p, + ctypes.c_size_t, + ], + ctypes.c_int32, +) +def llama_adapter_meta_val_str_by_index( + adapter: llama_adapter_lora_p, + i: int, + buf: Union[bytes, CtypesArray[ctypes.c_char]], + buf_size: int, + /, +) -> int: + """Get adapter metadata value as a string by index.""" + ... + + # // Manually free a LoRA adapter # // Note: loaded adapters will be free when the associated model is deleted # LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); @@ -1709,56 +1996,75 @@ def llama_adapter_lora_init( def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ... -# // The following functions operate on a llama_context, hence the naming: llama_verb_... +# // Get the invocation tokens if the current lora is an alora +# LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter); +@ctypes_function( + "llama_adapter_get_alora_n_invocation_tokens", + [llama_adapter_lora_p_ctypes], + ctypes.c_uint64, +) +def llama_adapter_get_alora_n_invocation_tokens( + adapter: llama_adapter_lora_p, / +) -> int: + """Get the invocation token count if the current LoRA is an aLoRA.""" + ... -# // Add a loaded LoRA adapter to given context -# // This will not modify model's weight -# LLAMA_API int32_t llama_set_adapter_lora( -# struct llama_context * ctx, -# struct llama_adapter_lora * adapter, -# float scale); +# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter); @ctypes_function( - "llama_set_adapter_lora", - [llama_context_p_ctypes, llama_adapter_lora_p_ctypes, ctypes.c_float], - ctypes.c_int32, + "llama_adapter_get_alora_invocation_tokens", + [llama_adapter_lora_p_ctypes], + ctypes.POINTER(llama_token), ) -def llama_set_adapter_lora( - ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, / -) -> int: - """Add a loaded LoRA adapter to given context - This will not modify model's weight""" +def llama_adapter_get_alora_invocation_tokens( + adapter: llama_adapter_lora_p, / +) -> Optional[CtypesPointer[llama_token]]: + """Get the invocation tokens if the current LoRA is an aLoRA.""" ... -# // Remove a specific LoRA adapter from given context -# // Return -1 if the adapter is not present in the context -# LLAMA_API int32_t llama_rm_adapter_lora( +# // The following functions operate on a llama_context, hence the naming: llama_verb_... + + +# // Set LoRa adapters on the context. Will only modify if the adapters currently in context are different. +# LLAMA_API int32_t llama_set_adapters_lora( # struct llama_context * ctx, -# struct llama_adapter_lora * adapter); +# struct llama_adapter_lora ** adapters, +# size_t n_adapters, +# float * scales); @ctypes_function( - "llama_rm_adapter_lora", - [llama_context_p_ctypes, llama_adapter_lora_p_ctypes], + "llama_set_adapters_lora", + [ + llama_context_p_ctypes, + ctypes.POINTER(llama_adapter_lora_p_ctypes), + ctypes.c_size_t, + ctypes.POINTER(ctypes.c_float), + ], ctypes.c_int32, ) -def llama_rm_adapter_lora( - ctx: llama_context_p, adapter: llama_adapter_lora_p, / +def llama_set_adapters_lora( + ctx: llama_context_p, + adapters: Optional[CtypesArray[llama_adapter_lora_p_ctypes]], + n_adapters: int, + scales: Optional[CtypesArray[ctypes.c_float]], + /, ) -> int: - """Remove a specific LoRA adapter from given context - Return -1 if the adapter is not present in the context""" + """Set LoRA adapters on the context if they differ from the current adapters.""" ... -# // Remove all LoRA adapters from given context -# LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx); -@ctypes_function( - "llama_clear_adapter_lora", - [llama_context_p_ctypes], - None, -) -def llama_clear_adapter_lora(ctx: llama_context_p, /): - """Remove all LoRA adapters from given context""" - ... +# Deprecated compatibility wrapper for the renamed llama_set_adapters_lora(). +def llama_set_adapter_lora( + ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, / +) -> int: + warnings.warn( + "llama_set_adapter_lora is deprecated; use llama_set_adapters_lora instead", + DeprecationWarning, + stacklevel=2, + ) + adapters = (llama_adapter_lora_p_ctypes * 1)(adapter) + scales = (ctypes.c_float * 1)(scale) + return llama_set_adapters_lora(ctx, adapters, 1, scales) # // Apply a loaded control vector to a llama_context, or if data is NULL, clear @@ -1767,7 +2073,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /): # // to an n_embd x n_layers buffer starting from layer 1. # // il_start and il_end are the layer range the vector should apply to (both inclusive) # // See llama_control_vector_load in common to load a control vector. -# LLAMA_API int32_t llama_apply_adapter_cvec( +# LLAMA_API int32_t llama_set_adapter_cvec( # struct llama_context * ctx, # const float * data, # size_t len, @@ -1775,7 +2081,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /): # int32_t il_start, # int32_t il_end); @ctypes_function( - "llama_apply_adapter_cvec", + "llama_set_adapter_cvec", [ llama_context_p_ctypes, ctypes.POINTER(ctypes.c_float), @@ -1786,7 +2092,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /): ], ctypes.c_int32, ) -def llama_apply_adapter_cvec( +def llama_set_adapter_cvec( ctx: llama_context_p, data: CtypesPointerOrRef[ctypes.c_float], len: int, @@ -1804,6 +2110,24 @@ def llama_apply_adapter_cvec( ... +# Deprecated compatibility wrapper for the renamed llama_set_adapter_cvec(). +def llama_apply_adapter_cvec( + ctx: llama_context_p, + data: CtypesPointerOrRef[ctypes.c_float], + len: int, + n_embd: int, + il_start: int, + il_end: int, + /, +) -> int: + warnings.warn( + "llama_apply_adapter_cvec is deprecated; use llama_set_adapter_cvec instead", + DeprecationWarning, + stacklevel=2, + ) + return llama_set_adapter_cvec(ctx, data, len, n_embd, il_start, il_end) + + # // # // Memory # // @@ -2018,251 +2342,6 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: ... -# // -# // KV cache for self-attention (TODO: deprecate in favor of llama_memory) -# // - - -# // Returns the number of tokens in the KV cache (slow, use only for debug) -# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times -# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), -# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function("llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32) -def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: - """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" - ... - - -# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) -# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), -# "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function("llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32) -def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: - """Returns the number of used KV cells (DEPRECATED)""" - ... - - -# // Clear the KV cache - both cell info is erased and KV data is zeroed -# DEPRECATED(LLAMA_API void llama_kv_self_clear( -# struct llama_context * ctx), -# "Use llama_memory_clear() instead"); -@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None) -def llama_kv_self_clear(ctx: llama_context_p, /): - """Clear the KV cache (DEPRECATED)""" - ... - - -# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) -# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails -# // seq_id < 0 : match any sequence -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1), -# "Use llama_memory_seq_rm() instead"); -@ctypes_function( - "llama_kv_self_seq_rm", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ], - ctypes.c_bool, -) -def llama_kv_self_seq_rm( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -) -> bool: - """Remove tokens from KV cache (DEPRECATED)""" - ... - - -# // Copy all tokens that belong to the specified sequence to another sequence -# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp( -# struct llama_context * ctx, -# llama_seq_id seq_id_src, -# llama_seq_id seq_id_dst, -# llama_pos p0, -# llama_pos p1), -# "Use llama_memory_seq_cp() instead"); -@ctypes_function( - "llama_kv_self_seq_cp", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_seq_id, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_cp( - ctx: llama_context_p, - seq_id_src: Union[llama_seq_id, int], - seq_id_dst: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -): - """Copy tokens in KV cache (DEPRECATED)""" - ... - - -# // Removes all tokens that do not belong to the specified sequence -# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_keep() instead"); -@ctypes_function("llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None) -def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): - """Keep only specified sequence in KV cache (DEPRECATED)""" - ... - - -# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_add( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# llama_pos delta), -# "Use llama_memory_seq_add() instead"); -@ctypes_function( - "llama_kv_self_seq_add", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_add( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - delta: Union[llama_pos, int], - /, -): - """Add delta to sequence positions in KV cache (DEPRECATED)""" - ... - - -# // Integer division of the positions by factor of `d > 1` -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# DEPRECATED(LLAMA_API void llama_kv_self_seq_div( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# int d), -# "Use llama_memory_seq_div() instead"); -@ctypes_function( - "llama_kv_self_seq_div", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ctypes.c_int, - ], - None, -) -def llama_kv_self_seq_div( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - d: Union[ctypes.c_int, int], - /, -): - """Divide sequence positions in KV cache (DEPRECATED)""" - ... - - -# // Returns the smallest position present in the KV cache for the specified sequence -# // This is typically non-zero only for SWA caches -# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache -# // Return -1 if the sequence is empty -# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_pos_min() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_min( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the smallest position in KV cache for sequence (DEPRECATED)""" - ... - - -# // Returns the largest position present in the KV cache for the specified sequence -# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache -# // Return -1 if the sequence is empty -# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max( -# struct llama_context * ctx, -# llama_seq_id seq_id), -# "Use llama_memory_seq_pos_max() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_max( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the largest position in KV cache for sequence (DEPRECATED)""" - ... - - -# // Defragment the KV cache -# // This will be applied: -# // - lazily on next llama_decode() -# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), -# "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); -@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) -def llama_kv_self_defrag(ctx: llama_context_p, /): - """Defragment the KV cache (DEPRECATED)""" - ... - - -# // Check if the context supports KV cache shifting -# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), -# "use llama_memory_can_shift() instead"); -@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: - """Check if the context supports KV cache shifting (DEPRECATED)""" - ... - - -# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) -# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), -# "simply remove this call, updates are applied lazily on the next llama_decode()"); -@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) -def llama_kv_self_update(ctx: llama_context_p, /): - """Apply the KV cache updates (DEPRECATED)""" - ... - - # // # // State / sessions # // @@ -2914,6 +2993,100 @@ def llama_get_embeddings_seq( ... +# // Get the backend sampled token for the ith token. +# // Returns LLAMA_TOKEN_NULL if no token was sampled. +# LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_token_ith", [llama_context_p_ctypes, ctypes.c_int32], llama_token +) +def llama_get_sampled_token_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> int: + """Get the backend sampled token for the ith token.""" + ... + + +# // Get the backend sampled probabilities for the ith token +# LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_probs_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.POINTER(ctypes.c_float), +) +def llama_get_sampled_probs_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> Optional[CtypesPointer[ctypes.c_float]]: + """Get the backend sampled probabilities for the ith token.""" + ... + + +# LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_probs_count_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.c_uint32, +) +def llama_get_sampled_probs_count_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> int: + """Get the backend sampled probability count for the ith token.""" + ... + + +# // Get the backend sampled logits for the ith token +# LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_logits_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.POINTER(ctypes.c_float), +) +def llama_get_sampled_logits_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> Optional[CtypesPointer[ctypes.c_float]]: + """Get the backend sampled logits for the ith token.""" + ... + + +# LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_logits_count_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.c_uint32, +) +def llama_get_sampled_logits_count_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> int: + """Get the backend sampled logit count for the ith token.""" + ... + + +# // Get the backend sampled candidates for the ith token +# LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_candidates_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.POINTER(llama_token), +) +def llama_get_sampled_candidates_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> Optional[CtypesPointer[llama_token]]: + """Get the backend sampled candidates for the ith token.""" + ... + + +# LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i); +@ctypes_function( + "llama_get_sampled_candidates_count_ith", + [llama_context_p_ctypes, ctypes.c_int32], + ctypes.c_uint32, +) +def llama_get_sampled_candidates_count_ith( + ctx: llama_context_p, i: Union[ctypes.c_int32, int], / +) -> int: + """Get the backend sampled candidate count for the ith token.""" + ... + + # // # // Vocab # // @@ -3558,6 +3731,21 @@ def llama_chat_builtin_templates( llama_sampler_context_t = ctypes.c_void_p +# struct llama_sampler_data { +# struct ggml_tensor * logits; +# struct ggml_tensor * probs; +# struct ggml_tensor * sampled; +# struct ggml_tensor * candidates; +# }; +class llama_sampler_data(ctypes.Structure): + _fields_ = [ + ("logits", ctypes.c_void_p), + ("probs", ctypes.c_void_p), + ("sampled", ctypes.c_void_p), + ("candidates", ctypes.c_void_p), + ] + + # // user code can implement the interface below in order to create custom llama_sampler # struct llama_sampler_i { # const char * (*name) (const struct llama_sampler * smpl); // can be NULL @@ -3598,6 +3786,24 @@ class llama_sampler(ctypes.Structure): llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes) llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) +llama_sampler_i_backend_init = ctypes.CFUNCTYPE( + ctypes.c_bool, llama_sampler_p_ctypes, ctypes.c_void_p +) +llama_sampler_i_backend_accept = ctypes.CFUNCTYPE( + None, + llama_sampler_p_ctypes, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, +) +llama_sampler_i_backend_apply = ctypes.CFUNCTYPE( + None, + llama_sampler_p_ctypes, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.POINTER(llama_sampler_data), +) +llama_sampler_i_backend_set_input = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) llama_sampler_i._fields_ = [ ("name", llama_sampler_i_name), @@ -3606,9 +3812,27 @@ class llama_sampler(ctypes.Structure): ("reset", llama_sampler_i_reset), ("clone", llama_sampler_i_clone), ("free", llama_sampler_i_free), + ("backend_init", llama_sampler_i_backend_init), + ("backend_accept", llama_sampler_i_backend_accept), + ("backend_apply", llama_sampler_i_backend_apply), + ("backend_set_input", llama_sampler_i_backend_set_input), ] +# // attach a sampler to the context +# LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl); +@ctypes_function( + "llama_set_sampler", + [llama_context_p_ctypes, llama_seq_id, llama_sampler_p_ctypes], + ctypes.c_bool, +) +def llama_set_sampler( + ctx: llama_context_p, seq_id: Union[llama_seq_id, int], smpl: llama_sampler_p, / +) -> bool: + """Attach a sampler to the context.""" + ... + + # // mirror of llama_sampler_i: # LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx); @ctypes_function( @@ -3748,14 +3972,6 @@ def llama_sampler_init_greedy() -> llama_sampler_p: ... def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ... -# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. -# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), -# "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); -@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) -def llama_sampler_init_softmax() -> llama_sampler_p: ... - - # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 # /// Setting k <= 0 makes this a noop # LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); @@ -4005,6 +4221,22 @@ def llama_sampler_init_dry( ) -> llama_sampler_p: ... +# LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p( +# float target, +# float decay, +# uint32_t seed); +@ctypes_function( + "llama_sampler_init_adaptive_p", + [ctypes.c_float, ctypes.c_float, ctypes.c_uint32], + llama_sampler_p_ctypes, +) +def llama_sampler_init_adaptive_p( + target: float, decay: float, seed: int, / +) -> llama_sampler_p: + """Initialize an adaptive-p sampler.""" + ... + + # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( # int32_t n_vocab, # int32_t n_logit_bias, @@ -4102,10 +4334,26 @@ def llama_print_system_info() -> bytes: ... # // Set callback for all future logging events. # // If this is not called, or NULL is supplied, everything is output on stderr. +# // The logger state is global so these functions are NOT thread safe. +# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); +@ctypes_function( + "llama_log_get", + [ctypes.POINTER(llama_log_callback), ctypes.POINTER(ctypes.c_void_p)], + None, +) +def llama_log_get( + log_callback: CtypesPointerOrRef[llama_log_callback], + user_data: CtypesPointerOrRef[ctypes.c_void_p], + /, +): + """Get the current logging callback and user data.""" + ... + + # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( "llama_log_set", - [ctypes.c_void_p, ctypes.c_void_p], + [llama_log_callback, ctypes.c_void_p], None, ) def llama_log_set( @@ -4214,6 +4462,13 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): ... def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... +# // print a breakdown of per-device memory use via LLAMA_LOG: +@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None) +def llama_memory_breakdown_print(ctx: llama_context_p, /): + """Print a breakdown of per-device memory use.""" + ... + + # // # // training # // diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 41753a7f6..787683179 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import warnings from ctypes import ( c_bool, c_char_p, @@ -78,13 +79,31 @@ # Structures class mtmd_context_params(Structure): + if TYPE_CHECKING: + use_gpu: bool + print_timings: bool + n_threads: int + image_marker: Optional[bytes] + media_marker: Optional[bytes] + flash_attn_type: int + warmup: bool + image_min_tokens: int + image_max_tokens: int + cb_eval: llama_cpp.ggml_backend_sched_eval_callback + cb_eval_user_data: c_void_p + _fields_ = [ ("use_gpu", c_bool), ("print_timings", c_bool), ("n_threads", c_int), - ("verbosity", c_int), # ggml_log_level ("image_marker", c_char_p), ("media_marker", c_char_p), + ("flash_attn_type", c_int), + ("warmup", c_bool), + ("image_min_tokens", c_int), + ("image_max_tokens", c_int), + ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback), + ("cb_eval_user_data", c_void_p), ] @@ -132,11 +151,49 @@ def mtmd_init_from_file( def mtmd_free(ctx: mtmd_context_p, /): ... +# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); +@ctypes_function("mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool) +def mtmd_decode_use_non_causal(ctx: mtmd_context_p, /) -> bool: + """Check whether MTMD decoding uses non-causal attention.""" + ... + + +# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); +@ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool) +def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool: + """Check whether MTMD decoding uses mRoPE.""" + ... + + # MTMD_API bool mtmd_support_vision(mtmd_context * ctx); @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool) def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ... +# MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +@ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool) +def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool: + """Check whether MTMD supports audio.""" + ... + + +# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); +@ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int) +def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int: + """Get the MTMD audio sample rate.""" + ... + + +# Deprecated compatibility wrapper for the renamed mtmd_get_audio_sample_rate(). +def mtmd_get_audio_bitrate(ctx: mtmd_context_p, /) -> int: + warnings.warn( + "mtmd_get_audio_bitrate is deprecated; use mtmd_get_audio_sample_rate instead", + DeprecationWarning, + stacklevel=2, + ) + return mtmd_get_audio_sample_rate(ctx) + + # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data); @ctypes_function( "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes @@ -149,6 +206,21 @@ def mtmd_bitmap_init( ) -> Optional[mtmd_bitmap_p]: ... +# MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); +@ctypes_function( + "mtmd_bitmap_init_from_audio", + [c_size_t, POINTER(c_float)], + mtmd_bitmap_p_ctypes, +) +def mtmd_bitmap_init_from_audio( + n_samples: Union[c_size_t, int], + data: CtypesArray[c_float], + /, +) -> Optional[mtmd_bitmap_p]: + """Initialize an MTMD bitmap from audio samples.""" + ... + + # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap); @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None) def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ... @@ -284,3 +356,25 @@ def mtmd_helper_eval_chunk_single( new_n_past: "_Pointer[llama_cpp.llama_pos]", /, ) -> int: ... + + +# MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); +@ctypes_function( + "mtmd_log_set", + [llama_cpp.llama_log_callback, c_void_p], + None, +) +def mtmd_log_set(log_callback, user_data: c_void_p, /): + """Set the MTMD logging callback.""" + ... + + +# MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data); +@ctypes_function( + "mtmd_helper_log_set", + [llama_cpp.llama_log_callback, c_void_p], + None, +) +def mtmd_helper_log_set(log_callback, user_data: c_void_p, /): + """Set the MTMD helper logging callback.""" + ... diff --git a/tests/test_llama.py b/tests/test_llama.py index 964b0895c..619c7378d 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -83,7 +83,7 @@ def test_real_model(llama_cpp_model_path): cparams.n_threads = multiprocessing.cpu_count() cparams.n_threads_batch = multiprocessing.cpu_count() cparams.logits_all = False - cparams.flash_attn = True + cparams.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED context = internals.LlamaContext(model=model, params=cparams) tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4227c9be4..49bfddeca 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 +Subproject commit 49bfddeca18e62fa3d39114a23e9fcbdf8a22388 From e1f8ac0e135201c4a6de7c828331a2f078e9763a Mon Sep 17 00:00:00 2001 From: Bruno Verachten Date: Mon, 23 Mar 2026 03:27:22 +0100 Subject: [PATCH 05/19] ci: add riscv64 wheel builds to release workflow (#2139) * ci: add riscv64 wheel builds to release workflow Add a build_wheels_riscv64 job mirroring the existing arm64 QEMU-based build. Uses cibuildwheel with QEMU emulation for linux/riscv64, targeting CPython 3.10-3.14 on manylinux. Closes #2138 * ci: use cibuildwheel 3.1.2 for riscv64 wheels * docs: update changelog for riscv64 wheel PR --------- Co-authored-by: abetlen --- .github/workflows/build-and-release.yaml | 31 +++++++++++++++++++++++- CHANGELOG.md | 1 + 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 7eaf017fb..0121febe8 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -85,6 +85,35 @@ jobs: name: wheels_arm64 path: ./wheelhouse/*.whl + build_wheels_riscv64: + name: Build riscv64 wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + with: + platforms: linux/riscv64 + + - name: Build wheels + uses: pypa/cibuildwheel@v3.1.2 + env: + CIBW_SKIP: "*musllinux* pp*" + CIBW_REPAIR_WHEEL_COMMAND: "" + CIBW_ARCHS: "riscv64" + CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*" + with: + output-dir: wheelhouse + + - name: Upload wheels as artifacts + uses: actions/upload-artifact@v4 + with: + name: wheels_riscv64 + path: ./wheelhouse/*.whl + build_sdist: name: Build source distribution runs-on: ubuntu-latest @@ -129,7 +158,7 @@ jobs: release: name: Release - needs: [build_wheels, build_wheels_arm64, build_sdist] + needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist] runs-on: ubuntu-latest steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index e7506eed5..94666cec1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main` - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149 +- ci: add riscv64 wheel builds to release workflow by @gounthar in #2139 ## [0.3.16] From 11e7a55af76303d42f2bc5a79ed0babbc89652dd Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 22 Mar 2026 22:33:31 -0700 Subject: [PATCH 06/19] fix: Qwen 3.5 support (#2152) * fix: handle Qwen 3.5 hybrid prefix reuse * test: fix Qwen runtime unit mocks * test: drop Qwen runtime unit tests * docs: credit Qwen fix contributors in changelog * docs/tests: update default Qwen model to 3.5 0.8B * test: rebaseline Qwen 3.5 outputs * test: stabilize low-level Qwen sampling check * test: tighten Qwen 3.5 completion prompts --- .github/workflows/test.yaml | 4 ++-- CHANGELOG.md | 1 + README.md | 6 +++--- examples/gradio_chat/local.py | 6 +++--- examples/hf_pull/main.py | 6 +++--- llama_cpp/_internals.py | 4 ++-- llama_cpp/llama.py | 19 +++++++++++++------ tests/test_llama.py | 33 ++++++++++++++++++++------------- 8 files changed, 47 insertions(+), 32 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index af4cacac4..8a6845ff2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -8,8 +8,8 @@ on: - main env: - REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF - MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf + REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF + MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf jobs: download-model: diff --git a/CHANGELOG.md b/CHANGELOG.md index 94666cec1..4153406c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151 +- fix: Handle Qwen 3.5 hybrid prefix reuse by @codavidgarcia and @r-dh in #2152 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main` - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149 diff --git a/README.md b/README.md index b57c95807..8ba4dbb5e 100644 --- a/README.md +++ b/README.md @@ -322,8 +322,8 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i ```python llm = Llama.from_pretrained( - repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", - filename="*q8_0.gguf", + repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF", + filename="*Q8_0.gguf", verbose=False ) ``` @@ -685,7 +685,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_ If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. ```bash -python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' +python3 -m llama_cpp.server --hf_model_repo_id lmstudio-community/Qwen3.5-0.8B-GGUF --model '*Q8_0.gguf' ``` ### Web Server Features diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py index e16bf234a..871d8b09b 100644 --- a/examples/gradio_chat/local.py +++ b/examples/gradio_chat/local.py @@ -4,10 +4,10 @@ import gradio as gr llama = llama_cpp.Llama.from_pretrained( - repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", - filename="*q8_0.gguf", + repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF", + filename="*Q8_0.gguf", tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( - "Qwen/Qwen1.5-0.5B" + "Qwen/Qwen3.5-0.8B" ), verbose=False, ) diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py index dfed17516..a9ca424d1 100644 --- a/examples/hf_pull/main.py +++ b/examples/hf_pull/main.py @@ -3,10 +3,10 @@ llama = llama_cpp.Llama.from_pretrained( - repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", - filename="*q8_0.gguf", + repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF", + filename="*Q8_0.gguf", tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( - "Qwen/Qwen1.5-0.5B" + "Qwen/Qwen3.5-0.8B" ), verbose=False, ) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index d6258d224..6862135aa 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -291,10 +291,10 @@ def kv_cache_clear(self): assert self.memory is not None, "Memory is not initialized" llama_cpp.llama_memory_clear(self.memory, True) - def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): + def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool: assert self.memory is not None, "Memory is not initialized" seq_id = seq_id if seq_id >= 0 else 0 - llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1) + return llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1) def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): assert self.memory is not None, "Memory is not initialized" diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1609ad16b..88bc2e5bb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -891,13 +891,20 @@ def generate( else: break if longest_prefix > 0: - reset = False - tokens = tokens[longest_prefix:] - self.n_tokens = longest_prefix - if self.verbose: + if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1): + reset = False + tokens = tokens[longest_prefix:] + self.n_tokens = longest_prefix + if self.verbose: + print( + f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", + file=sys.stderr, + ) + elif self.verbose: print( - f"Llama.generate: {longest_prefix} prefix-match hit, " - f"remaining {len(tokens)} prompt tokens to eval", + f"Llama.generate: {longest_prefix} prefix-match found " + f"but partial kv removal not supported, re-evaluating full prompt", file=sys.stderr, ) diff --git a/tests/test_llama.py b/tests/test_llama.py index 619c7378d..1a70c74d4 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -58,8 +58,8 @@ def test_llama_cpp_tokenization(): @pytest.fixture def llama_cpp_model_path(): - repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF" - filename = "qwen2-0_5b-instruct-q8_0.gguf" + repo_id = "lmstudio-community/Qwen3.5-0.8B-GGUF" + filename = "Qwen3.5-0.8B-Q8_0.gguf" model_path = hf_hub_download(repo_id, filename) return model_path @@ -88,9 +88,14 @@ def test_real_model(llama_cpp_model_path): context = internals.LlamaContext(model=model, params=cparams) tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True) - assert tokens == [9707, 11, 1879, 0] + assert tokens == [9419, 11, 1814, 0] - tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True) + tokens = model.tokenize( + b"The quick brown fox jumps over the lazy dog. The quick brown fox jumps ", + add_bos=True, + special=True, + ) + prompt_token_count = len(tokens) batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1) @@ -111,9 +116,11 @@ def test_real_model(llama_cpp_model_path): tokens = [token_id] result += tokens - output = result[5:] + output = result[prompt_token_count:] output_text = model.detokenize(output, special=True) - assert output_text == b" over the lazy dog" + # Low-level sampling output varies across CPU and Metal backends. + assert len(output) == 4 + assert output_text def test_real_llama(llama_cpp_model_path): @@ -129,14 +136,14 @@ def test_real_llama(llama_cpp_model_path): ) output = model.create_completion( - "The quick brown fox jumps", - max_tokens=4, + "The quick brown fox jumps over the lazy dog. The quick brown fox", + max_tokens=6, top_k=50, top_p=0.9, - temperature=0.8, + temperature=0.0, seed=1337, ) - assert output["choices"][0]["text"] == " over the lazy dog" + assert output["choices"][0]["text"] == " jumps over the lazy dog." output = model.create_completion( "The capital of france is paris, 'true' or 'false'?:\n", @@ -181,7 +188,7 @@ def logit_processor_func(input_ids, logits): max_tokens=4, top_k=50, top_p=0.9, - temperature=0.8, + temperature=1.0, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" """), @@ -193,7 +200,7 @@ def logit_processor_func(input_ids, logits): max_tokens=4, top_k=50, top_p=0.9, - temperature=0.8, + temperature=1.0, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" """), @@ -207,7 +214,7 @@ def logit_processor_func(input_ids, logits): max_tokens=4, top_k=50, top_p=0.9, - temperature=0.8, + temperature=1.0, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" """), From a6b180724a641750aeaefa51692fe12ee8c4d54f Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 22 Mar 2026 23:28:55 -0700 Subject: [PATCH 07/19] chore: Bump version (#2153) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4153406c1..d1195cc2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.17] + - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151 - fix: Handle Qwen 3.5 hybrid prefix reuse by @codavidgarcia and @r-dh in #2152 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main` diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index c1dde7046..a7c40478b 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.16" +__version__ = "0.3.17" From f0391c5ea7159b4c4916d9f4aced2f982adbd1f4 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 00:59:19 -0700 Subject: [PATCH 08/19] fix(ci): release wheel workflow (#2154) * fix(ci): harden release wheel workflow * fix(ci): document and pin release wheel baselines * fix(ci): speed up release arch builds * fix(ci): split riscv64 by python version * fix(ci): sanitize riscv64 artifact names --- .github/workflows/build-and-release.yaml | 49 +++++++++++++++++++----- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 0121febe8..3a9e6f369 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -46,6 +46,13 @@ jobs: env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" + # Skip cibuildwheel's default i686 sidecar and keep Linux release + # wheels on a portable x86_64 CPU baseline. + CIBW_ARCHS_LINUX: "auto64" + CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off" + # Keep macOS release wheels on a portable CPU baseline instead of + # inheriting the hosted runner's native flags. + CIBW_ENVIRONMENT_MACOS: CMAKE_ARGS="-DGGML_NATIVE=off" with: package-dir: . output-dir: wheelhouse @@ -57,24 +64,21 @@ jobs: build_wheels_arm64: name: Build arm64 wheels - runs-on: ubuntu-latest + runs-on: ubuntu-24.04-arm steps: - uses: actions/checkout@v4 with: submodules: "recursive" - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - name: Build wheels uses: pypa/cibuildwheel@v2.22.0 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_ARCHS: "aarch64" - CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON" + # Keep native arm64 builds on a portable CPU baseline instead of + # tuning wheels to the hosted runner. + CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off" CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" with: output-dir: wheelhouse @@ -86,8 +90,27 @@ jobs: path: ./wheelhouse/*.whl build_wheels_riscv64: - name: Build riscv64 wheels + name: Build riscv64 wheels (${{ matrix.shard.name }}) runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + shard: + - name: cp310 + build: "cp310-*" + artifact: wheels_riscv64_cp310 + - name: cp311 + build: "cp311-*" + artifact: wheels_riscv64_cp311 + - name: cp312 + build: "cp312-*" + artifact: wheels_riscv64_cp312 + - name: cp313 + build: "cp313-*" + artifact: wheels_riscv64_cp313 + - name: cp314 + build: "cp314-*" + artifact: wheels_riscv64_cp314 steps: - uses: actions/checkout@v4 with: @@ -104,14 +127,19 @@ jobs: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_ARCHS: "riscv64" - CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*" + # Build riscv64 wheels against a conservative baseline instead of + # enabling RVV-related extensions from the build container. + CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off" + # Split the emulated riscv64 build into one Python version per job + # to minimize wall-clock time without changing the release artifacts. + CIBW_BUILD: ${{ matrix.shard.build }} with: output-dir: wheelhouse - name: Upload wheels as artifacts uses: actions/upload-artifact@v4 with: - name: wheels_riscv64 + name: ${{ matrix.shard.artifact }} path: ./wheelhouse/*.whl build_sdist: @@ -159,6 +187,7 @@ jobs: release: name: Release needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist] + if: startsWith(github.ref, 'refs/tags/') runs-on: ubuntu-latest steps: From 909ebf1246a52c15ebc95460c7e5957e3b64711e Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 01:00:50 -0700 Subject: [PATCH 09/19] fix(ci): cuda wheel workflow (#2155) * fix(ci): harden cuda wheel workflow * fix(ci): pin cuda toolkit versions accurately * fix(ci): resolve exact cuda toolkit installs * fix(ci): align cuda toolkit roots and tags * fix(ci): pin cuda packages to nvidia label * fix(ci): allow cuda solver to mix non-cuda deps --- .github/workflows/build-wheels-cuda.yaml | 57 +++++++++++++++++++++--- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 07b30cfc0..b8d6c9dce 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -99,21 +99,63 @@ jobs: MAMBA_NO_LOW_SPEED_LIMIT: "1" run: | $cudaVersion = $env:CUDAVER - mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion + $cudaChannel = "nvidia/label/cuda-$cudaVersion" + if ($IsLinux) { + # Keep nvcc, cudart, and headers on the same NVIDIA label so the + # detected toolkit version matches the published wheel tag. + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev" + } else { + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" + } + if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE + } python -m pip install build wheel - name: Build Wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') $env:CUDA_PATH = $env:CONDA_PREFIX $env:CUDA_HOME = $env:CONDA_PREFIX $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + $cudaHostCompilerArg = '' + $env:CMAKE_ARGS = '' if ($IsLinux) { - $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH + if (Test-Path '/usr/bin/g++-12') { + $env:CC = '/usr/bin/gcc-12' + $env:CXX = '/usr/bin/g++-12' + $env:CUDAHOSTCXX = '/usr/bin/g++-12' + $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX" + } + if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) { + $env:CUDAToolkit_ROOT = $env:CONDA_PREFIX + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg" + $env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH" + $env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH" + $env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH" + $env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH" + } else { + $env:CMAKE_ARGS = $cudaHostCompilerArg.Trim() + } + } + $nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc' + if (-not (Test-Path $nvccPath)) { + $nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc' + } + if (-not (Test-Path $nvccPath)) { + throw 'Failed to find nvcc in the conda environment' + } + $env:CUDACXX = $nvccPath + $env:PATH = "$(Split-Path $nvccPath):$env:PATH" + $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value + if (-not $nvccVersion) { + throw 'Failed to detect the installed CUDA toolkit version' } + $cudaTagVersion = $nvccVersion.Replace('.','') $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" + # Keep a portable SM set, including sm_70, instead of CMake's `all`, + # which now pulls in future targets the hosted-runner toolchains cannot assemble. + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" # if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' # } @@ -124,10 +166,11 @@ jobs: # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' # } python -m build --wheel - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + # Publish tags that reflect the actual installed toolkit version. + Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV - uses: softprops/action-gh-release@v2 + if: startsWith(github.ref, 'refs/tags/') with: files: dist/* # Set tag_name to -cu From ccc6bc0454b2d73431a419620aad92fda1aba162 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 01:02:14 -0700 Subject: [PATCH 10/19] fix(ci): docker build workflow (#2156) * fix(ci): harden docker build workflow * docs: update changelog for ci workflows --- .github/workflows/build-docker.yaml | 11 ++++++++++- CHANGELOG.md | 4 ++++ docker/simple/Dockerfile | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index b290f6273..c65695847 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -16,6 +16,15 @@ jobs: with: submodules: "recursive" + - name: Set image tag + run: | + if [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then + image_tag="${GITHUB_REF_NAME}" + else + image_tag="${GITHUB_REF_NAME//\//-}" + fi + echo "IMAGE_TAG=$image_tag" >> "$GITHUB_ENV" + - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -40,7 +49,7 @@ jobs: platforms: linux/amd64,linux/arm64 tags: | ghcr.io/abetlen/llama-cpp-python:latest - ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }} + ghcr.io/abetlen/llama-cpp-python:${{ env.IMAGE_TAG }} build-args: | BUILDKIT_INLINE_CACHE=1 diff --git a/CHANGELOG.md b/CHANGELOG.md index d1195cc2a..b47613109 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 +- fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 +- fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154 + ## [0.3.17] - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151 diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index 06483d44e..bad4f456f 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -6,6 +6,7 @@ FROM ${IMAGE} # Re-declare the ARG after FROM ARG IMAGE +ARG CMAKE_ARGS="-DGGML_NATIVE=off" # Update and upgrade the existing packages RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ @@ -26,7 +27,7 @@ RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context -RUN pip install llama-cpp-python --verbose; +RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose # Set environment variable for the host ENV HOST=0.0.0.0 From 7b38c3122d2ff3ad23e1502de045807836ced4a7 Mon Sep 17 00:00:00 2001 From: Victor Biederbeck Date: Tue, 24 Mar 2026 02:50:15 -0700 Subject: [PATCH 11/19] feat: expose attention_type parameter in Llama.__init__ (#2143) * feat: expose attention_type parameter in Llama.__init__ * docs: preserve attention_type in pickled state * docs: update changelog for attention_type --------- Co-authored-by: Victor Biederbeck Co-authored-by: abetlen --- CHANGELOG.md | 1 + llama_cpp/llama.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b47613109..de4f070ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 - fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 88bc2e5bb..ad484c4d5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -81,6 +81,7 @@ def __init__( int ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, + attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED, rope_freq_base: float = 0.0, rope_freq_scale: float = 0.0, yarn_ext_factor: float = -1.0, @@ -163,6 +164,7 @@ def __init__( n_threads_batch: Number of threads to use for batch processing rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 pooling_type: Pooling type, from `enum llama_pooling_type`. + attention_type: Attention type, from `enum llama_attention_type`. rope_freq_base: RoPE base frequency, 0 = from model rope_freq_scale: RoPE frequency scaling factor, 0 = from model yarn_ext_factor: YaRN extrapolation mix factor, negative = from model @@ -319,6 +321,7 @@ def __init__( else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) self.context_params.pooling_type = pooling_type + self.context_params.attention_type = attention_type self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 ) @@ -2100,6 +2103,7 @@ def __getstate__(self): n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, pooling_type=self.context_params.pooling_type, + attention_type=self.context_params.attention_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, yarn_ext_factor=self.context_params.yarn_ext_factor, From d6f46a50d6b4cda10460c05e2acdbaec74428c1b Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 02:56:01 -0700 Subject: [PATCH 12/19] chore: bump version (#2157) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de4f070ff..4118f4848 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.18] + - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index a7c40478b..bdaefb9e0 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.17" +__version__ = "0.3.18" From 5f9c231ce165126f38c8897fd760ecd7ef79f9fd Mon Sep 17 00:00:00 2001 From: Andrei Date: Wed, 25 Mar 2026 01:56:18 -0700 Subject: [PATCH 13/19] fix(ci): reduce CUDA binary wheel size only including cubins for current arches and one PTX target for forward compatibility (#2158) * fix(ci): shrink CUDA wheel fatbins * docs: update changelog for cuda wheel size fix --- .github/workflows/build-wheels-cuda.yaml | 7 ++++--- CHANGELOG.md | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index b8d6c9dce..17daaa12a 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -153,9 +153,10 @@ jobs: } $cudaTagVersion = $nvccVersion.Replace('.','') $env:VERBOSE = '1' - # Keep a portable SM set, including sm_70, instead of CMake's `all`, - # which now pulls in future targets the hosted-runner toolchains cannot assemble. - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" + # Build real cubins for the supported GPUs, including sm_70, and keep + # one forward-compatible PTX target instead of embedding PTX for every + # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" # if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' # } diff --git a/CHANGELOG.md b/CHANGELOG.md index 4118f4848..f4a0b55d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158 + ## [0.3.18] - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143 From ac59e5a5ae8d331d80f30d3ddfc50195061637f5 Mon Sep 17 00:00:00 2001 From: Andrei Date: Wed, 25 Mar 2026 15:03:57 -0700 Subject: [PATCH 14/19] fix: handle embedding models without KV memory (#2160) * Fix embedding models without KV memory * Add changelog entry for embedding memory fix --- CHANGELOG.md | 1 + llama_cpp/_internals.py | 4 +++- tests/test_llama.py | 16 ++++++++++++---- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4a0b55d3..d2e4937c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158 ## [0.3.18] diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 6862135aa..9e9bcd407 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -288,7 +288,9 @@ def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) def kv_cache_clear(self): - assert self.memory is not None, "Memory is not initialized" + # Embedding models with non-causal attention may not allocate memory. + if self.memory is None: + return llama_cpp.llama_memory_clear(self.memory, True) def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool: diff --git a/tests/test_llama.py b/tests/test_llama.py index 1a70c74d4..23928fff6 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -64,6 +64,14 @@ def llama_cpp_model_path(): return model_path +@pytest.fixture +def llama_cpp_embedding_model_path(): + repo_id = "CompendiumLabs/bge-small-en-v1.5-gguf" + filename = "bge-small-en-v1.5-q4_k_m.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + + def test_real_model(llama_cpp_model_path): import os @@ -225,9 +233,9 @@ def logit_processor_func(input_ids, logits): assert number_1 == number_3 -def test_real_llama_embeddings(llama_cpp_model_path): +def test_real_llama_embeddings(llama_cpp_embedding_model_path): model = llama_cpp.Llama( - llama_cpp_model_path, + llama_cpp_embedding_model_path, n_ctx=32, n_batch=32, n_ubatch=32, @@ -237,5 +245,5 @@ def test_real_llama_embeddings(llama_cpp_model_path): flash_attn=True, embedding=True, ) - # Smoke test for now - model.embed("Hello World") + embedding = model.embed("Hello World") + assert len(embedding) > 0 From c670222c8379608aa22ac81e5f6a813620187a26 Mon Sep 17 00:00:00 2001 From: Andrei Date: Wed, 25 Mar 2026 15:21:44 -0700 Subject: [PATCH 15/19] feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 (#2161) * Update llama.cpp to c0159f9c1 * Add changelog entry for llama.cpp update --- CHANGELOG.md | 1 + llama_cpp/llama_cpp.py | 16 ++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2e4937c9..03240454f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 by @abetlen in #2161 - fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158 diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e51492c56..5a6c06b07 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1314,6 +1314,22 @@ def llama_model_load_from_splits( ... +# // Load a model from an open FILE pointer +# LLAMA_API struct llama_model * llama_model_load_from_file_ptr( +# FILE * file, +# struct llama_model_params params); +@ctypes_function( + "llama_model_load_from_file_ptr", + [ctypes.c_void_p, llama_model_params], + llama_model_p_ctypes, +) +def llama_model_load_from_file_ptr( + file: ctypes.c_void_p, params: llama_model_params, / +) -> Optional[llama_model_p]: + """Load a model from an open FILE pointer.""" + ... + + # LLAMA_API void llama_model_save_to_file( # const struct llama_model * model, # const char * path_model); diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 49bfddeca..c0159f9c1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 49bfddeca18e62fa3d39114a23e9fcbdf8a22388 +Subproject commit c0159f9c1f874da15e94f371d136f5920b4b5335 From f54421ba71db942b262a28762bc9e035a5d4d349 Mon Sep 17 00:00:00 2001 From: Andrei Date: Wed, 25 Mar 2026 15:28:12 -0700 Subject: [PATCH 16/19] Bump version to 0.3.19 (#2162) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03240454f..d1efdc5e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.19] + - feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 by @abetlen in #2161 - fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index bdaefb9e0..72388c4e5 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.18" +__version__ = "0.3.19" From fcd932a1574b7b6fdfc6c2d652f10f3af66995b5 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 28 Mar 2026 21:36:13 -0700 Subject: [PATCH 17/19] fix(ci): publish distinct manylinux and musllinux cpu wheels (#2165) * fix(ci): publish distinct manylinux and musllinux cpu wheels * docs: add changelog entry for linux wheel repair fix --- .github/workflows/build-and-release.yaml | 5 ++++- CHANGELOG.md | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 3a9e6f369..6cbac0cb1 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -44,8 +44,11 @@ jobs: - name: Build wheels uses: pypa/cibuildwheel@v2.22.0 env: - # disable repair + # Keep repair disabled by default for non-Linux platforms in this job. CIBW_REPAIR_WHEEL_COMMAND: "" + # Linux needs auditwheel repair so manylinux and musllinux wheels are + # published with distinct platform tags instead of generic linux tags. + CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair -w {dest_dir} {wheel}" # Skip cibuildwheel's default i686 sidecar and keep Linux release # wheels on a portable x86_64 CPU baseline. CIBW_ARCHS_LINUX: "auto64" diff --git a/CHANGELOG.md b/CHANGELOG.md index d1efdc5e9..7bcad0a47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165 + ## [0.3.19] - feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 by @abetlen in #2161 From 7613aca61259820ab550626384af52eed56a731f Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 29 Mar 2026 00:31:03 -0700 Subject: [PATCH 18/19] ci: publish release wheels as py3-none (#2166) * ci: publish CPU wheels as py3-none * docs: add changelog entry for py3-none wheel tags --- CHANGELOG.md | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bcad0a47..3b4c13ee3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- ci: Publish release wheels as `py3-none` by @Bing-su in #2166 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165 ## [0.3.19] diff --git a/pyproject.toml b/pyproject.toml index e0b0dc520..b5998dd1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ all = [ [tool.scikit-build] wheel.packages = ["llama_cpp"] +wheel.py-api = "py3" cmake.verbose = true cmake.minimum-version = "3.21" minimum-version = "0.5.1" From 7257ba95fbbf65201fd5bf4b7f0bdd1c701e1345 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 29 Mar 2026 23:17:22 -0700 Subject: [PATCH 19/19] feat(server): add model-load chat_template_kwargs (#2168) --- CHANGELOG.md | 1 + docs/server.md | 25 +++++++++++++++++++++++++ llama_cpp/llama_chat_format.py | 7 ++++++- llama_cpp/server/cli.py | 34 ++++++++++++++++++++++++++++++++-- llama_cpp/server/model.py | 15 +++++++++++++++ llama_cpp/server/settings.py | 6 +++++- 6 files changed, 84 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b4c13ee3..e577324db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168 - ci: Publish release wheels as `py3-none` by @Bing-su in #2166 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165 diff --git a/docs/server.md b/docs/server.md index cd6f86c51..9c09a1f1c 100644 --- a/docs/server.md +++ b/docs/server.md @@ -22,6 +22,15 @@ The server can then be started by running the following command: python3 -m llama_cpp.server --model ``` +You can also pass chat-template kwargs at model load time from the CLI: + +```bash +python3 -m llama_cpp.server \ + --model \ + --chat_format chatml \ + --chat_template_kwargs '{"enable_thinking": true}' +``` + ### Server options For a full list of options, run: @@ -147,6 +156,22 @@ The server supports routing requests to multiple models based on the `model` par At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed. +For a single-model config, `chat_template_kwargs` can be set directly on the model entry: + +```json +{ + "models": [ + { + "model": "models/Qwen3.5-0.8B/qwen3.5-0.8b-q8_0.gguf", + "chat_format": "chatml", + "chat_template_kwargs": { + "enable_thinking": true + } + } + ] +} +``` + ```json { "host": "0.0.0.0", diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index d7910e984..1024fb85b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -243,6 +243,7 @@ def raise_exception(message: str): tools=tools, tool_choice=tool_choice, strftime_now=self.strftime_now, + **kwargs, ) stopping_criteria = None @@ -617,6 +618,7 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + **kwargs, ) prompt = llama.tokenize( result.prompt.encode("utf-8"), @@ -734,7 +736,9 @@ def format_autotokenizer( **kwargs: Any, ) -> ChatFormatterResponse: tokenizer.use_default_system_prompt = False # type: ignore - prompt: str = tokenizer.apply_chat_template(messages, tokenize=False) # type: ignore + prompt: str = tokenizer.apply_chat_template( # type: ignore + messages, tokenize=False, **kwargs + ) assert isinstance(prompt, str) # Return formatted prompt and eos token by default return ChatFormatterResponse( @@ -791,6 +795,7 @@ def format_tokenizer_config( messages=messages, bos_token=bos_token, eos_token=eos_token, + **kwargs, ) return ChatFormatterResponse( prompt=prompt, stop=[eos_token, bos_token], added_special=True diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py index 8ed029063..171b8db30 100644 --- a/llama_cpp/server/cli.py +++ b/llama_cpp/server/cli.py @@ -1,8 +1,9 @@ from __future__ import annotations import argparse +import json -from typing import List, Literal, Union, Any, Type, TypeVar +from typing import List, Literal, Union, Any, Type, TypeVar, Dict from pydantic import BaseModel @@ -40,6 +41,17 @@ def _contains_list_type(annotation: Type[Any] | None) -> bool: return False +def _contains_dict_type(annotation: Type[Any] | None) -> bool: + origin = getattr(annotation, "__origin__", None) + + if origin is dict or origin is Dict: + return True + elif origin in (Literal, Union): + return any(_contains_dict_type(arg) for arg in annotation.__args__) # type: ignore + else: + return False + + def _parse_bool_arg(arg: str | bytes | bool) -> bool: if isinstance(arg, bytes): arg = arg.decode("utf-8") @@ -57,6 +69,16 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool: raise ValueError(f"Invalid boolean argument: {arg}") +def _parse_json_object_arg(arg: str | bytes) -> dict[str, Any]: + if isinstance(arg, bytes): + arg = arg.decode("utf-8") + + value = json.loads(arg) + if not isinstance(value, dict): + raise ValueError(f"Invalid JSON object argument: {arg}") + return value + + def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]): """Add arguments from a pydantic model to an argparse parser.""" @@ -68,7 +90,15 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]) _get_base_type(field.annotation) if field.annotation is not None else str ) list_type = _contains_list_type(field.annotation) - if base_type is not bool: + dict_type = _contains_dict_type(field.annotation) + if dict_type: + parser.add_argument( + f"--{name}", + dest=name, + type=_parse_json_object_arg, + help=description, + ) + elif base_type is not bool: parser.add_argument( f"--{name}", dest=name, diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 9e59e8563..3922ce5df 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -299,6 +299,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: # Misc verbose=settings.verbose, ) + if settings.chat_template_kwargs: + base_chat_handler = ( + _model.chat_handler + or _model._chat_handlers.get(_model.chat_format) + or llama_cpp.llama_chat_format.get_chat_completion_handler( + _model.chat_format + ) + ) + + def chat_handler_with_kwargs(*args, **kwargs): + return base_chat_handler( + *args, **{**settings.chat_template_kwargs, **kwargs} + ) + + _model.chat_handler = chat_handler_with_kwargs if settings.cache: if settings.cache_type == "disk": if settings.verbose: diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 13c951241..3c2bb7fd0 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -2,7 +2,7 @@ import multiprocessing -from typing import Optional, List, Literal, Union, Dict, cast +from typing import Any, Optional, List, Literal, Union, Dict, cast from typing_extensions import Self from pydantic import Field, model_validator @@ -131,6 +131,10 @@ class ModelSettings(BaseSettings): default=None, description="Chat format to use.", ) + chat_template_kwargs: Optional[Dict[str, Any]] = Field( + default=None, + description="Extra keyword arguments forwarded to chat templates at model load time. Matches llama.cpp server `chat_template_kwargs`.", + ) clip_model_path: Optional[str] = Field( default=None, description="Path to a CLIP model to use for multi-modal chat completion.",