From ca3b00a204d5a48b1c54eb609b20203dcdaa87be Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 22 Mar 2026 15:20:48 -0700
Subject: [PATCH 01/19] fix(ci): Rename `huggingface-cli` to `hf` (#2149)

* Fix model download in test workflow

* Use hf CLI in test workflow

* Use hf CLI name in CI and docs

* Reference PR in changelog
---
 .github/workflows/test.yaml | 2 +-
 CHANGELOG.md                | 2 ++
 README.md                   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 95f6e5a27..1d2b1983c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -22,7 +22,7 @@ jobs:
       - name: Install huggingface-hub
         run: pip install huggingface-hub
       - name: Download model
-        run: huggingface-cli download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
+        run: hf download ${{ env.REPO_ID }} ${{ env.MODEL_FILE }}
       - name: Cache model
         uses: actions/cache@v4
         with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16954eb88..1f577c1a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
+
 ## [0.3.16]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@4227c9be4268ac844921b90f31595f81236bd317
diff --git a/README.md b/README.md
index 382f7cbed..d2ba297ca 100644
--- a/README.md
+++ b/README.md
@@ -328,7 +328,7 @@ llm = Llama.from_pretrained(
 )
 ```
 
-By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool.
+By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`hf`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool.
 
 ### Chat Completion
 

From 9f661ff2cf63e72aea328daab15e521230dd20b0 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 22 Mar 2026 16:10:47 -0700
Subject: [PATCH 02/19] fix(ci): Fix macos tests, support both Intel and Apple
 Silicon testing (#2150)

* fix(ci): use supported macos runner label

* fix(ci): add apple silicon macos test coverage

* fix(ci): run standard macos tests on apple silicon

* fix(ci): simplify apple silicon macos install

* fix(ci): disable ggml native on apple silicon runner

* docs: update changelog for macos ci runner fix
---
 .github/workflows/test.yaml | 11 +++++------
 CHANGELOG.md                |  1 +
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1d2b1983c..af4cacac4 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -96,7 +96,7 @@ jobs:
 
   build-macos:
     needs: download-model
-    runs-on: macos-13
+    runs-on: macos-15
     strategy:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
@@ -127,17 +127,16 @@ jobs:
         run: |
           python3 -m pip install --upgrade pip
           python3 -m pip install uv
-          python3 -m uv pip install -e .[all] --verbose
-          CMAKE_ARGS="-DLLAMA_METAL=off" python3 -m uv pip install .[all] --verbose
+          CMAKE_ARGS="-DGGML_NATIVE=off" python3 -m uv pip install -e .[all] --verbose
         shell: bash
 
       - name: Test with pytest
         run: |
           python3 -m pytest
 
-  build-macos-metal:
+  build-macos-intel:
     needs: download-model
-    runs-on: macos-13
+    runs-on: macos-15-intel
     steps:
       - uses: actions/checkout@v4
         with:
@@ -163,7 +162,7 @@ jobs:
       - name: Install dependencies
         run: |
           python3 -m pip install --upgrade pip
-          CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
+          python3 -m pip install .[all] --verbose
         shell: bash
 
       - name: Test with pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f577c1a4..9bc1c9a0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
 
 ## [0.3.16]

From a9b4a067300c89857334195518e0bb9430d1c059 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 22 Mar 2026 16:12:02 -0700
Subject: [PATCH 03/19] misc: Add Ruff formatting (#2148)

* Add Ruff formatting and safe lint baseline

* Update changelog for Ruff setup
---
 .github/workflows/lint.yaml     |  29 ++
 .gitignore                      |   1 +
 CHANGELOG.md                    |   1 +
 Makefile                        |  10 +
 README.md                       |  14 +
 llama_cpp/_ggml.py              |   2 +-
 llama_cpp/_internals.py         |  98 ++++---
 llama_cpp/_logger.py            |   5 +-
 llama_cpp/llama.py              |  22 +-
 llama_cpp/llama_cache.py        |   6 +-
 llama_cpp/llama_chat_format.py  | 192 ++++++++-----
 llama_cpp/llama_cpp.py          | 470 ++++++++++++++------------------
 llama_cpp/llama_grammar.py      |  60 ++--
 llama_cpp/llava_cpp.py          |  28 +-
 llama_cpp/mtmd_cpp.py           |  94 ++++---
 llama_cpp/server/app.py         |   6 +-
 llama_cpp/server/cli.py         |   4 +-
 llama_cpp/server/model.py       |  12 +-
 pyproject.toml                  |  13 +-
 tests/test_llama.py             |  22 +-
 tests/test_llama_chat_format.py |  19 +-
 tests/test_llama_speculative.py |   9 +-
 22 files changed, 607 insertions(+), 510 deletions(-)
 create mode 100644 .github/workflows/lint.yaml

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
new file mode 100644
index 000000000..8b3e6322d
--- /dev/null
+++ b/.github/workflows/lint.yaml
@@ -0,0 +1,29 @@
+name: Lint
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Ruff
+        run: python -m pip install "ruff>=0.15.7"
+
+      - name: Lint with Ruff
+        run: python -m ruff check llama_cpp tests
+
+      - name: Check formatting with Ruff
+        run: python -m ruff format --check llama_cpp tests
diff --git a/.gitignore b/.gitignore
index 9d68dbcd9..ff773c668 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+.ruff_cache/
 cover/
 
 # Translations
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9bc1c9a0a..7044f44d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
 - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
 
diff --git a/Makefile b/Makefile
index 26ddf2c7a..8e6cae2c1 100644
--- a/Makefile
+++ b/Makefile
@@ -67,6 +67,14 @@ deploy.gh-docs:
 test:
 	python3 -m pytest --full-trace -v
 
+lint:
+	python3 -m ruff check llama_cpp tests
+	python3 -m ruff format --check llama_cpp tests
+
+format:
+	python3 -m ruff check --fix llama_cpp tests
+	python3 -m ruff format llama_cpp tests
+
 docker:
 	docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
 
@@ -93,5 +101,7 @@ clean:
 	build.sdist \
 	deploy.pypi \
 	deploy.gh-docs \
+	lint \
+	format \
 	docker \
 	clean
diff --git a/README.md b/README.md
index d2ba297ca..b57c95807 100644
--- a/README.md
+++ b/README.md
@@ -752,6 +752,9 @@ pip install --upgrade pip
 # Install with pip
 pip install -e .
 
+# install development tooling (tests, docs, ruff)
+pip install -e '.[dev]'
+
 # if you want to use the fastapi / openapi server
 pip install -e '.[server]'
 
@@ -768,6 +771,17 @@ Now try running the tests
 pytest
 ```
 
+And check formatting / linting before opening a PR:
+
+```bash
+python -m ruff check llama_cpp tests
+python -m ruff format --check llama_cpp tests
+
+# or use the Makefile targets
+make lint
+make format
+```
+
 There's a `Makefile` available with useful targets.
 A typical workflow would look like this:
 
diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py
index 5bee8a93b..5ece01e03 100644
--- a/llama_cpp/_ggml.py
+++ b/llama_cpp/_ggml.py
@@ -2,6 +2,7 @@
 
 This module provides a minimal interface for working with ggml tensors from llama-cpp-python
 """
+
 import os
 import pathlib
 
@@ -9,4 +10,3 @@
 
 libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
 libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)
-
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b5175a7f2..b520b7ea5 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int):
     # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "set_rng_seed is deprecated, use LlamaSampler instead"
+        )
 
     def sample_repetition_penalties(
         self,
@@ -366,30 +368,44 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_repetition_penalties is deprecated, use LlamaSampler instead"
+        )
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_softmax is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_k is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_min_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_typical is deprecated, use LlamaSampler instead"
+        )
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
         raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_grammar is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat(
         self,
@@ -399,7 +415,9 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat_v2(
         self,
@@ -408,17 +426,25 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_greedy is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token is deprecated, use LlamaSampler instead"
+        )
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "grammar_accept_token is deprecated, use LlamaSampler instead"
+        )
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -602,16 +628,16 @@ def sample(
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
         # This method is deprecated in favor of using LlamaSampler directly
-        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead"
+        )
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
         self.prev.append(id)
 
 
 class CustomSampler:
-    def __init__(
-        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
-    ):
+    def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]):
         self.apply_func = apply_func
 
         def apply_wrapper(
@@ -723,20 +749,20 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar_lazy_patterns(
-        self, 
-        model: LlamaModel, 
+        self,
+        model: LlamaModel,
         grammar: LlamaGrammar,
         trigger_patterns: List[str],
-        trigger_tokens: List[int]
+        trigger_tokens: List[int],
     ):
         # Convert patterns to C array
         pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
         for i, pattern in enumerate(trigger_patterns):
             pattern_ptrs[i] = pattern.encode("utf-8")
-        
+
         # Convert tokens to C array
         token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
-        
+
         sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
             model.vocab,
             grammar._grammar.encode("utf-8"),
@@ -744,7 +770,7 @@ def add_grammar_lazy_patterns(
             pattern_ptrs,
             len(trigger_patterns),
             token_array,
-            len(trigger_tokens)
+            len(trigger_tokens),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -771,13 +797,13 @@ def add_dry(
         dry_base: float,
         dry_allowed_length: int,
         dry_penalty_last_n: int,
-        seq_breakers: List[str]
+        seq_breakers: List[str],
     ):
         # Convert seq_breakers to C array
         breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
         for i, breaker in enumerate(seq_breakers):
             breaker_ptrs[i] = breaker.encode("utf-8")
-        
+
         sampler = llama_cpp.llama_sampler_init_dry(
             model.vocab,
             n_ctx_train,
@@ -786,25 +812,19 @@ def add_dry(
             dry_allowed_length,
             dry_penalty_last_n,
             breaker_ptrs,
-            len(seq_breakers)
+            len(seq_breakers),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_logit_bias(
-        self, 
-        n_vocab: int, 
-        logit_bias: Dict[int, float]
-    ):
+    def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]):
         # Convert logit_bias dict to C array
         bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
         for i, (token, bias) in enumerate(logit_bias.items()):
             bias_array[i].token = token
             bias_array[i].bias = bias
-        
+
         sampler = llama_cpp.llama_sampler_init_logit_bias(
-            n_vocab,
-            len(logit_bias),
-            bias_array
+            n_vocab, len(logit_bias), bias_array
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -838,15 +858,17 @@ def reset(self):
     def clone(self):
         # NOTE: Custom samplers cannot be cloned due to Python callback limitations
         if self.custom_samplers:
-            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
-        
+            raise NotImplementedError(
+                "Cannot clone LlamaSampler that contains custom samplers"
+            )
+
         cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
         # Create a new wrapper around the cloned sampler
         new_sampler = LlamaSampler.__new__(LlamaSampler)
         new_sampler.sampler = cloned_sampler
         new_sampler.custom_samplers = []
         new_sampler._exit_stack = ExitStack()
-        
+
         def free_sampler():
             if new_sampler.sampler is not None:
                 llama_cpp.llama_sampler_free(new_sampler.sampler)
diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py
index 787b3f108..31d89d099 100644
--- a/llama_cpp/_logger.py
+++ b/llama_cpp/_logger.py
@@ -25,6 +25,7 @@
 
 _last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
 
+
 # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
 @llama_cpp.llama_log_callback
 def llama_log_callback(
@@ -34,7 +35,9 @@ def llama_log_callback(
 ):
     # TODO: Correctly implement continue previous log
     global _last_log_level
-    log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
+    log_level = (
+        GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
+    )
     if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
         print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
     _last_log_level = log_level
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd8..21a7430a0 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -934,7 +934,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[:sample_idx],
+                    self._scores[sample_idx - self.n_tokens, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -1157,9 +1158,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = 0  # self._model.token_prefix() # TODO: Fix
+        middle_token_id: int = 0  # self._model.token_middle() # TODO: Fix
+        suffix_token_id: int = 0  # self._model.token_suffix() # TODO: Fix
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1315,7 +1316,7 @@ def logit_bias_processor(
         if seed is not None:
             self.set_seed(seed)
         else:
-            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+            self.set_seed(random.Random(self._seed).randint(0, 2**32))
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -2056,7 +2057,10 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                return (
+                    ChatCompletionChunk(**chunk)
+                    for chunk in self.create_chat_completion(*args, **kwargs)
+                )  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2318,7 +2322,11 @@ def from_pretrained(
         if additional_files:
             for additonal_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file
+                    for file in file_list
+                    if fnmatch.fnmatch(file, additonal_file_name)
+                ]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(
diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
index e059e98e1..5220c7933 100644
--- a/llama_cpp/llama_cache.py
+++ b/llama_cpp/llama_cache.py
@@ -52,9 +52,9 @@ class LlamaRAMCache(BaseLlamaCache):
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
-        self.cache_state: OrderedDict[
-            Tuple[int, ...], "llama_cpp.llama.LlamaState"
-        ] = OrderedDict()
+        self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = (
+            OrderedDict()
+        )
 
     @property
     def cache_size(self):
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f738ab9bb..8e8ac7bb3 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -286,11 +286,15 @@ def _convert_text_completion_logprobs_to_chat(
                     }
                     for top_token, top_logprob in top_logprobs.items()
                 ],
-            } for (token, logprob, top_logprobs) in zip(logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"])
+            }
+            for (token, logprob, top_logprobs) in zip(
+                logprobs["tokens"], logprobs["token_logprobs"], logprobs["top_logprobs"]
+            )
         ],
         "refusal": None,
     }
 
+
 def _convert_text_completion_to_chat(
     completion: llama_types.Completion,
 ) -> llama_types.ChatCompletion:
@@ -307,7 +311,9 @@ def _convert_text_completion_to_chat(
                     "role": "assistant",
                     "content": completion["choices"][0]["text"],
                 },
-                "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
                 "finish_reason": completion["choices"][0]["finish_reason"],
             }
         ],
@@ -351,7 +357,9 @@ def _convert_text_completion_chunks_to_chat(
                         if chunk["choices"][0]["finish_reason"] is None
                         else {}
                     ),
-                    "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        chunk["choices"][0]["logprobs"]
+                    ),
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
@@ -368,7 +376,9 @@ def _convert_completion_to_chat(
     llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]
 ]:
     if stream:
-        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = (
+            completion_or_chunks  # type: ignore
+        )
         return _convert_text_completion_chunks_to_chat(chunks)
     else:
         completion: llama_types.Completion = completion_or_chunks  # type: ignore
@@ -414,7 +424,9 @@ def _convert_completion_to_chat_function(
                             }
                         ],
                     },
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                     "finish_reason": "tool_calls",
                 }
             ],
@@ -422,7 +434,9 @@ def _convert_completion_to_chat_function(
         }
         return chat_completion
     else:
-        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = (
+            completion_or_chunks  # type: ignore
+        )
 
         def _stream_response_to_function_stream(
             chunks: Iterator[llama_types.CreateCompletionStreamResponse],
@@ -467,7 +481,9 @@ def _stream_response_to_function_stream(
                             {
                                 "index": 0,
                                 "finish_reason": None,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                 "delta": {
                                     "role": None,
                                     "content": None,
@@ -504,7 +520,9 @@ def _stream_response_to_function_stream(
                         {
                             "index": 0,
                             "finish_reason": None,
-                            "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                            "logprobs": _convert_text_completion_logprobs_to_chat(
+                                chunk["choices"][0]["logprobs"]
+                            ),
                             "delta": {
                                 "role": None,
                                 "content": None,
@@ -702,7 +720,7 @@ def chat_completion_handler(
 
 
 def hf_autotokenizer_to_chat_formatter(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> ChatFormatter:
     # https://huggingface.co/docs/transformers/main/chat_templating
     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
@@ -727,7 +745,7 @@ def format_autotokenizer(
 
 
 def hf_autotokenizer_to_chat_completion_handler(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> LlamaChatCompletionHandler:
     chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path)
     return chat_formatter_to_chat_completion_handler(chat_formatter)
@@ -1552,9 +1570,9 @@ def prepare_messages_for_inference(
                 message["name"] = f"functions.{message['name']}"
             # Function call requests by assistant
             if "function_call" in message:
-                message["function_call"][
-                    "name"
-                ] = f"functions.{message['function_call']['name']}"
+                message["function_call"]["name"] = (
+                    f"functions.{message['function_call']['name']}"
+                )
             all_messages.append(message)
 
         all_messages.append(
@@ -1632,7 +1650,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             logits_processor=logits_processor,
             grammar=grammar,
         )
-        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
+        return _convert_completion_to_chat(
+            completion_or_completion_chunks, stream=stream
+        )  # type: ignore
 
     if function_call is None or (
         isinstance(function_call, str) and function_call == "auto"
@@ -1748,7 +1768,9 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
                         }
                     ],
                 },
-                "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
                 "finish_reason": "tool_calls",
             }
         ],
@@ -1789,9 +1811,9 @@ def functionary_v1_v2_chat_handler(
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
     tokenizer = llama.tokenizer_
-    assert hasattr(
-        tokenizer, "hf_tokenizer"
-    ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+    assert hasattr(tokenizer, "hf_tokenizer"), (
+        "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+    )
     from transformers import AutoTokenizer
 
     if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
@@ -1941,9 +1963,9 @@ def prepare_messages_for_inference(
                 message["name"] = f"functions.{message['name']}"
             # Function call requests by assistant
             if "function_call" in message:
-                message["function_call"][
-                    "name"
-                ] = f"functions.{message['function_call']['name']}"
+                message["function_call"]["name"] = (
+                    f"functions.{message['function_call']['name']}"
+                )
             all_messages.append(message)
 
         if version == "v1":
@@ -2005,7 +2027,9 @@ def prepare_messages_for_inference(
             completion_or_completion_chunks["choices"][0]["text"] = (
                 completion_or_completion_chunks["choices"][0]["text"].lstrip()
             )
-        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
+        return _convert_completion_to_chat(
+            completion_or_completion_chunks, stream=stream
+        )  # type: ignore
 
     def get_grammar(function_call):
         function_body = None
@@ -2160,7 +2184,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                         choices=[
                             {
                                 "index": 0,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                 "delta": {
                                     "role": None,
                                     "content": None,
@@ -2262,7 +2288,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                         choices=[
                             {
                                 "index": 0,
-                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                    chunk["choices"][0]["logprobs"]
+                                ),
                                 "delta": {
                                     "role": "assistant",
                                     "content": None,
@@ -2300,7 +2328,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         choices=[
                                             {
                                                 "index": 0,
-                                                "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                                "logprobs": _convert_text_completion_logprobs_to_chat(
+                                                    chunk["choices"][0]["logprobs"]
+                                                ),
                                                 "delta": {
                                                     "role": "assistant",
                                                     "content": buffer.pop(0),
@@ -2323,7 +2353,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 choices=[
                                     {
                                         "index": 0,
-                                        "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                        "logprobs": _convert_text_completion_logprobs_to_chat(
+                                            chunk["choices"][0]["logprobs"]
+                                        ),
                                         "delta": {
                                             "role": "assistant",
                                             "content": (
@@ -2409,7 +2441,9 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 choices=[
                                     {
                                         "index": 0,
-                                        "logprobs": _convert_text_completion_logprobs_to_chat(chunk["choices"][0]["logprobs"]),
+                                        "logprobs": _convert_text_completion_logprobs_to_chat(
+                                            chunk["choices"][0]["logprobs"]
+                                        ),
                                         "delta": {
                                             "role": None,
                                             "content": None,
@@ -2643,7 +2677,9 @@ def generate_streaming(tools, functions, function_call, prompt):
             choices=[
                 {
                     "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                     "message": {
                         "role": "assistant",
                         "content": None if content == "" else content,
@@ -2716,20 +2752,20 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
         with suppress_stdout_stderr(disable=self.verbose):
             # Get default parameters
             ctx_params = self._mtmd_cpp.mtmd_context_params_default()
-            ctx_params.use_gpu = True # TODO: Make this configurable
+            ctx_params.use_gpu = True  # TODO: Make this configurable
             ctx_params.print_timings = self.verbose
             ctx_params.n_threads = llama_model.n_threads
             ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
 
             # Initialize mtmd context
             self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
-                self.clip_model_path.encode(),
-                llama_model.model,
-                ctx_params
+                self.clip_model_path.encode(), llama_model.model, ctx_params
             )
 
             if self.mtmd_ctx is None:
-                raise ValueError(f"Failed to load mtmd context from: {self.clip_model_path}")
+                raise ValueError(
+                    f"Failed to load mtmd context from: {self.clip_model_path}"
+                )
 
             # Check if vision is supported
             if not self._mtmd_cpp.mtmd_support_vision(self.mtmd_ctx):
@@ -2756,12 +2792,12 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes):
             bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
                 self.mtmd_ctx,
                 (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
-                len(image_bytes)
+                len(image_bytes),
             )
-            
+
             if bitmap is None:
                 raise ValueError("Failed to create bitmap from image bytes")
-            
+
             return bitmap
 
     def __call__(
@@ -2820,10 +2856,10 @@ def __call__(
             trim_blocks=True,
             lstrip_blocks=True,
         ).from_string(self.CHAT_FORMAT)
-        
+
         # Get the default media marker
-        media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
-        
+        media_marker = self._mtmd_cpp.mtmd_default_marker().decode("utf-8")
+
         # Replace image URLs with media markers in the template
         text = template.render(
             messages=messages,
@@ -2831,7 +2867,7 @@ def __call__(
             eos_token=llama.detokenize([llama.token_eos()]),
             bos_token=llama.detokenize([llama.token_bos()]),
         )
-        
+
         # Replace image URLs in text with media markers
         for image_url in image_urls:
             text = text.replace(image_url, media_marker)
@@ -2851,7 +2887,7 @@ def __call__(
 
             # Create input text structure
             input_text = self._mtmd_cpp.mtmd_input_text()
-            input_text.text = text.encode('utf-8')
+            input_text.text = text.encode("utf-8")
             input_text.add_special = True
             input_text.parse_special = True
 
@@ -2862,13 +2898,15 @@ def __call__(
 
             try:
                 # Tokenize text and images together
-                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps)
+                bitmap_array = (self._mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(
+                    *bitmaps
+                )
                 result = self._mtmd_cpp.mtmd_tokenize(
                     self.mtmd_ctx,
                     chunks,
                     ctypes.byref(input_text),
                     bitmap_array,
-                    len(bitmaps)
+                    len(bitmaps),
                 )
 
                 if result != 0:
@@ -2881,40 +2919,45 @@ def __call__(
                 # Process each chunk
                 n_past = llama_cpp.llama_pos(0)
                 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
-                
+
                 for i in range(n_chunks):
                     chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
                     if chunk is None:
                         continue
 
                     chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
-                    
+
                     if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT:
                         # Handle text chunk
                         n_tokens_out = ctypes.c_size_t()
                         tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
                             chunk, ctypes.byref(n_tokens_out)
                         )
-                        
+
                         if tokens_ptr and n_tokens_out.value > 0:
                             # Convert ctypes array to Python list
                             tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
-                            
+
                             if llama.n_tokens + len(tokens) > llama.n_ctx():
                                 raise ValueError(
                                     f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
                                 )
                             llama.eval(tokens)
-                    
-                    elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
+
+                    elif chunk_type in [
+                        self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                        self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO,
+                    ]:
                         # Handle image/audio chunk using helper
-                        chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
-                        
+                        chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(
+                            chunk
+                        )
+
                         if llama.n_tokens + chunk_n_tokens > llama.n_ctx():
                             raise ValueError(
                                 f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}"
                             )
-                        
+
                         new_n_past = llama_cpp.llama_pos(0)
                         result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
                             self.mtmd_ctx,
@@ -2924,12 +2967,14 @@ def __call__(
                             llama_cpp.llama_seq_id(0),
                             llama.n_batch,
                             False,  # logits_last
-                            ctypes.byref(new_n_past)
+                            ctypes.byref(new_n_past),
                         )
-                        
+
                         if result != 0:
-                            raise ValueError(f"Failed to evaluate chunk: error code {result}")
-                        
+                            raise ValueError(
+                                f"Failed to evaluate chunk: error code {result}"
+                            )
+
                         # Update llama's token count
                         llama.n_tokens = new_n_past.value
 
@@ -3019,7 +3064,7 @@ def __call__(
             grammar=grammar,
             logit_bias=logit_bias,
         )
-        
+
         if tool is not None:
             tool_name = tool["function"]["name"]
             return _convert_completion_to_chat_function(
@@ -3032,10 +3077,12 @@ def _load_image(image_url: str) -> bytes:
         # TODO: Add Pillow support for other image formats beyond (jpg, png)
         if image_url.startswith("data:"):
             import base64
+
             image_bytes = base64.b64decode(image_url.split(",")[1])
             return image_bytes
         else:
             import urllib.request
+
             with urllib.request.urlopen(image_url) as f:
                 image_bytes = f.read()
                 return image_bytes
@@ -3062,6 +3109,7 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
     @staticmethod
     def split_text_on_image_urls(text: str, image_urls: List[str]):
         """This method is no longer used in the new implementation."""
+
         def find_first(s: str, substrs: List[str]):
             for i, substr in enumerate(substrs):
                 pos = s.find(substr)
@@ -3443,7 +3491,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
         "{% endif %}"
         "{% endif %}"
         "{% endfor %}"
-
         "{% for content in message['content'] %}"
         "{% if content.type == 'text' %}"
         "{{ content.text }}"
@@ -3465,8 +3512,8 @@ class Qwen25VLChatHandler(Llava15ChatHandler):
     DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
 
     CHAT_FORMAT = (
-        #"{% set image_count = namespace(value=0) %}"
-        #"{% set video_count = namespace(value=0) %}"
+        # "{% set image_count = namespace(value=0) %}"
+        # "{% set video_count = namespace(value=0) %}"
         "{% for message in messages %}"
         "{% if loop.first and message['role'] != 'system' %}"
         "<|im_start|>system\n"
@@ -3483,7 +3530,7 @@ class Qwen25VLChatHandler(Llava15ChatHandler):
         "{% else %}"
         "{{ content.image_url.url }}"
         "{% endif %}"
-        #"{% set image_count.value = image_count.value + 1 %}"
+        # "{% set image_count.value = image_count.value + 1 %}"
         "{% elif content['type'] == 'text' %}"
         "{{ content['text'] }}"
         "{% endif %}"
@@ -3495,25 +3542,28 @@ class Qwen25VLChatHandler(Llava15ChatHandler):
     )
 
     def __call__(self, **kwargs):
-        llama = kwargs['llama']
+        llama = kwargs["llama"]
 
         # Clear state for multiple runs
         llama.reset()
         llama._ctx.kv_cache_clear()
         llama.n_tokens = 0
 
-        if hasattr(llama, 'input_ids'):
+        if hasattr(llama, "input_ids"):
             llama.input_ids.fill(0)
 
         # Clear any handler state
-        if hasattr(self, '_last_image_embed'):
+        if hasattr(self, "_last_image_embed"):
             self._last_image_embed = None
             self._last_image_hash = None
 
         if self.verbose:
-            messages = kwargs.get('messages', [])
+            messages = kwargs.get("messages", [])
             image_count = len(self.get_image_urls(messages))
-            print(f"Minimal - Cleared state, processing {image_count} images", file=sys.stderr)
+            print(
+                f"Minimal - Cleared state, processing {image_count} images",
+                file=sys.stderr,
+            )
 
         # Use parent implementation
         return super().__call__(**kwargs)
@@ -3636,7 +3686,9 @@ def chatml_function_calling(
     stop = (
         [stop, "<|im_end|>"]
         if isinstance(stop, str)
-        else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
+        else stop + ["<|im_end|>"]
+        if stop
+        else ["<|im_end|>"]
     )
 
     # Case 1: No tool choice by user
@@ -3738,7 +3790,7 @@ def chatml_function_calling(
     # Case 3: Automatic tool choice
     assert isinstance(tool_choice, str) and tool_choice == "auto"
     function_names = " | ".join(
-        [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
+        [f'''"functions.{tool["function"]["name"]}:"''' for tool in tools]
     )
     initial_gbnf_tool_grammar = (
         """root   ::= functions | "message:"\n"""
@@ -3914,7 +3966,9 @@ def chatml_function_calling(
                 {
                     "finish_reason": "tool_calls",
                     "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
+                    "logprobs": _convert_text_completion_logprobs_to_chat(
+                        completion["choices"][0]["logprobs"]
+                    ),
                     "message": {
                         "role": "assistant",
                         "content": None,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 711d42a6a..f13af67f3 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -33,7 +33,11 @@
 # Specify the base name of the shared library to load
 _lib_base_name = "llama"
 _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH")
-_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path)
+_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _override_base_path is None
+    else pathlib.Path(_override_base_path)
+)
 # Load the library
 _lib = load_shared_library(_lib_base_name, _base_path)
 
@@ -559,6 +563,7 @@ class llama_token_data_array(ctypes.Structure):
 # typedef struct llama_batch {
 #     int32_t n_tokens;
 
+
 #     llama_token  *  token;
 #     float        *  embd;
 #     llama_pos    *  pos;
@@ -688,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
 #     // override key-value pairs of the model meta data
 #     const struct llama_model_kv_override * kv_overrides;
 
+
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool vocab_only;    // only load the vocabulary, no weights
 #     bool use_mmap;      // use mmap if possible
@@ -716,7 +722,9 @@ class llama_model_params(ctypes.Structure):
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
-        tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused
+        tensor_buft_overrides: CtypesArray[
+            llama_model_tensor_buft_override
+        ]  # NOTE: unused
         n_gpu_layers: int
         split_mode: int
         main_gpu: int
@@ -731,8 +739,8 @@ class llama_model_params(ctypes.Structure):
         use_extra_bufts: bool
 
     _fields_ = [
-        ("devices", ctypes.c_void_p), # NOTE: unnused
-        ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused
+        ("devices", ctypes.c_void_p),  # NOTE: unnused
+        ("tensor_buft_overrides", ctypes.c_void_p),  # NOTE: unused
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
@@ -784,6 +792,7 @@ class llama_model_params(ctypes.Structure):
 #     ggml_abort_callback abort_callback;
 #     void *              abort_callback_data;
 
+
 #     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
@@ -1137,8 +1146,7 @@ def llama_backend_free():
     [ctypes.c_int],
     None,
 )
-def llama_numa_init(numa: int, /):
-    ...
+def llama_numa_init(numa: int, /): ...
 
 
 # // Optional: an auto threadpool gets created in ggml if not passed explicitly
@@ -1164,8 +1172,7 @@ def llama_numa_init(numa: int, /):
 )
 def llama_load_model_from_file(
     path_model: bytes, params: llama_model_params, /
-) -> Optional[llama_model_p]:
-    ...
+) -> Optional[llama_model_p]: ...
 
 
 # // Load the model from a file
@@ -1230,8 +1237,7 @@ def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
     [llama_model_p_ctypes],
     None,
 )
-def llama_free_model(model: llama_model_p, /):
-    ...
+def llama_free_model(model: llama_model_p, /): ...
 
 
 # LLAMA_API void llama_model_free(struct llama_model * model);
@@ -1240,8 +1246,7 @@ def llama_free_model(model: llama_model_p, /):
     [llama_model_p_ctypes],
     None,
 )
-def llama_model_free(model: llama_model_p, /):
-    ...
+def llama_model_free(model: llama_model_p, /): ...
 
 
 # LLAMA_API struct llama_context * llama_init_from_model(
@@ -1254,8 +1259,7 @@ def llama_model_free(model: llama_model_p, /):
 )
 def llama_init_from_model(
     model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]:
-    ...
+) -> Optional[llama_context_p]: ...
 
 
 # DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -1269,8 +1273,7 @@ def llama_init_from_model(
 )
 def llama_new_context_with_model(
     model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]:
-    ...
+) -> Optional[llama_context_p]: ...
 
 
 # // Frees all allocated memory
@@ -1291,104 +1294,87 @@ def llama_free(ctx: llama_context_p, /):
     [],
     ctypes.c_int64,
 )
-def llama_time_us() -> int:
-    ...
+def llama_time_us() -> int: ...
 
 
 # LLAMA_API size_t llama_max_devices(void);
 @ctypes_function("llama_max_devices", [], ctypes.c_size_t)
-def llama_max_devices() -> int:
-    ...
+def llama_max_devices() -> int: ...
 
 
 # LLAMA_API size_t llama_max_parallel_sequences(void);
 @ctypes_function("llama_max_parallel_sequences", [], ctypes.c_size_t)
-def llama_max_parallel_sequences() -> int:
-    ...
+def llama_max_parallel_sequences() -> int: ...
 
 
 # LLAMA_API bool llama_supports_mmap       (void);
 @ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
-def llama_supports_mmap() -> bool:
-    ...
+def llama_supports_mmap() -> bool: ...
 
 
 # LLAMA_API bool llama_supports_mlock      (void);
 @ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
-def llama_supports_mlock() -> bool:
-    ...
+def llama_supports_mlock() -> bool: ...
 
 
 # LLAMA_API bool llama_supports_gpu_offload(void);
 @ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
-def llama_supports_gpu_offload() -> bool:
-    ...
+def llama_supports_gpu_offload() -> bool: ...
 
 
 # LLAMA_API bool llama_supports_rpc        (void);
 @ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
-def llama_supports_rpc() -> bool:
-    ...
+def llama_supports_rpc() -> bool: ...
 
 
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ctx(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
 
 
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_batch(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_batch(ctx: llama_context_p, /) -> int: ...
 
 
 # LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
 @ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ubatch(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
 
 
 # LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
 @ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_seq_max(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 @ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_ctx_train(model: llama_model_p, /) -> int:
-    ...
+def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
 @ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_embd(model: llama_model_p, /) -> int:
-    ...
+def llama_n_embd(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
 @ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_layer(model: llama_model_p, /) -> int:
-    ...
+def llama_n_layer(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
 @ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_head(model: llama_model_p, /) -> int:
-    ...
+def llama_n_head(model: llama_model_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 @ctypes_function("llama_n_vocab", [llama_vocab_p_ctypes], ctypes.c_int32)
-def llama_n_vocab(model: llama_vocab_p, /) -> int:
-    ...
+def llama_n_vocab(model: llama_vocab_p, /) -> int: ...
 
 
 # LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
-    ...
+def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
 
 
 # LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
@@ -1400,8 +1386,7 @@ def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]:
 
 # LLAMA_API  enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
-def llama_pooling_type(ctx: llama_context_p, /) -> int:
-    ...
+def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
@@ -1417,57 +1402,50 @@ def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
 
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
-def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]:
-    ...
+def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: ...
 
 
 # LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
 @ctypes_function("llama_model_rope_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_model_rope_type(model: llama_model_p, /) -> int:
-    ...
+def llama_model_rope_type(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
 @ctypes_function("llama_model_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_ctx_train(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_ctx_train(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
 @ctypes_function("llama_model_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_embd(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_embd(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_layer(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_layer(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_head(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_head(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
 @ctypes_function("llama_model_n_head_kv", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_head_kv(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_head_kv(model: llama_model_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
 @ctypes_function("llama_model_n_swa", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_model_n_swa(model: llama_model_p, /) -> int:
-    ...
+def llama_model_n_swa(model: llama_model_p, /) -> int: ...
 
 
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
-@ctypes_function("llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
-def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float:
-    ...
+@ctypes_function(
+    "llama_model_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float
+)
+def llama_model_rope_freq_scale_train(model: llama_model_p, /) -> float: ...
 
 
 # // Returns the number of classifier outputs (only valid for classifier models)
@@ -1481,7 +1459,9 @@ def llama_model_n_cls_out(model: llama_model_p, /) -> int:
 
 # // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
 # LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
-@ctypes_function("llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p)
+@ctypes_function(
+    "llama_model_cls_label", [llama_model_p_ctypes, ctypes.c_uint32], ctypes.c_char_p
+)
 def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]:
     """Returns label of classifier output by index. Returns None if no label provided"""
     ...
@@ -1489,14 +1469,12 @@ def llama_model_cls_label(model: llama_model_p, i: int, /) -> Optional[bytes]:
 
 # LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
 @ctypes_function("llama_vocab_type", [llama_vocab_p_ctypes], ctypes.c_int)
-def llama_vocab_type(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_vocab_type(vocab: llama_vocab_p, /) -> int: ...
 
 
 # LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
 @ctypes_function("llama_vocab_n_tokens", [llama_vocab_p_ctypes], ctypes.c_int32)
-def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_vocab_n_tokens(vocab: llama_vocab_p, /) -> int: ...
 
 
 # // Functions to access the model's GGUF metadata scalar values
@@ -1611,8 +1589,14 @@ def llama_model_size(model: llama_model_p, /) -> int:
 # // Get the default chat template. Returns nullptr if not available
 # // If name is NULL, returns the default chat template
 # LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
-@ctypes_function("llama_model_chat_template", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_char_p)
-def llama_model_chat_template(model: llama_model_p, name: Optional[bytes], /) -> Optional[bytes]:
+@ctypes_function(
+    "llama_model_chat_template",
+    [llama_model_p_ctypes, ctypes.c_char_p],
+    ctypes.c_char_p,
+)
+def llama_model_chat_template(
+    model: llama_model_p, name: Optional[bytes], /
+) -> Optional[bytes]:
     """Get the default chat template. Returns None if not available
     If name is None, returns the default chat template"""
     ...
@@ -1699,6 +1683,7 @@ def llama_model_quantize(
 # // Adapters
 # //
 
+
 # // Load a LoRA adapter from file
 # LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
 #         struct llama_model * model,
@@ -1710,8 +1695,7 @@ def llama_model_quantize(
 )
 def llama_adapter_lora_init(
     model: llama_model_p, path_lora: bytes, /
-) -> Optional[llama_adapter_lora_p]:
-    ...
+) -> Optional[llama_adapter_lora_p]: ...
 
 
 # // Manually free a LoRA adapter
@@ -1722,8 +1706,7 @@ def llama_adapter_lora_init(
     [llama_adapter_lora_p_ctypes],
     None,
 )
-def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /):
-    ...
+def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ...
 
 
 # // The following functions operate on a llama_context, hence the naming: llama_verb_...
@@ -1825,6 +1808,7 @@ def llama_apply_adapter_cvec(
 # // Memory
 # //
 
+
 # // Clear the memory contents
 # // If data == true, the data buffers will also be cleared together with the metadata
 # LLAMA_API void llama_memory_clear(
@@ -1916,9 +1900,7 @@ def llama_memory_seq_cp(
 # LLAMA_API void llama_memory_seq_keep(
 #         llama_memory_t mem,
 #           llama_seq_id seq_id);
-@ctypes_function(
-    "llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None
-)
+@ctypes_function("llama_memory_seq_keep", [llama_memory_t_ctypes, llama_seq_id], None)
 def llama_memory_seq_keep(mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
     ...
@@ -2040,13 +2022,12 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
 # // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
 # //
 
+
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
 # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
 #            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
-)
+@ctypes_function("llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32)
 def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
     """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
     ...
@@ -2055,9 +2036,7 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
 # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
 #            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
-)
+@ctypes_function("llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32)
 def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
     """Returns the number of used KV cells (DEPRECATED)"""
     ...
@@ -2067,9 +2046,7 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
 # DEPRECATED(LLAMA_API void llama_kv_self_clear(
 #             struct llama_context * ctx),
 #         "Use llama_memory_clear() instead");
-@ctypes_function(
-    "llama_kv_self_clear", [llama_context_p_ctypes], None
-)
+@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
 def llama_kv_self_clear(ctx: llama_context_p, /):
     """Clear the KV cache (DEPRECATED)"""
     ...
@@ -2146,9 +2123,7 @@ def llama_kv_self_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id),
 #         "Use llama_memory_seq_keep() instead");
-@ctypes_function(
-    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
-)
+@ctypes_function("llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None)
 def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
     """Keep only specified sequence in KV cache (DEPRECATED)"""
     ...
@@ -2292,6 +2267,7 @@ def llama_kv_self_update(ctx: llama_context_p, /):
 # // State / sessions
 # //
 
+
 # // Returns the *actual* size in bytes of the state
 # // (logits, embedding and memory)
 # // Only use when saving the state, not when restoring it, otherwise the size may be too small.
@@ -2420,8 +2396,7 @@ def llama_state_load_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # LLAMA_API DEPRECATED(bool llama_load_session_file(
@@ -2449,8 +2424,7 @@ def llama_load_session_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # LLAMA_API bool llama_state_save_file(
@@ -2474,8 +2448,7 @@ def llama_state_save_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # LLAMA_API DEPRECATED(bool llama_save_session_file(
@@ -2500,8 +2473,7 @@ def llama_save_session_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # // Get the exact size needed to copy the state of a single sequence
@@ -2599,8 +2571,7 @@ def llama_state_seq_save_file(
     tokens: CtypesArray[llama_token],
     n_token_count: Union[ctypes.c_size_t, int],
     /,
-) -> int:
-    ...
+) -> int: ...
 
 
 # LLAMA_API size_t llama_state_seq_load_file(
@@ -2630,14 +2601,14 @@ def llama_state_seq_load_file(
     n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
-) -> int:
-    ...
+) -> int: ...
 
 
 # //
 # // Decoding
 # //
 
+
 # // Return batch for single sequence of tokens
 # // The sequence ID will be fixed to 0
 # // The position of the tokens will be tracked automatically by llama_decode
@@ -2947,14 +2918,14 @@ def llama_get_embeddings_seq(
 # // Vocab
 # //
 
+
 # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
     "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p
 )
 def llama_vocab_get_text(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bytes:
-    ...
+) -> bytes: ...
 
 
 # LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
@@ -2963,8 +2934,7 @@ def llama_vocab_get_text(
 )
 def llama_vocab_get_score(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> float:
-    ...
+) -> float: ...
 
 
 # LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
@@ -2973,8 +2943,7 @@ def llama_vocab_get_score(
 )
 def llama_vocab_get_attr(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> int:
-    ...
+) -> int: ...
 
 
 # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
@@ -3055,8 +3024,7 @@ def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -3065,8 +3033,7 @@ def llama_vocab_get_add_bos(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
@@ -3075,8 +3042,7 @@ def llama_vocab_get_add_eos(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
@@ -3085,8 +3051,7 @@ def llama_vocab_get_add_sep(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -3095,8 +3060,7 @@ def llama_vocab_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
@@ -3105,8 +3069,7 @@ def llama_vocab_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
@@ -3115,8 +3078,7 @@ def llama_vocab_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
@@ -3125,8 +3087,7 @@ def llama_vocab_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
@@ -3135,8 +3096,7 @@ def llama_vocab_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # DEPRECATED functions
@@ -3148,8 +3108,7 @@ def llama_vocab_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
 )
 def llama_token_get_text(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bytes:
-    ...
+) -> bytes: ...
 
 
 # DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
@@ -3160,8 +3119,8 @@ def llama_token_get_text(
 )
 def llama_token_get_score(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> float:
-    ...
+) -> float: ...
+
 
 # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
 @ctypes_function(
@@ -3171,8 +3130,8 @@ def llama_token_get_score(
 )
 def llama_token_get_attr(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> int:
-    ...
+) -> int: ...
+
 
 # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
 @ctypes_function(
@@ -3182,8 +3141,8 @@ def llama_token_get_attr(
 )
 def llama_token_is_eog(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bool:
-    ...
+) -> bool: ...
+
 
 # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
 @ctypes_function(
@@ -3193,8 +3152,8 @@ def llama_token_is_eog(
 )
 def llama_token_is_control(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
-) -> bool:
-    ...
+) -> bool: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
 @ctypes_function(
@@ -3202,8 +3161,8 @@ def llama_token_is_control(
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_bos(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_bos(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
 @ctypes_function(
@@ -3211,8 +3170,8 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_eos(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_eos(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
 @ctypes_function(
@@ -3220,8 +3179,8 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_eot(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_eot(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
 @ctypes_function(
@@ -3229,8 +3188,8 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_cls(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_cls(vocab: llama_vocab_p, /) -> int: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
 @ctypes_function(
@@ -3238,8 +3197,7 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_sep(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_sep(vocab: llama_vocab_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
@@ -3248,8 +3206,7 @@ def llama_token_sep(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_nl(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_nl(vocab: llama_vocab_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
@@ -3258,8 +3215,7 @@ def llama_token_nl(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_pad(vocab: llama_vocab_p, /) -> int:
-    ...
+def llama_token_pad(vocab: llama_vocab_p, /) -> int: ...
 
 
 # DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
@@ -3268,8 +3224,8 @@ def llama_token_pad(vocab: llama_vocab_p, /) -> int:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ...
+
 
 # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
 @ctypes_function(
@@ -3277,8 +3233,7 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     ctypes.c_bool,
 )
-def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool:
-    ...
+def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool: ...
 
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
@@ -3287,8 +3242,8 @@ def llama_add_eos_token(vocab: llama_vocab_p, /) -> bool:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
 @ctypes_function(
@@ -3296,8 +3251,8 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
 @ctypes_function(
@@ -3305,8 +3260,8 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
 @ctypes_function(
@@ -3314,8 +3269,8 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
 @ctypes_function(
@@ -3323,8 +3278,8 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
 @ctypes_function(
@@ -3332,8 +3287,8 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: ...
+
 
 # // CLS is equivalent to BOS
 # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
@@ -3343,8 +3298,7 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
     [llama_vocab_p_ctypes],
     llama_token,
 )
-def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
-    ...
+def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token: ...
 
 
 # //
@@ -3353,6 +3307,7 @@ def llama_vocab_cls(vocab: llama_vocab_p, /) -> llama_token:
 # // The API is thread-safe.
 # //
 
+
 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
 # /// @return Returns the number of tokens on success, no more than n_tokens_max
@@ -3512,6 +3467,7 @@ def llama_detokenize(
 # // Chat templates
 # //
 
+
 # /// Apply chat template. Inspired by hf apply_chat_template() on python.
 # /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
 # /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -3535,9 +3491,9 @@ def llama_detokenize(
         ctypes.c_char_p,  # tmpl
         ctypes.POINTER(llama_chat_message),  # chat
         ctypes.c_size_t,  # n_msg
-        ctypes.c_bool,    # add_ass (added)
+        ctypes.c_bool,  # add_ass (added)
         ctypes.c_char_p,  # buf
-        ctypes.c_int32,   # length
+        ctypes.c_int32,  # length
     ],
     ctypes.c_int32,
 )
@@ -3611,11 +3567,11 @@ def llama_chat_builtin_templates(
 #     struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 #     void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 
+
 #     // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
 #     //void (*apply_ggml) (struct llama_sampler * smpl, ...);
 # };
-class llama_sampler_i(ctypes.Structure):
-    ...
+class llama_sampler_i(ctypes.Structure): ...
 
 
 # struct llama_sampler {
@@ -3662,8 +3618,7 @@ class llama_sampler(ctypes.Structure):
 )
 def llama_sampler_init(
     iface: ctypes.POINTER(llama_sampler_i), ctx: llama_sampler_context_t, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
@@ -3672,8 +3627,7 @@ def llama_sampler_init(
     [llama_sampler_p_ctypes],
     ctypes.c_char_p,
 )
-def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes:
-    ...
+def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: ...
 
 
 # LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
@@ -3682,8 +3636,7 @@ def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes:
     [llama_sampler_p_ctypes, llama_token],
     None,
 )
-def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /):
-    ...
+def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): ...
 
 
 # LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -3694,8 +3647,7 @@ def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int],
 )
 def llama_sampler_apply(
     smpl: llama_sampler_p, cur_p: CtypesArray[llama_token_data_array], /
-):
-    ...
+): ...
 
 
 # LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
@@ -3704,8 +3656,7 @@ def llama_sampler_apply(
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_sampler_reset(smpl: llama_sampler_p, /):
-    ...
+def llama_sampler_reset(smpl: llama_sampler_p, /): ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
@@ -3714,8 +3665,7 @@ def llama_sampler_reset(smpl: llama_sampler_p, /):
     [llama_sampler_p_ctypes],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p:
-    ...
+def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: ...
 
 
 # // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
@@ -3725,21 +3675,22 @@ def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p:
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_sampler_free(smpl: llama_sampler_p, /):
-    ...
+def llama_sampler_free(smpl: llama_sampler_p, /): ...
 
 
 # // llama_sampler_chain
 # // a type of llama_sampler that can chain multiple samplers one after another
 
+
 # LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
 @ctypes_function(
     "llama_sampler_chain_init",
     [llama_sampler_chain_params],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sampler_p:
-    ...
+def llama_sampler_chain_init(
+    params: llama_sampler_chain_params, /
+) -> llama_sampler_p: ...
 
 
 # // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
@@ -3749,8 +3700,7 @@ def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sam
     [llama_sampler_p_ctypes, llama_sampler_p_ctypes],
     None,
 )
-def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /):
-    ...
+def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
@@ -3761,8 +3711,7 @@ def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /):
 )
 def llama_sampler_chain_get(
     chain: llama_sampler_p, i: Union[ctypes.c_int32, int], /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
@@ -3771,8 +3720,7 @@ def llama_sampler_chain_get(
     [llama_sampler_p_ctypes],
     ctypes.c_int,
 )
-def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int:
-    ...
+def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ...
 
 
 # // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -3784,22 +3732,20 @@ def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int:
 )
 def llama_sampler_chain_remove(
     chain: llama_sampler_p, i: Union[ctypes.c_int32, int], /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # // available samplers:
 
+
 # LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
-def llama_sampler_init_greedy() -> llama_sampler_p:
-    ...
+def llama_sampler_init_greedy() -> llama_sampler_p: ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
 @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
-def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ...
 
 
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -3807,16 +3753,14 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
 #     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
 @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p:
-    ...
+def llama_sampler_init_softmax() -> llama_sampler_p: ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 # /// Setting k <= 0 makes this a noop
 # LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
 @ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes)
-def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_top_k(k: int) -> llama_sampler_p: ...
 
 
 # /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -3826,8 +3770,7 @@ def llama_sampler_init_top_k(k: int) -> llama_sampler_p:
     [ctypes.c_float, ctypes.c_size_t],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: ...
 
 
 # /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
@@ -3837,8 +3780,7 @@ def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p:
     [ctypes.c_float, ctypes.c_size_t],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ...
 
 
 # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -3848,15 +3790,13 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p:
     [ctypes.c_float, ctypes.c_size_t],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p:
-    ...
+def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: ...
 
 
 # /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
 # LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
 @ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes)
-def llama_sampler_init_temp(t: float) -> llama_sampler_p:
-    ...
+def llama_sampler_init_temp(t: float) -> llama_sampler_p: ...
 
 
 # /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
@@ -3868,8 +3808,7 @@ def llama_sampler_init_temp(t: float) -> llama_sampler_p:
 )
 def llama_sampler_init_temp_ext(
     t: float, delta: float, exponent: float
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
@@ -3881,8 +3820,7 @@ def llama_sampler_init_temp_ext(
 )
 def llama_sampler_init_xtc(
     p: float, t: float, min_keep: int, seed: int, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
@@ -3892,8 +3830,7 @@ def llama_sampler_init_xtc(
     [ctypes.c_float],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
-    ...
+def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p: ...
 
 
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3910,8 +3847,7 @@ def llama_sampler_init_top_n_sigma(n: float, /) -> llama_sampler_p:
 )
 def llama_sampler_init_mirostat(
     n_vocab: int, seed: int, tau: float, eta: float, m: int, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3926,8 +3862,7 @@ def llama_sampler_init_mirostat(
 )
 def llama_sampler_init_mirostat_v2(
     seed: int, tau: float, eta: float, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Intializes a GBNF grammar, see grammars/README.md for details.
@@ -3942,8 +3877,7 @@ def llama_sampler_init_mirostat_v2(
 )
 def llama_sampler_init_grammar(
     vocab: llama_vocab_p, grammar_str: bytes, grammar_root: bytes, /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -3977,8 +3911,7 @@ def llama_sampler_init_grammar_lazy(
     trigger_tokens: CtypesArray[llama_token],
     num_trigger_tokens: int,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
@@ -4012,8 +3945,7 @@ def llama_sampler_init_grammar_lazy_patterns(
     trigger_tokens: CtypesArray[llama_token],
     num_trigger_tokens: int,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
@@ -4033,8 +3965,7 @@ def llama_sampler_init_penalties(
     penalty_freq: float,
     penalty_present: float,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
@@ -4071,8 +4002,7 @@ def llama_sampler_init_dry(
     seq_breakers,
     num_breakers: int,
     /,
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
@@ -4086,8 +4016,7 @@ def llama_sampler_init_dry(
 )
 def llama_sampler_init_logit_bias(
     n_vocab: int, n_logit_bias: int, logit_bias: CtypesArray[llama_logit_bias], /
-) -> llama_sampler_p:
-    ...
+) -> llama_sampler_p: ...
 
 
 # // this sampler is meant to be used for fill-in-the-middle infilling
@@ -4097,8 +4026,7 @@ def llama_sampler_init_logit_bias(
     [llama_vocab_p_ctypes],
     llama_sampler_p_ctypes,
 )
-def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p:
-    ...
+def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p: ...
 
 
 # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
@@ -4108,8 +4036,7 @@ def llama_sampler_init_infill(vocab: llama_vocab_p, /) -> llama_sampler_p:
     [llama_sampler_p_ctypes],
     ctypes.c_uint32,
 )
-def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
-    ...
+def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: ...
 
 
 # /// @details Sample and accept a token from the idx-th output of the last evaluation
@@ -4121,14 +4048,14 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
 )
 def llama_sampler_sample(
     smpl: llama_sampler_p, ctx: llama_context_p, idx: int, /
-) -> int:
-    ...
+) -> int: ...
 
 
 # //
 # // Model split
 # //
 
+
 # /// @details Build a split GGUF final path for this chunk.
 # LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
 @ctypes_function(
@@ -4170,8 +4097,7 @@ def llama_split_prefix(
 # // Print system information
 # LLAMA_API const char * llama_print_system_info(void);
 @ctypes_function("llama_print_system_info", [], ctypes.c_char_p)
-def llama_print_system_info() -> bytes:
-    ...
+def llama_print_system_info() -> bytes: ...
 
 
 # // Set callback for all future logging events.
@@ -4203,6 +4129,7 @@ def llama_log_set(
 #     double t_p_eval_ms;
 #     double t_eval_ms;
 
+
 #     int32_t n_p_eval;
 #     int32_t n_eval;
 #     int32_t n_reused; // number of times a ggml compute graph had been reused
@@ -4222,6 +4149,7 @@ class llama_perf_context_data(ctypes.Structure):
 # struct llama_perf_sampler_data {
 #     double t_sample_ms;
 
+
 #     int32_t n_sample;
 # };
 class llama_perf_sampler_data(ctypes.Structure):
@@ -4237,8 +4165,7 @@ class llama_perf_sampler_data(ctypes.Structure):
     [llama_context_p_ctypes],
     llama_perf_context_data,
 )
-def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data:
-    ...
+def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: ...
 
 
 # LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
@@ -4247,8 +4174,7 @@ def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data:
     [llama_context_p_ctypes],
     None,
 )
-def llama_perf_context_print(ctx: llama_context_p, /):
-    ...
+def llama_perf_context_print(ctx: llama_context_p, /): ...
 
 
 # LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
@@ -4257,8 +4183,7 @@ def llama_perf_context_print(ctx: llama_context_p, /):
     [llama_context_p_ctypes],
     None,
 )
-def llama_perf_context_reset(ctx: llama_context_p, /):
-    ...
+def llama_perf_context_reset(ctx: llama_context_p, /): ...
 
 
 # // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
@@ -4268,8 +4193,7 @@ def llama_perf_context_reset(ctx: llama_context_p, /):
     [llama_sampler_p_ctypes],
     llama_perf_sampler_data,
 )
-def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data:
-    ...
+def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: ...
 
 
 # LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
@@ -4278,8 +4202,7 @@ def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data:
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_perf_sampler_print(chain: llama_sampler_p, /):
-    ...
+def llama_perf_sampler_print(chain: llama_sampler_p, /): ...
 
 
 # LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
@@ -4288,8 +4211,7 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /):
     [llama_sampler_p_ctypes],
     None,
 )
-def llama_perf_sampler_reset(chain: llama_sampler_p, /):
-    ...
+def llama_perf_sampler_reset(chain: llama_sampler_p, /): ...
 
 
 # //
@@ -4298,7 +4220,10 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
 
 # // function that returns whether or not a given tensor contains trainable parameters
 # typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
-llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
+llama_opt_param_filter = ctypes.CFUNCTYPE(
+    ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p
+)
+
 
 # // always returns true
 # LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
@@ -4307,8 +4232,9 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
     [ctypes.c_void_p, ctypes.c_void_p],
     ctypes.c_bool,
 )
-def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /) -> bool:
-    ...
+def llama_opt_param_filter_all(
+    tensor: ctypes.c_void_p, userdata: ctypes.c_void_p, /
+) -> bool: ...
 
 
 # struct llama_opt_params {
@@ -4317,6 +4243,7 @@ def llama_opt_param_filter_all(tensor: ctypes.c_void_p, userdata: ctypes.c_void_
 #     llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
 #     void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
 
+
 #     ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
 #     void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
 # };
@@ -4325,7 +4252,10 @@ class llama_opt_params(ctypes.Structure):
         ("n_ctx_train", ctypes.c_uint32),
         ("param_filter", llama_opt_param_filter),
         ("param_filter_ud", ctypes.c_void_p),
-        ("get_opt_pars", ctypes.c_void_p),  # ggml_opt_get_optimizer_params - not implemented here
+        (
+            "get_opt_pars",
+            ctypes.c_void_p,
+        ),  # ggml_opt_get_optimizer_params - not implemented here
         ("get_opt_pars_ud", ctypes.c_void_p),
     ]
 
@@ -4336,8 +4266,9 @@ class llama_opt_params(ctypes.Structure):
     [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params],
     None,
 )
-def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /):
-    ...
+def llama_opt_init(
+    lctx: llama_context_p, model: llama_model_p, lopt_params: llama_opt_params, /
+): ...
 
 
 # LLAMA_API void llama_opt_epoch(
@@ -4353,7 +4284,7 @@ def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: lla
     [
         llama_context_p_ctypes,
         ctypes.c_void_p,  # ggml_opt_dataset_t
-        ctypes.c_void_p,  # ggml_opt_result_t  
+        ctypes.c_void_p,  # ggml_opt_result_t
         ctypes.c_void_p,  # ggml_opt_result_t
         ctypes.c_int64,
         ctypes.c_void_p,  # ggml_opt_epoch_callback
@@ -4370,5 +4301,4 @@ def llama_opt_epoch(
     callback_train: ctypes.c_void_p,
     callback_eval: ctypes.c_void_p,
     /,
-):
-    ...
+): ...
diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index b95c77ab5..ba34dda83 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -297,7 +297,7 @@ def opt_repetitions(up_to_n, prefix_with_sep=False):
     if max_items is not None:
         result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
     else:
-        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+        item_operator = f"({separator_rule + ' ' if separator_rule else ''}{item_rule})"
 
         if min_items == 0 and separator_rule:
             result = f"({item_rule} {item_operator}*)?"
@@ -450,9 +450,9 @@ def visit(n: dict):
                 ref = n.get("$ref")
                 if ref is not None and ref not in self._refs:
                     if ref.startswith("https://"):
-                        assert (
-                            self._allow_fetch
-                        ), "Fetching remote schemas is not allowed (use --allow-fetch for force)"
+                        assert self._allow_fetch, (
+                            "Fetching remote schemas is not allowed (use --allow-fetch for force)"
+                        )
                         import requests
 
                         frag_split = ref.split("#")
@@ -475,9 +475,9 @@ def visit(n: dict):
                         raise ValueError(f"Unsupported ref {ref}")
 
                     for sel in ref.split("#")[-1].split("/")[1:]:
-                        assert (
-                            target is not None and sel in target
-                        ), f"Error resolving ref {ref}: {sel} not in {target}"
+                        assert target is not None and sel in target, (
+                            f"Error resolving ref {ref}: {sel} not in {target}"
+                        )
                         target = target[sel]
 
                     self._refs[ref] = target
@@ -492,7 +492,7 @@ def visit(n: dict):
     def _generate_union_rule(self, name, alt_schemas):
         return " | ".join(
             (
-                self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
+                self.visit(alt_schema, f"{name}{'-' if name else 'alternative-'}{i}")
                 for i, alt_schema in enumerate(alt_schemas)
             )
         )
@@ -510,9 +510,9 @@ def _visit_pattern(self, pattern, name):
         we define sub-rules to keep the output lean.
         """
 
-        assert pattern.startswith("^") and pattern.endswith(
-            "$"
-        ), 'Pattern must start with "^" and end with "$"'
+        assert pattern.startswith("^") and pattern.endswith("$"), (
+            'Pattern must start with "^" and end with "$"'
+        )
         pattern = pattern[1:-1]
         sub_rule_ids = {}
 
@@ -566,15 +566,15 @@ def join_seq():
                 elif c == "(":
                     i += 1
                     if i < length:
-                        assert (
-                            pattern[i] != "?"
-                        ), f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
+                        assert pattern[i] != "?", (
+                            f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
+                        )
                     seq.append((f"({to_rule(transform())})", False))
                 elif c == ")":
                     i += 1
-                    assert (
-                        start > 0 and pattern[start - 1] == "("
-                    ), f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}"
+                    assert start > 0 and pattern[start - 1] == "(", (
+                        f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}"
+                    )
                     return join_seq()
                 elif c == "[":
                     square_brackets = c
@@ -586,9 +586,9 @@ def join_seq():
                         else:
                             square_brackets += pattern[i]
                             i += 1
-                    assert (
-                        i < length
-                    ), f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}"
+                    assert i < length, (
+                        f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}"
+                    )
                     square_brackets += "]"
                     i += 1
                     seq.append((square_brackets, False))
@@ -604,9 +604,9 @@ def join_seq():
                     while i < length and pattern[i] != "}":
                         curly_brackets += pattern[i]
                         i += 1
-                    assert (
-                        i < length
-                    ), f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}"
+                    assert i < length, (
+                        f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}"
+                    )
                     curly_brackets += "}"
                     i += 1
                     nums = [s.strip() for s in curly_brackets[1:-1].split(",")]
@@ -777,13 +777,13 @@ def add_component(comp_schema, is_required):
                     rule_name,
                     '"[" space '
                     + ' "," space '.join(
-                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
+                        self.visit(item, f"{name}{'-' if name else ''}tuple-{i}")
                         for i, item in enumerate(items)
                     )
                     + ' "]" space',
                 )
             else:
-                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
+                item_rule_name = self.visit(items, f"{name}{'-' if name else ''}item")
                 min_items = schema.get("minItems", 0)
                 max_items = schema.get("maxItems")
                 return self._add_rule(
@@ -873,17 +873,17 @@ def _build_object_rule(
         prop_kv_rule_names = {}
         for prop_name, prop_schema in properties:
             prop_rule_name = self.visit(
-                prop_schema, f'{name}{"-" if name else ""}{prop_name}'
+                prop_schema, f"{name}{'-' if name else ''}{prop_name}"
             )
             prop_kv_rule_names[prop_name] = self._add_rule(
-                f'{name}{"-" if name else ""}{prop_name}-kv',
+                f"{name}{'-' if name else ''}{prop_name}-kv",
                 rf'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}',
             )
         required_props = [k for k in sorted_props if k in required]
         optional_props = [k for k in sorted_props if k not in required]
 
         if additional_properties == True or isinstance(additional_properties, dict):
-            sub_name = f'{name}{"-" if name else ""}additional'
+            sub_name = f"{name}{'-' if name else ''}additional"
             value_rule = self.visit(
                 {} if additional_properties == True else additional_properties,
                 f"{sub_name}-value",
@@ -908,7 +908,7 @@ def get_recursive_refs(ks, first_is_optional):
                 kv_rule_name = prop_kv_rule_names[k]
                 if k == "*":
                     res = self._add_rule(
-                        f'{name}{"-" if name else ""}additional-kvs',
+                        f"{name}{'-' if name else ''}additional-kvs",
                         f'{kv_rule_name} ( "," space ' + kv_rule_name + " )*",
                     )
                 elif first_is_optional:
@@ -917,7 +917,7 @@ def get_recursive_refs(ks, first_is_optional):
                     res = kv_rule_name
                 if len(rest) > 0:
                     res += " " + self._add_rule(
-                        f'{name}{"-" if name else ""}{k}-rest',
+                        f"{name}{'-' if name else ''}{k}-rest",
                         get_recursive_refs(rest, first_is_optional=True),
                     )
                 return res
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index d9dfaf5fd..3dc96d2f2 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -36,7 +36,11 @@
 # Specify the base name of the shared library to load
 _libllava_base_name = "llava"
 _libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
-_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
+_libllava_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _libllava_override_path is None
+    else pathlib.Path()
+)
 
 # Load the library
 _libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
@@ -73,8 +77,7 @@ class llava_image_embed(Structure):
 )
 def llava_validate_embed_size(
     ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
-) -> bool:
-    ...
+) -> bool: ...
 
 
 # /** build an image embed from image file bytes */
@@ -90,8 +93,7 @@ def llava_image_embed_make_with_bytes(
     image_bytes: CtypesArray[c_uint8],
     image_bytes_length: Union[c_int, int],
     /,
-) -> "_Pointer[llava_image_embed]":
-    ...
+) -> "_Pointer[llava_image_embed]": ...
 
 
 # /** build an image embed from a path to an image filename */
@@ -103,15 +105,13 @@ def llava_image_embed_make_with_bytes(
 )
 def llava_image_embed_make_with_filename(
     ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
-) -> "_Pointer[llava_image_embed]":
-    ...
+) -> "_Pointer[llava_image_embed]": ...
 
 
 # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 # /** free an embedding made with llava_image_embed_make_* */
 @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
-def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
-    ...
+def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ...
 
 
 # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
@@ -132,8 +132,7 @@ def llava_eval_image_embed(
     n_batch: Union[c_int, int],
     n_past: "_Pointer[c_int]",
     /,
-) -> bool:
-    ...
+) -> bool: ...
 
 
 ################################################
@@ -146,13 +145,10 @@ def llava_eval_image_embed(
 @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
 def clip_model_load(
     fname: bytes, verbosity: Union[c_int, int], /
-) -> Optional[clip_ctx_p]:
-    ...
+) -> Optional[clip_ctx_p]: ...
 
 
 # /** free mmproj model */
 # CLIP_API void clip_free(struct clip_ctx * ctx);
 @ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
-def clip_free(ctx: clip_ctx_p, /):
-    ...
-
+def clip_free(ctx: clip_ctx_p, /): ...
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index a45f8f406..41753a7f6 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -39,7 +39,11 @@
 # Specify the base name of the shared library to load
 _libmtmd_base_name = "mtmd"
 _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
-_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+_libmtmd_base_path = (
+    pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+    if _libmtmd_override_path is None
+    else pathlib.Path()
+)
 
 # Load the library
 _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
@@ -71,6 +75,7 @@
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
 
+
 # Structures
 class mtmd_context_params(Structure):
     _fields_ = [
@@ -82,6 +87,7 @@ class mtmd_context_params(Structure):
         ("media_marker", c_char_p),
     ]
 
+
 class mtmd_input_text(Structure):
     _fields_ = [
         ("text", c_char_p),
@@ -89,19 +95,21 @@ class mtmd_input_text(Structure):
         ("parse_special", c_bool),
     ]
 
+
 ################################################
 # mtmd.h functions
 ################################################
 
+
 # MTMD_API const char * mtmd_default_marker(void);
 @ctypes_function("mtmd_default_marker", [], c_char_p)
-def mtmd_default_marker() -> bytes:
-    ...
+def mtmd_default_marker() -> bytes: ...
+
 
 # MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 @ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
-def mtmd_context_params_default() -> mtmd_context_params:
-    ...
+def mtmd_context_params_default() -> mtmd_context_params: ...
+
 
 # MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 #                                             const struct llama_model * text_model,
@@ -109,70 +117,68 @@ def mtmd_context_params_default() -> mtmd_context_params:
 @ctypes_function(
     "mtmd_init_from_file",
     [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
-    mtmd_context_p_ctypes
+    mtmd_context_p_ctypes,
 )
 def mtmd_init_from_file(
     mmproj_fname: bytes,
     text_model: llama_cpp.llama_model_p,
     ctx_params: mtmd_context_params,
     /,
-) -> Optional[mtmd_context_p]:
-    ...
+) -> Optional[mtmd_context_p]: ...
+
 
 # MTMD_API void mtmd_free(mtmd_context * ctx);
 @ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
-def mtmd_free(ctx: mtmd_context_p, /):
-    ...
+def mtmd_free(ctx: mtmd_context_p, /): ...
+
 
 # MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
-def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
-    ...
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ...
+
 
 # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
 @ctypes_function(
-    "mtmd_bitmap_init",
-    [c_uint32, c_uint32, POINTER(c_uint8)],
-    mtmd_bitmap_p_ctypes
+    "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes
 )
 def mtmd_bitmap_init(
     nx: Union[c_uint32, int],
     ny: Union[c_uint32, int],
     data: CtypesArray[c_uint8],
     /,
-) -> Optional[mtmd_bitmap_p]:
-    ...
+) -> Optional[mtmd_bitmap_p]: ...
+
 
 # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
 @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
-def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
-    ...
+def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ...
+
 
 # MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
 @ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
-def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
-    ...
+def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]: ...
+
 
 # MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
-def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
-    ...
+def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /): ...
+
 
 # MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
-def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
-    ...
+def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int: ...
+
 
 # MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
 @ctypes_function(
     "mtmd_input_chunks_get",
     [mtmd_input_chunks_p_ctypes, c_size_t],
-    mtmd_input_chunk_p_ctypes
+    mtmd_input_chunk_p_ctypes,
 )
 def mtmd_input_chunks_get(
     chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
-) -> Optional[mtmd_input_chunk_p]:
-    ...
+) -> Optional[mtmd_input_chunk_p]: ...
+
 
 # MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
 #                                mtmd_input_chunks * output,
@@ -197,52 +203,53 @@ def mtmd_tokenize(
     bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
     n_bitmaps: Union[c_size_t, int],
     /,
-) -> int:
-    ...
+) -> int: ...
+
 
 # MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
 @ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
-def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
-    ...
+def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int: ...
+
 
 # MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
 @ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
-def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
-    ...
+def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int: ...
+
 
 # MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 @ctypes_function(
     "mtmd_input_chunk_get_tokens_text",
     [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
-    POINTER(llama_cpp.llama_token)
+    POINTER(llama_cpp.llama_token),
 )
 def mtmd_input_chunk_get_tokens_text(
     chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
-) -> Optional["_Pointer[llama_cpp.llama_token]"]:
-    ...
+) -> Optional["_Pointer[llama_cpp.llama_token]"]: ...
+
 
 ################################################
 # mtmd-helper.h functions
 ################################################
 
+
 # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 @ctypes_function(
     "mtmd_helper_bitmap_init_from_buf",
     [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
-    mtmd_bitmap_p_ctypes
+    mtmd_bitmap_p_ctypes,
 )
 def mtmd_helper_bitmap_init_from_buf(
     ctx: mtmd_context_p,
     buf: CtypesArray[c_uint8],
     length: Union[c_size_t, int],
     /,
-) -> Optional[mtmd_bitmap_p]:
-    ...
+) -> Optional[mtmd_bitmap_p]: ...
+
 
 # MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 @ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
-def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
-    ...
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int: ...
+
 
 # MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 #                                                struct llama_context * lctx,
@@ -276,5 +283,4 @@ def mtmd_helper_eval_chunk_single(
     logits_last: Union[c_bool, bool],
     new_n_past: "_Pointer[llama_cpp.llama_pos]",
     /,
-) -> int:
-    ...
+) -> int: ...
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 5120f2416..f776fe159 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -125,9 +125,9 @@ def create_app(
         server_settings = ServerSettings.model_validate(settings)
         model_settings = [ModelSettings.model_validate(settings)]
 
-    assert (
-        server_settings is not None and model_settings is not None
-    ), "server_settings and model_settings must be provided together"
+    assert server_settings is not None and model_settings is not None, (
+        "server_settings and model_settings must be provided together"
+    )
 
     set_server_settings(server_settings)
     middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index 3dd007676..8ed029063 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -14,7 +14,9 @@ def _get_base_type(annotation: Type[Any]) -> Type[Any]:
     elif getattr(annotation, "__origin__", None) is Union:
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
         non_optional_args: List[Type[Any]] = [
-            arg for arg in annotation.__args__ if arg is not type(None)  # type: ignore
+            arg
+            for arg in annotation.__args__
+            if arg is not type(None)  # type: ignore
         ]
         if non_optional_args:
             return _get_base_type(non_optional_args[0])
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 11bd363b5..9e59e8563 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -186,18 +186,18 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "hf-autotokenizer":
-            assert (
-                settings.hf_pretrained_model_name_or_path is not None
-            ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
+            assert settings.hf_pretrained_model_name_or_path is not None, (
+                "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
+            )
             chat_handler = (
                 llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
                     settings.hf_pretrained_model_name_or_path
                 )
             )
         elif settings.chat_format == "hf-tokenizer-config":
-            assert (
-                settings.hf_tokenizer_config_path is not None
-            ), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
+            assert settings.hf_tokenizer_config_path is not None, (
+                "hf_tokenizer_config_path must be set for hf-tokenizer-config"
+            )
             chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
                 json.load(open(settings.hf_tokenizer_config_path))
             )
diff --git a/pyproject.toml b/pyproject.toml
index f5ae7b59c..e0b0dc520 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ test = [
     "huggingface-hub>=0.23.0"
 ]
 dev = [
-    "black>=23.3.0",
+    "ruff>=0.15.7",
     "twine>=4.0.2",
     "mkdocs>=1.4.3",
     "mkdocstrings[python]>=0.22.0",
@@ -78,5 +78,16 @@ Issues = "https://github.com/abetlen/llama-cpp-python/issues"
 Documentation = "https://llama-cpp-python.readthedocs.io/en/latest/"
 Changelog = "https://llama-cpp-python.readthedocs.io/en/latest/changelog/"
 
+[tool.ruff]
+target-version = "py38"
+line-length = 88
+required-version = ">=0.15.7"
+src = ["llama_cpp", "tests"]
+extend-exclude = ["vendor", "examples/notebooks"]
+
+[tool.ruff.lint]
+select = ["E4", "E7", "E9"]
+ignore = ["E712"]
+
 [tool.pytest.ini_options]
 testpaths = "tests"
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 0a1a9f5ad..964b0895c 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -66,6 +66,7 @@ def llama_cpp_model_path():
 
 def test_real_model(llama_cpp_model_path):
     import os
+
     assert os.path.exists(llama_cpp_model_path)
 
     params = llama_cpp.llama_model_default_params()
@@ -114,6 +115,7 @@ def test_real_model(llama_cpp_model_path):
     output_text = model.detokenize(output, special=True)
     assert output_text == b" over the lazy dog"
 
+
 def test_real_llama(llama_cpp_model_path):
     model = llama_cpp.Llama(
         llama_cpp_model_path,
@@ -132,11 +134,10 @@ def test_real_llama(llama_cpp_model_path):
         top_k=50,
         top_p=0.9,
         temperature=0.8,
-        seed=1337
+        seed=1337,
     )
     assert output["choices"][0]["text"] == " over the lazy dog"
 
-
     output = model.create_completion(
         "The capital of france is paris, 'true' or 'false'?:\n",
         max_tokens=4,
@@ -146,20 +147,19 @@ def test_real_llama(llama_cpp_model_path):
         seed=1337,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "true" | "false"
-""")
+"""),
     )
     assert output["choices"][0]["text"] == "true"
 
     suffix = b"rot"
     tokens = model.tokenize(suffix, add_bos=True, special=True)
+
     def logit_processor_func(input_ids, logits):
         for token in tokens:
             logits[token] *= 1000
         return logits
 
-    logit_processors = llama_cpp.LogitsProcessorList(
-        [logit_processor_func]
-    )
+    logit_processors = llama_cpp.LogitsProcessorList([logit_processor_func])
 
     output = model.create_completion(
         "The capital of france is par",
@@ -168,7 +168,7 @@ def logit_processor_func(input_ids, logits):
         top_p=0.9,
         temperature=0.8,
         seed=1337,
-        logits_processor=logit_processors
+        logits_processor=logit_processors,
     )
     assert output["choices"][0]["text"].lower().startswith("rot")
 
@@ -184,7 +184,7 @@ def logit_processor_func(input_ids, logits):
         temperature=0.8,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
-""")
+"""),
     )
     number_1 = output["choices"][0]["text"]
 
@@ -196,7 +196,7 @@ def logit_processor_func(input_ids, logits):
         temperature=0.8,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
-""")
+"""),
     )
     number_2 = output["choices"][0]["text"]
 
@@ -210,7 +210,7 @@ def logit_processor_func(input_ids, logits):
         temperature=0.8,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
-""")
+"""),
     )
     number_3 = output["choices"][0]["text"]
 
@@ -228,7 +228,7 @@ def test_real_llama_embeddings(llama_cpp_model_path):
         n_threads_batch=multiprocessing.cpu_count(),
         logits_all=False,
         flash_attn=True,
-        embedding=True
+        embedding=True,
     )
     # Smoke test for now
     model.embed("Hello World")
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72b..18c7279cf 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -10,13 +10,20 @@
 
 from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
 
+
 def test_mistral_instruct():
     chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
     chat_formatter = jinja2.Template(chat_template)
     messages = [
-        llama_types.ChatCompletionRequestUserMessage(role="user", content="Instruction"),
-        llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="Model answer"),
-        llama_types.ChatCompletionRequestUserMessage(role="user", content="Follow-up instruction"),
+        llama_types.ChatCompletionRequestUserMessage(
+            role="user", content="Instruction"
+        ),
+        llama_types.ChatCompletionRequestAssistantMessage(
+            role="assistant", content="Model answer"
+        ),
+        llama_types.ChatCompletionRequestUserMessage(
+            role="user", content="Follow-up instruction"
+        ),
     ]
     response = llama_chat_format.format_mistral_instruct(
         messages=messages,
@@ -77,13 +84,11 @@ def test_mistral_instruct():
 
 def test_hf_tokenizer_config_str_to_chat_formatter():
     tokenizer_config = json.loads(mistral_7b_tokenizer_config)
-    chat_formatter = hf_tokenizer_config_to_chat_formatter(
-        tokenizer_config
-    )
+    chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config)
     chat_formatter_respoonse = chat_formatter(
         messages=[
             ChatCompletionRequestUserMessage(role="user", content="Hello, world!"),
         ]
     )
 
-    assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>" "")
+    assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>")
diff --git a/tests/test_llama_speculative.py b/tests/test_llama_speculative.py
index b5d450567..d28c9ca9c 100644
--- a/tests/test_llama_speculative.py
+++ b/tests/test_llama_speculative.py
@@ -2,15 +2,20 @@
 
 from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 
+
 def test_find_candidate_pred_tokens():
     find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens
 
     # Test Case 1: Matching ngram is found
     input_ids1 = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
-    result1 = find_candidate_pred_tokens(input_ids1, max_ngram_size=3, num_pred_tokens=2)
+    result1 = find_candidate_pred_tokens(
+        input_ids1, max_ngram_size=3, num_pred_tokens=2
+    )
     assert np.array_equal(result1, np.array([1, 2]))
 
     # Test Case 2: Matching ngram is not found
     input_ids2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
-    result2 = find_candidate_pred_tokens(input_ids2, max_ngram_size=3, num_pred_tokens=2)
+    result2 = find_candidate_pred_tokens(
+        input_ids2, max_ngram_size=3, num_pred_tokens=2
+    )
     assert np.array_equal(result2, np.array([]))

From 18aa31ef1e6150cf1b24b81583f5254aabe00199 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 22 Mar 2026 18:58:55 -0700
Subject: [PATCH 04/19] feat: Update llama.cpp to
 ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 (#2151)

* Update llama.cpp and sync bindings

* Clean up binding compatibility shims

* Remove flash attention property shim

* Remove mtmd verbosity shim

* Add docstrings for new bindings

* Format Ruff files and add changelog entry
---
 CHANGELOG.md                   |   1 +
 CMakeLists.txt                 |  20 +
 Makefile                       |   2 -
 llama_cpp/_internals.py        |   8 +-
 llama_cpp/llama.py             |  22 +-
 llama_cpp/llama_chat_format.py |   9 +-
 llama_cpp/llama_cpp.py         | 881 +++++++++++++++++++++------------
 llama_cpp/mtmd_cpp.py          |  96 +++-
 tests/test_llama.py            |   2 +-
 vendor/llama.cpp               |   2 +-
 10 files changed, 714 insertions(+), 329 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7044f44d6..e7506eed5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
 - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b06d98b3..9b2744cdc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,26 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Upstream mtmd expects LLAMA_INSTALL_VERSION to be set by llama.cpp's
+        # top-level CMakeLists.txt. When we include tools/mtmd directly from the
+        # Python package build, that directory scope is skipped.
+        if (NOT DEFINED LLAMA_INSTALL_VERSION OR "${LLAMA_INSTALL_VERSION}" STREQUAL "")
+            set(LLAMA_INSTALL_VERSION 0.0.0)
+            find_package(Git QUIET)
+            if (Git_FOUND)
+                execute_process(
+                    COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
+                    OUTPUT_VARIABLE LLAMA_MTMD_BUILD_NUMBER
+                    OUTPUT_STRIP_TRAILING_WHITESPACE
+                    RESULT_VARIABLE LLAMA_MTMD_BUILD_NUMBER_RESULT
+                )
+                if (LLAMA_MTMD_BUILD_NUMBER_RESULT EQUAL 0)
+                    set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_MTMD_BUILD_NUMBER})
+                endif()
+            endif()
+        endif()
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
diff --git a/Makefile b/Makefile
index 8e6cae2c1..db45246c7 100644
--- a/Makefile
+++ b/Makefile
@@ -82,8 +82,6 @@ run-server:
 	python3 -m llama_cpp.server --model ${MODEL}
 
 clean:
-	- cd vendor/llama.cpp && make clean
-	- cd vendor/llama.cpp && rm libllama.so
 	- rm -rf _skbuild
 	- rm llama_cpp/lib/*.so
 	- rm llama_cpp/lib/*.dylib
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b520b7ea5..d6258d224 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -2,6 +2,7 @@
 
 import os
 import ctypes
+import warnings
 
 from typing import (
     Dict,
@@ -699,8 +700,11 @@ def add_dist(self, seed: int):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
-        sampler = llama_cpp.llama_sampler_init_softmax()
-        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+        warnings.warn(
+            "add_softmax is deprecated; llama_sampler_init_dist now samples directly from logits",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 21a7430a0..1609ad16b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -341,7 +341,11 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        self.context_params.flash_attn_type = (
+            llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            if flash_attn
+            else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+        )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -431,9 +435,9 @@ def free_lora_adapter():
 
             self._stack.callback(free_lora_adapter)
 
-            if llama_cpp.llama_set_adapter_lora(
-                self._ctx.ctx, self._lora_adapter, self.lora_scale
-            ):
+            adapters = (llama_cpp.llama_adapter_lora_p_ctypes * 1)(self._lora_adapter)
+            scales = (ctypes.c_float * 1)(self.lora_scale)
+            if llama_cpp.llama_set_adapters_lora(self._ctx.ctx, adapters, 1, scales):
                 raise RuntimeError(
                     f"Failed to set LoRA adapter from lora path: {self.lora_path}"
                 )
@@ -726,7 +730,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
             sampler.add_grammar(self._model, grammar)
 
         if temp < 0.0:
-            sampler.add_softmax()
             sampler.add_dist(self._seed)
         elif temp == 0.0:
             sampler.add_greedy()
@@ -1042,7 +1045,7 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            self._ctx.kv_cache_clear()
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1113,7 +1116,7 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        self._ctx.kv_cache_clear()
         self.reset()
 
         if return_count:
@@ -2100,7 +2103,10 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=(
+                self.context_params.flash_attn_type
+                == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            ),
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 8e8ac7bb3..d7910e984 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2755,7 +2755,14 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
             ctx_params.use_gpu = True  # TODO: Make this configurable
             ctx_params.print_timings = self.verbose
             ctx_params.n_threads = llama_model.n_threads
-            ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+            ctx_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                if (
+                    llama_model.context_params.flash_attn_type
+                    == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                )
+                else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
 
             # Initialize mtmd context
             self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index f13af67f3..e51492c56 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -3,6 +3,7 @@
 import os
 import ctypes
 import pathlib
+import warnings
 
 from typing import (
     Callable,
@@ -77,6 +78,8 @@
 #     GGML_TYPE_I64     = 27,
 #     GGML_TYPE_F64     = 28,
 #     GGML_TYPE_IQ1_M   = 29,
+#     GGML_TYPE_MXFP4   = 39,
+#     GGML_TYPE_NVFP4   = 40,
 #     GGML_TYPE_COUNT,
 # };
 GGML_TYPE_F32 = 0
@@ -107,7 +110,9 @@
 GGML_TYPE_I64 = 27
 GGML_TYPE_F64 = 28
 GGML_TYPE_IQ1_M = 29
-GGML_TYPE_COUNT = 30
+GGML_TYPE_MXFP4 = 39
+GGML_TYPE_NVFP4 = 40
+GGML_TYPE_COUNT = 41
 
 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
@@ -173,6 +178,10 @@
 llama_kv_cache_p = NewType("llama_kv_cache_p", int)
 llama_kv_cache_p_ctypes = ctypes.c_void_p
 
+# struct gguf_context;
+gguf_context_p = NewType("gguf_context_p", int)
+gguf_context_p_ctypes = ctypes.c_void_p
+
 # typedef int32_t llama_pos;
 llama_pos = ctypes.c_int32
 # typedef int32_t llama_token;
@@ -292,12 +301,14 @@
 #     LLAMA_ROPE_TYPE_NORM   = 0,
 #     LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
 #     LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
+#     LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
 #     LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
 # };
 LLAMA_ROPE_TYPE_NONE = -1
 LLAMA_ROPE_TYPE_NORM = 0
 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX = 2
 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE = 8
+LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE = 40
 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION = 24
 
 
@@ -386,6 +397,7 @@
 #     LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_NVFP4         = 39, // except 1d tensors
 #
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -425,6 +437,7 @@
 LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
 LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
+LLAMA_FTYPE_MOSTLY_NVFP4 = 39
 LLAMA_FTYPE_GUESSED = 1024
 
 # enum llama_rope_scaling_type {
@@ -467,6 +480,16 @@
 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
 
 
+# enum llama_flash_attn_type {
+#     LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
+#     LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+#     LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
+# };
+LLAMA_FLASH_ATTN_TYPE_AUTO = -1
+LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
+LLAMA_FLASH_ATTN_TYPE_ENABLED = 1
+
+
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
 #     LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -619,6 +642,34 @@ class llama_batch(ctypes.Structure):
 LLAMA_KV_OVERRIDE_TYPE_STR = 3
 
 
+# enum llama_model_meta_key {
+#     LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
+#     LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
+#     LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
+#     LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
+#     LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
+#     LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
+#     LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
+#     LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
+#     LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
+# };
+LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE = 0
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_K = 1
+LLAMA_MODEL_META_KEY_SAMPLING_TOP_P = 2
+LLAMA_MODEL_META_KEY_SAMPLING_MIN_P = 3
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY = 4
+LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD = 5
+LLAMA_MODEL_META_KEY_SAMPLING_TEMP = 6
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N = 7
+LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT = 8
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT = 9
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU = 10
+LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA = 11
+
+
 # struct llama_model_kv_override {
 #     enum llama_model_kv_override_type tag;
 
@@ -695,11 +746,14 @@ class llama_model_kv_override(ctypes.Structure):
 
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool vocab_only;    // only load the vocabulary, no weights
-#     bool use_mmap;      // use mmap if possible
-#     bool use_mlock;     // force system to keep model in RAM
-#     bool check_tensors; // validate model tensor data
+#     bool vocab_only;      // only load the vocabulary, no weights
+#     bool use_mmap;        // use mmap if possible
+#     bool use_direct_io;   // use direct io, takes precedence over use_mmap when supported
+#     bool use_mlock;       // force system to keep model in RAM
+#     bool check_tensors;   // validate model tensor data
 #     bool use_extra_bufts; // use extra buffer types (used for weight repacking)
+#     bool no_host;         // bypass host buffer allowing extra buffers to be used
+#     bool no_alloc;        // only load metadata and simulate memory allocations
 # };
 class llama_model_params(ctypes.Structure):
     """Parameters for llama_model
@@ -716,9 +770,12 @@ class llama_model_params(ctypes.Structure):
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
         vocab_only (bool): only load the vocabulary, no weights
         use_mmap (bool): use mmap if possible
+        use_direct_io (bool): use direct io, takes precedence over use_mmap when supported
         use_mlock (bool): force system to keep model in RAM
         check_tensors (bool): validate model tensor data
-        use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
+        use_extra_bufts (bool): use extra buffer types (used for weight repacking)
+        no_host (bool): bypass host buffer allowing extra buffers to be used
+        no_alloc (bool): only load metadata and simulate memory allocations"""
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
@@ -734,9 +791,12 @@ class llama_model_params(ctypes.Structure):
         kv_overrides: CtypesArray[llama_model_kv_override]
         vocab_only: bool
         use_mmap: bool
+        use_direct_io: bool
         use_mlock: bool
         check_tensors: bool
         use_extra_bufts: bool
+        no_host: bool
+        no_alloc: bool
 
     _fields_ = [
         ("devices", ctypes.c_void_p),  # NOTE: unnused
@@ -750,9 +810,27 @@ class llama_model_params(ctypes.Structure):
         ("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
         ("vocab_only", ctypes.c_bool),
         ("use_mmap", ctypes.c_bool),
+        ("use_direct_io", ctypes.c_bool),
         ("use_mlock", ctypes.c_bool),
         ("check_tensors", ctypes.c_bool),
         ("use_extra_bufts", ctypes.c_bool),
+        ("no_host", ctypes.c_bool),
+        ("no_alloc", ctypes.c_bool),
+    ]
+
+
+# struct llama_sampler_seq_config {
+#     llama_seq_id           seq_id;
+#     struct llama_sampler * sampler;
+# };
+class llama_sampler_seq_config(ctypes.Structure):
+    if TYPE_CHECKING:
+        seq_id: int
+        sampler: ctypes.c_void_p
+
+    _fields_ = [
+        ("seq_id", llama_seq_id),
+        ("sampler", ctypes.c_void_p),
     ]
 
 
@@ -769,6 +847,7 @@ class llama_model_params(ctypes.Structure):
 #     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
+#     enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
 
 #     // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -796,7 +875,6 @@ class llama_model_params(ctypes.Structure):
 #     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // use flash attention [EXPERIMENTAL]
 #     bool no_perf;     // measure performance timings
 #     bool op_offload;  // offload host tensor operations to device
 #     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -805,6 +883,8 @@ class llama_model_params(ctypes.Structure):
 #     bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
 #                       // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
 #                       // ref: https://github.com/ggml-org/llama.cpp/pull/14363
+#     struct llama_sampler_seq_config * samplers;
+#     size_t                            n_samplers;
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -819,6 +899,7 @@ class llama_context_params(ctypes.Structure):
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
         pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         attention_type (int): attention type to use for embeddings
+        flash_attn_type (int): when to enable flash attention
         rope_freq_base (float): RoPE base frequency, 0 = from model
         rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
         yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -835,11 +916,12 @@ class llama_context_params(ctypes.Structure):
         abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
-        flash_attn (bool): whether to use flash attention
         no_perf (bool): whether to measure performance timings
         op_offload (bool): offload host tensor operations to device
         swa_full (bool): use full-size SWA cache
         kv_unified (bool): use a unified buffer across the input sequences when computing the attention
+        samplers (ctypes.POINTER(llama_sampler_seq_config)): backend sampler chain configuration
+        n_samplers (int): number of backend sampler chain configurations
     """
 
     if TYPE_CHECKING:
@@ -852,6 +934,7 @@ class llama_context_params(ctypes.Structure):
         rope_scaling_type: int
         pooling_type: int
         attention_type: int
+        flash_attn_type: int
         rope_freq_base: float
         rope_freq_scale: float
         yarn_ext_factor: float
@@ -868,11 +951,12 @@ class llama_context_params(ctypes.Structure):
         abort_callback_data: ctypes.c_void_p
         embeddings: bool
         offload_kqv: bool
-        flash_attn: bool
         no_perf: bool
         op_offload: bool
         swa_full: bool
         kv_unified: bool
+        samplers: ctypes.POINTER(llama_sampler_seq_config)
+        n_samplers: int
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -884,6 +968,7 @@ class llama_context_params(ctypes.Structure):
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
+        ("flash_attn_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -900,11 +985,12 @@ class llama_context_params(ctypes.Structure):
         ("abort_callback_data", ctypes.c_void_p),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
-        ("flash_attn", ctypes.c_bool),
         ("no_perf", ctypes.c_bool),
         ("op_offload", ctypes.c_bool),
         ("swa_full", ctypes.c_bool),
         ("kv_unified", ctypes.c_bool),
+        ("samplers", ctypes.POINTER(llama_sampler_seq_config)),
+        ("n_samplers", ctypes.c_size_t),
     ]
 
 
@@ -935,6 +1021,7 @@ class llama_context_params(ctypes.Structure):
 #     bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                            // quantize all tensors to the default type
 #     bool keep_split;                      // quantize to the same number of shards
+#     bool dry_run;                         // calculate and show the final quantization size without performing quantization
 #     void * imatrix;                       // pointer to importance matrix data
 #     void * kv_overrides;                  // pointer to vector containing overrides
 #     void * tensor_types;                  // pointer to vector containing tensor types
@@ -953,6 +1040,7 @@ class llama_model_quantize_params(ctypes.Structure):
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         pure (bool): quantize all tensors to the default type
         keep_split (bool): quantize to the same number of shards
+        dry_run (bool): calculate and show the final quantization size without performing quantization
         imatrix (ctypes.c_void_p): pointer to importance matrix data
         kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
         tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
@@ -969,6 +1057,7 @@ class llama_model_quantize_params(ctypes.Structure):
         only_copy: bool
         pure: bool
         keep_split: bool
+        dry_run: bool
         imatrix: ctypes.c_void_p
         kv_overrides: ctypes.c_void_p
         tensor_types: ctypes.c_void_p
@@ -984,6 +1073,7 @@ class llama_model_quantize_params(ctypes.Structure):
         ("only_copy", ctypes.c_bool),
         ("pure", ctypes.c_bool),
         ("keep_split", ctypes.c_bool),
+        ("dry_run", ctypes.c_bool),
         ("imatrix", ctypes.c_void_p),
         ("kv_overrides", ctypes.c_void_p),
         ("tensor_types", ctypes.c_void_p),
@@ -1095,6 +1185,13 @@ def llama_model_quantize_default_params() -> llama_model_quantize_params:
     ...
 
 
+# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
+@ctypes_function("llama_flash_attn_type_name", [ctypes.c_int], ctypes.c_char_p)
+def llama_flash_attn_type_name(flash_attn_type: int, /) -> Optional[bytes]:
+    """Get the flash attention type name."""
+    ...
+
+
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
@@ -1249,6 +1346,36 @@ def llama_free_model(model: llama_model_p, /): ...
 def llama_model_free(model: llama_model_p, /): ...
 
 
+# typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
+llama_model_set_tensor_data_t = ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_void_p)
+
+
+# LLAMA_API struct llama_model * llama_model_init_from_user(
+#                 struct gguf_context * metadata,
+#       llama_model_set_tensor_data_t   set_tensor_data,
+#                                void * set_tensor_data_ud,
+#           struct llama_model_params   params);
+@ctypes_function(
+    "llama_model_init_from_user",
+    [
+        gguf_context_p_ctypes,
+        llama_model_set_tensor_data_t,
+        ctypes.c_void_p,
+        llama_model_params,
+    ],
+    llama_model_p_ctypes,
+)
+def llama_model_init_from_user(
+    metadata: gguf_context_p,
+    set_tensor_data: llama_model_set_tensor_data_t,
+    set_tensor_data_ud: ctypes.c_void_p,
+    params: llama_model_params,
+    /,
+) -> Optional[llama_model_p]:
+    """Initialize a model from user-provided metadata and tensor data."""
+    ...
+
+
 # LLAMA_API struct llama_context * llama_init_from_model(
 #                  struct llama_model * model,
 #         struct llama_context_params   params);
@@ -1288,6 +1415,54 @@ def llama_free(ctx: llama_context_p, /):
     ...
 
 
+# enum llama_params_fit_status {
+#     LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
+#     LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
+#     LLAMA_PARAMS_FIT_STATUS_ERROR   = 2,
+# };
+LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0
+LLAMA_PARAMS_FIT_STATUS_FAILURE = 1
+LLAMA_PARAMS_FIT_STATUS_ERROR = 2
+
+
+# LLAMA_API enum llama_params_fit_status llama_params_fit(
+#                                const char   * path_model,
+#                 struct llama_model_params   * mparams,
+#                 struct llama_context_params * cparams,
+#                                       float * tensor_split,
+#     struct llama_model_tensor_buft_override * tensor_buft_overrides,
+#                                      size_t * margins,
+#                                    uint32_t   n_ctx_min,
+#                         enum ggml_log_level   log_level);
+@ctypes_function(
+    "llama_params_fit",
+    [
+        ctypes.c_char_p,
+        ctypes.POINTER(llama_model_params),
+        ctypes.POINTER(llama_context_params),
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_void_p,
+        ctypes.POINTER(ctypes.c_size_t),
+        ctypes.c_uint32,
+        ctypes.c_int,
+    ],
+    ctypes.c_int,
+)
+def llama_params_fit(
+    path_model: bytes,
+    mparams: CtypesPointerOrRef[llama_model_params],
+    cparams: CtypesPointerOrRef[llama_context_params],
+    tensor_split: Optional[CtypesPointer[ctypes.c_float]],
+    tensor_buft_overrides: ctypes.c_void_p,
+    margins: Optional[CtypesPointer[ctypes.c_size_t]],
+    n_ctx_min: int,
+    log_level: int,
+    /,
+) -> int:
+    """Fit model and context parameters for a model path."""
+    ...
+
+
 # LLAMA_API int64_t llama_time_us(void);
 @ctypes_function(
     "llama_time_us",
@@ -1307,6 +1482,13 @@ def llama_max_devices() -> int: ...
 def llama_max_parallel_sequences() -> int: ...
 
 
+# LLAMA_API size_t llama_max_tensor_buft_overrides(void);
+@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
+def llama_max_tensor_buft_overrides() -> int:
+    """Get the maximum number of tensor buffer type overrides."""
+    ...
+
+
 # LLAMA_API bool llama_supports_mmap       (void);
 @ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
 def llama_supports_mmap() -> bool: ...
@@ -1332,6 +1514,13 @@ def llama_supports_rpc() -> bool: ...
 def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
 
 
+# LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
+@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_ctx_seq(ctx: llama_context_p, /) -> int:
+    """Get the context size per sequence."""
+    ...
+
+
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_batch(ctx: llama_context_p, /) -> int: ...
@@ -1389,17 +1578,6 @@ def llama_get_memory(ctx: llama_context_p, /) -> Optional[llama_memory_t]:
 def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
 
 
-# DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
-@ctypes_function(
-    "llama_get_kv_self",
-    [llama_context_p_ctypes],
-    llama_kv_cache_p_ctypes,
-)
-def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
-    """Get the KV cache for self-attention (DEPRECATED)"""
-    ...
-
-
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 @ctypes_function("llama_model_get_vocab", [llama_model_p_ctypes], llama_vocab_p_ctypes)
 def llama_model_get_vocab(model: llama_model_p, /) -> Optional[llama_vocab_p]: ...
@@ -1420,6 +1598,20 @@ def llama_model_n_ctx_train(model: llama_model_p, /) -> int: ...
 def llama_model_n_embd(model: llama_model_p, /) -> int: ...
 
 
+# LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
+@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd_inp(model: llama_model_p, /) -> int:
+    """Get the model input embedding size."""
+    ...
+
+
+# LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
+@ctypes_function("llama_model_n_embd_out", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd_out(model: llama_model_p, /) -> int:
+    """Get the model output embedding size."""
+    ...
+
+
 # LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_layer(model: llama_model_p, /) -> int: ...
@@ -1515,6 +1707,14 @@ def llama_model_meta_count(model: llama_model_p, /) -> int:
     ...
 
 
+# // Get sampling metadata key name. Returns nullptr if the key is invalid
+# LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
+@ctypes_function("llama_model_meta_key_str", [ctypes.c_int], ctypes.c_char_p)
+def llama_model_meta_key_str(key: int, /) -> Optional[bytes]:
+    """Get sampling metadata key name. Returns None if the key is invalid."""
+    ...
+
+
 # // Get metadata key name by index
 # LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 @ctypes_function(
@@ -1647,6 +1847,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
     ...
 
 
+# // Returns true if the model is hybrid (like Jamba, Granite, etc.)
+# LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
+@ctypes_function("llama_model_is_hybrid", [llama_model_p_ctypes], ctypes.c_bool)
+def llama_model_is_hybrid(model: llama_model_p, /) -> bool:
+    """Returns true if the model is hybrid (like Jamba, Granite, etc.)"""
+    ...
+
+
 # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
 # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
 @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
@@ -1698,6 +1906,85 @@ def llama_adapter_lora_init(
 ) -> Optional[llama_adapter_lora_p]: ...
 
 
+# // Get metadata value as a string by key name
+# LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
+@ctypes_function(
+    "llama_adapter_meta_val_str",
+    [
+        llama_adapter_lora_p_ctypes,
+        ctypes.c_char_p,
+        ctypes.c_char_p,
+        ctypes.c_size_t,
+    ],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str(
+    adapter: llama_adapter_lora_p,
+    key: bytes,
+    buf: Union[bytes, CtypesArray[ctypes.c_char]],
+    buf_size: int,
+    /,
+) -> int:
+    """Get adapter metadata value as a string by key name."""
+    ...
+
+
+# // Get the number of metadata key/value pairs
+# LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
+@ctypes_function(
+    "llama_adapter_meta_count", [llama_adapter_lora_p_ctypes], ctypes.c_int32
+)
+def llama_adapter_meta_count(adapter: llama_adapter_lora_p, /) -> int:
+    """Get the number of adapter metadata key/value pairs."""
+    ...
+
+
+# // Get metadata key name by index
+# LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
+@ctypes_function(
+    "llama_adapter_meta_key_by_index",
+    [
+        llama_adapter_lora_p_ctypes,
+        ctypes.c_int32,
+        ctypes.c_char_p,
+        ctypes.c_size_t,
+    ],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_key_by_index(
+    adapter: llama_adapter_lora_p,
+    i: int,
+    buf: Union[bytes, CtypesArray[ctypes.c_char]],
+    buf_size: int,
+    /,
+) -> int:
+    """Get adapter metadata key name by index."""
+    ...
+
+
+# // Get metadata value as a string by index
+# LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
+@ctypes_function(
+    "llama_adapter_meta_val_str_by_index",
+    [
+        llama_adapter_lora_p_ctypes,
+        ctypes.c_int32,
+        ctypes.c_char_p,
+        ctypes.c_size_t,
+    ],
+    ctypes.c_int32,
+)
+def llama_adapter_meta_val_str_by_index(
+    adapter: llama_adapter_lora_p,
+    i: int,
+    buf: Union[bytes, CtypesArray[ctypes.c_char]],
+    buf_size: int,
+    /,
+) -> int:
+    """Get adapter metadata value as a string by index."""
+    ...
+
+
 # // Manually free a LoRA adapter
 # // Note: loaded adapters will be free when the associated model is deleted
 # LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
@@ -1709,56 +1996,75 @@ def llama_adapter_lora_init(
 def llama_adapter_lora_free(adapter: llama_adapter_lora_p, /): ...
 
 
-# // The following functions operate on a llama_context, hence the naming: llama_verb_...
+# // Get the invocation tokens if the current lora is an alora
+# LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
+@ctypes_function(
+    "llama_adapter_get_alora_n_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.c_uint64,
+)
+def llama_adapter_get_alora_n_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> int:
+    """Get the invocation token count if the current LoRA is an aLoRA."""
+    ...
 
 
-# // Add a loaded LoRA adapter to given context
-# // This will not modify model's weight
-# LLAMA_API int32_t llama_set_adapter_lora(
-#         struct llama_context * ctx,
-#         struct llama_adapter_lora * adapter,
-#         float scale);
+# LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens  (const struct llama_adapter_lora * adapter);
 @ctypes_function(
-    "llama_set_adapter_lora",
-    [llama_context_p_ctypes, llama_adapter_lora_p_ctypes, ctypes.c_float],
-    ctypes.c_int32,
+    "llama_adapter_get_alora_invocation_tokens",
+    [llama_adapter_lora_p_ctypes],
+    ctypes.POINTER(llama_token),
 )
-def llama_set_adapter_lora(
-    ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, /
-) -> int:
-    """Add a loaded LoRA adapter to given context
-    This will not modify model's weight"""
+def llama_adapter_get_alora_invocation_tokens(
+    adapter: llama_adapter_lora_p, /
+) -> Optional[CtypesPointer[llama_token]]:
+    """Get the invocation tokens if the current LoRA is an aLoRA."""
     ...
 
 
-# // Remove a specific LoRA adapter from given context
-# // Return -1 if the adapter is not present in the context
-# LLAMA_API int32_t llama_rm_adapter_lora(
+# // The following functions operate on a llama_context, hence the naming: llama_verb_...
+
+
+# // Set LoRa adapters on the context. Will only modify if the adapters currently in context are different.
+# LLAMA_API int32_t llama_set_adapters_lora(
 #         struct llama_context * ctx,
-#         struct llama_adapter_lora * adapter);
+#         struct llama_adapter_lora ** adapters,
+#         size_t n_adapters,
+#         float * scales);
 @ctypes_function(
-    "llama_rm_adapter_lora",
-    [llama_context_p_ctypes, llama_adapter_lora_p_ctypes],
+    "llama_set_adapters_lora",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(llama_adapter_lora_p_ctypes),
+        ctypes.c_size_t,
+        ctypes.POINTER(ctypes.c_float),
+    ],
     ctypes.c_int32,
 )
-def llama_rm_adapter_lora(
-    ctx: llama_context_p, adapter: llama_adapter_lora_p, /
+def llama_set_adapters_lora(
+    ctx: llama_context_p,
+    adapters: Optional[CtypesArray[llama_adapter_lora_p_ctypes]],
+    n_adapters: int,
+    scales: Optional[CtypesArray[ctypes.c_float]],
+    /,
 ) -> int:
-    """Remove a specific LoRA adapter from given context
-    Return -1 if the adapter is not present in the context"""
+    """Set LoRA adapters on the context if they differ from the current adapters."""
     ...
 
 
-# // Remove all LoRA adapters from given context
-# LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
-@ctypes_function(
-    "llama_clear_adapter_lora",
-    [llama_context_p_ctypes],
-    None,
-)
-def llama_clear_adapter_lora(ctx: llama_context_p, /):
-    """Remove all LoRA adapters from given context"""
-    ...
+# Deprecated compatibility wrapper for the renamed llama_set_adapters_lora().
+def llama_set_adapter_lora(
+    ctx: llama_context_p, adapter: llama_adapter_lora_p, scale: float, /
+) -> int:
+    warnings.warn(
+        "llama_set_adapter_lora is deprecated; use llama_set_adapters_lora instead",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    adapters = (llama_adapter_lora_p_ctypes * 1)(adapter)
+    scales = (ctypes.c_float * 1)(scale)
+    return llama_set_adapters_lora(ctx, adapters, 1, scales)
 
 
 # // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -1767,7 +2073,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /):
 # // to an n_embd x n_layers buffer starting from layer 1.
 # // il_start and il_end are the layer range the vector should apply to (both inclusive)
 # // See llama_control_vector_load in common to load a control vector.
-# LLAMA_API int32_t llama_apply_adapter_cvec(
+# LLAMA_API int32_t llama_set_adapter_cvec(
 #         struct llama_context * ctx,
 #                  const float * data,
 #                       size_t   len,
@@ -1775,7 +2081,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /):
 #                      int32_t   il_start,
 #                      int32_t   il_end);
 @ctypes_function(
-    "llama_apply_adapter_cvec",
+    "llama_set_adapter_cvec",
     [
         llama_context_p_ctypes,
         ctypes.POINTER(ctypes.c_float),
@@ -1786,7 +2092,7 @@ def llama_clear_adapter_lora(ctx: llama_context_p, /):
     ],
     ctypes.c_int32,
 )
-def llama_apply_adapter_cvec(
+def llama_set_adapter_cvec(
     ctx: llama_context_p,
     data: CtypesPointerOrRef[ctypes.c_float],
     len: int,
@@ -1804,6 +2110,24 @@ def llama_apply_adapter_cvec(
     ...
 
 
+# Deprecated compatibility wrapper for the renamed llama_set_adapter_cvec().
+def llama_apply_adapter_cvec(
+    ctx: llama_context_p,
+    data: CtypesPointerOrRef[ctypes.c_float],
+    len: int,
+    n_embd: int,
+    il_start: int,
+    il_end: int,
+    /,
+) -> int:
+    warnings.warn(
+        "llama_apply_adapter_cvec is deprecated; use llama_set_adapter_cvec instead",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return llama_set_adapter_cvec(ctx, data, len, n_embd, il_start, il_end)
+
+
 # //
 # // Memory
 # //
@@ -2018,251 +2342,6 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
     ...
 
 
-# //
-# // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
-# //
-
-
-# // Returns the number of tokens in the KV cache (slow, use only for debug)
-# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function("llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32)
-def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
-    ...
-
-
-# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-#            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function("llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32)
-def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (DEPRECATED)"""
-    ...
-
-
-# // Clear the KV cache - both cell info is erased and KV data is zeroed
-# DEPRECATED(LLAMA_API void llama_kv_self_clear(
-#             struct llama_context * ctx),
-#         "Use llama_memory_clear() instead");
-@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
-def llama_kv_self_clear(ctx: llama_context_p, /):
-    """Clear the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
-# // seq_id < 0 : match any sequence
-# // p0 < 0     : [0,  p1]
-# // p1 < 0     : [p0, inf)
-# DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_rm() instead");
-@ctypes_function(
-    "llama_kv_self_seq_rm",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    ctypes.c_bool,
-)
-def llama_kv_self_seq_rm(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-) -> bool:
-    """Remove tokens from KV cache (DEPRECATED)"""
-    ...
-
-
-# // Copy all tokens that belong to the specified sequence to another sequence
-# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id_src,
-#                 llama_seq_id   seq_id_dst,
-#                    llama_pos   p0,
-#                    llama_pos   p1),
-#         "Use llama_memory_seq_cp() instead");
-@ctypes_function(
-    "llama_kv_self_seq_cp",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_cp(
-    ctx: llama_context_p,
-    seq_id_src: Union[llama_seq_id, int],
-    seq_id_dst: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-):
-    """Copy tokens in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Removes all tokens that do not belong to the specified sequence
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_keep() instead");
-@ctypes_function("llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None)
-def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
-    """Keep only specified sequence in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                    llama_pos   delta),
-#         "Use llama_memory_seq_add() instead");
-@ctypes_function(
-    "llama_kv_self_seq_add",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_add(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    delta: Union[llama_pos, int],
-    /,
-):
-    """Add delta to sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Integer division of the positions by factor of `d > 1`
-# // If the KV cache is RoPEd, the KV data is updated accordingly:
-# //   - lazily on next llama_decode()
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
-# DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id,
-#                    llama_pos   p0,
-#                    llama_pos   p1,
-#                          int   d),
-#         "Use llama_memory_seq_div() instead");
-@ctypes_function(
-    "llama_kv_self_seq_div",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        ctypes.c_int,
-    ],
-    None,
-)
-def llama_kv_self_seq_div(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    d: Union[ctypes.c_int, int],
-    /,
-):
-    """Divide sequence positions in KV cache (DEPRECATED)"""
-    ...
-
-
-# // Returns the smallest position present in the KV cache for the specified sequence
-# // This is typically non-zero only for SWA caches
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_min() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_min(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Returns the largest position present in the KV cache for the specified sequence
-# // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
-# // Return -1 if the sequence is empty
-# DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
-#         struct llama_context * ctx,
-#                 llama_seq_id   seq_id),
-#         "Use llama_memory_seq_pos_max() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_max(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
-    ...
-
-
-# // Defragment the KV cache
-# // This will be applied:
-# //   - lazily on next llama_decode()
-# DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
-#         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
-def llama_kv_self_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache (DEPRECATED)"""
-    ...
-
-
-# // Check if the context supports KV cache shifting
-# DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
-#         "use llama_memory_can_shift() instead");
-@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting (DEPRECATED)"""
-    ...
-
-
-# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
-#         "simply remove this call, updates are applied lazily on the next llama_decode()");
-@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
-def llama_kv_self_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (DEPRECATED)"""
-    ...
-
-
 # //
 # // State / sessions
 # //
@@ -2914,6 +2993,100 @@ def llama_get_embeddings_seq(
     ...
 
 
+# // Get the backend sampled token for the ith token.
+# // Returns LLAMA_TOKEN_NULL if no token was sampled.
+# LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_token_ith", [llama_context_p_ctypes, ctypes.c_int32], llama_token
+)
+def llama_get_sampled_token_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> int:
+    """Get the backend sampled token for the ith token."""
+    ...
+
+
+# // Get the backend sampled probabilities for the ith token
+# LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_probs_ith",
+    [llama_context_p_ctypes, ctypes.c_int32],
+    ctypes.POINTER(ctypes.c_float),
+)
+def llama_get_sampled_probs_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> Optional[CtypesPointer[ctypes.c_float]]:
+    """Get the backend sampled probabilities for the ith token."""
+    ...
+
+
+# LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_probs_count_ith",
+    [llama_context_p_ctypes, ctypes.c_int32],
+    ctypes.c_uint32,
+)
+def llama_get_sampled_probs_count_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> int:
+    """Get the backend sampled probability count for the ith token."""
+    ...
+
+
+# // Get the backend sampled logits for the ith token
+# LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_logits_ith",
+    [llama_context_p_ctypes, ctypes.c_int32],
+    ctypes.POINTER(ctypes.c_float),
+)
+def llama_get_sampled_logits_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> Optional[CtypesPointer[ctypes.c_float]]:
+    """Get the backend sampled logits for the ith token."""
+    ...
+
+
+# LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_logits_count_ith",
+    [llama_context_p_ctypes, ctypes.c_int32],
+    ctypes.c_uint32,
+)
+def llama_get_sampled_logits_count_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> int:
+    """Get the backend sampled logit count for the ith token."""
+    ...
+
+
+# // Get the backend sampled candidates for the ith token
+# LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_candidates_ith",
+    [llama_context_p_ctypes, ctypes.c_int32],
+    ctypes.POINTER(llama_token),
+)
+def llama_get_sampled_candidates_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> Optional[CtypesPointer[llama_token]]:
+    """Get the backend sampled candidates for the ith token."""
+    ...
+
+
+# LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
+@ctypes_function(
+    "llama_get_sampled_candidates_count_ith",
+    [llama_context_p_ctypes, ctypes.c_int32],
+    ctypes.c_uint32,
+)
+def llama_get_sampled_candidates_count_ith(
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+) -> int:
+    """Get the backend sampled candidate count for the ith token."""
+    ...
+
+
 # //
 # // Vocab
 # //
@@ -3558,6 +3731,21 @@ def llama_chat_builtin_templates(
 llama_sampler_context_t = ctypes.c_void_p
 
 
+# struct llama_sampler_data {
+#     struct ggml_tensor * logits;
+#     struct ggml_tensor * probs;
+#     struct ggml_tensor * sampled;
+#     struct ggml_tensor * candidates;
+# };
+class llama_sampler_data(ctypes.Structure):
+    _fields_ = [
+        ("logits", ctypes.c_void_p),
+        ("probs", ctypes.c_void_p),
+        ("sampled", ctypes.c_void_p),
+        ("candidates", ctypes.c_void_p),
+    ]
+
+
 # // user code can implement the interface below in order to create custom llama_sampler
 # struct llama_sampler_i {
 #     const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
@@ -3598,6 +3786,24 @@ class llama_sampler(ctypes.Structure):
 llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes)
 llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes)
 llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes)
+llama_sampler_i_backend_init = ctypes.CFUNCTYPE(
+    ctypes.c_bool, llama_sampler_p_ctypes, ctypes.c_void_p
+)
+llama_sampler_i_backend_accept = ctypes.CFUNCTYPE(
+    None,
+    llama_sampler_p_ctypes,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+)
+llama_sampler_i_backend_apply = ctypes.CFUNCTYPE(
+    None,
+    llama_sampler_p_ctypes,
+    ctypes.c_void_p,
+    ctypes.c_void_p,
+    ctypes.POINTER(llama_sampler_data),
+)
+llama_sampler_i_backend_set_input = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes)
 
 llama_sampler_i._fields_ = [
     ("name", llama_sampler_i_name),
@@ -3606,9 +3812,27 @@ class llama_sampler(ctypes.Structure):
     ("reset", llama_sampler_i_reset),
     ("clone", llama_sampler_i_clone),
     ("free", llama_sampler_i_free),
+    ("backend_init", llama_sampler_i_backend_init),
+    ("backend_accept", llama_sampler_i_backend_accept),
+    ("backend_apply", llama_sampler_i_backend_apply),
+    ("backend_set_input", llama_sampler_i_backend_set_input),
 ]
 
 
+# // attach a sampler to the context
+# LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
+@ctypes_function(
+    "llama_set_sampler",
+    [llama_context_p_ctypes, llama_seq_id, llama_sampler_p_ctypes],
+    ctypes.c_bool,
+)
+def llama_set_sampler(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], smpl: llama_sampler_p, /
+) -> bool:
+    """Attach a sampler to the context."""
+    ...
+
+
 # // mirror of llama_sampler_i:
 # LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
 @ctypes_function(
@@ -3748,14 +3972,6 @@ def llama_sampler_init_greedy() -> llama_sampler_p: ...
 def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ...
 
 
-# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-#     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p: ...
-
-
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 # /// Setting k <= 0 makes this a noop
 # LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
@@ -4005,6 +4221,22 @@ def llama_sampler_init_dry(
 ) -> llama_sampler_p: ...
 
 
+# LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
+#                            float   target,
+#                            float   decay,
+#                         uint32_t   seed);
+@ctypes_function(
+    "llama_sampler_init_adaptive_p",
+    [ctypes.c_float, ctypes.c_float, ctypes.c_uint32],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_adaptive_p(
+    target: float, decay: float, seed: int, /
+) -> llama_sampler_p:
+    """Initialize an adaptive-p sampler."""
+    ...
+
+
 # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
 #                          int32_t   n_vocab,
 #                          int32_t   n_logit_bias,
@@ -4102,10 +4334,26 @@ def llama_print_system_info() -> bytes: ...
 
 # // Set callback for all future logging events.
 # // If this is not called, or NULL is supplied, everything is output on stderr.
+# // The logger state is global so these functions are NOT thread safe.
+# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
+@ctypes_function(
+    "llama_log_get",
+    [ctypes.POINTER(llama_log_callback), ctypes.POINTER(ctypes.c_void_p)],
+    None,
+)
+def llama_log_get(
+    log_callback: CtypesPointerOrRef[llama_log_callback],
+    user_data: CtypesPointerOrRef[ctypes.c_void_p],
+    /,
+):
+    """Get the current logging callback and user data."""
+    ...
+
+
 # LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
 @ctypes_function(
     "llama_log_set",
-    [ctypes.c_void_p, ctypes.c_void_p],
+    [llama_log_callback, ctypes.c_void_p],
     None,
 )
 def llama_log_set(
@@ -4214,6 +4462,13 @@ def llama_perf_sampler_print(chain: llama_sampler_p, /): ...
 def llama_perf_sampler_reset(chain: llama_sampler_p, /): ...
 
 
+# // print a breakdown of per-device memory use via LLAMA_LOG:
+@ctypes_function("llama_memory_breakdown_print", [llama_context_p_ctypes], None)
+def llama_memory_breakdown_print(ctx: llama_context_p, /):
+    """Print a breakdown of per-device memory use."""
+    ...
+
+
 # //
 # // training
 # //
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 41753a7f6..787683179 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import warnings
 from ctypes import (
     c_bool,
     c_char_p,
@@ -78,13 +79,31 @@
 
 # Structures
 class mtmd_context_params(Structure):
+    if TYPE_CHECKING:
+        use_gpu: bool
+        print_timings: bool
+        n_threads: int
+        image_marker: Optional[bytes]
+        media_marker: Optional[bytes]
+        flash_attn_type: int
+        warmup: bool
+        image_min_tokens: int
+        image_max_tokens: int
+        cb_eval: llama_cpp.ggml_backend_sched_eval_callback
+        cb_eval_user_data: c_void_p
+
     _fields_ = [
         ("use_gpu", c_bool),
         ("print_timings", c_bool),
         ("n_threads", c_int),
-        ("verbosity", c_int),  # ggml_log_level
         ("image_marker", c_char_p),
         ("media_marker", c_char_p),
+        ("flash_attn_type", c_int),
+        ("warmup", c_bool),
+        ("image_min_tokens", c_int),
+        ("image_max_tokens", c_int),
+        ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
+        ("cb_eval_user_data", c_void_p),
     ]
 
 
@@ -132,11 +151,49 @@ def mtmd_init_from_file(
 def mtmd_free(ctx: mtmd_context_p, /): ...
 
 
+# MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+@ctypes_function("mtmd_decode_use_non_causal", [mtmd_context_p_ctypes], c_bool)
+def mtmd_decode_use_non_causal(ctx: mtmd_context_p, /) -> bool:
+    """Check whether MTMD decoding uses non-causal attention."""
+    ...
+
+
+# MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+@ctypes_function("mtmd_decode_use_mrope", [mtmd_context_p_ctypes], c_bool)
+def mtmd_decode_use_mrope(ctx: mtmd_context_p, /) -> bool:
+    """Check whether MTMD decoding uses mRoPE."""
+    ...
+
+
 # MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 @ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
 def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool: ...
 
 
+# MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+@ctypes_function("mtmd_support_audio", [mtmd_context_p_ctypes], c_bool)
+def mtmd_support_audio(ctx: mtmd_context_p, /) -> bool:
+    """Check whether MTMD supports audio."""
+    ...
+
+
+# MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
+@ctypes_function("mtmd_get_audio_sample_rate", [mtmd_context_p_ctypes], c_int)
+def mtmd_get_audio_sample_rate(ctx: mtmd_context_p, /) -> int:
+    """Get the MTMD audio sample rate."""
+    ...
+
+
+# Deprecated compatibility wrapper for the renamed mtmd_get_audio_sample_rate().
+def mtmd_get_audio_bitrate(ctx: mtmd_context_p, /) -> int:
+    warnings.warn(
+        "mtmd_get_audio_bitrate is deprecated; use mtmd_get_audio_sample_rate instead",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return mtmd_get_audio_sample_rate(ctx)
+
+
 # MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
 @ctypes_function(
     "mtmd_bitmap_init", [c_uint32, c_uint32, POINTER(c_uint8)], mtmd_bitmap_p_ctypes
@@ -149,6 +206,21 @@ def mtmd_bitmap_init(
 ) -> Optional[mtmd_bitmap_p]: ...
 
 
+# MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
+@ctypes_function(
+    "mtmd_bitmap_init_from_audio",
+    [c_size_t, POINTER(c_float)],
+    mtmd_bitmap_p_ctypes,
+)
+def mtmd_bitmap_init_from_audio(
+    n_samples: Union[c_size_t, int],
+    data: CtypesArray[c_float],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    """Initialize an MTMD bitmap from audio samples."""
+    ...
+
+
 # MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
 @ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
 def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /): ...
@@ -284,3 +356,25 @@ def mtmd_helper_eval_chunk_single(
     new_n_past: "_Pointer[llama_cpp.llama_pos]",
     /,
 ) -> int: ...
+
+
+# MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
+@ctypes_function(
+    "mtmd_log_set",
+    [llama_cpp.llama_log_callback, c_void_p],
+    None,
+)
+def mtmd_log_set(log_callback, user_data: c_void_p, /):
+    """Set the MTMD logging callback."""
+    ...
+
+
+# MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
+@ctypes_function(
+    "mtmd_helper_log_set",
+    [llama_cpp.llama_log_callback, c_void_p],
+    None,
+)
+def mtmd_helper_log_set(log_callback, user_data: c_void_p, /):
+    """Set the MTMD helper logging callback."""
+    ...
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 964b0895c..619c7378d 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -83,7 +83,7 @@ def test_real_model(llama_cpp_model_path):
     cparams.n_threads = multiprocessing.cpu_count()
     cparams.n_threads_batch = multiprocessing.cpu_count()
     cparams.logits_all = False
-    cparams.flash_attn = True
+    cparams.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
 
     context = internals.LlamaContext(model=model, params=cparams)
     tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be4..49bfddeca 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit 49bfddeca18e62fa3d39114a23e9fcbdf8a22388

From e1f8ac0e135201c4a6de7c828331a2f078e9763a Mon Sep 17 00:00:00 2001
From: Bruno Verachten <gounthar@gmail.com>
Date: Mon, 23 Mar 2026 03:27:22 +0100
Subject: [PATCH 05/19] ci: add riscv64 wheel builds to release workflow
 (#2139)

* ci: add riscv64 wheel builds to release workflow

Add a build_wheels_riscv64 job mirroring the existing arm64 QEMU-based
build. Uses cibuildwheel with QEMU emulation for linux/riscv64, targeting
CPython 3.10-3.14 on manylinux.

Closes #2138

* ci: use cibuildwheel 3.1.2 for riscv64 wheels

* docs: update changelog for riscv64 wheel PR

---------

Co-authored-by: abetlen <abetlen@gmail.com>
---
 .github/workflows/build-and-release.yaml | 31 +++++++++++++++++++++++-
 CHANGELOG.md                             |  1 +
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 7eaf017fb..0121febe8 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -85,6 +85,35 @@ jobs:
           name: wheels_arm64
           path: ./wheelhouse/*.whl
 
+  build_wheels_riscv64:
+    name: Build riscv64 wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+        with:
+          platforms: linux/riscv64
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v3.1.2
+        env:
+          CIBW_SKIP: "*musllinux* pp*"
+          CIBW_REPAIR_WHEEL_COMMAND: ""
+          CIBW_ARCHS: "riscv64"
+          CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*"
+        with:
+          output-dir: wheelhouse
+
+      - name: Upload wheels as artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels_riscv64
+          path: ./wheelhouse/*.whl
+
   build_sdist:
     name: Build source distribution
     runs-on: ubuntu-latest
@@ -129,7 +158,7 @@ jobs:
 
   release:
     name: Release
-    needs: [build_wheels, build_wheels_arm64, build_sdist]
+    needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
     runs-on: ubuntu-latest
 
     steps:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e7506eed5..94666cec1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
 - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
+- ci: add riscv64 wheel builds to release workflow by @gounthar in #2139
 
 ## [0.3.16]
 

From 11e7a55af76303d42f2bc5a79ed0babbc89652dd Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 22 Mar 2026 22:33:31 -0700
Subject: [PATCH 06/19] fix: Qwen 3.5 support (#2152)

* fix: handle Qwen 3.5 hybrid prefix reuse

* test: fix Qwen runtime unit mocks

* test: drop Qwen runtime unit tests

* docs: credit Qwen fix contributors in changelog

* docs/tests: update default Qwen model to 3.5 0.8B

* test: rebaseline Qwen 3.5 outputs

* test: stabilize low-level Qwen sampling check

* test: tighten Qwen 3.5 completion prompts
---
 .github/workflows/test.yaml   |  4 ++--
 CHANGELOG.md                  |  1 +
 README.md                     |  6 +++---
 examples/gradio_chat/local.py |  6 +++---
 examples/hf_pull/main.py      |  6 +++---
 llama_cpp/_internals.py       |  4 ++--
 llama_cpp/llama.py            | 19 +++++++++++++------
 tests/test_llama.py           | 33 ++++++++++++++++++++-------------
 8 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index af4cacac4..8a6845ff2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -8,8 +8,8 @@ on:
       - main
 
 env:
-  REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF
-  MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf
+  REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF
+  MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf
 
 jobs:
   download-model:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94666cec1..4153406c1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
+- fix: Handle Qwen 3.5 hybrid prefix reuse by @codavidgarcia and @r-dh in #2152
 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
 - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
diff --git a/README.md b/README.md
index b57c95807..8ba4dbb5e 100644
--- a/README.md
+++ b/README.md
@@ -322,8 +322,8 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i
 
 ```python
 llm = Llama.from_pretrained(
-    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
+    filename="*Q8_0.gguf",
     verbose=False
 )
 ```
@@ -685,7 +685,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
 
 ```bash
-python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
+python3 -m llama_cpp.server --hf_model_repo_id lmstudio-community/Qwen3.5-0.8B-GGUF --model '*Q8_0.gguf'
 ```
 
 ### Web Server Features
diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py
index e16bf234a..871d8b09b 100644
--- a/examples/gradio_chat/local.py
+++ b/examples/gradio_chat/local.py
@@ -4,10 +4,10 @@
 import gradio as gr
 
 llama = llama_cpp.Llama.from_pretrained(
-    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
+    filename="*Q8_0.gguf",
     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        "Qwen/Qwen1.5-0.5B"
+        "Qwen/Qwen3.5-0.8B"
     ),
     verbose=False,
 )
diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py
index dfed17516..a9ca424d1 100644
--- a/examples/hf_pull/main.py
+++ b/examples/hf_pull/main.py
@@ -3,10 +3,10 @@
 
 
 llama = llama_cpp.Llama.from_pretrained(
-    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
+    filename="*Q8_0.gguf",
     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        "Qwen/Qwen1.5-0.5B"
+        "Qwen/Qwen3.5-0.8B"
     ),
     verbose=False,
 )
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index d6258d224..6862135aa 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -291,10 +291,10 @@ def kv_cache_clear(self):
         assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_clear(self.memory, True)
 
-    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
         assert self.memory is not None, "Memory is not initialized"
         seq_id = seq_id if seq_id >= 0 else 0
-        llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
+        return llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
         assert self.memory is not None, "Memory is not initialized"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 1609ad16b..88bc2e5bb 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -891,13 +891,20 @@ def generate(
                 else:
                     break
             if longest_prefix > 0:
-                reset = False
-                tokens = tokens[longest_prefix:]
-                self.n_tokens = longest_prefix
-                if self.verbose:
+                if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
+                    reset = False
+                    tokens = tokens[longest_prefix:]
+                    self.n_tokens = longest_prefix
+                    if self.verbose:
+                        print(
+                            f"Llama.generate: {longest_prefix} prefix-match hit, "
+                            f"remaining {len(tokens)} prompt tokens to eval",
+                            file=sys.stderr,
+                        )
+                elif self.verbose:
                     print(
-                        f"Llama.generate: {longest_prefix} prefix-match hit, "
-                        f"remaining {len(tokens)} prompt tokens to eval",
+                        f"Llama.generate: {longest_prefix} prefix-match found "
+                        f"but partial kv removal not supported, re-evaluating full prompt",
                         file=sys.stderr,
                     )
 
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 619c7378d..1a70c74d4 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -58,8 +58,8 @@ def test_llama_cpp_tokenization():
 
 @pytest.fixture
 def llama_cpp_model_path():
-    repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF"
-    filename = "qwen2-0_5b-instruct-q8_0.gguf"
+    repo_id = "lmstudio-community/Qwen3.5-0.8B-GGUF"
+    filename = "Qwen3.5-0.8B-Q8_0.gguf"
     model_path = hf_hub_download(repo_id, filename)
     return model_path
 
@@ -88,9 +88,14 @@ def test_real_model(llama_cpp_model_path):
     context = internals.LlamaContext(model=model, params=cparams)
     tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
 
-    assert tokens == [9707, 11, 1879, 0]
+    assert tokens == [9419, 11, 1814, 0]
 
-    tokens = model.tokenize(b"The quick brown fox jumps", add_bos=True, special=True)
+    tokens = model.tokenize(
+        b"The quick brown fox jumps over the lazy dog. The quick brown fox jumps ",
+        add_bos=True,
+        special=True,
+    )
+    prompt_token_count = len(tokens)
 
     batch = internals.LlamaBatch(n_tokens=len(tokens), embd=0, n_seq_max=1)
 
@@ -111,9 +116,11 @@ def test_real_model(llama_cpp_model_path):
         tokens = [token_id]
         result += tokens
 
-    output = result[5:]
+    output = result[prompt_token_count:]
     output_text = model.detokenize(output, special=True)
-    assert output_text == b" over the lazy dog"
+    # Low-level sampling output varies across CPU and Metal backends.
+    assert len(output) == 4
+    assert output_text
 
 
 def test_real_llama(llama_cpp_model_path):
@@ -129,14 +136,14 @@ def test_real_llama(llama_cpp_model_path):
     )
 
     output = model.create_completion(
-        "The quick brown fox jumps",
-        max_tokens=4,
+        "The quick brown fox jumps over the lazy dog. The quick brown fox",
+        max_tokens=6,
         top_k=50,
         top_p=0.9,
-        temperature=0.8,
+        temperature=0.0,
         seed=1337,
     )
-    assert output["choices"][0]["text"] == " over the lazy dog"
+    assert output["choices"][0]["text"] == " jumps over the lazy dog."
 
     output = model.create_completion(
         "The capital of france is paris, 'true' or 'false'?:\n",
@@ -181,7 +188,7 @@ def logit_processor_func(input_ids, logits):
         max_tokens=4,
         top_k=50,
         top_p=0.9,
-        temperature=0.8,
+        temperature=1.0,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
 """),
@@ -193,7 +200,7 @@ def logit_processor_func(input_ids, logits):
         max_tokens=4,
         top_k=50,
         top_p=0.9,
-        temperature=0.8,
+        temperature=1.0,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
 """),
@@ -207,7 +214,7 @@ def logit_processor_func(input_ids, logits):
         max_tokens=4,
         top_k=50,
         top_p=0.9,
-        temperature=0.8,
+        temperature=1.0,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
 """),

From a6b180724a641750aeaefa51692fe12ee8c4d54f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 22 Mar 2026 23:28:55 -0700
Subject: [PATCH 07/19] chore: Bump version (#2153)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4153406c1..d1195cc2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.17]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
 - fix: Handle Qwen 3.5 hybrid prefix reuse by @codavidgarcia and @r-dh in #2152
 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde7046..a7c40478b 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.3.17"

From f0391c5ea7159b4c4916d9f4aced2f982adbd1f4 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 00:59:19 -0700
Subject: [PATCH 08/19] fix(ci): release wheel workflow (#2154)

* fix(ci): harden release wheel workflow

* fix(ci): document and pin release wheel baselines

* fix(ci): speed up release arch builds

* fix(ci): split riscv64 by python version

* fix(ci): sanitize riscv64 artifact names
---
 .github/workflows/build-and-release.yaml | 49 +++++++++++++++++++-----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 0121febe8..3a9e6f369 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -46,6 +46,13 @@ jobs:
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
+          # Skip cibuildwheel's default i686 sidecar and keep Linux release
+          # wheels on a portable x86_64 CPU baseline.
+          CIBW_ARCHS_LINUX: "auto64"
+          CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off"
+          # Keep macOS release wheels on a portable CPU baseline instead of
+          # inheriting the hosted runner's native flags.
+          CIBW_ENVIRONMENT_MACOS: CMAKE_ARGS="-DGGML_NATIVE=off"
         with:
           package-dir: .
           output-dir: wheelhouse
@@ -57,24 +64,21 @@ jobs:
 
   build_wheels_arm64:
     name: Build arm64 wheels
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04-arm
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: "recursive"
 
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
-
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.22.0
         env:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "aarch64"
-          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON"
+          # Keep native arm64 builds on a portable CPU baseline instead of
+          # tuning wheels to the hosted runner.
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
           CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
         with:
           output-dir: wheelhouse
@@ -86,8 +90,27 @@ jobs:
           path: ./wheelhouse/*.whl
 
   build_wheels_riscv64:
-    name: Build riscv64 wheels
+    name: Build riscv64 wheels (${{ matrix.shard.name }})
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        shard:
+          - name: cp310
+            build: "cp310-*"
+            artifact: wheels_riscv64_cp310
+          - name: cp311
+            build: "cp311-*"
+            artifact: wheels_riscv64_cp311
+          - name: cp312
+            build: "cp312-*"
+            artifact: wheels_riscv64_cp312
+          - name: cp313
+            build: "cp313-*"
+            artifact: wheels_riscv64_cp313
+          - name: cp314
+            build: "cp314-*"
+            artifact: wheels_riscv64_cp314
     steps:
       - uses: actions/checkout@v4
         with:
@@ -104,14 +127,19 @@ jobs:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "riscv64"
-          CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*"
+          # Build riscv64 wheels against a conservative baseline instead of
+          # enabling RVV-related extensions from the build container.
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
+          # Split the emulated riscv64 build into one Python version per job
+          # to minimize wall-clock time without changing the release artifacts.
+          CIBW_BUILD: ${{ matrix.shard.build }}
         with:
           output-dir: wheelhouse
 
       - name: Upload wheels as artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: wheels_riscv64
+          name: ${{ matrix.shard.artifact }}
           path: ./wheelhouse/*.whl
 
   build_sdist:
@@ -159,6 +187,7 @@ jobs:
   release:
     name: Release
     needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
+    if: startsWith(github.ref, 'refs/tags/')
     runs-on: ubuntu-latest
 
     steps:

From 909ebf1246a52c15ebc95460c7e5957e3b64711e Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 01:00:50 -0700
Subject: [PATCH 09/19] fix(ci): cuda wheel workflow (#2155)

* fix(ci): harden cuda wheel workflow

* fix(ci): pin cuda toolkit versions accurately

* fix(ci): resolve exact cuda toolkit installs

* fix(ci): align cuda toolkit roots and tags

* fix(ci): pin cuda packages to nvidia label

* fix(ci): allow cuda solver to mix non-cuda deps
---
 .github/workflows/build-wheels-cuda.yaml | 57 +++++++++++++++++++++---
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 07b30cfc0..b8d6c9dce 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -99,21 +99,63 @@ jobs:
           MAMBA_NO_LOW_SPEED_LIMIT: "1"
         run: |
           $cudaVersion = $env:CUDAVER
-          mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
+          $cudaChannel = "nvidia/label/cuda-$cudaVersion"
+          if ($IsLinux) {
+            # Keep nvcc, cudart, and headers on the same NVIDIA label so the
+            # detected toolkit version matches the published wheel tag.
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev"
+          } else {
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion"
+          }
+          if ($LASTEXITCODE -ne 0) {
+            exit $LASTEXITCODE
+          }
           python -m pip install build wheel
 
       - name: Build Wheel
         run: |
-          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
           $env:CUDA_PATH = $env:CONDA_PREFIX
           $env:CUDA_HOME = $env:CONDA_PREFIX
           $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+          $cudaHostCompilerArg = ''
+          $env:CMAKE_ARGS = ''
           if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
+            if (Test-Path '/usr/bin/g++-12') {
+              $env:CC = '/usr/bin/gcc-12'
+              $env:CXX = '/usr/bin/g++-12'
+              $env:CUDAHOSTCXX = '/usr/bin/g++-12'
+              $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
+            }
+            if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) {
+              $env:CUDAToolkit_ROOT = $env:CONDA_PREFIX
+              $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+              $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg"
+              $env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH"
+              $env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH"
+              $env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH"
+              $env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH"
+            } else {
+              $env:CMAKE_ARGS = $cudaHostCompilerArg.Trim()
+            }
+          }
+          $nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc'
+          if (-not (Test-Path $nvccPath)) {
+            $nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc'
+          }
+          if (-not (Test-Path $nvccPath)) {
+            throw 'Failed to find nvcc in the conda environment'
+          }
+          $env:CUDACXX = $nvccPath
+          $env:PATH = "$(Split-Path $nvccPath):$env:PATH"
+          $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value
+          if (-not $nvccVersion) {
+            throw 'Failed to detect the installed CUDA toolkit version'
           }
+          $cudaTagVersion = $nvccVersion.Replace('.','')
           $env:VERBOSE = '1'
-          $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all'
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
+          # Keep a portable SM set, including sm_70, instead of CMake's `all`,
+          # which now pulls in future targets the hosted-runner toolchains cannot assemble.
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
           # if ($env:AVXVER -eq 'AVX') {
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           # }
@@ -124,10 +166,11 @@ jobs:
           #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           # }
           python -m build --wheel
-          # write the build tag to the output
-          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+          # Publish tags that reflect the actual installed toolkit version.
+          Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV
 
       - uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
         with:
           files: dist/*
           # Set tag_name to <tag>-cu<cuda_version>

From ccc6bc0454b2d73431a419620aad92fda1aba162 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 01:02:14 -0700
Subject: [PATCH 10/19] fix(ci): docker build workflow (#2156)

* fix(ci): harden docker build workflow

* docs: update changelog for ci workflows
---
 .github/workflows/build-docker.yaml | 11 ++++++++++-
 CHANGELOG.md                        |  4 ++++
 docker/simple/Dockerfile            |  3 ++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index b290f6273..c65695847 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -16,6 +16,15 @@ jobs:
         with:
           submodules: "recursive"
 
+      - name: Set image tag
+        run: |
+          if [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then
+            image_tag="${GITHUB_REF_NAME}"
+          else
+            image_tag="${GITHUB_REF_NAME//\//-}"
+          fi
+          echo "IMAGE_TAG=$image_tag" >> "$GITHUB_ENV"
+
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
 
@@ -40,7 +49,7 @@ jobs:
           platforms: linux/amd64,linux/arm64
           tags: |
             ghcr.io/abetlen/llama-cpp-python:latest
-            ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }}
+            ghcr.io/abetlen/llama-cpp-python:${{ env.IMAGE_TAG }}
           build-args: |
             BUILDKIT_INLINE_CACHE=1
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d1195cc2a..b47613109 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156
+- fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155
+- fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154
+
 ## [0.3.17]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
index 06483d44e..bad4f456f 100644
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@@ -6,6 +6,7 @@ FROM ${IMAGE}
 
 # Re-declare the ARG after FROM
 ARG IMAGE
+ARG CMAKE_ARGS="-DGGML_NATIVE=off"
 
 # Update and upgrade the existing packages 
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
@@ -26,7 +27,7 @@ RUN python3 -m pip install --upgrade pip
 
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
-RUN pip install llama-cpp-python --verbose;
+RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose
 
 # Set environment variable for the host
 ENV HOST=0.0.0.0

From 7b38c3122d2ff3ad23e1502de045807836ced4a7 Mon Sep 17 00:00:00 2001
From: Victor Biederbeck <james@jamesbiederbeck.com>
Date: Tue, 24 Mar 2026 02:50:15 -0700
Subject: [PATCH 11/19] feat: expose attention_type parameter in Llama.__init__
 (#2143)

* feat: expose attention_type parameter in Llama.__init__

* docs: preserve attention_type in pickled state

* docs: update changelog for attention_type

---------

Co-authored-by: Victor Biederbeck <victor@moria.hiddencove.xyz>
Co-authored-by: abetlen <abetlen@gmail.com>
---
 CHANGELOG.md       | 1 +
 llama_cpp/llama.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b47613109..de4f070ff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143
 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156
 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155
 - fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 88bc2e5bb..ad484c4d5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -81,6 +81,7 @@ def __init__(
             int
         ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
         yarn_ext_factor: float = -1.0,
@@ -163,6 +164,7 @@ def __init__(
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
             pooling_type: Pooling type, from `enum llama_pooling_type`.
+            attention_type: Attention type, from `enum llama_attention_type`.
             rope_freq_base: RoPE base frequency, 0 = from model
             rope_freq_scale: RoPE frequency scaling factor, 0 = from model
             yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
@@ -319,6 +321,7 @@ def __init__(
             else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
         )
         self.context_params.pooling_type = pooling_type
+        self.context_params.attention_type = attention_type
         self.context_params.rope_freq_base = (
             rope_freq_base if rope_freq_base != 0.0 else 0
         )
@@ -2100,6 +2103,7 @@ def __getstate__(self):
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
             pooling_type=self.context_params.pooling_type,
+            attention_type=self.context_params.attention_type,
             rope_freq_base=self.context_params.rope_freq_base,
             rope_freq_scale=self.context_params.rope_freq_scale,
             yarn_ext_factor=self.context_params.yarn_ext_factor,

From d6f46a50d6b4cda10460c05e2acdbaec74428c1b Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 02:56:01 -0700
Subject: [PATCH 12/19] chore: bump version (#2157)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de4f070ff..4118f4848 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.18]
+
 - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143
 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156
 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index a7c40478b..bdaefb9e0 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.17"
+__version__ = "0.3.18"

From 5f9c231ce165126f38c8897fd760ecd7ef79f9fd Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Wed, 25 Mar 2026 01:56:18 -0700
Subject: [PATCH 13/19] fix(ci): reduce CUDA binary wheel size only including
 cubins for current arches and one PTX target for forward compatibility
 (#2158)

* fix(ci): shrink CUDA wheel fatbins

* docs: update changelog for cuda wheel size fix
---
 .github/workflows/build-wheels-cuda.yaml | 7 ++++---
 CHANGELOG.md                             | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index b8d6c9dce..17daaa12a 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -153,9 +153,10 @@ jobs:
           }
           $cudaTagVersion = $nvccVersion.Replace('.','')
           $env:VERBOSE = '1'
-          # Keep a portable SM set, including sm_70, instead of CMake's `all`,
-          # which now pulls in future targets the hosted-runner toolchains cannot assemble.
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
+          # Build real cubins for the supported GPUs, including sm_70, and keep
+          # one forward-compatible PTX target instead of embedding PTX for every
+          # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
           # if ($env:AVXVER -eq 'AVX') {
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           # }
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4118f4848..f4a0b55d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158
+
 ## [0.3.18]
 
 - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143

From ac59e5a5ae8d331d80f30d3ddfc50195061637f5 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Wed, 25 Mar 2026 15:03:57 -0700
Subject: [PATCH 14/19] fix: handle embedding models without KV memory (#2160)

* Fix embedding models without KV memory

* Add changelog entry for embedding memory fix
---
 CHANGELOG.md            |  1 +
 llama_cpp/_internals.py |  4 +++-
 tests/test_llama.py     | 16 ++++++++++++----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4a0b55d3..d2e4937c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160
 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158
 
 ## [0.3.18]
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 6862135aa..9e9bcd407 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -288,7 +288,9 @@ def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
     def kv_cache_clear(self):
-        assert self.memory is not None, "Memory is not initialized"
+        # Embedding models with non-causal attention may not allocate memory.
+        if self.memory is None:
+            return
         llama_cpp.llama_memory_clear(self.memory, True)
 
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 1a70c74d4..23928fff6 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -64,6 +64,14 @@ def llama_cpp_model_path():
     return model_path
 
 
+@pytest.fixture
+def llama_cpp_embedding_model_path():
+    repo_id = "CompendiumLabs/bge-small-en-v1.5-gguf"
+    filename = "bge-small-en-v1.5-q4_k_m.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+
 def test_real_model(llama_cpp_model_path):
     import os
 
@@ -225,9 +233,9 @@ def logit_processor_func(input_ids, logits):
     assert number_1 == number_3
 
 
-def test_real_llama_embeddings(llama_cpp_model_path):
+def test_real_llama_embeddings(llama_cpp_embedding_model_path):
     model = llama_cpp.Llama(
-        llama_cpp_model_path,
+        llama_cpp_embedding_model_path,
         n_ctx=32,
         n_batch=32,
         n_ubatch=32,
@@ -237,5 +245,5 @@ def test_real_llama_embeddings(llama_cpp_model_path):
         flash_attn=True,
         embedding=True,
     )
-    # Smoke test for now
-    model.embed("Hello World")
+    embedding = model.embed("Hello World")
+    assert len(embedding) > 0

From c670222c8379608aa22ac81e5f6a813620187a26 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Wed, 25 Mar 2026 15:21:44 -0700
Subject: [PATCH 15/19] feat: Update llama.cpp to
 ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 (#2161)

* Update llama.cpp to c0159f9c1

* Add changelog entry for llama.cpp update
---
 CHANGELOG.md           |  1 +
 llama_cpp/llama_cpp.py | 16 ++++++++++++++++
 vendor/llama.cpp       |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2e4937c9..03240454f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 by @abetlen in #2161
 - fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160
 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index e51492c56..5a6c06b07 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1314,6 +1314,22 @@ def llama_model_load_from_splits(
     ...
 
 
+# // Load a model from an open FILE pointer
+# LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
+#                                FILE * file,
+#           struct llama_model_params   params);
+@ctypes_function(
+    "llama_model_load_from_file_ptr",
+    [ctypes.c_void_p, llama_model_params],
+    llama_model_p_ctypes,
+)
+def llama_model_load_from_file_ptr(
+    file: ctypes.c_void_p, params: llama_model_params, /
+) -> Optional[llama_model_p]:
+    """Load a model from an open FILE pointer."""
+    ...
+
+
 # LLAMA_API void llama_model_save_to_file(
 #         const struct llama_model * model,
 #                     const char * path_model);
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 49bfddeca..c0159f9c1 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 49bfddeca18e62fa3d39114a23e9fcbdf8a22388
+Subproject commit c0159f9c1f874da15e94f371d136f5920b4b5335

From f54421ba71db942b262a28762bc9e035a5d4d349 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Wed, 25 Mar 2026 15:28:12 -0700
Subject: [PATCH 16/19] Bump version to 0.3.19 (#2162)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 03240454f..d1efdc5e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.19]
+
 - feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 by @abetlen in #2161
 - fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160
 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index bdaefb9e0..72388c4e5 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.18"
+__version__ = "0.3.19"

From fcd932a1574b7b6fdfc6c2d652f10f3af66995b5 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 28 Mar 2026 21:36:13 -0700
Subject: [PATCH 17/19] fix(ci): publish distinct manylinux and musllinux cpu
 wheels (#2165)

* fix(ci): publish distinct manylinux and musllinux cpu wheels

* docs: add changelog entry for linux wheel repair fix
---
 .github/workflows/build-and-release.yaml | 5 ++++-
 CHANGELOG.md                             | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 3a9e6f369..6cbac0cb1 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -44,8 +44,11 @@ jobs:
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.22.0
         env:
-          # disable repair
+          # Keep repair disabled by default for non-Linux platforms in this job.
           CIBW_REPAIR_WHEEL_COMMAND: ""
+          # Linux needs auditwheel repair so manylinux and musllinux wheels are
+          # published with distinct platform tags instead of generic linux tags.
+          CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair -w {dest_dir} {wheel}"
           # Skip cibuildwheel's default i686 sidecar and keep Linux release
           # wheels on a portable x86_64 CPU baseline.
           CIBW_ARCHS_LINUX: "auto64"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d1efdc5e9..7bcad0a47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
+
 ## [0.3.19]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@c0159f9c1f874da15e94f371d136f5920b4b5335 by @abetlen in #2161

From 7613aca61259820ab550626384af52eed56a731f Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 29 Mar 2026 00:31:03 -0700
Subject: [PATCH 18/19] ci: publish release wheels as py3-none (#2166)

* ci: publish CPU wheels as py3-none

* docs: add changelog entry for py3-none wheel tags
---
 CHANGELOG.md   | 1 +
 pyproject.toml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7bcad0a47..3b4c13ee3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- ci: Publish release wheels as `py3-none` by @Bing-su in #2166
 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
 
 ## [0.3.19]
diff --git a/pyproject.toml b/pyproject.toml
index e0b0dc520..b5998dd1c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,7 @@ all = [
 
 [tool.scikit-build]
 wheel.packages = ["llama_cpp"]
+wheel.py-api = "py3"
 cmake.verbose = true
 cmake.minimum-version = "3.21"
 minimum-version = "0.5.1"

From 7257ba95fbbf65201fd5bf4b7f0bdd1c701e1345 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 29 Mar 2026 23:17:22 -0700
Subject: [PATCH 19/19] feat(server): add model-load chat_template_kwargs
 (#2168)

---
 CHANGELOG.md                   |  1 +
 docs/server.md                 | 25 +++++++++++++++++++++++++
 llama_cpp/llama_chat_format.py |  7 ++++++-
 llama_cpp/server/cli.py        | 34 ++++++++++++++++++++++++++++++++--
 llama_cpp/server/model.py      | 15 +++++++++++++++
 llama_cpp/server/settings.py   |  6 +++++-
 6 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3b4c13ee3..e577324db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
 - ci: Publish release wheels as `py3-none` by @Bing-su in #2166
 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
 
diff --git a/docs/server.md b/docs/server.md
index cd6f86c51..9c09a1f1c 100644
--- a/docs/server.md
+++ b/docs/server.md
@@ -22,6 +22,15 @@ The server can then be started by running the following command:
 python3 -m llama_cpp.server --model <model_path>
 ```
 
+You can also pass chat-template kwargs at model load time from the CLI:
+
+```bash
+python3 -m llama_cpp.server \
+  --model <model_path> \
+  --chat_format chatml \
+  --chat_template_kwargs '{"enable_thinking": true}'
+```
+
 ### Server options
 
 For a full list of options, run:
@@ -147,6 +156,22 @@ The server supports routing requests to multiple models based on the `model` par
 
 At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
 
+For a single-model config, `chat_template_kwargs` can be set directly on the model entry:
+
+```json
+{
+    "models": [
+        {
+            "model": "models/Qwen3.5-0.8B/qwen3.5-0.8b-q8_0.gguf",
+            "chat_format": "chatml",
+            "chat_template_kwargs": {
+                "enable_thinking": true
+            }
+        }
+    ]
+}
+```
+
 ```json
 {
     "host": "0.0.0.0",
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index d7910e984..1024fb85b 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -243,6 +243,7 @@ def raise_exception(message: str):
             tools=tools,
             tool_choice=tool_choice,
             strftime_now=self.strftime_now,
+            **kwargs,
         )
 
         stopping_criteria = None
@@ -617,6 +618,7 @@ def chat_completion_handler(
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            **kwargs,
         )
         prompt = llama.tokenize(
             result.prompt.encode("utf-8"),
@@ -734,7 +736,9 @@ def format_autotokenizer(
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         tokenizer.use_default_system_prompt = False  # type: ignore
-        prompt: str = tokenizer.apply_chat_template(messages, tokenize=False)  # type: ignore
+        prompt: str = tokenizer.apply_chat_template(  # type: ignore
+            messages, tokenize=False, **kwargs
+        )
         assert isinstance(prompt, str)
         # Return formatted prompt and eos token by default
         return ChatFormatterResponse(
@@ -791,6 +795,7 @@ def format_tokenizer_config(
             messages=messages,
             bos_token=bos_token,
             eos_token=eos_token,
+            **kwargs,
         )
         return ChatFormatterResponse(
             prompt=prompt, stop=[eos_token, bos_token], added_special=True
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index 8ed029063..171b8db30 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 import argparse
+import json
 
-from typing import List, Literal, Union, Any, Type, TypeVar
+from typing import List, Literal, Union, Any, Type, TypeVar, Dict
 
 from pydantic import BaseModel
 
@@ -40,6 +41,17 @@ def _contains_list_type(annotation: Type[Any] | None) -> bool:
         return False
 
 
+def _contains_dict_type(annotation: Type[Any] | None) -> bool:
+    origin = getattr(annotation, "__origin__", None)
+
+    if origin is dict or origin is Dict:
+        return True
+    elif origin in (Literal, Union):
+        return any(_contains_dict_type(arg) for arg in annotation.__args__)  # type: ignore
+    else:
+        return False
+
+
 def _parse_bool_arg(arg: str | bytes | bool) -> bool:
     if isinstance(arg, bytes):
         arg = arg.decode("utf-8")
@@ -57,6 +69,16 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool:
         raise ValueError(f"Invalid boolean argument: {arg}")
 
 
+def _parse_json_object_arg(arg: str | bytes) -> dict[str, Any]:
+    if isinstance(arg, bytes):
+        arg = arg.decode("utf-8")
+
+    value = json.loads(arg)
+    if not isinstance(value, dict):
+        raise ValueError(f"Invalid JSON object argument: {arg}")
+    return value
+
+
 def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
     """Add arguments from a pydantic model to an argparse parser."""
 
@@ -68,7 +90,15 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel])
             _get_base_type(field.annotation) if field.annotation is not None else str
         )
         list_type = _contains_list_type(field.annotation)
-        if base_type is not bool:
+        dict_type = _contains_dict_type(field.annotation)
+        if dict_type:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                type=_parse_json_object_arg,
+                help=description,
+            )
+        elif base_type is not bool:
             parser.add_argument(
                 f"--{name}",
                 dest=name,
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 9e59e8563..3922ce5df 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -299,6 +299,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             # Misc
             verbose=settings.verbose,
         )
+        if settings.chat_template_kwargs:
+            base_chat_handler = (
+                _model.chat_handler
+                or _model._chat_handlers.get(_model.chat_format)
+                or llama_cpp.llama_chat_format.get_chat_completion_handler(
+                    _model.chat_format
+                )
+            )
+
+            def chat_handler_with_kwargs(*args, **kwargs):
+                return base_chat_handler(
+                    *args, **{**settings.chat_template_kwargs, **kwargs}
+                )
+
+            _model.chat_handler = chat_handler_with_kwargs
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 13c951241..3c2bb7fd0 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -2,7 +2,7 @@
 
 import multiprocessing
 
-from typing import Optional, List, Literal, Union, Dict, cast
+from typing import Any, Optional, List, Literal, Union, Dict, cast
 from typing_extensions import Self
 
 from pydantic import Field, model_validator
@@ -131,6 +131,10 @@ class ModelSettings(BaseSettings):
         default=None,
         description="Chat format to use.",
     )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Extra keyword arguments forwarded to chat templates at model load time. Matches llama.cpp server `chat_template_kwargs`.",
+    )
     clip_model_path: Optional[str] = Field(
         default=None,
         description="Path to a CLIP model to use for multi-modal chat completion.",