Skip to content

Commit 103f671

Browse files
author
Ralf Waldukat
committed
Update llama.cpp to 2026-01-01
- Update vendor/llama.cpp submodule to be47fb92 (2026-01-01) - Bump version from 0.3.16 to 0.4.0 Breaking changes: - Migrate flash_attn bool to flash_attn_type enum (backward compatible via None=AUTO) - Replace llama_kv_self_* API with llama_memory_* API New features: - Add LLAMA_FLASH_ATTN_TYPE_* enum (AUTO/DISABLED/ENABLED) - Add llama_model_params fields: no_host, no_alloc - Add mtmd_context_params fields: flash_attn_type, warmup, image_min/max_tokens - Add LLAMA_ROPE_TYPE_IMROPE, LLAMA_PARAMS_FIT_STATUS_* enums - Add 15+ new functions: llama_max_tensor_buft_overrides, llama_n_ctx_seq, llama_model_n_embd_inp, llama_model_is_hybrid, llama_log_*, llama_memory_*, llama_attach/detach_threadpool, llama_adapter_meta_* (4 functions) Fixes: - Server settings: flash_attn default None (AUTO) instead of False (DISABLED) - Enable FIM token functions: token_prefix/middle/suffix - Fix typos: additonal→additional, unnused→unused - Remove deprecated verbosity field from mtmd_context_params - Add CMake version workaround documentation Code quality: - Consistent stub style (... not pass) - Struct alignment verified against llama.h and mtmd.h - Minimal whitespace noise (0.4% of diff)
1 parent c37132b commit 103f671

File tree

7 files changed

+434
-334
lines changed

7 files changed

+434
-334
lines changed

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,15 @@ if (LLAMA_BUILD)
153153
add_compile_definitions(GGML_USE_METAL)
154154
endif()
155155

156+
# Set version for mtmd (required by upstream CMakeLists.txt)
157+
# NOTE: This is a workaround for mtmd build requirements.
158+
# Version is set to 0.0.0 for local builds. If upstream adds version
159+
# compatibility checks, this may need to match llama.cpp version.
160+
if (NOT DEFINED LLAMA_BUILD_NUMBER)
161+
set(LLAMA_BUILD_NUMBER 0)
162+
endif()
163+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
164+
156165
# Building llava
157166
add_subdirectory(vendor/llama.cpp/tools/mtmd)
158167

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.3.16"
4+
__version__ = "0.4.0"

llama_cpp/llama.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ def __init__(
9191
logits_all: bool = False,
9292
embedding: bool = False,
9393
offload_kqv: bool = True,
94-
flash_attn: bool = False,
9594
op_offload: Optional[bool] = None,
9695
swa_full: Optional[bool] = None,
96+
flash_attn: Optional[bool] = None,
9797
# Sampling Params
9898
no_perf: bool = False,
9999
last_n_tokens_size: int = 64,
@@ -173,7 +173,7 @@ def __init__(
173173
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
174174
embedding: Embedding mode only.
175175
offload_kqv: Offload K, Q, V to GPU.
176-
flash_attn: Use flash attention.
176+
flash_attn: Use flash attention. None = auto, True = enabled, False = disabled.
177177
op_offload: offload host tensor operations to device
178178
swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
179179
no_perf: Measure performance timings.
@@ -341,7 +341,16 @@ def __init__(
341341
self._logits_all = logits_all if draft_model is None else True
342342
self.context_params.embeddings = embedding # TODO: Rename to embeddings
343343
self.context_params.offload_kqv = offload_kqv
344-
self.context_params.flash_attn = flash_attn
344+
if flash_attn is None:
345+
self.context_params.flash_attn_type = llama_cpp.LLAMA_FLASH_ATTN_TYPE_AUTO
346+
elif flash_attn:
347+
self.context_params.flash_attn_type = (
348+
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
349+
)
350+
else:
351+
self.context_params.flash_attn_type = (
352+
llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
353+
)
345354

346355
if op_offload is not None:
347356
self.context_params.op_offload = op_offload
@@ -934,7 +943,8 @@ def generate(
934943

935944
sample_idx += 1
936945
if stopping_criteria is not None and stopping_criteria(
937-
self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
946+
self._input_ids[:sample_idx],
947+
self._scores[sample_idx - self.n_tokens, :],
938948
):
939949
return
940950
tokens_or_none = yield token
@@ -1041,7 +1051,9 @@ def embed(
10411051
data: Union[List[List[float]], List[List[List[float]]]] = []
10421052

10431053
def decode_batch(seq_sizes: List[int]):
1044-
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
1054+
mem = llama_cpp.llama_get_memory(self._ctx.ctx)
1055+
if mem is not None:
1056+
llama_cpp.llama_memory_clear(mem, True)
10451057
self._ctx.decode(self._batch)
10461058
self._batch.reset()
10471059

@@ -1112,7 +1124,9 @@ def decode_batch(seq_sizes: List[int]):
11121124

11131125
output = data[0] if isinstance(input, str) else data
11141126

1115-
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
1127+
mem = llama_cpp.llama_get_memory(self._ctx.ctx)
1128+
if mem is not None:
1129+
llama_cpp.llama_memory_clear(mem, True)
11161130
self.reset()
11171131

11181132
if return_count:
@@ -1157,9 +1171,9 @@ def _create_completion(
11571171
bos_token_id: int = self.token_bos()
11581172
cls_token_id: int = self._model.token_cls()
11591173
sep_token_id: int = self._model.token_sep()
1160-
prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
1161-
middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
1162-
suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
1174+
prefix_token_id: int = self._model.token_prefix()
1175+
middle_token_id: int = self._model.token_middle()
1176+
suffix_token_id: int = self._model.token_suffix()
11631177
add_space_prefix: bool = (
11641178
self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
11651179
)
@@ -1315,7 +1329,7 @@ def logit_bias_processor(
13151329
if seed is not None:
13161330
self.set_seed(seed)
13171331
else:
1318-
self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
1332+
self.set_seed(random.Random(self._seed).randint(0, 2**32))
13191333

13201334
finish_reason = "length"
13211335
multibyte_fix = 0
@@ -2056,7 +2070,10 @@ def create_chat_completion_openai_v1(
20562070
stream = kwargs.get("stream", False) # type: ignore
20572071
assert isinstance(stream, bool)
20582072
if stream:
2059-
return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
2073+
return (
2074+
ChatCompletionChunk(**chunk)
2075+
for chunk in self.create_chat_completion(*args, **kwargs)
2076+
) # type: ignore
20602077
else:
20612078
return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
20622079
except ImportError:
@@ -2096,7 +2113,7 @@ def __getstate__(self):
20962113
logits_all=self._logits_all,
20972114
embedding=self.context_params.embeddings,
20982115
offload_kqv=self.context_params.offload_kqv,
2099-
flash_attn=self.context_params.flash_attn,
2116+
flash_attn=self.context_params.flash_attn_type,
21002117
op_offload=self.context_params.op_offload,
21012118
swa_full=self.context_params.swa_full,
21022119
# Sampling Params
@@ -2316,19 +2333,23 @@ def from_pretrained(
23162333
)
23172334

23182335
if additional_files:
2319-
for additonal_file_name in additional_files:
2336+
for additional_file_name in additional_files:
23202337
# find the additional shard file:
2321-
matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
2338+
matching_additional_files = [
2339+
file
2340+
for file in file_list
2341+
if fnmatch.fnmatch(file, additional_file_name)
2342+
]
23222343

23232344
if len(matching_additional_files) == 0:
23242345
raise ValueError(
2325-
f"No file found in {repo_id} that match {additonal_file_name}\n\n"
2346+
f"No file found in {repo_id} that match {additional_file_name}\n\n"
23262347
f"Available Files:\n{json.dumps(file_list)}"
23272348
)
23282349

23292350
if len(matching_additional_files) > 1:
23302351
raise ValueError(
2331-
f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
2352+
f"Multiple files found in {repo_id} matching {additional_file_name}\n\n"
23322353
f"Available Files:\n{json.dumps(files)}"
23332354
)
23342355

0 commit comments

Comments
 (0)