Skip to content

Commit 8cb4df5

Browse files
committed
Merge remote-tracking branch 'origin/master' into GraniteFour
* origin/master: metal : disable fast-math for some cpy kernels (ggml-org#14460) ggml-cpu: sycl: Re-enable exp f16 (ggml-org#14462) test-backend-ops : disable llama test (ggml-org#14461) cmake : Remove redundant include path in CMakeLists.txt (ggml-org#14452) scripts : make the shell scripts cross-platform (ggml-org#14341) server : support jinja extra template kwargs (Qwen3 enable_thinking feature), from command line and from client (ggml-org#13196) server : fix appearance of the chats list context menu for Safari (ggml-org#14322) SYCL: disable faulty fp16 exp kernel (ggml-org#14395) ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (ggml-org#14443) ggml : implement REGLU/GEGLU/SWIGLU ops (ggml-org#14158) vulkan: Add fusion support for RMS_NORM+MUL (ggml-org#14366) CUDA: add bf16 and f32 support to cublas_mul_mat_batched (ggml-org#14361) vulkan: handle noncontig in the final case of ggml_vk_get_cpy_pipeline (ggml-org#14378) vulkan: lock accesses of pinned_memory vector (ggml-org#14333) model : add support for ERNIE 4.5 0.3B model (ggml-org#14408) fix async_mode bug (ggml-org#14432) ci : fix windows build and release (ggml-org#14431) vulkan: Fix GGML_VULKAN_SHADER_DEBUG_INFO (ggml-org#14427) graph : make llm_graph_context destructor virtual (ggml-org#14410)
2 parents 2b263e6 + 5dd942d commit 8cb4df5

File tree

99 files changed

+2903
-1296
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+2903
-1296
lines changed

.devops/tools.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
set -e
33

44
# Read the first argument into a variable

.github/workflows/build.yml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ jobs:
664664
./build-xcframework.sh
665665
666666
windows-msys2:
667-
runs-on: windows-latest
667+
runs-on: windows-2025
668668

669669
strategy:
670670
fail-fast: false
@@ -714,7 +714,7 @@ jobs:
714714
cmake --build build --config ${{ matrix.build }} -j $(nproc)
715715
716716
windows-latest-cmake:
717-
runs-on: windows-latest
717+
runs-on: windows-2025
718718

719719
env:
720720
OPENBLAS_VERSION: 0.3.23
@@ -725,16 +725,22 @@ jobs:
725725
matrix:
726726
include:
727727
- build: 'cpu-x64 (static)'
728+
arch: 'x64'
728729
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
729730
- build: 'openblas-x64'
731+
arch: 'x64'
730732
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
731733
- build: 'vulkan-x64'
734+
arch: 'x64'
732735
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
733736
- build: 'llvm-arm64'
737+
arch: 'arm64'
734738
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
735739
- build: 'llvm-arm64-opencl-adreno'
740+
arch: 'arm64'
736741
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
737742
# - build: 'kompute-x64'
743+
# arch: 'x64'
738744
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
739745

740746
steps:
@@ -805,6 +811,8 @@ jobs:
805811
- name: libCURL
806812
id: get_libcurl
807813
uses: ./.github/actions/windows-setup-curl
814+
with:
815+
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
808816

809817
- name: Build
810818
id: cmake_build
@@ -825,7 +833,7 @@ jobs:
825833
826834
- name: Test
827835
id: cmake_test
828-
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
836+
if: ${{ matrix.arch == 'x64' }}
829837
run: |
830838
cd build
831839
ctest -L main -C Release --verbose --timeout 900
@@ -930,7 +938,7 @@ jobs:
930938
cmake --build build --config Release
931939
932940
windows-latest-cmake-sycl:
933-
runs-on: windows-latest
941+
runs-on: windows-2022
934942

935943
defaults:
936944
run:
@@ -964,7 +972,7 @@ jobs:
964972

965973
windows-latest-cmake-hip:
966974
if: ${{ github.event.inputs.create_release != 'true' }}
967-
runs-on: windows-latest
975+
runs-on: windows-2022
968976

969977
steps:
970978
- name: Clone

.github/workflows/release.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ jobs:
235235
name: llama-bin-ubuntu-vulkan-x64.zip
236236

237237
windows-cpu:
238-
runs-on: windows-latest
238+
runs-on: windows-2025
239239

240240
strategy:
241241
matrix:
@@ -271,7 +271,7 @@ jobs:
271271
env:
272272
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
273273
run: |
274-
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
274+
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
275275
cmake -S . -B build -G "Ninja Multi-Config" ^
276276
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
277277
-DGGML_NATIVE=OFF ^
@@ -288,7 +288,7 @@ jobs:
288288
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
289289
run: |
290290
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
291-
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
291+
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
292292
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
293293
294294
- name: Upload artifacts
@@ -298,7 +298,7 @@ jobs:
298298
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
299299

300300
windows:
301-
runs-on: windows-latest
301+
runs-on: windows-2025
302302

303303
env:
304304
OPENBLAS_VERSION: 0.3.23
@@ -448,7 +448,7 @@ jobs:
448448
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
449449

450450
windows-sycl:
451-
runs-on: windows-latest
451+
runs-on: windows-2022
452452

453453
defaults:
454454
run:
@@ -520,7 +520,7 @@ jobs:
520520
name: llama-bin-win-sycl-x64.zip
521521

522522
windows-hip:
523-
runs-on: windows-latest
523+
runs-on: windows-2022
524524

525525
strategy:
526526
matrix:

build-xcframework.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
#
33
# Options
44
IOS_MIN_OS_VERSION=16.4

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
#
33
# sample usage:
44
#

common/arg.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2794,6 +2794,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27942794
params.ssl_file_cert = value;
27952795
}
27962796
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2797+
add_opt(common_arg(
2798+
{"--chat-template-kwargs"}, "STRING",
2799+
string_format("sets additional params for the json template parser"),
2800+
[](common_params & params, const std::string & value) {
2801+
auto parsed = json::parse(value);
2802+
for (const auto & item : parsed.items()) {
2803+
params.default_template_kwargs[item.key()] = item.value().dump();
2804+
}
2805+
}
2806+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
27972807
add_opt(common_arg(
27982808
{"-to", "--timeout"}, "N",
27992809
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),

common/chat.cpp

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#include <string>
1818
#include <vector>
1919

20+
using json = nlohmann::ordered_json;
21+
2022
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
2123
auto time = std::chrono::system_clock::to_time_t(now);
2224
auto local_time = *std::localtime(&time);
@@ -140,6 +142,7 @@ struct templates_params {
140142
bool add_generation_prompt = true;
141143
bool enable_thinking = true;
142144
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
145+
json extra_context;
143146
};
144147

145148
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
720723

721724
static std::string apply(
722725
const common_chat_template & tmpl,
723-
const nlohmann::ordered_json & messages,
724-
const nlohmann::ordered_json & tools,
725-
bool add_generation_prompt,
726-
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
726+
const struct templates_params & inputs,
727+
const std::optional<json> & messages_override = std::nullopt,
728+
const std::optional<json> & tools_override = std::nullopt,
729+
const std::optional<json> & additional_context = std::nullopt)
727730
{
728731
minja::chat_template_inputs tmpl_inputs;
729-
tmpl_inputs.messages = messages;
730-
tmpl_inputs.tools = tools;
731-
tmpl_inputs.add_generation_prompt = add_generation_prompt;
732-
tmpl_inputs.extra_context = extra_context;
732+
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
733+
if (tools_override) {
734+
tmpl_inputs.tools = *tools_override;
735+
} else {
736+
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
737+
}
738+
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
739+
tmpl_inputs.extra_context = inputs.extra_context;
740+
if (additional_context) {
741+
tmpl_inputs.extra_context.merge_patch(*additional_context);
742+
}
733743
// TODO: add flag to control date/time, if only for testing purposes.
734744
// tmpl_inputs.now = std::chrono::system_clock::now();
735745

@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
828838
inputs.messages,
829839
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
830840

831-
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
841+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
832842
data.format = COMMON_CHAT_FORMAT_GENERIC;
833843
return data;
834844
}
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
904914
data.preserved_tokens = {
905915
"[TOOL_CALLS]",
906916
};
907-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
917+
data.prompt = apply(tmpl, inputs);
908918
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
909919
return data;
910920
}
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
934944
adjusted_messages.push_back(msg);
935945
}
936946
}
937-
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
947+
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
938948
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
939949
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
940950
if (!inputs.enable_thinking) {
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
11221132
} else {
11231133
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
11241134
}
1125-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
1135+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
11261136
{"date_string", format_time(inputs.now, "%d %b %Y")},
11271137
{"tools_in_user_message", false},
11281138
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
11871197

11881198
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
11891199
common_chat_params data;
1190-
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1200+
auto prompt = apply(tmpl, inputs);
11911201

11921202
// Hacks to fix the official (broken) prompt.
11931203
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
12821292
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
12831293
LOG_DBG("%s\n", __func__);
12841294
common_chat_params data;
1285-
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
1295+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
12861296
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
12871297
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
12881298
});
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
13381348
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
13391349
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
13401350
common_chat_params data;
1341-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1351+
data.prompt = apply(tmpl, inputs);
13421352
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
13431353
if (inputs.tools.is_array() && !inputs.tools.empty()) {
13441354
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
14651475
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
14661476
}
14671477

1468-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1478+
data.prompt = apply(tmpl, inputs);
14691479
// TODO: if (has_raw_python)
14701480
return data;
14711481
}
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
14981508
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14991509
common_chat_params data;
15001510

1501-
json additional_context = {
1511+
json extra_context = json {
15021512
{"enable_thinking", inputs.enable_thinking},
15031513
};
1514+
extra_context.update(inputs.extra_context);
15041515

1505-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
1516+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
15061517
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
15071518
if (string_ends_with(data.prompt, "<think>\n")) {
1508-
if (!inputs.enable_thinking) {
1519+
if (!extra_context["enable_thinking"]) {
15091520
data.prompt += "</think>";
15101521
} else {
15111522
data.thinking_forced_open = true;
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
16911702

16921703
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
16931704
common_chat_params data;
1694-
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1705+
data.prompt = apply(tmpl, inputs);
16951706
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
16961707
data.grammar_lazy = false;
16971708
if (!inputs.json_schema.is_null()) {
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
17221733
params.enable_thinking = inputs.enable_thinking;
17231734
params.grammar = inputs.grammar;
17241735
params.now = inputs.now;
1736+
1737+
params.extra_context = json::object();
1738+
for (auto el : inputs.chat_template_kwargs) {
1739+
params.extra_context[el.first] = json::parse(el.second);
1740+
}
1741+
17251742
if (!inputs.json_schema.empty()) {
17261743
params.json_schema = json::parse(inputs.json_schema);
17271744
}

common/chat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <chrono>
88
#include <string>
99
#include <vector>
10+
#include <map>
1011

1112
struct common_chat_templates;
1213

@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
125126
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126127
bool enable_thinking = true;
127128
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
129+
std::map<std::string, std::string> chat_template_kwargs;
128130
};
129131

130132
struct common_chat_params {

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <string>
99
#include <string_view>
1010
#include <vector>
11+
#include <map>
1112
#include <sstream>
1213

1314
#ifdef _WIN32
@@ -381,6 +382,8 @@ struct common_params {
381382
std::string ssl_file_key = ""; // NOLINT
382383
std::string ssl_file_cert = ""; // NOLINT
383384

385+
std::map<std::string, std::string> default_template_kwargs;
386+
384387
// "advanced" endpoints are disabled by default for better security
385388
bool webui = true;
386389
bool endpoint_slots = false;

0 commit comments

Comments
 (0)