Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Paligemma support #1777

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions llama_cpp/_llava.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from __future__ import annotations

import os
import ctypes
import typing
import contextlib

import numpy as np

import llama_cpp
import llama_cpp.llava_cpp as llava_cpp


class LlavaEmbedding:
def __init__(self, embedding: ctypes._Pointer[llava_cpp.llava_image_embed]):
self._embedding = embedding
self._exit_stack = contextlib.ExitStack()

def llava_image_embed_free():
llava_cpp.llava_image_embed_free(self._embedding)

self._exit_stack.callback(llava_image_embed_free)

@property
def n_image_pos(self) -> int:
return self._embedding.contents.n_image_pos

def embed(
self, llama_ctx: llama_cpp.llama_context_p, n_tokens: int, n_batch: int
) -> int:
n_past = ctypes.c_int(n_tokens)
n_past_p = ctypes.pointer(n_past)
llava_cpp.llava_eval_image_embed(
llama_ctx,
self._embedding,
n_batch,
n_past_p,
)
return n_past.value

def numpy_view(self, shape: typing.Tuple[int, int]) -> np.ndarray:
return np.ctypeslib.as_array(
self._embedding.contents.embed, shape=shape
)


class LlavaModel:
def __init__(self, path: str, n_threads: int = 1):
self._path = path
self._n_threads = n_threads
self._exit_stack = contextlib.ExitStack()

if not os.path.exists(self._path):
raise ValueError(f"Clip model path does not exist: {self._path}")

clip_ctx = llava_cpp.clip_model_load(self._path.encode(), 0)

if clip_ctx is None:
raise ValueError(f"Failed to load clip model: {self._path}")

self._clip_ctx = clip_ctx

def clip_free():
llava_cpp.clip_free(self._clip_ctx)
print("Clip model freed")

self._exit_stack.callback(clip_free)

def embed_bytes(self, image_bytes: bytes):
embed = llava_cpp.llava_image_embed_make_with_bytes(
self._clip_ctx,
self._n_threads,
(ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
len(image_bytes),
)
return LlavaEmbedding(embed)

200 changes: 200 additions & 0 deletions llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
import numpy as np
import numpy.typing as npt

import llama_cpp
import llama_cpp.llama as llama
import llama_cpp.llama_types as llama_types
import llama_cpp.llama_grammar as llama_grammar
import llama_cpp._internals as internals

from ._logger import logger
from ._utils import suppress_stdout_stderr, Singleton
Expand Down Expand Up @@ -3350,6 +3352,204 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
)


class PaliGemmaChatHandler(Llava15ChatHandler):
def __call__(
self,
*,
llama: llama.Llama,
messages: List[llama_types.ChatCompletionRequestMessage],
functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
temperature: float = 0.2,
top_p: float = 0.95,
top_k: int = 40,
min_p: float = 0.05,
typical_p: float = 1.0,
stream: bool = False,
stop: Optional[Union[str, List[str]]] = [],
seed: Optional[int] = None,
response_format: Optional[
llama_types.ChatCompletionRequestResponseFormat
] = None,
max_tokens: Optional[int] = None,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
repeat_penalty: float = 1.1,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
logits_processor: Optional[llama.LogitsProcessorList] = None,
grammar: Optional[llama.LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
Iterator[llama_types.CreateChatCompletionStreamResponse],
]:
assert self.clip_ctx is not None

if len(messages) != 1:
raise ValueError("PaligemmaChatHandler only supports single-turn conversations.")

image_urls = self.get_image_urls(messages)

if len(image_urls) > 1:
raise ValueError("PaligemmaChatHandler only supports single image per turn.")

text = "<s>answer en "
message = messages[0]
if isinstance(message["content"], str):
text = message["content"]
elif isinstance(message["content"], list):
for content in message["content"]:
if content["type"] == "text":
text += content["text"]
text += "\n"

if self.verbose:
print(text, file=sys.stderr)



tokens = llama.tokenize(text.encode("utf-8"), special=True)
embedding_dim = llama_cpp.llama_n_embd(llama.model)
tokens_np = np.array(tokens).astype(np.int32)
token_embedding = np.empty((len(tokens), embedding_dim), dtype=np.single)
llama_cpp.llama_token_inp_embd(
llama.ctx,
tokens_np.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
len(tokens),
token_embedding.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
)

if len(image_urls) > 0:
image_embedding = self._embed_image_bytes(self.load_image(image_urls[0]))
n_image_pos = image_embedding.contents.n_image_pos
embeds = np.concatenate([np.ctypeslib.as_array(image_embedding.contents.embed, shape=(n_image_pos, embedding_dim)), token_embedding], axis=0)
n_tokens = n_image_pos + len(tokens)
llama.input_ids[: n_tokens] = (
llama.tokenize(b"<image>", add_bos=False, special=True) * image_embedding.contents.n_image_pos + tokens
)
else:
n_tokens = len(tokens)
llama.input_ids[: n_tokens] = tokens
embeds = token_embedding


n_batch = 512
batch = internals.LlamaBatch(n_tokens=n_batch, embd=embedding_dim, n_seq_max=1)

batch.batch.n_tokens = n_tokens

np.ctypeslib.as_array(batch.batch.embd, shape=(n_batch, embedding_dim))[
:n_tokens, :
] = embeds
np.ctypeslib.as_array(batch.batch.pos, shape=(n_batch,))[:n_tokens] = np.arange(n_tokens)
np.ctypeslib.as_array(batch.batch.n_seq_id, shape=(n_batch,))[:] = 1
np.ctypeslib.as_array(batch.batch.logits, shape=(n_batch,))[:] = False
np.ctypeslib.as_array(batch.batch.logits, shape=(n_batch,))[n_tokens - 1] = True

for i in range(n_tokens):
batch.batch.seq_id[i][0] = 0

# Evaluate prompt
llama.reset()
llama._ctx.kv_cache_clear()
llama_cpp.llama_set_causal_attn(llama._ctx.ctx, False)
llama._ctx.decode(batch)
llama.n_tokens += n_tokens
llama_cpp.llama_set_causal_attn(llama._ctx.ctx, True)

# Get prompt tokens to avoid a cache miss
prompt = llama.input_ids[: llama.n_tokens].tolist()

if response_format is not None and response_format["type"] == "json_object":
grammar = _grammar_for_response_format(response_format)

# Convert legacy functions to tools
if functions is not None:
tools = [
{
"type": "function",
"function": function,
}
for function in functions
]

# Convert legacy function_call to tool_choice
if function_call is not None:
if isinstance(function_call, str) and (
function_call == "none" or function_call == "auto"
):
tool_choice = function_call
if isinstance(function_call, dict) and "name" in function_call:
tool_choice = {
"type": "function",
"function": {
"name": function_call["name"],
},
}

tool = None
if (
tool_choice is not None
and isinstance(tool_choice, dict)
and tools is not None
):
name = tool_choice["function"]["name"]
tool = next((t for t in tools if t["function"]["name"] == name), None)
if tool is None:
raise ValueError(f"Tool choice '{name}' not found in tools.")
schema = tool["function"]["parameters"]
try:
# create grammar from json schema
grammar = llama_grammar.LlamaGrammar.from_json_schema(
json.dumps(schema), verbose=llama.verbose
)
except Exception as e:
if llama.verbose:
print(str(e), file=sys.stderr)
grammar = llama_grammar.LlamaGrammar.from_string(
llama_grammar.JSON_GBNF, verbose=llama.verbose
)

completion_or_chunks = llama.create_completion(
prompt=prompt,
temperature=temperature,
top_p=top_p,
top_k=top_k,
min_p=min_p,
typical_p=typical_p,
logprobs=top_logprobs if logprobs else None,
stream=stream,
stop=stop,
seed=seed,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
repeat_penalty=repeat_penalty,
tfs_z=tfs_z,
mirostat_mode=mirostat_mode,
mirostat_tau=mirostat_tau,
mirostat_eta=mirostat_eta,
model=model,
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
)
if tool is not None:
tool_name = tool["function"]["name"]
return _convert_completion_to_chat_function(
tool_name, completion_or_chunks, stream
)
return _convert_completion_to_chat(completion_or_chunks, stream=stream)

@register_chat_completion_handler("chatml-function-calling")
def chatml_function_calling(
llama: llama.Llama,
Expand Down
41 changes: 41 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2836,6 +2836,47 @@ def llama_detokenize(
...


# // @details Get the input embeddings for a sequence of tokens
# // @param tokens The tokens to embed
# // @param n_tokens The number of tokens
# // @param embeddings The embeddings pointer must be large enough to hold the resulting embeddings.
# // @param n_embd The number of embeddings per token
# // @return Returns a negative number on failure
# LLAMA_API int32_t llama_token_inp_embd(
# struct llama_context * ctx,
# llama_token * tokens,
# int32_t n_tokens,
# float * embeddings);
@ctypes_function(
"llama_token_inp_embd",
[
llama_context_p_ctypes,
llama_token_p,
ctypes.c_int32,
ctypes.POINTER(ctypes.c_float),
],
ctypes.c_int32,
)
def llama_token_inp_embd(
ctx: llama_context_p,
tokens: CtypesArray[llama_token],
n_tokens: Union[ctypes.c_int32, int],
embeddings: CtypesArray[ctypes.c_float],
/,
) -> int:
"""Get the input embeddings for a sequence of tokens

Args:
ctx: The model context.
tokens: The tokens to embed.
n_tokens: The number of tokens.
embeddings: The embeddings pointer must be large enough to hold the resulting embeddings.

Returns:
Returns a negative number on failure"""
...


# //
# // Chat templates
# //
Expand Down
14 changes: 14 additions & 0 deletions llama_cpp/server/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "paligemma":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.PaliGemmaChatHandler.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.PaliGemmaChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "hf-autotokenizer":
assert (
settings.hf_pretrained_model_name_or_path is not None
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp