From cdb722cd058d294fcf70d661b1c0d223955f29c3 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Tue, 31 Dec 2024 17:51:22 +0100 Subject: [PATCH 1/2] fix pixtral language only --- mlx_vlm/models/pixtral/pixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlx_vlm/models/pixtral/pixtral.py b/mlx_vlm/models/pixtral/pixtral.py index fb51a84..37dbc3c 100644 --- a/mlx_vlm/models/pixtral/pixtral.py +++ b/mlx_vlm/models/pixtral/pixtral.py @@ -70,7 +70,7 @@ def get_input_embeddings( pixel_values: Optional[mx.array] = None, ): if pixel_values is None: - return self.language_model(input_ids) + return self.language_model.model.embed_tokens(input_ids) # Get the input embeddings from the language model inputs_embeds = self.language_model.model.embed_tokens(input_ids) From 575bb39fee1510d1771c00242c521e17587d6cb9 Mon Sep 17 00:00:00 2001 From: Prince Canuma Date: Tue, 31 Dec 2024 17:51:43 +0100 Subject: [PATCH 2/2] fix tokenizer processor for DS-VL2 --- mlx_vlm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlx_vlm/utils.py b/mlx_vlm/utils.py index 218ffef..ffc71e4 100644 --- a/mlx_vlm/utils.py +++ b/mlx_vlm/utils.py @@ -1000,7 +1000,7 @@ def stream_generate( Yields: Generator[Tuple[mx.array, mx.array]]: A generator producing text. """ - tokenizer = processor if hasattr(processor, "encode") else processor.tokenizer + tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor prompt_tokens = mx.array(tokenizer.encode(prompt)) resize_shape = kwargs.pop("resize_shape", None)