From cdb722cd058d294fcf70d661b1c0d223955f29c3 Mon Sep 17 00:00:00 2001
From: Prince Canuma <prince.gdt@gmail.com>
Date: Tue, 31 Dec 2024 17:51:22 +0100
Subject: [PATCH 1/2] fix pixtral language only

---
 mlx_vlm/models/pixtral/pixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlx_vlm/models/pixtral/pixtral.py b/mlx_vlm/models/pixtral/pixtral.py
index fb51a84..37dbc3c 100644
--- a/mlx_vlm/models/pixtral/pixtral.py
+++ b/mlx_vlm/models/pixtral/pixtral.py
@@ -70,7 +70,7 @@ def get_input_embeddings(
         pixel_values: Optional[mx.array] = None,
     ):
         if pixel_values is None:
-            return self.language_model(input_ids)
+            return self.language_model.model.embed_tokens(input_ids)
 
         # Get the input embeddings from the language model
         inputs_embeds = self.language_model.model.embed_tokens(input_ids)

From 575bb39fee1510d1771c00242c521e17587d6cb9 Mon Sep 17 00:00:00 2001
From: Prince Canuma <prince.gdt@gmail.com>
Date: Tue, 31 Dec 2024 17:51:43 +0100
Subject: [PATCH 2/2] fix tokenizer processor for DS-VL2

---
 mlx_vlm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlx_vlm/utils.py b/mlx_vlm/utils.py
index 218ffef..ffc71e4 100644
--- a/mlx_vlm/utils.py
+++ b/mlx_vlm/utils.py
@@ -1000,7 +1000,7 @@ def stream_generate(
     Yields:
         Generator[Tuple[mx.array, mx.array]]: A generator producing text.
     """
-    tokenizer = processor if hasattr(processor, "encode") else processor.tokenizer
+    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
     prompt_tokens = mx.array(tokenizer.encode(prompt))
 
     resize_shape = kwargs.pop("resize_shape", None)