fix: critical fixes for recurrent/hybrid model support

Ralf Waldukat · Ralf Waldukat · commit f42739945a70 · 2026-01-14T08:35:09.000+07:00
After external code review (GPT-5.2), fixed 4 critical issues: 1. CRITICAL: Fixed tokens[:-1] bug in prefix matching - Was silently breaking prefix matching for ALL models - Caused false rewind detection and cache inefficiency - Impact: Transformers AND recurrent models 2. CRITICAL: Implement proper reset() for recurrent models - Now actually clears llama_memory backend state - Root cause fix for 'sequence positions not consecutive' crash - Without this, reset was a no-op for recurrent models 3. CRITICAL: Enforce strict append policy for recurrent models - Prevents KV cache rewinding that's impossible without state snapshots - Forces full reset on history edits instead of crashing 4. Performance: Cache _is_recurrent to avoid repeated FFI calls 5. Documentation: Simplified comments and updated docstring 6. Testing: All existing tests pass + Mistral-Small-3.2-24B validated Resolves multi-turn crashes for Nemotron-A3B, Mamba, RWKV, Jamba models. Reviewed-by: GPT-5.2 (OpenAI) Tested-by: pytest + Mistral-Small-3.2-24B Fixes: #2108 (recurrent model crashes) Compatible-with: #2109 (Granite-Docling/SmolVLM special tokens)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -190,6 +190,11 @@ def __init__(
             type_v: KV cache data type for V (default: f16)
             spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 
+        Note:
+            Recurrent and hybrid models (Mamba, RWKV, Nemotron-A3B, Jamba) cannot
+            rewind their state and require full reset on history edits. This is handled
+            automatically to maintain compatibility. Standard transformers are unaffected.
+
         Raises:
             ValueError: If the model path does not exist.
 
@@ -555,6 +560,11 @@ def free_lora_adapter():
 
         self._sampler = None
 
+        # Cache recurrent/hybrid model detection to avoid repeated FFI calls
+        self._is_recurrent_model = llama_cpp.llama_model_is_recurrent(
+            self._model.model
+        ) or llama_cpp.llama_model_is_hybrid(self._model.model)
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         return self._ctx.ctx
@@ -582,6 +592,19 @@ def eval_logits(self) -> Deque[List[float]]:
             maxlen=self._n_ctx if self._logits_all else 1,
         )
 
+    @property
+    def _is_recurrent(self) -> bool:
+        """Check if model is recurrent (SSM) or hybrid (SSM+Attention).
+
+        These models (Mamba, RWKV, Nemotron, Jamba, etc.) cannot rewind their
+        recurrent state without snapshots. Only strict forward progression or
+        full reset is allowed.
+
+        Returns:
+            True if model has recurrent state that cannot be rewound.
+        """
+        return self._is_recurrent_model
+
     def tokenize(
         self, text: bytes, add_bos: bool = True, special: bool = False
     ) -> List[int]:
@@ -640,6 +663,11 @@ def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
 
+        if self._is_recurrent:
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            if mem is not None:
+                llama_cpp.llama_memory_clear(mem, True)
+
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
 
@@ -891,19 +919,29 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
+
+            # Recurrent models cannot rewind state; reset if needed
+            if self._is_recurrent and longest_prefix < self.n_tokens:
+                longest_prefix = 0
+                reset = True
+                if self.verbose:
+                    print(
+                        "Llama.generate: recurrent model requires full state reset",
+                        file=sys.stderr,
+                    )
+
             if longest_prefix > 0:
                 reset = False
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
                 if self.verbose:
                     print(
-                        f"Llama.generate: {longest_prefix} prefix-match hit, "
-                        f"remaining {len(tokens)} prompt tokens to eval",
+                        f"Llama.generate: {longest_prefix} prefix-match hit, {len(tokens)} tokens to eval",
                         file=sys.stderr,
                     )