ml-explore
diff --git a/‎Libraries/MLXLMCommon/ChatSession.swift‎
Lines changed: 139 additions & 114 deletions b/‎Libraries/MLXLMCommon/ChatSession.swift‎
Lines changed: 139 additions & 114 deletions
@@ -20,14 +20,15 @@ import MLX
 ///   model operations.
 public final class ChatSession {
 
-    private enum Model {
-        case container(ModelContainer)
-        case context(ModelContext)
+    enum Cache {
+        case empty
+        case kvcache([KVCache])
+        case history([Chat.Message])
     }
 
-    private let model: Model
-    private var messages: [Chat.Message]
-    private var cache: [KVCache]
+    private let model: ModelContainer
+    private let instructions: String?
+    private let cache: SerialAccessContainer<Cache>
     private let processing: UserInput.Processing
     private let generateParameters: GenerateParameters
     private let additionalContext: [String: any Sendable]?
@@ -47,9 +48,9 @@ public final class ChatSession {
         processing: UserInput.Processing = .init(resize: CGSize(width: 512, height: 512)),
         additionalContext: [String: any Sendable]? = nil
     ) {
-        self.model = .container(model)
-        self.messages = instructions.map { [.system($0)] } ?? []
-        self.cache = []
+        self.model = model
+        self.instructions = instructions
+        self.cache = .init(.empty)
         self.processing = processing
         self.generateParameters = generateParameters
         self.additionalContext = additionalContext
@@ -70,9 +71,9 @@ public final class ChatSession {
         processing: UserInput.Processing = .init(resize: CGSize(width: 512, height: 512)),
         additionalContext: [String: any Sendable]? = nil
     ) {
-        self.model = .context(model)
-        self.messages = instructions.map { [.system($0)] } ?? []
-        self.cache = []
+        self.model = ModelContainer(context: model)
+        self.instructions = instructions
+        self.cache = .init(.empty)
         self.processing = processing
         self.generateParameters = generateParameters
         self.additionalContext = additionalContext
@@ -88,21 +89,20 @@ public final class ChatSession {
     ///   - generateParameters: parameters that control generation
     ///   - processing: media processing configuration for images/videos
     ///   - additionalContext: optional model-specific context
-    public convenience init(
+    public init(
         _ model: ModelContainer,
-        history: [Chat.Message],
+        instructions: String? = nil,
+        history: consuming [Chat.Message],
         generateParameters: GenerateParameters = .init(),
         processing: UserInput.Processing = .init(resize: CGSize(width: 512, height: 512)),
         additionalContext: [String: any Sendable]? = nil
     ) {
-        self.init(
-            model,
-            instructions: nil,
-            generateParameters: generateParameters,
-            processing: processing,
-            additionalContext: additionalContext
-        )
-        self.messages = history
+        self.model = model
+        self.instructions = instructions
+        self.cache = .init(.history(history))
+        self.processing = processing
+        self.generateParameters = generateParameters
+        self.additionalContext = additionalContext
     }
 
     /// Initialize the `ChatSession` with an existing message history.
@@ -115,21 +115,20 @@ public final class ChatSession {
     ///   - generateParameters: parameters that control generation
     ///   - processing: media processing configuration for images/videos
     ///   - additionalContext: optional model-specific context
-    public convenience init(
+    public init(
         _ model: ModelContext,
+        instructions: String? = nil,
         history: [Chat.Message],
         generateParameters: GenerateParameters = .init(),
         processing: UserInput.Processing = .init(resize: CGSize(width: 512, height: 512)),
         additionalContext: [String: any Sendable]? = nil
     ) {
-        self.init(
-            model,
-            instructions: nil,
-            generateParameters: generateParameters,
-            processing: processing,
-            additionalContext: additionalContext
-        )
-        self.messages = history
+        self.model = ModelContainer(context: model)
+        self.instructions = instructions
+        self.cache = .init(.history(history))
+        self.processing = processing
+        self.generateParameters = generateParameters
+        self.additionalContext = additionalContext
     }
 
     /// Produces a response to a prompt.
@@ -141,45 +140,13 @@ public final class ChatSession {
     /// - Returns: the model's response
     public func respond(
         to prompt: String,
-        images: [UserInput.Image],
-        videos: [UserInput.Video]
+        images: consuming [UserInput.Image],
+        videos: consuming [UserInput.Video]
     ) async throws -> String {
-        messages.append(.user(prompt, images: images, videos: videos))
-
-        func generate(context: ModelContext) async throws -> String {
-            let userInput = UserInput(
-                chat: messages, processing: processing, additionalContext: additionalContext)
-            let input = try await context.processor.prepare(input: userInput)
-
-            if cache.isEmpty {
-                cache = context.model.newCache(parameters: generateParameters)
-            }
-
-            var output = ""
-            for await generation in try MLXLMCommon.generate(
-                input: input, cache: cache, parameters: generateParameters, context: context
-            ) {
-                if let chunk = generation.chunk {
-                    output += chunk
-                }
-            }
-
-            Stream().synchronize()
-
-            return output
-        }
-
-        let output: String
-        switch model {
-        case .container(let container):
-            output = try await container.perform { context in
-                try await generate(context: context)
-            }
-        case .context(let context):
-            output = try await generate(context: context)
+        var output = ""
+        for try await chunk in streamResponse(to: prompt, images: images, videos: videos) {
+            output += chunk
         }
-
-        messages.append(.assistant(output))
         return output
     }
 
@@ -211,16 +178,105 @@ public final class ChatSession {
     /// - Returns: a stream of string chunks from the model
     public func streamResponse(
         to prompt: String,
-        images: [UserInput.Image],
-        videos: [UserInput.Video]
+        images: consuming [UserInput.Image],
+        videos: consuming [UserInput.Video]
     ) -> AsyncThrowingStream<String, Error> {
-        messages.append(.user(prompt, images: images, videos: videos))
-
         let (stream, continuation) = AsyncThrowingStream<String, Error>.makeStream()
 
+        // images and videos are not Sendable (MLXArray) but they are consumed
+        // and are only being sent to the inner async
+        let message = SendableBox<Chat.Message>(
+            .user(prompt, images: images, videos: videos)
+        )
+
         let task = Task {
+            [
+                model,
+                instructions, processing, additionalContext, cache, generateParameters
+            ] in
             do {
-                try await self.performStreaming(continuation: continuation)
+                try await cache.update { cache in
+
+                    // these are all Sendable
+                    let processor = await model.processor
+                    let tokenizer = await model.tokenizer
+                    let modelConfiguration = await model.configuration
+
+                    var messages: [Chat.Message] = []
+                    if let instructions {
+                        messages.append(.system(instructions))
+                    }
+
+                    // prepare the cache, if needed.  note:
+                    // this is using the LanguageModel (not Sendable) outside
+                    // the protective lock.  Assuming the weights are not
+                    // being mutated behind the scenes, this will obey the MLXArray
+                    // contract that they be evaluated if used across threads.
+                    // This is internal to the implementation and this technique
+                    // should not be used in calling code.
+                    //
+                    // The benefit is that callers can be running multiple
+                    // ChatSessions in parallel, as long as the instances
+                    // are distinct.  In particular the KVCache cannot
+                    // be shared and that is the lock that is held here.
+
+                    let model = await model.perform { context in
+                        SendableBox(context.model)
+                    }.consume()
+
+                    var kvCache: [KVCache]
+                    switch cache {
+                    case .empty:
+                        kvCache = model.newCache(parameters: generateParameters)
+                        cache = .kvcache(kvCache)
+
+                    case .kvcache(let array):
+                        kvCache = array
+
+                    case .history(let history):
+                        // the KVCache is represented by a chat history
+                        kvCache = model.newCache(parameters: generateParameters)
+                        cache = .kvcache(kvCache)
+                        messages.append(contentsOf: history)
+                    }
+
+                    // prepare the input
+                    messages.append(message.consume())
+
+                    let userInput = UserInput(
+                        chat: messages, processing: processing,
+                        additionalContext: additionalContext)
+                    let input = try await processor.prepare(input: userInput)
+
+                    // generate output
+                    let iterator = try TokenIterator(
+                        input: input, model: model, cache: kvCache,
+                        parameters: generateParameters)
+
+                    let (stream, task) = MLXLMCommon.generateTask(
+                        promptTokenCount: input.text.tokens.size,
+                        modelConfiguration: modelConfiguration,
+                        tokenizer: tokenizer,
+                        iterator: iterator
+                    )
+
+                    var fullResponse = ""
+                    for await item in stream {
+                        if let chunk = item.chunk {
+                            fullResponse += chunk
+                            if case .terminated = continuation.yield(chunk) {
+                                break
+                            }
+                        }
+                    }
+
+                    // wait for the task to complete -- this is important in
+                    // the case where we broke the loop early as the generation
+                    // work may continue (briefly) and use the KVCache
+                    await task.value
+
+                    continuation.finish()
+                }
             } catch {
                 continuation.finish(throwing: error)
             }
@@ -253,48 +309,17 @@ public final class ChatSession {
     }
 
     /// Clear the session history and cache, preserving system instructions.
-    public func clear() {
-        messages = messages.filter { $0.role == .system }
-        cache = []
-    }
-
-    // MARK: - Private
-
-    private func performStreaming(
-        continuation: AsyncThrowingStream<String, Error>.Continuation
-    ) async throws {
-        func stream(context: ModelContext) async throws {
-            let userInput = UserInput(
-                chat: messages, processing: processing, additionalContext: additionalContext)
-            let input = try await context.processor.prepare(input: userInput)
-
-            if cache.isEmpty {
-                cache = context.model.newCache(parameters: generateParameters)
-            }
-
-            var fullResponse = ""
-            for await item in try MLXLMCommon.generate(
-                input: input, cache: cache, parameters: generateParameters, context: context
-            ) {
-                if let chunk = item.chunk {
-                    fullResponse += chunk
-                    continuation.yield(chunk)
-                }
-            }
-
-            Stream().synchronize()
-
-            messages.append(.assistant(fullResponse))
-            continuation.finish()
+    public func clear() async {
+        await cache.update { cache in
+            cache = .empty
         }
+    }
 
-        switch model {
-        case .container(let container):
-            try await container.perform { context in
-                try await stream(context: context)
-            }
-        case .context(let context):
-            try await stream(context: context)
-        }
+    /// Wait for exclusive access to the KVCache.
+    ///
+    /// This is useful for cases where a program is terminating and wants to ensure that any
+    /// async operations are complete.
+    public func synchronize() async {
+        await cache.read { _ in }
     }
 }