From f038a82c33003d77e817b7df347af6b9057f6233 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 5 Jun 2025 15:32:41 +0800
Subject: [PATCH 01/33] cache tokens in Python side to reduce pybind reading
 overhead

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py  | 13 +++++++++++++
 tensorrt_llm/_torch/pyexecutor/model_engine.py |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 63ac568f4d..91e80d07bd 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -246,6 +246,8 @@ def __init__(
         self.is_cuda_graph_dummy = False
         self.py_lora_task_layer_module_configs = None
 
+        self.py_tokens = super().get_tokens()
+
         self.py_return_log_probs = return_log_probs
         self.py_return_context_logits = return_context_logits
         self.py_return_generation_logits = return_generation_logits
@@ -260,6 +262,17 @@ def __init__(
                                   return_log_probs, return_context_logits,
                                   return_generation_logits)
 
+    def get_tokens(self, beam: int):
+        return self.py_tokens[beam]
+
+    def get_last_tokens(self, beam: int):
+        return self.py_tokens[beam][-1]
+
+    def add_new_token(self, token: int, beam: int):
+        self.py_tokens[beam].append(token)
+        # sync to C++ side
+        super().add_new_token(token, beam)
+
     def create_response(
             self,
             use_fast_logits=False,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 49bf6194b2..db9b421d7a 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1120,7 +1120,7 @@ def _prepare_tp_inputs(
             gather_ids.append(len(input_ids) - 1)
             sequence_lengths.append(len(prompt_tokens))
             prompt_lengths.append(len(prompt_tokens))
-            past_seen_token_num = request.context_current_position
+            past_seen_token_num = begin_compute
             num_cached_tokens_per_seq.append(past_seen_token_num)
             multimodal_embedding = request.multimodal_embedding
             if multimodal_embedding is not None:

From bba671970cdd2b5341a8d6842f59978976bc32a5 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 5 Jun 2025 15:36:41 +0800
Subject: [PATCH 02/33] refine

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 91e80d07bd..0c29be89f3 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -262,16 +262,16 @@ def __init__(
                                   return_log_probs, return_context_logits,
                                   return_generation_logits)
 
-    def get_tokens(self, beam: int):
+    def get_tokens(self, beam: int) -> int:
         return self.py_tokens[beam]
 
-    def get_last_tokens(self, beam: int):
+    def get_last_tokens(self, beam: int) -> int:
         return self.py_tokens[beam][-1]
 
-    def add_new_token(self, token: int, beam: int):
+    def add_new_token(self, token: int, beam: int) -> int:
         self.py_tokens[beam].append(token)
         # sync to C++ side
-        super().add_new_token(token, beam)
+        return super().add_new_token(token, beam)
 
     def create_response(
             self,

From 3c836448fb422479406fe8f1856549730bde9e40 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 18:57:54 +0800
Subject: [PATCH 03/33] pure Python LlmResponse

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .../tensorrt_llm/batch_manager/llmRequest.h   |  2 +
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 15 ++++---
 .../pybind/batch_manager/bindings.cpp         |  2 +
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 43 ++++++++++---------
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 24 ++++++-----
 tensorrt_llm/executor/result.py               | 12 +++---
 tensorrt_llm/executor/worker.py               |  7 ++-
 7 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 086dc2bf4a..dca20816db 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -2328,6 +2328,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     /// @return An optional Response
     std::optional<executor::Response> createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
+    executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
+
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
 
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 6fc7051ad7..a722587b79 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -39,8 +39,16 @@ runtime::SizeType32 GenericLlmRequest<TTensor, TStream>::getBeamWidthByIter(bool
 
 template class GenericLlmRequest<runtime::ITensor::SharedPtr>;
 
-/// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank)
+{
+    auto requestId = isChild() ? mParentRequestId : mRequestId;
+    auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId);
+
+    return response;
+}
+
+/// Note that there is some dependency on the order of operations in this method. Modify with care!
+executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
@@ -192,11 +200,6 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 
     // Update position of last sent response
     setMaxSentTokenLen(maxNbTokens);
-
-    auto requestId = isChild() ? mParentRequestId : mRequestId;
-    auto response = executor::Response(requestId, std::move(result), mClientId);
-
-    return response;
 }
 
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 35f32a3b12..a3399e1833 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -360,6 +360,8 @@ void initBindings(pybind11::module_& m)
             py::arg("enable_kv_cache_reuse") = false)
         .def("create_response", &tb::LlmRequest::createResponse, py::arg("use_fast_logits") = false,
             py::arg("mpi_world_rank") = 0)
+        .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false,
+            py::arg("mpi_world_rank") = 0)
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"));
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index c89ad56ec6..ef9bdbde63 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -219,25 +219,18 @@ def __getattr__(self, item):
 class LlmResponse:
     """LlmResponse wraps `bindings.executor.Response` but detour some features to Python implementation"""
 
-    def __init__(self, response: tensorrt_llm.bindings.executor.Response,
-                 py_result: PyResult):
-        self._response = response
-        self._py_result = py_result
-
-    def __getstate__(self):
-        return self._response, self._py_result
-
-    def __setstate__(self, state):
-        self._response, self._py_result = state
-
-    @property
-    def result(self) -> tensorrt_llm.bindings.executor.Result:
-        return LlmResult(
-            self._response.result,
-            self._py_result)  # LlmResult masquerades bindings.executor.Result
+    def __init__(self,
+                 request_id: int,
+                 error: str = None,
+                 result: LlmResult = None,
+                 client_id: int = None):
+        self.request_id = request_id
+        self.error = error
+        self.result = result
+        self.client_id = client_id
 
-    def __getattr__(self, item):
-        return getattr(self._response, item)
+    def has_error(self):
+        return self.error is not None
 
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
@@ -269,6 +262,7 @@ def __init__(
             **kwargs)
         self.py_client_id = client_id
         self.py_request_id = self.request_id
+        self.py_llm_request_type = self.llm_request_type
         self.py_end_id = self.end_id
         self.py_prompt_len = self.prompt_len
         self.py_orig_prompt_len = self.orig_prompt_len
@@ -299,6 +293,9 @@ def __init__(
                                   return_generation_logits,
                                   exclude_last_generation_logits)
 
+    def is_generation_only_request(self):
+        return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+
     def get_tokens(self, beam: int) -> int:
         return self.py_tokens[beam]
 
@@ -314,9 +311,13 @@ def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
-        response = super().create_response(use_fast_logits, mpi_world_rank)
-        return LlmResponse(response,
-                           self.py_result) if response is not None else None
+        return LlmResponse(
+            request_id=self.py_request_id,
+            result=LlmResult(
+                super().create_result(use_fast_logits, mpi_world_rank),
+                self.py_result),
+            client_id=self.py_client_id,
+        )
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 54ccc55650..37ae2da196 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -30,8 +30,8 @@
 
 from ..distributed import Distributed
 from .kv_cache_transceiver import KvCacheTransceiver
-from .llm_request import (ExecutorRequest, ExecutorResponse, LlmRequest,
-                          LlmRequestState, executor_request_to_llm_request)
+from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState,
+                          LlmResponse, executor_request_to_llm_request)
 from .model_engine import ModelEngine
 from .sampler import Sampler, SampleState, SampleStateTensors, TorchSampler
 from .scheduler import ScheduledRequests
@@ -323,14 +323,14 @@ def await_responses(
         self,
         id: Optional[Union[List[int], int]] = None,
         timeout: Optional[datetime.timedelta] = None,
-    ) -> Union[List[List[ExecutorResponse]], List[ExecutorResponse]]:
+    ) -> Union[List[List[LlmResponse]], List[LlmResponse]]:
         """
         Await for ready responses
         Args:
             id (Optional[Union[List[int], int]]): Request id
             timeout (Optional[datetime.timedelta]): The maximum time to wait for new responses
         Returns:
-            Union[List[tensorrt_llm.bindings.executor.Response], List[List[tensorrt_llm.bindings.executor.Response]]]: Responses
+            Union[List[LlmResponse], List[List[LlmResponse]]]: Responses
         """
         timeout = timeout.total_seconds() if timeout is not None else None
         if id is None:
@@ -1934,8 +1934,10 @@ def _handle_errors(self, error_msg: Optional[str] = None):
             req_id = request.py_request_id
             request.state = LlmRequestState.GENERATION_COMPLETE
             self._terminate_request(request)
-            error_responses[req_id] = ExecutorResponse(
-                req_id, error_msg, client_id=request.py_client_id)
+            error_responses[req_id] = LlmResponse(
+                request_id=req_id,
+                error=error_msg,
+                client_id=request.py_client_id)
         self.active_requests.clear()
         self._enqueue_responses(error_responses)
 
@@ -1979,7 +1981,7 @@ def _handle_cancelled_requests(self):
         self._enqueue_responses(cancelled_responses)
 
     @nvtx_range("_enqueue_responses")
-    def _enqueue_responses(self, responses: Dict[int, ExecutorResponse]):
+    def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
         if 0 not in self.dist.mapping.tp_group and not self.gather_all_responses:
             return
 
@@ -2036,7 +2038,7 @@ def _handle_responses(self):
                 requests_to_terminate.append(request)
                 continue
 
-            if request.is_generation_only_request:
+            if request.is_generation_only_request():
                 # If request is in transmission, so we don't need to emit a response
                 # Also, for the first iteration with overlap, we should skip since first
                 # token has already been emitted previously
@@ -2048,7 +2050,7 @@ def _handle_responses(self):
 
             request.draft_tokens = request.py_draft_tokens
             request.decoding_iter = request.py_decoding_iter
-            response: Response = request.create_response(False, self.dist.rank)
+            response = request.create_response(False, self.dist.rank)
             request_done = False
             if response:
                 request_done = response.result.is_final
@@ -2075,7 +2077,7 @@ def _terminate_ctx_finished_requests(self):
 
     def _await_any_response(self,
                             timeout: Optional[float] = None
-                            ) -> List[ExecutorResponse]:
+                            ) -> List[LlmResponse]:
 
         def any_responses_ready():
             return len(self.responses) > 0 or self.is_shutdown
@@ -2092,7 +2094,7 @@ def any_responses_ready():
     def _await_single_response(
             self,
             id: int,
-            timeout: Optional[float] = None) -> List[ExecutorResponse]:
+            timeout: Optional[float] = None) -> List[LlmResponse]:
         with self.response_cv:
 
             def key_has_response():
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index 0f2e1581ca..acfdc00795 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -16,7 +16,7 @@
 from ..llmapi.tracer import global_tracer
 from ..llmapi.utils import AsyncQueue
 from ..sampling_params import LogprobParams, SamplingParams
-from .utils import ErrorResponse, has_event_loop, is_llm_response
+from .utils import ErrorResponse, has_event_loop
 
 if TYPE_CHECKING:
     from .executor import GenerationExecutor
@@ -282,7 +282,11 @@ def _handle_response(self,
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
                     handler(response.error)
-        elif is_llm_response(response):
+        elif isinstance(response, ErrorResponse):
+            if self._background_error_handler is not None and (
+                    handler := self._background_error_handler()):
+                handler(response.error_msg)
+        elif hasattr(response, "request_id"):
             if response.has_error():
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
@@ -318,10 +322,6 @@ def _handle_response(self,
             if self._background_error_handler and (
                     handler := self._background_error_handler()):
                 handler()
-        elif isinstance(response, ErrorResponse):
-            if self._background_error_handler is not None and (
-                    handler := self._background_error_handler()):
-                handler(response.error_msg)
         else:
             raise ValueError(f"Unknown response type: {response}")
 
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 5f057c6501..0a421e4668 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -39,7 +39,7 @@
 from .result import (GenerationResult, IterationResult, LogProbsResult,
                      ResponseWrapper, compute_logprobs)
 from .utils import (ErrorResponse, IntraProcessQueue, RequestError,
-                    WorkerCommIpcAddrs, has_event_loop, is_llm_response)
+                    WorkerCommIpcAddrs, has_event_loop)
 
 __all__ = [
     "GenerationExecutorWorker",
@@ -994,9 +994,8 @@ def _send_rsp(
 
     # Eliminate the finished GenerationRequest instances timely, which may
     # take considerable memory.
-    if is_llm_response(response):
-        if response.has_error() or response.result.is_final:
-            worker._pop_result(response.client_id)
+    if response.has_error() or response.result.is_final:
+        worker._pop_result(response.client_id)
     elif isinstance(response, ErrorResponse):
         worker._pop_result(response.client_id)
     else:

From f0bb7c8234e89ee0b57ec241a0a88db7c1dd63ad Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 19:04:38 +0800
Subject: [PATCH 04/33] pure Python LlmResponse

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 6 +++---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 tensorrt_llm/executor/proxy.py                | 6 ++----
 tensorrt_llm/executor/utils.py                | 9 ---------
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index ef9bdbde63..522f39babf 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -221,16 +221,16 @@ class LlmResponse:
 
     def __init__(self,
                  request_id: int,
-                 error: str = None,
+                 error_msg: str = None,
                  result: LlmResult = None,
                  client_id: int = None):
         self.request_id = request_id
-        self.error = error
+        self.error_msg = error_msg
         self.result = result
         self.client_id = client_id
 
     def has_error(self):
-        return self.error is not None
+        return self.error_msg is not None
 
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 37ae2da196..5b84cd7e37 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1936,7 +1936,7 @@ def _handle_errors(self, error_msg: Optional[str] = None):
             self._terminate_request(request)
             error_responses[req_id] = LlmResponse(
                 request_id=req_id,
-                error=error_msg,
+                error_msg=error_msg,
                 client_id=request.py_client_id)
         self.active_requests.clear()
         self._enqueue_responses(error_responses)
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 76cb2737c6..4e54be2168 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -24,8 +24,7 @@
 from .request import CancellingRequest, GenerationRequest
 from .result import GenerationResult, IterationResult
 from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs,
-                    create_mpi_comm_session, get_spawn_proxy_process_env,
-                    is_llm_response)
+                    create_mpi_comm_session, get_spawn_proxy_process_env)
 from .worker import GenerationExecutorWorker, worker_main
 
 __all__ = [
@@ -172,8 +171,7 @@ def process_res(res):
                 event_loop = event_loop or queue.loop
             else:
                 queue.put(res)
-
-            if (is_llm_response(res) and res.result.is_final) or isinstance(
+            if (hasattr(res, "result") and res.result.is_final) or isinstance(
                     res, ErrorResponse):
                 self._results.pop(client_id)
 
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
index bb6466373f..e7b9975a5d 100644
--- a/tensorrt_llm/executor/utils.py
+++ b/tensorrt_llm/executor/utils.py
@@ -8,7 +8,6 @@
 from strenum import StrEnum
 
 from tensorrt_llm._utils import mpi_rank
-from tensorrt_llm.bindings.executor import Response
 from tensorrt_llm.llmapi.utils import print_colored_debug
 
 from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession,
@@ -141,11 +140,3 @@ class WorkerCommIpcAddrs(NamedTuple):
     result_queue_addr: tuple[str, Optional[bytes]]
     stats_queue_addr: tuple[str, Optional[bytes]]
     kv_cache_events_queue_addr: tuple[str, Optional[bytes]]
-
-
-def is_llm_response(instance):
-    from tensorrt_llm._torch.pyexecutor.llm_request import \
-        LlmResponse as PyLlmResponse
-
-    from .result import ResponseWrapper
-    return isinstance(instance, (Response, PyLlmResponse, ResponseWrapper))

From 60ca761095701150ee7c842bfd0bafd655d28660 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 19:12:04 +0800
Subject: [PATCH 05/33] clean

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/proxy.py  |  5 +++--
 tensorrt_llm/executor/result.py | 12 ++++++------
 tensorrt_llm/executor/utils.py  |  4 ++++
 tensorrt_llm/executor/worker.py |  7 ++++---
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 4e54be2168..8f16031a00 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -24,7 +24,8 @@
 from .request import CancellingRequest, GenerationRequest
 from .result import GenerationResult, IterationResult
 from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs,
-                    create_mpi_comm_session, get_spawn_proxy_process_env)
+                    create_mpi_comm_session, get_spawn_proxy_process_env,
+                    is_llm_response)
 from .worker import GenerationExecutorWorker, worker_main
 
 __all__ = [
@@ -171,7 +172,7 @@ def process_res(res):
                 event_loop = event_loop or queue.loop
             else:
                 queue.put(res)
-            if (hasattr(res, "result") and res.result.is_final) or isinstance(
+            if (is_llm_response(res) and res.result.is_final) or isinstance(
                     res, ErrorResponse):
                 self._results.pop(client_id)
 
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index acfdc00795..0f2e1581ca 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -16,7 +16,7 @@
 from ..llmapi.tracer import global_tracer
 from ..llmapi.utils import AsyncQueue
 from ..sampling_params import LogprobParams, SamplingParams
-from .utils import ErrorResponse, has_event_loop
+from .utils import ErrorResponse, has_event_loop, is_llm_response
 
 if TYPE_CHECKING:
     from .executor import GenerationExecutor
@@ -282,11 +282,7 @@ def _handle_response(self,
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
                     handler(response.error)
-        elif isinstance(response, ErrorResponse):
-            if self._background_error_handler is not None and (
-                    handler := self._background_error_handler()):
-                handler(response.error_msg)
-        elif hasattr(response, "request_id"):
+        elif is_llm_response(response):
             if response.has_error():
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
@@ -322,6 +318,10 @@ def _handle_response(self,
             if self._background_error_handler and (
                     handler := self._background_error_handler()):
                 handler()
+        elif isinstance(response, ErrorResponse):
+            if self._background_error_handler is not None and (
+                    handler := self._background_error_handler()):
+                handler(response.error_msg)
         else:
             raise ValueError(f"Unknown response type: {response}")
 
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
index e7b9975a5d..fd4cd8444e 100644
--- a/tensorrt_llm/executor/utils.py
+++ b/tensorrt_llm/executor/utils.py
@@ -140,3 +140,7 @@ class WorkerCommIpcAddrs(NamedTuple):
     result_queue_addr: tuple[str, Optional[bytes]]
     stats_queue_addr: tuple[str, Optional[bytes]]
     kv_cache_events_queue_addr: tuple[str, Optional[bytes]]
+
+
+def is_llm_response(instance):
+    return hasattr(instance, "result")
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 0a421e4668..5f057c6501 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -39,7 +39,7 @@
 from .result import (GenerationResult, IterationResult, LogProbsResult,
                      ResponseWrapper, compute_logprobs)
 from .utils import (ErrorResponse, IntraProcessQueue, RequestError,
-                    WorkerCommIpcAddrs, has_event_loop)
+                    WorkerCommIpcAddrs, has_event_loop, is_llm_response)
 
 __all__ = [
     "GenerationExecutorWorker",
@@ -994,8 +994,9 @@ def _send_rsp(
 
     # Eliminate the finished GenerationRequest instances timely, which may
     # take considerable memory.
-    if response.has_error() or response.result.is_final:
-        worker._pop_result(response.client_id)
+    if is_llm_response(response):
+        if response.has_error() or response.result.is_final:
+            worker._pop_result(response.client_id)
     elif isinstance(response, ErrorResponse):
         worker._pop_result(response.client_id)
     else:

From 5f7e9ea233acdcd741c52d06ff6968600331a8fe Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:32:15 +0800
Subject: [PATCH 06/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index a722587b79..2cdb8a5bec 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -48,7 +48,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
-executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
+executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))

From 1b3a7b7a3286956b18d188b5b1795d80d371b827 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:34:05 +0800
Subject: [PATCH 07/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 2cdb8a5bec..492b45f1d8 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -48,7 +48,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
-executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
+executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))

From df6007306b59862662494a2517924a1f5f17612d Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:39:35 +0800
Subject: [PATCH 08/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/include/tensorrt_llm/batch_manager/llmRequest.h |  2 +-
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp       | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index dca20816db..ac5b11d882 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -2328,7 +2328,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     /// @return An optional Response
     std::optional<executor::Response> createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
-    executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
+    std::optional<executor::Result> createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 492b45f1d8..1c11688e9e 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -42,13 +42,16 @@ template class GenericLlmRequest<runtime::ITensor::SharedPtr>;
 std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank)
 {
     auto requestId = isChild() ? mParentRequestId : mRequestId;
-    auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId);
-
-    return response;
+    auto result = createResult(useFastLogits, mpiWorldRank);
+    if (result.has_value())
+    {
+        return executor::Response(requestId, std::move(result), mClientId);
+    }
+    return std::nullopt;
 }
 
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
-executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
+std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))

From 91c904e85e3f50afb0f1ad1aabbea3fad1758f66 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:41:44 +0800
Subject: [PATCH 09/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 1c11688e9e..702f208375 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -45,7 +45,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())
     {
-        return executor::Response(requestId, std::move(result), mClientId);
+        return executor::Response(requestId, result, mClientId);
     }
     return std::nullopt;
 }

From 5d62ceac9b53c522cb12d57c33ae21ab2c5bde6b Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:54:53 +0800
Subject: [PATCH 10/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 702f208375..1a55482b07 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -45,7 +45,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())
     {
-        return executor::Response(requestId, result, mClientId);
+        return executor::Response(requestId, result.value(), mClientId);
     }
     return std::nullopt;
 }

From 50eb5b964acb634ebab93dc885b359b26ba07143 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:12:32 +0800
Subject: [PATCH 11/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 1a55482b07..cfa10eb405 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -203,6 +203,7 @@ std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int
 
     // Update position of last sent response
     setMaxSentTokenLen(maxNbTokens);
+    return result;
 }
 
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,

From ea2f8cc9e4fd01cfa9a24538ccf38675a4648ac8 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:20:45 +0800
Subject: [PATCH 12/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 5b84cd7e37..3470737f04 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2052,7 +2052,7 @@ def _handle_responses(self):
             request.decoding_iter = request.py_decoding_iter
             response = request.create_response(False, self.dist.rank)
             request_done = False
-            if response:
+            if response and response.result:
                 request_done = response.result.is_final
                 new_responses.update({req_id: response})
             if request_done:

From 431926d21274e2a2be7483b00b35a51d0e204671 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:27:19 +0800
Subject: [PATCH 13/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 +++
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 522f39babf..cecadd6e51 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -232,6 +232,9 @@ def __init__(self,
     def has_error(self):
         return self.error_msg is not None
 
+    def has_result(self):
+        return self.result._result
+
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 3470737f04..900aa71127 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2052,7 +2052,7 @@ def _handle_responses(self):
             request.decoding_iter = request.py_decoding_iter
             response = request.create_response(False, self.dist.rank)
             request_done = False
-            if response and response.result:
+            if response.has_result():
                 request_done = response.result.is_final
                 new_responses.update({req_id: response})
             if request_done:

From 3fb4d84591a09268932b08ff1e3a4290f2b0708c Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 23:10:53 +0800
Subject: [PATCH 14/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 11 +++--------
 tensorrt_llm/_torch/pyexecutor/py_executor.py |  2 +-
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index cecadd6e51..752305555a 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -232,9 +232,6 @@ def __init__(self,
     def has_error(self):
         return self.error_msg is not None
 
-    def has_result(self):
-        return self.result._result
-
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
@@ -314,13 +311,11 @@ def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
+        result = super().create_result(use_fast_logits, mpi_world_rank)
         return LlmResponse(
             request_id=self.py_request_id,
-            result=LlmResult(
-                super().create_result(use_fast_logits, mpi_world_rank),
-                self.py_result),
-            client_id=self.py_client_id,
-        )
+            result=LlmResult(result, self.py_result),
+            client_id=self.py_client_id) if result is not None else None
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 900aa71127..5b84cd7e37 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2052,7 +2052,7 @@ def _handle_responses(self):
             request.decoding_iter = request.py_decoding_iter
             response = request.create_response(False, self.dist.rank)
             request_done = False
-            if response.has_result():
+            if response:
                 request_done = response.result.is_final
                 new_responses.update({req_id: response})
             if request_done:

From 04370bad2ae9279aa8831e2c7ccf72aae2c570eb Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 00:07:21 +0800
Subject: [PATCH 15/33] polish

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/proxy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 8f16031a00..76cb2737c6 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -172,6 +172,7 @@ def process_res(res):
                 event_loop = event_loop or queue.loop
             else:
                 queue.put(res)
+
             if (is_llm_response(res) and res.result.is_final) or isinstance(
                     res, ErrorResponse):
                 self._results.pop(client_id)

From acd09a45940f4b70f5e61b12798ffe4b7f111204 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:37:14 +0800
Subject: [PATCH 16/33] expose createSerializedResult api

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .../tensorrt_llm/batch_manager/llmRequest.h   |  3 +++
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 25 +++++++++++++++++++
 .../pybind/batch_manager/bindings.cpp         |  8 ++++++
 3 files changed, 36 insertions(+)

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index ac5b11d882..d71b6e89f6 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -2330,6 +2330,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
 
     std::optional<executor::Result> createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
+    void createSerializedResult(
+        std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0);
+
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
 
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index cfa10eb405..ea1cffa254 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/executor/serializeUtils.h"
 #include "tensorrt_llm/kernels/beamSearchKernels.h"
 
 namespace tensorrt_llm::batch_manager
@@ -50,6 +51,21 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
     return std::nullopt;
 }
 
+void LlmRequest::createSerializedResult(
+    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0)
+{
+    auto result = createResult(useFastLogits, mpiWorldRank);
+    if (result.has_value())
+    {
+        std::ostringstream oStream;
+        executor::serialize_utils::serialize(result.value(), oStream);
+        auto str = oStream.str();
+        serializedResult.resize(str.size());
+        std::copy(serializedResult.begin(), str.begin(), str.end());
+        isFinal = result.isFinal;
+    }
+}
+
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
@@ -206,6 +222,15 @@ std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int
     return result;
 }
 
+bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank)
+{
+    auto result = createResult(useFastLogits, mpiWorldRank);
+    if (result.has_value())
+    {
+        executor::serialize
+    }
+}
+
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
     SizeType32 vocabSizePadded, std::optional<SizeType32> maxEncoderInputLen, bool enableKVCacheReuse)
 {
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index a3399e1833..62ef664ac6 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -362,6 +362,14 @@ void initBindings(pybind11::module_& m)
             py::arg("mpi_world_rank") = 0)
         .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false,
             py::arg("mpi_world_rank") = 0)
+        .def("create_serialized_result",
+            [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
+            {
+                std::vector<char> serialized_result;
+                bool is_final = False;
+                self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
+                return py::str(serialized_result.data(), serialized_result.size()), is_final;
+            })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"));

From 48a999ce2abe906d86751119ef11897821885c8c Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:38:49 +0800
Subject: [PATCH 17/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 752305555a..3ea0eb7ec4 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -213,7 +213,8 @@ def __init__(self, result: tensorrt_llm.bindings.executor.Result,
     def __getattr__(self, item):
         if item in self.py_result_properties:
             return getattr(self._py_result, item)
-        return getattr(self._result, item)
+        result = object.__getattribute__(self, '_result')
+        return getattr(result, item)
 
 
 class LlmResponse:

From 1e36d773786d98d17d5b6d8f172a6d738e27b74d Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:45:18 +0800
Subject: [PATCH 18/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index ea1cffa254..56e2addf61 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -52,7 +52,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 void LlmRequest::createSerializedResult(
-    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0)
+    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank)
 {
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())
@@ -61,8 +61,8 @@ void LlmRequest::createSerializedResult(
         executor::serialize_utils::serialize(result.value(), oStream);
         auto str = oStream.str();
         serializedResult.resize(str.size());
-        std::copy(serializedResult.begin(), str.begin(), str.end());
-        isFinal = result.isFinal;
+        std::copy(str.begin(), str.end(), serializedResult.begin());
+        isFinal = result.value().isFinal;
     }
 }
 

From 4fd71bc78730c7a26cc3759b2a231ae5d8d7ac62 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:47:36 +0800
Subject: [PATCH 19/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 56e2addf61..cd119dccf9 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -222,15 +222,6 @@ std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int
     return result;
 }
 
-bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank)
-{
-    auto result = createResult(useFastLogits, mpiWorldRank);
-    if (result.has_value())
-    {
-        executor::serialize
-    }
-}
-
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
     SizeType32 vocabSizePadded, std::optional<SizeType32> maxEncoderInputLen, bool enableKVCacheReuse)
 {

From 1a4920ba5fdb345b6dd86f7159f765bd4fd6d693 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:38:20 +0800
Subject: [PATCH 20/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index cd119dccf9..433f349b07 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -52,7 +52,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 void LlmRequest::createSerializedResult(
-    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank)
+    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits, int32_t mpiWorldRank)
 {
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())

From 5e9888a3e774c51a98eb3d478ca45b24b50c25d3 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:40:53 +0800
Subject: [PATCH 21/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 62ef664ac6..ab299af0ba 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -366,7 +366,7 @@ void initBindings(pybind11::module_& m)
             [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
             {
                 std::vector<char> serialized_result;
-                bool is_final = False;
+                bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
                 return py::str(serialized_result.data(), serialized_result.size()), is_final;
             })

From 9bfa834cafbdf7368c72e5aad9aed5b7eff604a8 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:49:17 +0800
Subject: [PATCH 22/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index ab299af0ba..e287550535 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -36,6 +36,7 @@
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 #include <torch/extension.h>
+#include <tuple>
 
 namespace py = pybind11;
 namespace tb = tensorrt_llm::batch_manager;
@@ -368,7 +369,7 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return py::str(serialized_result.data(), serialized_result.size()), is_final;
+                return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))

From b37703cfade9c0d16f0f8107297d08d6b609bf24 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:27:36 +0800
Subject: [PATCH 23/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index e287550535..9f8f95e8ed 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -369,7 +369,7 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final);
+                return std::make_tuple(serialized_result, is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))

From 3e4844d42ef9c35c1d03bb64bd198ce007541408 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:59:59 +0800
Subject: [PATCH 24/33] serialize result

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/executor/request.cpp  | 11 ++++++++++-
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 15 +++++++++------
 tensorrt_llm/executor/result.py               |  4 ++++
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp
index 85fb83ce27..66f7a26e53 100644
--- a/cpp/tensorrt_llm/pybind/executor/request.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/request.cpp
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/serializeUtils.h"
 #include "tensorrt_llm/executor/tensor.h"
 #include "tensorrt_llm/executor/types.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
@@ -775,7 +776,15 @@ void initRequestBindings(pybind11::module_& m)
         .def_readwrite("context_phase_params", &tle::Result::contextPhaseParams)
         .def(py::pickle(resultGetstate, resultSetstate));
 
-    auto responseGetstate = [](tle::Response const& self)
+    m.def("deserialize_result",
+        [](std::string& x)
+        {
+            std::istream is(&x);
+            return tle::serialize_utils::deserialize<tle::Result>(is);
+        })
+
+        auto responseGetstate
+        = [](tle::Response const& self)
     { return py::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); };
 
     auto responseSetstate = [](py::tuple const& state)
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 3ea0eb7ec4..fd0cd1f969 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -205,16 +205,18 @@ class LlmResult:
     py_result_properties = frozenset(
         ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs'))
 
-    def __init__(self, result: tensorrt_llm.bindings.executor.Result,
-                 py_result: PyResult):
+    def __init__(self,
+                 result: str,
+                 py_result: PyResult,
+                 is_final: bool = False):
         self._result = result
         self._py_result = py_result
+        self.is_final = is_final
 
     def __getattr__(self, item):
         if item in self.py_result_properties:
             return getattr(self._py_result, item)
-        result = object.__getattribute__(self, '_result')
-        return getattr(result, item)
+        return getattr(self, item)
 
 
 class LlmResponse:
@@ -312,10 +314,11 @@ def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
-        result = super().create_result(use_fast_logits, mpi_world_rank)
+        result, is_final = super().create_serialized_result(
+            use_fast_logits, mpi_world_rank)
         return LlmResponse(
             request_id=self.py_request_id,
-            result=LlmResult(result, self.py_result),
+            result=LlmResult(result, self.py_result, is_final),
             client_id=self.py_client_id) if result is not None else None
 
     @property
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index 0f2e1581ca..a67c35e16d 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -289,6 +289,10 @@ def _handle_response(self,
                     handler(response.error_msg)
 
             response_result = response.result
+            if hasattr(response_result.result, "_result"):
+                response_result = tllm.deserialize_result(
+                    response_result.result._result)
+
             self._done = response_result.is_final
             context_phase_params = response_result.context_phase_params
             self.decoding_iter = response_result.decoding_iter

From e5e38731b2bc54dfe22ee7f701479ae2a72375ff Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 16:13:03 +0800
Subject: [PATCH 25/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/executor/request.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp
index 66f7a26e53..c8020d7ccb 100644
--- a/cpp/tensorrt_llm/pybind/executor/request.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/request.cpp
@@ -781,10 +781,9 @@ void initRequestBindings(pybind11::module_& m)
         {
             std::istream is(&x);
             return tle::serialize_utils::deserialize<tle::Result>(is);
-        })
+        });
 
-        auto responseGetstate
-        = [](tle::Response const& self)
+    auto responseGetstate = [](tle::Response const& self)
     { return py::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); };
 
     auto responseSetstate = [](py::tuple const& state)

From 39a791ac44c5bef1725d52cf8c6ececbe1bad13a Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 16:20:10 +0800
Subject: [PATCH 26/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/executor/request.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp
index c8020d7ccb..e43ad80ce1 100644
--- a/cpp/tensorrt_llm/pybind/executor/request.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/request.cpp
@@ -30,6 +30,7 @@
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sstream>
 
 #include <optional>
 #include <vector>
@@ -779,7 +780,7 @@ void initRequestBindings(pybind11::module_& m)
     m.def("deserialize_result",
         [](std::string& x)
         {
-            std::istream is(&x);
+            std::istringstream is(x);
             return tle::serialize_utils::deserialize<tle::Result>(is);
         });
 

From 883b9f0e670936e579cbfe1c21fc0ccb415b41fa Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 16:58:33 +0800
Subject: [PATCH 27/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 ++-
 tensorrt_llm/_torch/pyexecutor/llm_request.py      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 9f8f95e8ed..75ccb06774 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -369,7 +369,8 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return std::make_tuple(serialized_result, is_final);
+                std::string str(serialized_result.begin(), serialized_result.end());
+                return std::make_tuple(str, is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index fd0cd1f969..61c521cd38 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -216,7 +216,7 @@ def __init__(self,
     def __getattr__(self, item):
         if item in self.py_result_properties:
             return getattr(self._py_result, item)
-        return getattr(self, item)
+        return object.__getattribute__(self, item)
 
 
 class LlmResponse:

From ab84d66572fe4a2a3ff07aae8b59c7fbfaddd1a0 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 17:47:58 +0800
Subject: [PATCH 28/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 +--
 tensorrt_llm/_torch/pyexecutor/llm_request.py      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 75ccb06774..95173b545d 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -369,8 +369,7 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                std::string str(serialized_result.begin(), serialized_result.end());
-                return std::make_tuple(str, is_final);
+                return std::make_tuple(py::bytes(serialized_result.data(), serialized_result.size()), , is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 61c521cd38..5a6ac2a96a 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -206,7 +206,7 @@ class LlmResult:
         ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs'))
 
     def __init__(self,
-                 result: str,
+                 result: bytes,
                  py_result: PyResult,
                  is_final: bool = False):
         self._result = result

From 3c53e791058cc6822b6d002dae158f709c47b314 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 17:56:27 +0800
Subject: [PATCH 29/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 95173b545d..a6fd8b8e9d 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -369,7 +369,7 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return std::make_tuple(py::bytes(serialized_result.data(), serialized_result.size()), , is_final);
+                return std::make_tuple(py::bytes(serialized_result.data(), serialized_result.size()), is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))

From 144677be4498a28407c75371a7cdc85557fb025e Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 20:54:18 +0800
Subject: [PATCH 30/33] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 7 +++++--
 tensorrt_llm/executor/result.py               | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 5a6ac2a96a..c3c92a942f 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -216,7 +216,10 @@ def __init__(self,
     def __getattr__(self, item):
         if item in self.py_result_properties:
             return getattr(self._py_result, item)
-        return object.__getattribute__(self, item)
+        if item == 'is_final':
+            return object.__getattribute__(self, 'is_final')
+        result = object.__getattribute__(self, '_result')
+        return getattr(result, item)
 
 
 class LlmResponse:
@@ -319,7 +322,7 @@ def create_response(
         return LlmResponse(
             request_id=self.py_request_id,
             result=LlmResult(result, self.py_result, is_final),
-            client_id=self.py_client_id) if result is not None else None
+            client_id=self.py_client_id) if len(result) > 0 else None
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index a67c35e16d..2f6d61ac3b 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -289,9 +289,9 @@ def _handle_response(self,
                     handler(response.error_msg)
 
             response_result = response.result
-            if hasattr(response_result.result, "_result"):
+            if hasattr(response_result, "_result"):
                 response_result = tllm.deserialize_result(
-                    response_result.result._result)
+                    response_result._result)
 
             self._done = response_result.is_final
             context_phase_params = response_result.context_phase_params

From e5a6efff7a2124b0c428d8fb5687adbaf9802272 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Sat, 14 Jun 2025 14:52:01 +0800
Subject: [PATCH 31/33] use HIGHEST_PROTOCOL

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/serialization.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/executor/serialization.py b/tensorrt_llm/executor/serialization.py
index 79a3006b0f..f291a5a746 100644
--- a/tensorrt_llm/executor/serialization.py
+++ b/tensorrt_llm/executor/serialization.py
@@ -2,6 +2,7 @@
 # pickle is not secure, but but this whole file is a wrapper to make it
 # possible to mitigate the primary risk of code injection via pickle.
 import pickle  # nosec B403
+from functools import partial
 
 # These are the base classes that are generally serialized by the ZeroMQ IPC.
 # If a class is needed by ZMQ routinely it should be added here. If
@@ -155,8 +156,8 @@ def find_class(self, module, name):
 # dump and dumps are just aliases because the serucity controls are on the deserialization
 # side. However they are included here so that in the future if a more secure serialization
 # soliton is identified, it can be added with less impact to the rest of the application.
-dump = pickle.dump  # nosec B301
-dumps = pickle.dumps  # nosec B301
+dump = partial(pickle.dump, protocol=pickle.HIGHEST_PROTOCOL)  # nosec B301
+dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)  # nosec B301
 
 
 def load(file,

From 11ffdb80746e97acd8ff8a121aef748bb965e41c Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Mon, 16 Jun 2025 12:51:13 +0800
Subject: [PATCH 32/33] fix ci

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index 33d44d059b..a7f4b6b2cf 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -295,7 +295,7 @@ def _handle_response(self,
 
             response_result = response.result
             if hasattr(response_result, "_result"):
-                response_result = tllm.deserialize_result(
+                response_result._result = tllm.deserialize_result(
                     response_result._result)
 
             self._done = response_result.is_final

From a54beae0f6551aa3fc19fa6d7be3d88523790436 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:51:09 +0800
Subject: [PATCH 33/33] polish code

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 29 ++++++-------
 tensorrt_llm/executor/result.py               |  3 +-
 tests/unittest/_torch/test_return_logits.py   | 41 -------------------
 3 files changed, 13 insertions(+), 60 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index fe90917469..e792be72ed 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -1,4 +1,5 @@
-from typing import List, Optional
+from dataclasses import dataclass
+from typing import List, Optional, Union
 
 import torch
 
@@ -206,7 +207,7 @@ class LlmResult:
         ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs'))
 
     def __init__(self,
-                 result: bytes,
+                 result: Union[bytes, tensorrt_llm.bindings.executor.Result],
                  py_result: PyResult,
                  is_final: bool = False):
         self._result = result
@@ -221,27 +222,21 @@ def __getattr__(self, item):
         result = object.__getattribute__(self, '_result')
         return getattr(result, item)
 
+    def deserialize(self):
+        self._result = tensorrt_llm.bindings.executor.deserialize_result(
+            self._result)
 
-class LlmResponse:
-    """LlmResponse wraps `bindings.executor.Response` but detour some features to Python implementation"""
 
-    def __init__(self,
-                 request_id: int,
-                 error_msg: str = None,
-                 result: LlmResult = None,
-                 client_id: int = None):
-        self.request_id = request_id
-        self.error_msg = error_msg
-        self.result = result
-        self.client_id = client_id
+@dataclass
+class LlmResponse:
+    request_id: int
+    error_msg: Optional[str] = None
+    result: Optional[LlmResult] = None
+    client_id: Optional[int] = None
 
     def has_error(self):
         return self.error_msg is not None
 
-    @property
-    def _is_llm_response(self) -> bool:
-        return True
-
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index a7f4b6b2cf..2716f80d16 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -295,8 +295,7 @@ def _handle_response(self,
 
             response_result = response.result
             if hasattr(response_result, "_result"):
-                response_result._result = tllm.deserialize_result(
-                    response_result._result)
+                response_result.deserialize()
 
             self._done = response_result.is_final
             context_phase_params = response_result.context_phase_params
diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py
index 8555b8bc5a..2fa21ad417 100644
--- a/tests/unittest/_torch/test_return_logits.py
+++ b/tests/unittest/_torch/test_return_logits.py
@@ -1,5 +1,4 @@
 import os
-import pickle
 
 import pytest
 import torch
@@ -8,52 +7,12 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.llm_request import LlmResponse, PyResult
-from tensorrt_llm.bindings.executor import Response, Result
-from tensorrt_llm.executor.result import Logprob
 from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig
 
 prompts = ["A B C"]
 global_kvcache_config = KvCacheConfig(max_tokens=10000)
 
 
-def test_LlmResponse_pickle():
-    result = Result()
-    result.decoding_iter = 1
-    result.sequence_index = 1
-    binding_response = Response(request_id=1, result=result, client_id=1)
-    py_result = PyResult(prompt_len=1,
-                         max_new_tokens=1,
-                         use_device_memory=True,
-                         streaming=False,
-                         return_log_probs=True,
-                         return_context_logits=True,
-                         return_generation_logits=True)
-    context_logits = torch.randn([1, 1, 128], device='cuda')
-    generation_logits = torch.randn([1, 1, 128], device='cuda')
-    logprobs = [[{1: Logprob(0.8, 1)}]]
-    py_result.append_context_logits(context_logits)
-    py_result.append_generation_logits(generation_logits)
-    py_result.append_log_probs(logprobs)
-
-    response = LlmResponse(binding_response, py_result)
-
-    data = pickle.dumps(response)
-    pickle_response: LlmResponse = pickle.loads(data)
-
-    assert pickle_response._response.request_id == 1
-    assert pickle_response._response.client_id == 1
-
-    pickle_result = pickle_response.result
-
-    assert pickle_result.decoding_iter == 1
-    assert pickle_result.sequence_index == 1
-    assert torch.all(torch.eq(pickle_result.context_logits, context_logits))
-    assert torch.all(
-        torch.eq(pickle_result.generation_logits, generation_logits))
-    assert pickle_result.log_probs == logprobs
-
-
 @force_ampere  # Save H100 resource
 @pytest.mark.parametrize("return_log_probs", [False, True])
 @pytest.mark.parametrize("gather_generation_logits", [False, True])