From f038a82c33003d77e817b7df347af6b9057f6233 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:32:41 +0800 Subject: [PATCH 01/33] cache tokens in Python side to reduce pybind reading overhead Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 13 +++++++++++++ tensorrt_llm/_torch/pyexecutor/model_engine.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 63ac568f4d..91e80d07bd 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -246,6 +246,8 @@ def __init__( self.is_cuda_graph_dummy = False self.py_lora_task_layer_module_configs = None + self.py_tokens = super().get_tokens() + self.py_return_log_probs = return_log_probs self.py_return_context_logits = return_context_logits self.py_return_generation_logits = return_generation_logits @@ -260,6 +262,17 @@ def __init__( return_log_probs, return_context_logits, return_generation_logits) + def get_tokens(self, beam: int): + return self.py_tokens[beam] + + def get_last_tokens(self, beam: int): + return self.py_tokens[beam][-1] + + def add_new_token(self, token: int, beam: int): + self.py_tokens[beam].append(token) + # sync to C++ side + super().add_new_token(token, beam) + def create_response( self, use_fast_logits=False, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 49bf6194b2..db9b421d7a 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -1120,7 +1120,7 @@ def _prepare_tp_inputs( gather_ids.append(len(input_ids) - 1) sequence_lengths.append(len(prompt_tokens)) prompt_lengths.append(len(prompt_tokens)) - past_seen_token_num = request.context_current_position + past_seen_token_num = begin_compute num_cached_tokens_per_seq.append(past_seen_token_num) multimodal_embedding = request.multimodal_embedding if multimodal_embedding is not None: From bba671970cdd2b5341a8d6842f59978976bc32a5 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:36:41 +0800 Subject: [PATCH 02/33] refine Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 91e80d07bd..0c29be89f3 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -262,16 +262,16 @@ def __init__( return_log_probs, return_context_logits, return_generation_logits) - def get_tokens(self, beam: int): + def get_tokens(self, beam: int) -> int: return self.py_tokens[beam] - def get_last_tokens(self, beam: int): + def get_last_tokens(self, beam: int) -> int: return self.py_tokens[beam][-1] - def add_new_token(self, token: int, beam: int): + def add_new_token(self, token: int, beam: int) -> int: self.py_tokens[beam].append(token) # sync to C++ side - super().add_new_token(token, beam) + return super().add_new_token(token, beam) def create_response( self, From 3c836448fb422479406fe8f1856549730bde9e40 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 18:57:54 +0800 Subject: [PATCH 03/33] pure Python LlmResponse Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/llmRequest.h | 2 + cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 15 ++++--- .../pybind/batch_manager/bindings.cpp | 2 + tensorrt_llm/_torch/pyexecutor/llm_request.py | 43 ++++++++++--------- tensorrt_llm/_torch/pyexecutor/py_executor.py | 24 ++++++----- tensorrt_llm/executor/result.py | 12 +++--- tensorrt_llm/executor/worker.py | 7 ++- 7 files changed, 57 insertions(+), 48 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 086dc2bf4a..dca20816db 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2328,6 +2328,8 @@ class LlmRequest : public GenericLlmRequest /// @return An optional Response std::optional createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0); + executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); + void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false); diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 6fc7051ad7..a722587b79 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -39,8 +39,16 @@ runtime::SizeType32 GenericLlmRequest::getBeamWidthByIter(bool template class GenericLlmRequest; -/// Note that there is some dependency on the order of operations in this method. Modify with care! std::optional LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank) +{ + auto requestId = isChild() ? mParentRequestId : mRequestId; + auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId); + + return response; +} + +/// Note that there is some dependency on the order of operations in this method. Modify with care! +executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) @@ -192,11 +200,6 @@ std::optional LlmRequest::createResponse(bool useFastLogits, // Update position of last sent response setMaxSentTokenLen(maxNbTokens); - - auto requestId = isChild() ? mParentRequestId : mRequestId; - auto response = executor::Response(requestId, std::move(result), mClientId); - - return response; } void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 35f32a3b12..a3399e1833 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -360,6 +360,8 @@ void initBindings(pybind11::module_& m) py::arg("enable_kv_cache_reuse") = false) .def("create_response", &tb::LlmRequest::createResponse, py::arg("use_fast_logits") = false, py::arg("mpi_world_rank") = 0) + .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false, + py::arg("mpi_world_rank") = 0) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason")); diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index c89ad56ec6..ef9bdbde63 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -219,25 +219,18 @@ def __getattr__(self, item): class LlmResponse: """LlmResponse wraps `bindings.executor.Response` but detour some features to Python implementation""" - def __init__(self, response: tensorrt_llm.bindings.executor.Response, - py_result: PyResult): - self._response = response - self._py_result = py_result - - def __getstate__(self): - return self._response, self._py_result - - def __setstate__(self, state): - self._response, self._py_result = state - - @property - def result(self) -> tensorrt_llm.bindings.executor.Result: - return LlmResult( - self._response.result, - self._py_result) # LlmResult masquerades bindings.executor.Result + def __init__(self, + request_id: int, + error: str = None, + result: LlmResult = None, + client_id: int = None): + self.request_id = request_id + self.error = error + self.result = result + self.client_id = client_id - def __getattr__(self, item): - return getattr(self._response, item) + def has_error(self): + return self.error is not None class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): @@ -269,6 +262,7 @@ def __init__( **kwargs) self.py_client_id = client_id self.py_request_id = self.request_id + self.py_llm_request_type = self.llm_request_type self.py_end_id = self.end_id self.py_prompt_len = self.prompt_len self.py_orig_prompt_len = self.orig_prompt_len @@ -299,6 +293,9 @@ def __init__( return_generation_logits, exclude_last_generation_logits) + def is_generation_only_request(self): + return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY + def get_tokens(self, beam: int) -> int: return self.py_tokens[beam] @@ -314,9 +311,13 @@ def create_response( self, use_fast_logits=False, mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None: - response = super().create_response(use_fast_logits, mpi_world_rank) - return LlmResponse(response, - self.py_result) if response is not None else None + return LlmResponse( + request_id=self.py_request_id, + result=LlmResult( + super().create_result(use_fast_logits, mpi_world_rank), + self.py_result), + client_id=self.py_client_id, + ) @property def is_dummy(self): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 54ccc55650..37ae2da196 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -30,8 +30,8 @@ from ..distributed import Distributed from .kv_cache_transceiver import KvCacheTransceiver -from .llm_request import (ExecutorRequest, ExecutorResponse, LlmRequest, - LlmRequestState, executor_request_to_llm_request) +from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState, + LlmResponse, executor_request_to_llm_request) from .model_engine import ModelEngine from .sampler import Sampler, SampleState, SampleStateTensors, TorchSampler from .scheduler import ScheduledRequests @@ -323,14 +323,14 @@ def await_responses( self, id: Optional[Union[List[int], int]] = None, timeout: Optional[datetime.timedelta] = None, - ) -> Union[List[List[ExecutorResponse]], List[ExecutorResponse]]: + ) -> Union[List[List[LlmResponse]], List[LlmResponse]]: """ Await for ready responses Args: id (Optional[Union[List[int], int]]): Request id timeout (Optional[datetime.timedelta]): The maximum time to wait for new responses Returns: - Union[List[tensorrt_llm.bindings.executor.Response], List[List[tensorrt_llm.bindings.executor.Response]]]: Responses + Union[List[LlmResponse], List[List[LlmResponse]]]: Responses """ timeout = timeout.total_seconds() if timeout is not None else None if id is None: @@ -1934,8 +1934,10 @@ def _handle_errors(self, error_msg: Optional[str] = None): req_id = request.py_request_id request.state = LlmRequestState.GENERATION_COMPLETE self._terminate_request(request) - error_responses[req_id] = ExecutorResponse( - req_id, error_msg, client_id=request.py_client_id) + error_responses[req_id] = LlmResponse( + request_id=req_id, + error=error_msg, + client_id=request.py_client_id) self.active_requests.clear() self._enqueue_responses(error_responses) @@ -1979,7 +1981,7 @@ def _handle_cancelled_requests(self): self._enqueue_responses(cancelled_responses) @nvtx_range("_enqueue_responses") - def _enqueue_responses(self, responses: Dict[int, ExecutorResponse]): + def _enqueue_responses(self, responses: Dict[int, LlmResponse]): if 0 not in self.dist.mapping.tp_group and not self.gather_all_responses: return @@ -2036,7 +2038,7 @@ def _handle_responses(self): requests_to_terminate.append(request) continue - if request.is_generation_only_request: + if request.is_generation_only_request(): # If request is in transmission, so we don't need to emit a response # Also, for the first iteration with overlap, we should skip since first # token has already been emitted previously @@ -2048,7 +2050,7 @@ def _handle_responses(self): request.draft_tokens = request.py_draft_tokens request.decoding_iter = request.py_decoding_iter - response: Response = request.create_response(False, self.dist.rank) + response = request.create_response(False, self.dist.rank) request_done = False if response: request_done = response.result.is_final @@ -2075,7 +2077,7 @@ def _terminate_ctx_finished_requests(self): def _await_any_response(self, timeout: Optional[float] = None - ) -> List[ExecutorResponse]: + ) -> List[LlmResponse]: def any_responses_ready(): return len(self.responses) > 0 or self.is_shutdown @@ -2092,7 +2094,7 @@ def any_responses_ready(): def _await_single_response( self, id: int, - timeout: Optional[float] = None) -> List[ExecutorResponse]: + timeout: Optional[float] = None) -> List[LlmResponse]: with self.response_cv: def key_has_response(): diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index 0f2e1581ca..acfdc00795 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -16,7 +16,7 @@ from ..llmapi.tracer import global_tracer from ..llmapi.utils import AsyncQueue from ..sampling_params import LogprobParams, SamplingParams -from .utils import ErrorResponse, has_event_loop, is_llm_response +from .utils import ErrorResponse, has_event_loop if TYPE_CHECKING: from .executor import GenerationExecutor @@ -282,7 +282,11 @@ def _handle_response(self, if self._background_error_handler is not None and ( handler := self._background_error_handler()): handler(response.error) - elif is_llm_response(response): + elif isinstance(response, ErrorResponse): + if self._background_error_handler is not None and ( + handler := self._background_error_handler()): + handler(response.error_msg) + elif hasattr(response, "request_id"): if response.has_error(): if self._background_error_handler is not None and ( handler := self._background_error_handler()): @@ -318,10 +322,6 @@ def _handle_response(self, if self._background_error_handler and ( handler := self._background_error_handler()): handler() - elif isinstance(response, ErrorResponse): - if self._background_error_handler is not None and ( - handler := self._background_error_handler()): - handler(response.error_msg) else: raise ValueError(f"Unknown response type: {response}") diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 5f057c6501..0a421e4668 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -39,7 +39,7 @@ from .result import (GenerationResult, IterationResult, LogProbsResult, ResponseWrapper, compute_logprobs) from .utils import (ErrorResponse, IntraProcessQueue, RequestError, - WorkerCommIpcAddrs, has_event_loop, is_llm_response) + WorkerCommIpcAddrs, has_event_loop) __all__ = [ "GenerationExecutorWorker", @@ -994,9 +994,8 @@ def _send_rsp( # Eliminate the finished GenerationRequest instances timely, which may # take considerable memory. - if is_llm_response(response): - if response.has_error() or response.result.is_final: - worker._pop_result(response.client_id) + if response.has_error() or response.result.is_final: + worker._pop_result(response.client_id) elif isinstance(response, ErrorResponse): worker._pop_result(response.client_id) else: From f0bb7c8234e89ee0b57ec241a0a88db7c1dd63ad Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:04:38 +0800 Subject: [PATCH 04/33] pure Python LlmResponse Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 6 +++--- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- tensorrt_llm/executor/proxy.py | 6 ++---- tensorrt_llm/executor/utils.py | 9 --------- 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index ef9bdbde63..522f39babf 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -221,16 +221,16 @@ class LlmResponse: def __init__(self, request_id: int, - error: str = None, + error_msg: str = None, result: LlmResult = None, client_id: int = None): self.request_id = request_id - self.error = error + self.error_msg = error_msg self.result = result self.client_id = client_id def has_error(self): - return self.error is not None + return self.error_msg is not None class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 37ae2da196..5b84cd7e37 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -1936,7 +1936,7 @@ def _handle_errors(self, error_msg: Optional[str] = None): self._terminate_request(request) error_responses[req_id] = LlmResponse( request_id=req_id, - error=error_msg, + error_msg=error_msg, client_id=request.py_client_id) self.active_requests.clear() self._enqueue_responses(error_responses) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 76cb2737c6..4e54be2168 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -24,8 +24,7 @@ from .request import CancellingRequest, GenerationRequest from .result import GenerationResult, IterationResult from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs, - create_mpi_comm_session, get_spawn_proxy_process_env, - is_llm_response) + create_mpi_comm_session, get_spawn_proxy_process_env) from .worker import GenerationExecutorWorker, worker_main __all__ = [ @@ -172,8 +171,7 @@ def process_res(res): event_loop = event_loop or queue.loop else: queue.put(res) - - if (is_llm_response(res) and res.result.is_final) or isinstance( + if (hasattr(res, "result") and res.result.is_final) or isinstance( res, ErrorResponse): self._results.pop(client_id) diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py index bb6466373f..e7b9975a5d 100644 --- a/tensorrt_llm/executor/utils.py +++ b/tensorrt_llm/executor/utils.py @@ -8,7 +8,6 @@ from strenum import StrEnum from tensorrt_llm._utils import mpi_rank -from tensorrt_llm.bindings.executor import Response from tensorrt_llm.llmapi.utils import print_colored_debug from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession, @@ -141,11 +140,3 @@ class WorkerCommIpcAddrs(NamedTuple): result_queue_addr: tuple[str, Optional[bytes]] stats_queue_addr: tuple[str, Optional[bytes]] kv_cache_events_queue_addr: tuple[str, Optional[bytes]] - - -def is_llm_response(instance): - from tensorrt_llm._torch.pyexecutor.llm_request import \ - LlmResponse as PyLlmResponse - - from .result import ResponseWrapper - return isinstance(instance, (Response, PyLlmResponse, ResponseWrapper)) From 60ca761095701150ee7c842bfd0bafd655d28660 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:12:04 +0800 Subject: [PATCH 05/33] clean Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/proxy.py | 5 +++-- tensorrt_llm/executor/result.py | 12 ++++++------ tensorrt_llm/executor/utils.py | 4 ++++ tensorrt_llm/executor/worker.py | 7 ++++--- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 4e54be2168..8f16031a00 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -24,7 +24,8 @@ from .request import CancellingRequest, GenerationRequest from .result import GenerationResult, IterationResult from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs, - create_mpi_comm_session, get_spawn_proxy_process_env) + create_mpi_comm_session, get_spawn_proxy_process_env, + is_llm_response) from .worker import GenerationExecutorWorker, worker_main __all__ = [ @@ -171,7 +172,7 @@ def process_res(res): event_loop = event_loop or queue.loop else: queue.put(res) - if (hasattr(res, "result") and res.result.is_final) or isinstance( + if (is_llm_response(res) and res.result.is_final) or isinstance( res, ErrorResponse): self._results.pop(client_id) diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index acfdc00795..0f2e1581ca 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -16,7 +16,7 @@ from ..llmapi.tracer import global_tracer from ..llmapi.utils import AsyncQueue from ..sampling_params import LogprobParams, SamplingParams -from .utils import ErrorResponse, has_event_loop +from .utils import ErrorResponse, has_event_loop, is_llm_response if TYPE_CHECKING: from .executor import GenerationExecutor @@ -282,11 +282,7 @@ def _handle_response(self, if self._background_error_handler is not None and ( handler := self._background_error_handler()): handler(response.error) - elif isinstance(response, ErrorResponse): - if self._background_error_handler is not None and ( - handler := self._background_error_handler()): - handler(response.error_msg) - elif hasattr(response, "request_id"): + elif is_llm_response(response): if response.has_error(): if self._background_error_handler is not None and ( handler := self._background_error_handler()): @@ -322,6 +318,10 @@ def _handle_response(self, if self._background_error_handler and ( handler := self._background_error_handler()): handler() + elif isinstance(response, ErrorResponse): + if self._background_error_handler is not None and ( + handler := self._background_error_handler()): + handler(response.error_msg) else: raise ValueError(f"Unknown response type: {response}") diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py index e7b9975a5d..fd4cd8444e 100644 --- a/tensorrt_llm/executor/utils.py +++ b/tensorrt_llm/executor/utils.py @@ -140,3 +140,7 @@ class WorkerCommIpcAddrs(NamedTuple): result_queue_addr: tuple[str, Optional[bytes]] stats_queue_addr: tuple[str, Optional[bytes]] kv_cache_events_queue_addr: tuple[str, Optional[bytes]] + + +def is_llm_response(instance): + return hasattr(instance, "result") diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 0a421e4668..5f057c6501 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -39,7 +39,7 @@ from .result import (GenerationResult, IterationResult, LogProbsResult, ResponseWrapper, compute_logprobs) from .utils import (ErrorResponse, IntraProcessQueue, RequestError, - WorkerCommIpcAddrs, has_event_loop) + WorkerCommIpcAddrs, has_event_loop, is_llm_response) __all__ = [ "GenerationExecutorWorker", @@ -994,8 +994,9 @@ def _send_rsp( # Eliminate the finished GenerationRequest instances timely, which may # take considerable memory. - if response.has_error() or response.result.is_final: - worker._pop_result(response.client_id) + if is_llm_response(response): + if response.has_error() or response.result.is_final: + worker._pop_result(response.client_id) elif isinstance(response, ErrorResponse): worker._pop_result(response.client_id) else: From 5f7e9ea233acdcd741c52d06ff6968600331a8fe Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:32:15 +0800 Subject: [PATCH 06/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index a722587b79..2cdb8a5bec 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -48,7 +48,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } /// Note that there is some dependency on the order of operations in this method. Modify with care! -executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) +executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) From 1b3a7b7a3286956b18d188b5b1795d80d371b827 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:34:05 +0800 Subject: [PATCH 07/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 2cdb8a5bec..492b45f1d8 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -48,7 +48,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } /// Note that there is some dependency on the order of operations in this method. Modify with care! -executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) +executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) From df6007306b59862662494a2517924a1f5f17612d Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:39:35 +0800 Subject: [PATCH 08/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/include/tensorrt_llm/batch_manager/llmRequest.h | 2 +- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index dca20816db..ac5b11d882 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2328,7 +2328,7 @@ class LlmRequest : public GenericLlmRequest /// @return An optional Response std::optional createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0); - executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); + std::optional createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false); diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 492b45f1d8..1c11688e9e 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -42,13 +42,16 @@ template class GenericLlmRequest; std::optional LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank) { auto requestId = isChild() ? mParentRequestId : mRequestId; - auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId); - - return response; + auto result = createResult(useFastLogits, mpiWorldRank); + if (result.has_value()) + { + return executor::Response(requestId, std::move(result), mClientId); + } + return std::nullopt; } /// Note that there is some dependency on the order of operations in this method. Modify with care! -executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) +std::optional LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) From 91c904e85e3f50afb0f1ad1aabbea3fad1758f66 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:41:44 +0800 Subject: [PATCH 09/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 1c11688e9e..702f208375 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -45,7 +45,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) { - return executor::Response(requestId, std::move(result), mClientId); + return executor::Response(requestId, result, mClientId); } return std::nullopt; } From 5d62ceac9b53c522cb12d57c33ae21ab2c5bde6b Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:54:53 +0800 Subject: [PATCH 10/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 702f208375..1a55482b07 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -45,7 +45,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) { - return executor::Response(requestId, result, mClientId); + return executor::Response(requestId, result.value(), mClientId); } return std::nullopt; } From 50eb5b964acb634ebab93dc885b359b26ba07143 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:12:32 +0800 Subject: [PATCH 11/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 1a55482b07..cfa10eb405 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -203,6 +203,7 @@ std::optional LlmRequest::createResult(bool useFastLogits, int // Update position of last sent response setMaxSentTokenLen(maxNbTokens); + return result; } void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, From ea2f8cc9e4fd01cfa9a24538ccf38675a4648ac8 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:20:45 +0800 Subject: [PATCH 12/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 5b84cd7e37..3470737f04 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2052,7 +2052,7 @@ def _handle_responses(self): request.decoding_iter = request.py_decoding_iter response = request.create_response(False, self.dist.rank) request_done = False - if response: + if response and response.result: request_done = response.result.is_final new_responses.update({req_id: response}) if request_done: From 431926d21274e2a2be7483b00b35a51d0e204671 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:27:19 +0800 Subject: [PATCH 13/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 +++ tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 522f39babf..cecadd6e51 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -232,6 +232,9 @@ def __init__(self, def has_error(self): return self.error_msg is not None + def has_result(self): + return self.result._result + class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest` diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 3470737f04..900aa71127 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2052,7 +2052,7 @@ def _handle_responses(self): request.decoding_iter = request.py_decoding_iter response = request.create_response(False, self.dist.rank) request_done = False - if response and response.result: + if response.has_result(): request_done = response.result.is_final new_responses.update({req_id: response}) if request_done: From 3fb4d84591a09268932b08ff1e3a4290f2b0708c Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 23:10:53 +0800 Subject: [PATCH 14/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 11 +++-------- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index cecadd6e51..752305555a 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -232,9 +232,6 @@ def __init__(self, def has_error(self): return self.error_msg is not None - def has_result(self): - return self.result._result - class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest` @@ -314,13 +311,11 @@ def create_response( self, use_fast_logits=False, mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None: + result = super().create_result(use_fast_logits, mpi_world_rank) return LlmResponse( request_id=self.py_request_id, - result=LlmResult( - super().create_result(use_fast_logits, mpi_world_rank), - self.py_result), - client_id=self.py_client_id, - ) + result=LlmResult(result, self.py_result), + client_id=self.py_client_id) if result is not None else None @property def is_dummy(self): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 900aa71127..5b84cd7e37 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2052,7 +2052,7 @@ def _handle_responses(self): request.decoding_iter = request.py_decoding_iter response = request.create_response(False, self.dist.rank) request_done = False - if response.has_result(): + if response: request_done = response.result.is_final new_responses.update({req_id: response}) if request_done: From 04370bad2ae9279aa8831e2c7ccf72aae2c570eb Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 00:07:21 +0800 Subject: [PATCH 15/33] polish Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/proxy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 8f16031a00..76cb2737c6 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -172,6 +172,7 @@ def process_res(res): event_loop = event_loop or queue.loop else: queue.put(res) + if (is_llm_response(res) and res.result.is_final) or isinstance( res, ErrorResponse): self._results.pop(client_id) From acd09a45940f4b70f5e61b12798ffe4b7f111204 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:37:14 +0800 Subject: [PATCH 16/33] expose createSerializedResult api Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/llmRequest.h | 3 +++ cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 25 +++++++++++++++++++ .../pybind/batch_manager/bindings.cpp | 8 ++++++ 3 files changed, 36 insertions(+) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index ac5b11d882..d71b6e89f6 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2330,6 +2330,9 @@ class LlmRequest : public GenericLlmRequest std::optional createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); + void createSerializedResult( + std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0); + void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false); diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index cfa10eb405..ea1cffa254 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -16,6 +16,7 @@ */ #include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/executor/serializeUtils.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" namespace tensorrt_llm::batch_manager @@ -50,6 +51,21 @@ std::optional LlmRequest::createResponse(bool useFastLogits, return std::nullopt; } +void LlmRequest::createSerializedResult( + std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0) +{ + auto result = createResult(useFastLogits, mpiWorldRank); + if (result.has_value()) + { + std::ostringstream oStream; + executor::serialize_utils::serialize(result.value(), oStream); + auto str = oStream.str(); + serializedResult.resize(str.size()); + std::copy(serializedResult.begin(), str.begin(), str.end()); + isFinal = result.isFinal; + } +} + /// Note that there is some dependency on the order of operations in this method. Modify with care! std::optional LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) { @@ -206,6 +222,15 @@ std::optional LlmRequest::createResult(bool useFastLogits, int return result; } +bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank) +{ + auto result = createResult(useFastLogits, mpiWorldRank); + if (result.has_value()) + { + executor::serialize + } +} + void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen, bool enableKVCacheReuse) { diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index a3399e1833..62ef664ac6 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -362,6 +362,14 @@ void initBindings(pybind11::module_& m) py::arg("mpi_world_rank") = 0) .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false, py::arg("mpi_world_rank") = 0) + .def("create_serialized_result", + [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0) + { + std::vector serialized_result; + bool is_final = False; + self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); + return py::str(serialized_result.data(), serialized_result.size()), is_final; + }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason")); From 48a999ce2abe906d86751119ef11897821885c8c Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:38:49 +0800 Subject: [PATCH 17/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 752305555a..3ea0eb7ec4 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -213,7 +213,8 @@ def __init__(self, result: tensorrt_llm.bindings.executor.Result, def __getattr__(self, item): if item in self.py_result_properties: return getattr(self._py_result, item) - return getattr(self._result, item) + result = object.__getattribute__(self, '_result') + return getattr(result, item) class LlmResponse: From 1e36d773786d98d17d5b6d8f172a6d738e27b74d Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:45:18 +0800 Subject: [PATCH 18/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index ea1cffa254..56e2addf61 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -52,7 +52,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } void LlmRequest::createSerializedResult( - std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0) + std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank) { auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) @@ -61,8 +61,8 @@ void LlmRequest::createSerializedResult( executor::serialize_utils::serialize(result.value(), oStream); auto str = oStream.str(); serializedResult.resize(str.size()); - std::copy(serializedResult.begin(), str.begin(), str.end()); - isFinal = result.isFinal; + std::copy(str.begin(), str.end(), serializedResult.begin()); + isFinal = result.value().isFinal; } } From 4fd71bc78730c7a26cc3759b2a231ae5d8d7ac62 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:47:36 +0800 Subject: [PATCH 19/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 56e2addf61..cd119dccf9 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -222,15 +222,6 @@ std::optional LlmRequest::createResult(bool useFastLogits, int return result; } -bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank) -{ - auto result = createResult(useFastLogits, mpiWorldRank); - if (result.has_value()) - { - executor::serialize - } -} - void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen, bool enableKVCacheReuse) { From 1a4920ba5fdb345b6dd86f7159f765bd4fd6d693 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:38:20 +0800 Subject: [PATCH 20/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index cd119dccf9..433f349b07 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -52,7 +52,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } void LlmRequest::createSerializedResult( - std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank) + std::vector& serializedResult, bool& isFinal, bool useFastLogits, int32_t mpiWorldRank) { auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) From 5e9888a3e774c51a98eb3d478ca45b24b50c25d3 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:40:53 +0800 Subject: [PATCH 21/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 62ef664ac6..ab299af0ba 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -366,7 +366,7 @@ void initBindings(pybind11::module_& m) [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0) { std::vector serialized_result; - bool is_final = False; + bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); return py::str(serialized_result.data(), serialized_result.size()), is_final; }) From 9bfa834cafbdf7368c72e5aad9aed5b7eff604a8 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:49:17 +0800 Subject: [PATCH 22/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index ab299af0ba..e287550535 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -36,6 +36,7 @@ #include #include #include +#include namespace py = pybind11; namespace tb = tensorrt_llm::batch_manager; @@ -368,7 +369,7 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - return py::str(serialized_result.data(), serialized_result.size()), is_final; + return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) From b37703cfade9c0d16f0f8107297d08d6b609bf24 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:27:36 +0800 Subject: [PATCH 23/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index e287550535..9f8f95e8ed 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -369,7 +369,7 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final); + return std::make_tuple(serialized_result, is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) From 3e4844d42ef9c35c1d03bb64bd198ce007541408 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:59:59 +0800 Subject: [PATCH 24/33] serialize result Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/executor/request.cpp | 11 ++++++++++- tensorrt_llm/_torch/pyexecutor/llm_request.py | 15 +++++++++------ tensorrt_llm/executor/result.py | 4 ++++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp index 85fb83ce27..66f7a26e53 100644 --- a/cpp/tensorrt_llm/pybind/executor/request.cpp +++ b/cpp/tensorrt_llm/pybind/executor/request.cpp @@ -19,6 +19,7 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/executor/serializeUtils.h" #include "tensorrt_llm/executor/tensor.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/runtime/cudaStream.h" @@ -775,7 +776,15 @@ void initRequestBindings(pybind11::module_& m) .def_readwrite("context_phase_params", &tle::Result::contextPhaseParams) .def(py::pickle(resultGetstate, resultSetstate)); - auto responseGetstate = [](tle::Response const& self) + m.def("deserialize_result", + [](std::string& x) + { + std::istream is(&x); + return tle::serialize_utils::deserialize(is); + }) + + auto responseGetstate + = [](tle::Response const& self) { return py::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); }; auto responseSetstate = [](py::tuple const& state) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 3ea0eb7ec4..fd0cd1f969 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -205,16 +205,18 @@ class LlmResult: py_result_properties = frozenset( ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs')) - def __init__(self, result: tensorrt_llm.bindings.executor.Result, - py_result: PyResult): + def __init__(self, + result: str, + py_result: PyResult, + is_final: bool = False): self._result = result self._py_result = py_result + self.is_final = is_final def __getattr__(self, item): if item in self.py_result_properties: return getattr(self._py_result, item) - result = object.__getattribute__(self, '_result') - return getattr(result, item) + return getattr(self, item) class LlmResponse: @@ -312,10 +314,11 @@ def create_response( self, use_fast_logits=False, mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None: - result = super().create_result(use_fast_logits, mpi_world_rank) + result, is_final = super().create_serialized_result( + use_fast_logits, mpi_world_rank) return LlmResponse( request_id=self.py_request_id, - result=LlmResult(result, self.py_result), + result=LlmResult(result, self.py_result, is_final), client_id=self.py_client_id) if result is not None else None @property diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index 0f2e1581ca..a67c35e16d 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -289,6 +289,10 @@ def _handle_response(self, handler(response.error_msg) response_result = response.result + if hasattr(response_result.result, "_result"): + response_result = tllm.deserialize_result( + response_result.result._result) + self._done = response_result.is_final context_phase_params = response_result.context_phase_params self.decoding_iter = response_result.decoding_iter From e5e38731b2bc54dfe22ee7f701479ae2a72375ff Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 16:13:03 +0800 Subject: [PATCH 25/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/executor/request.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp index 66f7a26e53..c8020d7ccb 100644 --- a/cpp/tensorrt_llm/pybind/executor/request.cpp +++ b/cpp/tensorrt_llm/pybind/executor/request.cpp @@ -781,10 +781,9 @@ void initRequestBindings(pybind11::module_& m) { std::istream is(&x); return tle::serialize_utils::deserialize(is); - }) + }); - auto responseGetstate - = [](tle::Response const& self) + auto responseGetstate = [](tle::Response const& self) { return py::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); }; auto responseSetstate = [](py::tuple const& state) From 39a791ac44c5bef1725d52cf8c6ececbe1bad13a Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 16:20:10 +0800 Subject: [PATCH 26/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/executor/request.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp index c8020d7ccb..e43ad80ce1 100644 --- a/cpp/tensorrt_llm/pybind/executor/request.cpp +++ b/cpp/tensorrt_llm/pybind/executor/request.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -779,7 +780,7 @@ void initRequestBindings(pybind11::module_& m) m.def("deserialize_result", [](std::string& x) { - std::istream is(&x); + std::istringstream is(x); return tle::serialize_utils::deserialize(is); }); From 883b9f0e670936e579cbfe1c21fc0ccb415b41fa Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 16:58:33 +0800 Subject: [PATCH 27/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 ++- tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 9f8f95e8ed..75ccb06774 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -369,7 +369,8 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - return std::make_tuple(serialized_result, is_final); + std::string str(serialized_result.begin(), serialized_result.end()); + return std::make_tuple(str, is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index fd0cd1f969..61c521cd38 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -216,7 +216,7 @@ def __init__(self, def __getattr__(self, item): if item in self.py_result_properties: return getattr(self._py_result, item) - return getattr(self, item) + return object.__getattribute__(self, item) class LlmResponse: From ab84d66572fe4a2a3ff07aae8b59c7fbfaddd1a0 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:47:58 +0800 Subject: [PATCH 28/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 +-- tensorrt_llm/_torch/pyexecutor/llm_request.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 75ccb06774..95173b545d 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -369,8 +369,7 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - std::string str(serialized_result.begin(), serialized_result.end()); - return std::make_tuple(str, is_final); + return std::make_tuple(py::bytes(serialized_result.data(), serialized_result.size()), , is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 61c521cd38..5a6ac2a96a 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -206,7 +206,7 @@ class LlmResult: ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs')) def __init__(self, - result: str, + result: bytes, py_result: PyResult, is_final: bool = False): self._result = result From 3c53e791058cc6822b6d002dae158f709c47b314 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:56:27 +0800 Subject: [PATCH 29/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 95173b545d..a6fd8b8e9d 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -369,7 +369,7 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - return std::make_tuple(py::bytes(serialized_result.data(), serialized_result.size()), , is_final); + return std::make_tuple(py::bytes(serialized_result.data(), serialized_result.size()), is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) From 144677be4498a28407c75371a7cdc85557fb025e Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 20:54:18 +0800 Subject: [PATCH 30/33] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 7 +++++-- tensorrt_llm/executor/result.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 5a6ac2a96a..c3c92a942f 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -216,7 +216,10 @@ def __init__(self, def __getattr__(self, item): if item in self.py_result_properties: return getattr(self._py_result, item) - return object.__getattribute__(self, item) + if item == 'is_final': + return object.__getattribute__(self, 'is_final') + result = object.__getattribute__(self, '_result') + return getattr(result, item) class LlmResponse: @@ -319,7 +322,7 @@ def create_response( return LlmResponse( request_id=self.py_request_id, result=LlmResult(result, self.py_result, is_final), - client_id=self.py_client_id) if result is not None else None + client_id=self.py_client_id) if len(result) > 0 else None @property def is_dummy(self): diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index a67c35e16d..2f6d61ac3b 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -289,9 +289,9 @@ def _handle_response(self, handler(response.error_msg) response_result = response.result - if hasattr(response_result.result, "_result"): + if hasattr(response_result, "_result"): response_result = tllm.deserialize_result( - response_result.result._result) + response_result._result) self._done = response_result.is_final context_phase_params = response_result.context_phase_params From e5a6efff7a2124b0c428d8fb5687adbaf9802272 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Sat, 14 Jun 2025 14:52:01 +0800 Subject: [PATCH 31/33] use HIGHEST_PROTOCOL Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/serialization.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/executor/serialization.py b/tensorrt_llm/executor/serialization.py index 79a3006b0f..f291a5a746 100644 --- a/tensorrt_llm/executor/serialization.py +++ b/tensorrt_llm/executor/serialization.py @@ -2,6 +2,7 @@ # pickle is not secure, but but this whole file is a wrapper to make it # possible to mitigate the primary risk of code injection via pickle. import pickle # nosec B403 +from functools import partial # These are the base classes that are generally serialized by the ZeroMQ IPC. # If a class is needed by ZMQ routinely it should be added here. If @@ -155,8 +156,8 @@ def find_class(self, module, name): # dump and dumps are just aliases because the serucity controls are on the deserialization # side. However they are included here so that in the future if a more secure serialization # soliton is identified, it can be added with less impact to the rest of the application. -dump = pickle.dump # nosec B301 -dumps = pickle.dumps # nosec B301 +dump = partial(pickle.dump, protocol=pickle.HIGHEST_PROTOCOL) # nosec B301 +dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) # nosec B301 def load(file, From 11ffdb80746e97acd8ff8a121aef748bb965e41c Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Mon, 16 Jun 2025 12:51:13 +0800 Subject: [PATCH 32/33] fix ci Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index 33d44d059b..a7f4b6b2cf 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -295,7 +295,7 @@ def _handle_response(self, response_result = response.result if hasattr(response_result, "_result"): - response_result = tllm.deserialize_result( + response_result._result = tllm.deserialize_result( response_result._result) self._done = response_result.is_final From a54beae0f6551aa3fc19fa6d7be3d88523790436 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Mon, 16 Jun 2025 15:51:09 +0800 Subject: [PATCH 33/33] polish code Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 29 ++++++------- tensorrt_llm/executor/result.py | 3 +- tests/unittest/_torch/test_return_logits.py | 41 ------------------- 3 files changed, 13 insertions(+), 60 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index fe90917469..e792be72ed 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -1,4 +1,5 @@ -from typing import List, Optional +from dataclasses import dataclass +from typing import List, Optional, Union import torch @@ -206,7 +207,7 @@ class LlmResult: ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs')) def __init__(self, - result: bytes, + result: Union[bytes, tensorrt_llm.bindings.executor.Result], py_result: PyResult, is_final: bool = False): self._result = result @@ -221,27 +222,21 @@ def __getattr__(self, item): result = object.__getattribute__(self, '_result') return getattr(result, item) + def deserialize(self): + self._result = tensorrt_llm.bindings.executor.deserialize_result( + self._result) -class LlmResponse: - """LlmResponse wraps `bindings.executor.Response` but detour some features to Python implementation""" - def __init__(self, - request_id: int, - error_msg: str = None, - result: LlmResult = None, - client_id: int = None): - self.request_id = request_id - self.error_msg = error_msg - self.result = result - self.client_id = client_id +@dataclass +class LlmResponse: + request_id: int + error_msg: Optional[str] = None + result: Optional[LlmResult] = None + client_id: Optional[int] = None def has_error(self): return self.error_msg is not None - @property - def _is_llm_response(self) -> bool: - return True - class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest` diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index a7f4b6b2cf..2716f80d16 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -295,8 +295,7 @@ def _handle_response(self, response_result = response.result if hasattr(response_result, "_result"): - response_result._result = tllm.deserialize_result( - response_result._result) + response_result.deserialize() self._done = response_result.is_final context_phase_params = response_result.context_phase_params diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py index 8555b8bc5a..2fa21ad417 100644 --- a/tests/unittest/_torch/test_return_logits.py +++ b/tests/unittest/_torch/test_return_logits.py @@ -1,5 +1,4 @@ import os -import pickle import pytest import torch @@ -8,52 +7,12 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.llm_request import LlmResponse, PyResult -from tensorrt_llm.bindings.executor import Response, Result -from tensorrt_llm.executor.result import Logprob from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig prompts = ["A B C"] global_kvcache_config = KvCacheConfig(max_tokens=10000) -def test_LlmResponse_pickle(): - result = Result() - result.decoding_iter = 1 - result.sequence_index = 1 - binding_response = Response(request_id=1, result=result, client_id=1) - py_result = PyResult(prompt_len=1, - max_new_tokens=1, - use_device_memory=True, - streaming=False, - return_log_probs=True, - return_context_logits=True, - return_generation_logits=True) - context_logits = torch.randn([1, 1, 128], device='cuda') - generation_logits = torch.randn([1, 1, 128], device='cuda') - logprobs = [[{1: Logprob(0.8, 1)}]] - py_result.append_context_logits(context_logits) - py_result.append_generation_logits(generation_logits) - py_result.append_log_probs(logprobs) - - response = LlmResponse(binding_response, py_result) - - data = pickle.dumps(response) - pickle_response: LlmResponse = pickle.loads(data) - - assert pickle_response._response.request_id == 1 - assert pickle_response._response.client_id == 1 - - pickle_result = pickle_response.result - - assert pickle_result.decoding_iter == 1 - assert pickle_result.sequence_index == 1 - assert torch.all(torch.eq(pickle_result.context_logits, context_logits)) - assert torch.all( - torch.eq(pickle_result.generation_logits, generation_logits)) - assert pickle_result.log_probs == logprobs - - @force_ampere # Save H100 resource @pytest.mark.parametrize("return_log_probs", [False, True]) @pytest.mark.parametrize("gather_generation_logits", [False, True])