@@ -84,6 +84,8 @@ def __init__(self, deployable: Deployment) -> None:
84
84
if is_high_scale_model is not None and is_high_scale_model .lower () == "true" :
85
85
self ._update_upscale_delay (120 )
86
86
self ._update_downscale_delay (600 )
87
+ self ._update_target_ongoing_requests (4 )
88
+ self ._update_max_concurrent_requests (6 )
87
89
88
90
def _determine_vram_usage (self , model_path : str , total_vram : str ):
89
91
warn (
@@ -189,6 +191,21 @@ def _update_max_replicas(self, num_replicas: int):
189
191
190
192
return self
191
193
194
+ def _update_target_ongoing_requests (self , target_ongoing_requests : int ):
195
+ self ._autoscaling_config ["target_num_ongoing_requests_per_replica" ] = (
196
+ target_ongoing_requests
197
+ )
198
+ self ._autoscaling_config ["target_ongoing_requests" ] = target_ongoing_requests
199
+ self ._deployment = self ._deployment .options (
200
+ autoscaling_config = self ._autoscaling_config
201
+ )
202
+
203
+ def _update_max_concurrent_requests (self , max_concurrent_requests : int ):
204
+ self ._deployment = self ._deployment .options (
205
+ max_concurrent_queries = max_concurrent_requests ,
206
+ max_ongoing_requests = max_concurrent_requests ,
207
+ )
208
+
192
209
def _update_upscale_delay (self , upscale_delay_s : int ):
193
210
self ._autoscaling_config ["upscale_delay_s" ] = upscale_delay_s
194
211
self ._deployment = self ._deployment .options (
0 commit comments