refactor changes

sampan · sampan · commit 515d530c8963 · 2025-09-01T04:03:51.000Z
Signed-off-by: sampan &lt;sampan@anyscale.com&gt;
diff --git a/python/ray/dashboard/modules/aggregator/aggregator_agent.py b/python/ray/dashboard/modules/aggregator/aggregator_agent.py
@@ -89,43 +89,43 @@
         namespace="ray",
     )
     events_published_to_http_svc = Counter(
-        f"{metrics_prefix}_http_events_published_total",
+        f"{metrics_prefix}_http_publisher_published_events_total",
         "Total number of events successfully published to the HTTP service.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS),
         namespace="ray",
     )
     events_filtered_out_before_http_svc_publish = Counter(
-        f"{metrics_prefix}_http_events_filtered_total",
+        f"{metrics_prefix}_http_publisher_filtered_events_total",
         "Total number of events filtered out before publishing to the HTTP service.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS),
         namespace="ray",
     )
     events_failed_to_publish_to_http_svc = Counter(
-        f"{metrics_prefix}_http_publish_failures_total",
+        f"{metrics_prefix}_http_publisher_failures_total",
         "Total number of events that failed to publish to the HTTP service after retries.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS),
         namespace="ray",
     )
     events_dropped_in_http_svc_publish_queue = Counter(
-        f"{metrics_prefix}_http_publish_queue_dropped_events_total",
+        f"{metrics_prefix}_http_publisher_queue_dropped_events_total",
         "Total number of events dropped because the HTTP publish queue was full.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS) + ("event_type",),
         namespace="ray",
     )
     http_publish_latency_seconds = Histogram(
-        f"{metrics_prefix}_http_publish_duration_seconds",
+        f"{metrics_prefix}_http_publisher_publish_duration_seconds",
         "Duration of HTTP publish calls in seconds.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS) + ("Outcome",),
         namespace="ray",
     )
     http_failed_attempts_since_last_success = Gauge(
-        f"{metrics_prefix}_http_publish_consecutive_failures",
+        f"{metrics_prefix}_http_publisher_consecutive_failures_since_last_success",
         "Number of consecutive failed publish attempts since the last success.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS),
         namespace="ray",
     )
     http_time_since_last_success_seconds = Gauge(
-        f"{metrics_prefix}_http_time_since_last_success_seconds",
+        f"{metrics_prefix}_http_publisher_time_since_last_success_seconds",
         "Seconds since the last successful publish to the HTTP service.",
         tuple(dashboard_consts.COMPONENT_METRICS_TAG_KEYS),
         namespace="ray",
@@ -174,7 +174,7 @@ def __init__(self, dashboard_agent) -> None:
                 f"Publishing events to external HTTP service is enabled. events_export_addr: {self._events_export_addr}"
             )
             self._event_processing_enabled = True
-            self._HttpEndpointPublisher = RayEventsPublisher(
+            self._http_endpoint_publisher = RayEventsPublisher(
                 name="http-endpoint-publisher",
                 publish_client=AsyncHttpPublisherClient(
                     endpoint=self._events_export_addr,
@@ -187,7 +187,7 @@ def __init__(self, dashboard_agent) -> None:
             logger.info(
                 f"Event HTTP target is not enabled or publishing events to external HTTP service is disabled. Skipping sending events to external HTTP service. events_export_addr: {self._events_export_addr}"
             )
-            self._HttpEndpointPublisher = NoopPublisher()
+            self._http_endpoint_publisher = NoopPublisher()
 
     async def AddEvents(self, request, context) -> None:
         """
@@ -245,11 +245,11 @@ async def _update_metrics(self) -> None:
         }
 
         http_endpoint_publisher_metrics = await (
-            self._HttpEndpointPublisher.get_and_reset_metrics()
+            self._http_endpoint_publisher.get_and_reset_metrics()
         )
 
+        # Aggregator agent metrics
         async with self._lock:
-            # Aggregator agent metrics
             _events_received = self._events_received_since_last_metrics_update
             _events_failed_to_add_to_aggregator = (
                 self._events_failed_to_add_to_aggregator_since_last_metrics_update
@@ -258,31 +258,31 @@ async def _update_metrics(self) -> None:
             self._events_received_since_last_metrics_update = 0
             self._events_failed_to_add_to_aggregator_since_last_metrics_update = 0
 
-            # HTTP service publisher metrics
-            _events_published_to_http_svc = http_endpoint_publisher_metrics.get(
-                "published", 0
-            )
-            _events_filtered_out_before_http_svc_publish = (
-                http_endpoint_publisher_metrics.get("filtered_out", 0)
-            )
-            _events_failed_to_publish_to_http_svc = http_endpoint_publisher_metrics.get(
-                "failed", 0
-            )
-            _events_dropped_in_http_publish_queue_by_type = (
-                http_endpoint_publisher_metrics.get("dropped_events", {})
-            )
-            _http_publish_latency_success_samples = http_endpoint_publisher_metrics.get(
-                "success_latency_seconds", []
-            )
-            _http_publish_latency_failure_samples = http_endpoint_publisher_metrics.get(
-                "failure_latency_seconds", []
-            )
-            _failed_attempts_since_last_success = http_endpoint_publisher_metrics.get(
-                "failed_attempts_since_last_success", 0
-            )
-            _time_since_last_success_seconds = http_endpoint_publisher_metrics.get(
-                "time_since_last_success_seconds", None
-            )
+        # HTTP service publisher metrics
+        _events_published_to_http_svc = http_endpoint_publisher_metrics.get(
+            "published", 0
+        )
+        _events_filtered_out_before_http_svc_publish = (
+            http_endpoint_publisher_metrics.get("filtered_out", 0)
+        )
+        _events_failed_to_publish_to_http_svc = http_endpoint_publisher_metrics.get(
+            "failed", 0
+        )
+        _events_dropped_in_http_publish_queue_by_type = (
+            http_endpoint_publisher_metrics.get("dropped_events", {})
+        )
+        _http_publish_latency_success_samples = http_endpoint_publisher_metrics.get(
+            "success_latency_seconds", []
+        )
+        _http_publish_latency_failure_samples = http_endpoint_publisher_metrics.get(
+            "failure_latency_seconds", []
+        )
+        _failed_attempts_since_last_success = http_endpoint_publisher_metrics.get(
+            "failed_attempts_since_last_success", 0
+        )
+        _time_since_last_success_seconds = http_endpoint_publisher_metrics.get(
+            "time_since_last_success_seconds", None
+        )
 
         events_received.labels(**common_labels).inc(_events_received)
         events_failed_to_add_to_aggregator.labels(**common_labels).inc(
@@ -328,7 +328,7 @@ async def run(self, server) -> None:
             )
 
         await asyncio.gather(
-            self._HttpEndpointPublisher.run_forever(),
+            self._http_endpoint_publisher.run_forever(),
             self._update_metrics(),
         )
 
diff --git a/python/ray/dashboard/modules/aggregator/multi_consumer_event_buffer.py b/python/ray/dashboard/modules/aggregator/multi_consumer_event_buffer.py
@@ -13,19 +13,19 @@
 
 @dataclass
 class _ConsumerState:
-    # index of the next event to be consumed by this consumer
+    # Index of the next event to be consumed by this consumer
     cursor_index: int
-    # map of event type to the number of events evicted for this consumer since last metric update
+    # Map of event type to the number of events evicted for this consumer since last metric update
     evicted_events_count: Dict[str, int]
-    # event to signal that there are new events to consume
+    # Condition variable to signal that there are new events to consume
     has_new_events_to_consume: asyncio.Event
 
 
 class MultiConsumerEventBuffer:
     """A buffer which allows adding one event at a time and consuming events in batches.
     Supports multiple consumers, each with their own cursor index. Tracks the number of events evicted for each consumer.
 
-    Buffer is not thread-safe but is asyncio-friendly. All operations must be called from the same event loop.
+    Buffer is not thread-safe but is asyncio-friendly. All operations must be called from within the same event loop.
     """
 
     def __init__(self, max_size: int, max_batch_size: int):
@@ -48,20 +48,20 @@ async def add_event(self, event: events_base_event_pb2.RayEvent):
             self._buffer.append(event)
 
             for _, consumer_state in self._consumers.items():
-                # update consumer cursor index and evicted events count if the event was dropped
+                # Update consumer cursor index and evicted events count if the event was dropped
                 if dropped_event is not None:
                     if consumer_state.cursor_index == 0:
-                        # the dropped event was the next event this consumer would have consumed
+                        # The dropped event was the next event this consumer would have consumed, update the evicted events count
                         event_type_name = RayEvent.EventType.Name(
                             dropped_event.event_type
                         )
                         if event_type_name not in consumer_state.evicted_events_count:
                             consumer_state.evicted_events_count[event_type_name] = 0
                         consumer_state.evicted_events_count[event_type_name] += 1
                     else:
-                        # the dropped event was before the consumer's current position, so adjust cursor
+                        # The dropped event was already consumed by the consumer, so we need to adjust the cursor
                         consumer_state.cursor_index -= 1
-                # signal that there are new events to consume
+                # Signal all consumers that there are new events to consume
                 consumer_state.has_new_events_to_consume.set()
 
     async def wait_for_batch(
@@ -81,28 +81,26 @@ async def wait_for_batch(
             timeout_seconds: maximum time to wait for a batch
         """
         max_batch = self._max_batch_size
-        consumer_state = None
-        has_events_to_consume = None
         async with self._lock:
             consumer_state = self._consumers.get(consumer_id)
             if consumer_state is None:
                 raise KeyError(f"unknown consumer '{consumer_id}'")
             has_events_to_consume = consumer_state.has_new_events_to_consume
 
-        # phase 1: read the first event, we wait indefinitely until there is at least one event to consume
-        # we wait inside a loop to deal with spurious wakeups.
+        # Phase 1: read the first event, wait indefinitely until there is at least one event to consume
+        # Wait inside a loop to deal with spurious wakeups.
         while True:
-            # we wait outside the lock to avoid deadlocks
+            # Wait outside the lock to avoid deadlocks
             await has_events_to_consume.wait()
             async with self._lock:
                 if consumer_state.cursor_index < len(self._buffer):
-                    # add the first event to the batch
+                    # Add the first event to the batch
                     event = self._buffer[consumer_state.cursor_index]
                     consumer_state.cursor_index += 1
                     batch = [event]
                     break
 
-                # there is no new events to consume, clear the condition variable and wait for it to be set again
+                # There are no new events to consume, clear the condition variable and wait for it to be set again
                 has_events_to_consume.clear()
 
         # Phase 2: add items to the batch up to timeout or until full
@@ -113,7 +111,7 @@ async def wait_for_batch(
                 break
 
             async with self._lock:
-                # drain whatever is available
+                # Drain whatever is available
                 while len(batch) < max_batch and consumer_state.cursor_index < len(
                     self._buffer
                 ):
@@ -123,12 +121,12 @@ async def wait_for_batch(
                 if len(batch) >= max_batch:
                     break
 
-                # there is still room in the batch, but no new events to consume, clear the condition variable and wait for it to be set again
+                # There is still room in the batch, but no new events to consume, clear the condition variable and wait for it to be set again
                 has_events_to_consume.clear()
             try:
                 await asyncio.wait_for(has_events_to_consume.wait(), remaining)
             except asyncio.TimeoutError:
-                # timeout, we return the batch as is
+                # Timeout, return the current batch
                 break
 
         return batch
@@ -137,7 +135,7 @@ async def register_consumer(self) -> str:
         """Register a new consumer.
 
         Returns:
-            id of the consumer
+            Id of the consumer, used to identify the consumer in other methods.
         """
         async with self._lock:
             consumer_id = str(uuid.uuid4())
@@ -149,7 +147,7 @@ async def register_consumer(self) -> str:
             return consumer_id
 
     async def size(self) -> int:
-        """Get the number of events in the buffer."""
+        """Get total number of events in the buffer. Does not take consumer cursors into account."""
         async with self._lock:
             return len(self._buffer)
 
diff --git a/python/ray/dashboard/modules/aggregator/publisher/async_publisher_client.py b/python/ray/dashboard/modules/aggregator/publisher/async_publisher_client.py
@@ -66,14 +66,15 @@ async def publish(
         self, events_batch: list[events_base_event_pb2.RayEvent]
     ) -> PublishStats:
         if not events_batch:
+            # Nothing to publish -> success but nothing published
             return PublishStats(True, 0, 0)
         filtered = [e for e in events_batch if self._events_filter_fn(e)]
         num_filtered_out = len(events_batch) - len(filtered)
         if not filtered:
             # All filtered out -> success but nothing published
             return PublishStats(True, 0, num_filtered_out)
 
-        # Convert protobuf objects to python dictionaries for HTTP POST
+        # Convert protobuf objects to python dictionaries for HTTP POST. Run in executor to avoid blocking the event loop.
         filtered_json = await get_or_create_event_loop().run_in_executor(
             self._executor,
             lambda: [
diff --git a/python/ray/dashboard/modules/aggregator/publisher/configs.py b/python/ray/dashboard/modules/aggregator/publisher/configs.py
@@ -1,22 +1,22 @@
-# Environment variables for the aggregator agent
+# Environment variables for the aggregator agent publisher component.
 from ray._private import ray_constants
 
 env_var_prefix = "RAY_DASHBOARD_AGGREGATOR_AGENT_PUBLISHER"
-# timeout for the publisher to publish events to the destination
+# Timeout for the publisher to publish events to the destination
 PUBLISHER_TIMEOUT_SECONDS = ray_constants.env_integer(
-    f"{env_var_prefix}_TIMEOUT_SECONDS", 5
+    f"{env_var_prefix}_TIMEOUT_SECONDS", 3
 )
-# maximum number of retries for publishing events to the destination, if less than 0, will retry indefinitely
+# Maximum number of retries for publishing events to the destination, if less than 0, will retry indefinitely
 PUBLISHER_MAX_RETRIES = ray_constants.env_integer(f"{env_var_prefix}_MAX_RETRIES", -1)
-# initial backoff time for publishing events to the destination
+# Initial backoff time for publishing events to the destination
 PUBLISHER_INITIAL_BACKOFF_SECONDS = ray_constants.env_float(
     f"{env_var_prefix}_INITIAL_BACKOFF_SECONDS", 0.01
 )
-# maximum backoff time for publishing events to the destination
+# Maximum backoff time for publishing events to the destination
 PUBLISHER_MAX_BACKOFF_SECONDS = ray_constants.env_float(
     f"{env_var_prefix}_MAX_BACKOFF_SECONDS", 5.0
 )
-# jitter ratio for publishing events to the destination
+# Jitter ratio for publishing events to the destination
 PUBLISHER_JITTER_RATIO = ray_constants.env_float(f"{env_var_prefix}_JITTER_RATIO", 0.1)
 # Maximum sleep time between sending batches of events to the destination, should be greater than 0.0 to avoid busy looping
 PUBLISHER_MAX_BUFFER_SEND_INTERVAL_SECONDS = ray_constants.env_float(
diff --git a/python/ray/dashboard/modules/aggregator/publisher/ray_event_publisher.py b/python/ray/dashboard/modules/aggregator/publisher/ray_event_publisher.py
@@ -41,10 +41,9 @@ async def wait_until_running(self, timeout: Optional[float] = None) -> bool:
 
 
 class RayEventsPublisher(RayEventsPublisherInterface):
-    """RayEvents publisher that publishes batches of events to a destination using a dedicated async worker.
+    """RayEvents publisher that publishes batches of events to a destination by running a worker loop.
 
-    The publisher is single-threaded and uses a queue to store batches of events.
-    The worker loop continuously pulls batches from the queue and publishes them.
+    The worker loop continuously pulls batches from the event buffer and publishes them to the destination.
     """
 
     def __init__(
@@ -78,7 +77,6 @@ def __init__(
         self._event_buffer_consumer_id = None
 
         # Internal metrics (since last get_and_reset_metrics call)
-        # using thread lock as non publisher threads can also call get_and_reset_metrics
         self._metrics_lock = asyncio.Lock()
         self._metric_events_published_since_last: int = 0
         self._metric_events_filtered_out_since_last: int = 0
@@ -110,17 +108,27 @@ async def run_forever(self) -> None:
                 await self._async_publish_with_retries(batch)
         except asyncio.CancelledError:
             logger.info(f"Publisher {self._name} cancelled, shutting down gracefully")
+            self._started_event.clear()
             await self._publish_client.close()
             raise
         except Exception as e:
             logger.error(f"Publisher {self._name} encountered error: {e}")
+            self._started_event.clear()
             await self._publish_client.close()
             raise
 
     async def get_and_reset_metrics(self) -> Dict[str, int]:
         """Return a snapshot of internal metrics since last call and reset them.
 
-        Returns a dict with keys: 'published', 'filtered_out', 'failed', 'queue_dropped'.
+        Returns a dict with the following keys:
+            published: Number of events successfully published since last call
+            filtered_out: Number of events filtered out before publishing since last call
+            failed: Number of events that failed to publish since last call
+            success_latency_seconds: List of publish latencies for successful attempts since last call
+            failure_latency_seconds: List of publish latencies for failed attempts since last call
+            failed_attempts_since_last_success: Number of consecutive failed publish attempts since last successful publish
+            time_since_last_success_seconds: Time elapsed since last successful publish, or None if never succeeded
+            dropped_events: Dict mapping event types to counts of events dropped from buffer
         """
         async with self._metrics_lock:
             if self._metric_last_publish_success_timestamp is None:
diff --git a/python/ray/dashboard/modules/aggregator/tests/test_aggregator_agent.py b/python/ray/dashboard/modules/aggregator/tests/test_aggregator_agent.py
@@ -880,7 +880,7 @@ def test_aggregator_agent_receive_driver_job_execution_event(
     ],
     indirect=True,
 )
-def test_aggregator_agent_publish_disabled_does_not_send_http(
+def test_aggregator_agent_http_svc_publish_disabled(
     ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp
 ):
     cluster = ray_start_cluster_head_with_env_vars
diff --git a/python/ray/dashboard/modules/aggregator/tests/test_multi_consumer_event_buffer.py b/python/ray/dashboard/modules/aggregator/tests/test_multi_consumer_event_buffer.py
@@ -83,8 +83,8 @@ async def test_add_event_buffer_overflow(self):
         assert evicted_events_count[first_event_type_name] == 1
 
     @pytest.mark.asyncio
-    async def test_wait_for_batch_multiple_events_immediate(self):
-        """Test waiting for batch when multiple events are immediately available."""
+    async def test_wait_for_batch_multiple_events(self):
+        """Test waiting for batch when multiple events are immediately available and when when not all events are available."""
         buffer = MultiConsumerEventBuffer(max_size=10, max_batch_size=3)
         consumer_id = await buffer.register_consumer()
 
diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py

Original file line number	Diff line number	Diff line change
`@@ -880,7 +880,7 @@ def test_aggregator_agent_receive_driver_job_execution_event(`
`880`	`880`	`],`
`881`	`881`	`indirect=True,`
`882`	`882`	`)`
`883`		`-def test_aggregator_agent_publish_disabled_does_not_send_http(`
	`883`	`+def test_aggregator_agent_http_svc_publish_disabled(`
`884`	`884`	`ray_start_cluster_head_with_env_vars, httpserver, fake_timestamp`
`885`	`885`	`):`
`886`	`886`	`cluster = ray_start_cluster_head_with_env_vars`