ray-project · sampan-s-nayak · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
@@ -0,0 +1,166 @@
+from collections import deque
+import asyncio
+import time
+from typing import Dict, List
+from dataclasses import dataclass
+import uuid
+
+from ray.core.generated import (
+    events_base_event_pb2,
+)
+from ray.core.generated.events_base_event_pb2 import RayEvent
+
+
+@dataclass
+class _ConsumerState:
+    # index of the next event to be consumed by this consumer
+    cursor_index: int
+    # map of event type to the number of events evicted for this consumer since last metric update
+    evicted_events_count: Dict[str, int]
+    # event to signal that there are new events to consume
+    has_new_events_to_consume: asyncio.Event
+
+
+class MultiConsumerEventBuffer:
+    """A buffer which allows adding one event at a time and consuming events in batches.
+    Supports multiple consumers, each with their own cursor index. Tracks the number of events evicted for each consumer.
+
+    Buffer is not thread-safe but is asyncio-friendly. All operations must be called from the same event loop.
+    """
+
+    def __init__(self, max_size: int, max_batch_size: int):
+        self._buffer = deque(maxlen=max_size)
+        self._max_size = max_size
+        self._lock = asyncio.Lock()
+        self._consumers: Dict[str, _ConsumerState] = {}
+
+        self._max_batch_size = max_batch_size
+
+    async def add_event(self, event: events_base_event_pb2.RayEvent):
+        """Add an event to the buffer.
+
+        If the buffer is full, the oldest event is dropped.
+        """
+        async with self._lock:
+            dropped_event = None
+            if len(self._buffer) >= self._max_size:
+                dropped_event = self._buffer.popleft()
+            self._buffer.append(event)
+
+            for _, consumer_state in self._consumers.items():
+                # update consumer cursor index and evicted events count if the event was dropped
+                if dropped_event is not None:
+                    if consumer_state.cursor_index == 0:
+                        # the dropped event was the next event this consumer would have consumed
+                        event_type_name = RayEvent.EventType.Name(
+                            dropped_event.event_type
+                        )
+                        if event_type_name not in consumer_state.evicted_events_count:
+                            consumer_state.evicted_events_count[event_type_name] = 0
+                        consumer_state.evicted_events_count[event_type_name] += 1
+                    else:
+                        # the dropped event was before the consumer's current position, so adjust cursor
+                        consumer_state.cursor_index -= 1
+                # signal that there are new events to consume
+                consumer_state.has_new_events_to_consume.set()
+
+    async def wait_for_batch(
+        self, consumer_id: str, timeout_seconds: float = 1.0
+    ) -> List[events_base_event_pb2.RayEvent]:
+        """Wait for batch respecting self.max_batch_size and timeout_seconds.
+
+        Returns a batch of up to self.max_batch_size items. Waits for up to
+        timeout_seconds after receiving the first request that will be in
+        the next batch. After the timeout, returns as many items as are ready.
+
+        Always returns a batch with at least one item - will block
+        indefinitely until an item comes in.
+
+        Arguments:
+            consumer_id: id of the consumer consuming the batch
+            timeout_seconds: maximum time to wait for a batch
+        """
+        max_batch = self._max_batch_size
+        consumer_state = None
+        has_events_to_consume = None
+        async with self._lock:
+            consumer_state = self._consumers.get(consumer_id)
+            if consumer_state is None:
+                raise KeyError(f"unknown consumer '{consumer_id}'")
+            has_events_to_consume = consumer_state.has_new_events_to_consume
+
+        # phase 1: read the first event, we wait indefinitely until there is at least one event to consume
+        # we wait inside a loop to deal with spurious wakeups.
+        while True:
+            # we wait outside the lock to avoid deadlocks
+            await has_events_to_consume.wait()
+            async with self._lock:
+                if consumer_state.cursor_index < len(self._buffer):
+                    # add the first event to the batch
+                    event = self._buffer[consumer_state.cursor_index]
+                    consumer_state.cursor_index += 1
+                    batch = [event]
+                    break
+
+                # there is no new events to consume, clear the condition variable and wait for it to be set again
+                has_events_to_consume.clear()
+
+        # Phase 2: add items to the batch up to timeout or until full
+        deadline = time.monotonic() + max(0.0, float(timeout_seconds))
+        while len(batch) < max_batch:
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                break
+
+            async with self._lock:
+                # drain whatever is available
+                while len(batch) < max_batch and consumer_state.cursor_index < len(
+                    self._buffer
+                ):
+                    batch.append(self._buffer[consumer_state.cursor_index])
+                    consumer_state.cursor_index += 1
+
+                if len(batch) >= max_batch:
+                    break
+
+                # there is still room in the batch, but no new events to consume, clear the condition variable and wait for it to be set again
+                has_events_to_consume.clear()
+            try:
+                await asyncio.wait_for(has_events_to_consume.wait(), remaining)
+            except asyncio.TimeoutError:
+                # timeout, we return the batch as is
+                break
+
+        return batch
+
+    async def register_consumer(self) -> str:
+        """Register a new consumer.
+
+        Returns:
+            id of the consumer
+        """
+        async with self._lock:
+            consumer_id = str(uuid.uuid4())
+            self._consumers[consumer_id] = _ConsumerState(
+                cursor_index=0,
+                evicted_events_count={},
+                has_new_events_to_consume=asyncio.Event(),
+            )
+            return consumer_id
+
+    async def size(self) -> int:
+        """Get the number of events in the buffer."""
+        async with self._lock:
+            return len(self._buffer)
+
+    async def get_and_reset_evicted_events_count(
+        self, consumer_id: str
+    ) -> Dict[str, int]:
+        """Get the number of events evicted for a consumer. and reset the evicted events count."""
+        async with self._lock:
+            consumer_state = self._consumers.get(consumer_id)
+            if consumer_state is None:
+                raise KeyError(f"unknown consumer '{consumer_id}'")
+            evicted_events_count = consumer_state.evicted_events_count
+            consumer_state.evicted_events_count = {}
+            return evicted_events_count
@@ -0,0 +1,121 @@
+from concurrent.futures import ThreadPoolExecutor
+import json
+import logging
+
+from ray._common.utils import get_or_create_event_loop
+import aiohttp
+from ray._private.protobuf_compat import message_to_json
+from ray.core.generated import events_base_event_pb2
+from ray.dashboard.modules.aggregator.publisher.configs import PUBLISHER_TIMEOUT_SECONDS
+from typing import Callable
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PublishStats:
+    """Data class that represents stats of publishing a batch of events."""
+
+    is_publish_successful: bool
+    num_events_published: int
+    num_events_filtered_out: int
+
+
+class PublisherClientInterface(ABC):
+    """Abstract interface for publishing Ray event batches to external destinations.
+
+    Implementations should handle the actual publishing logic, filtering,
+    and format conversion appropriate for their specific destination type.
+    """
+
+    @abstractmethod
+    async def publish(self, batch) -> PublishStats:
+        """Publish a batch of events to the destination."""
+        pass
+
+    @abstractmethod
+    def count_num_events_in_batch(self, batch) -> int:
+        """Count the number of events in a given batch."""
+        pass
+
+    @abstractmethod
+    async def close(self) -> None:
+        """Clean up any resources used by this client. Should be called when the publisherClient is no longer required"""
+        pass
+
+
+class AsyncHttpPublisherClient(PublisherClientInterface):
+    """Client for publishing ray event batches to an external HTTP service."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        executor: ThreadPoolExecutor,
+        events_filter_fn: Callable[[object], bool],
+        timeout: float = PUBLISHER_TIMEOUT_SECONDS,
+    ) -> None:
+        self._endpoint = endpoint
+        self._executor = executor
+        self._events_filter_fn = events_filter_fn
+        self._timeout = aiohttp.ClientTimeout(total=timeout)
+        self._session = None
+
+    async def publish(
+        self, events_batch: list[events_base_event_pb2.RayEvent]
+    ) -> PublishStats:
+        if not events_batch:
+            return PublishStats(True, 0, 0)
+        filtered = [e for e in events_batch if self._events_filter_fn(e)]
+        num_filtered_out = len(events_batch) - len(filtered)
+        if not filtered:
+            # All filtered out -> success but nothing published
+            return PublishStats(True, 0, num_filtered_out)
+
+        # Convert protobuf objects to python dictionaries for HTTP POST
+        filtered_json = await get_or_create_event_loop().run_in_executor(
+            self._executor,
+            lambda: [
+                json.loads(
+                    message_to_json(e, always_print_fields_with_no_presence=True)
+                )
+                for e in filtered
+            ],
+        )
+
+        try:
+            # Create session on first use (lazy initialization)
+            if not self._session:
+                self._session = aiohttp.ClientSession(timeout=self._timeout)
+
+            return await self._send_http_request(filtered_json, num_filtered_out)
+        except Exception as e:
+            logger.error("Failed to send events to external service. Error: %s", e)
+            return PublishStats(False, 0, 0)
+
+    async def _send_http_request(self, json_data, num_filtered_out):
+        async with self._session.post(
+            self._endpoint,
+            json=json_data,
+        ) as resp:
+            resp.raise_for_status()
+            return PublishStats(True, len(json_data), num_filtered_out)
+
+    def count_num_events_in_batch(
+        self, events_batch: list[events_base_event_pb2.RayEvent]
+    ) -> int:
+        return len(events_batch)
+
+    async def close(self) -> None:
+        """Closes the http session if one was created. Should be called when the publisherClient is no longer required"""
+        if self._session:
+            await self._session.close()
+            self._session = None
+
+    def set_session(self, session) -> None:
+        """Inject an HTTP client session. Intended for testing.
+
+        If a session is set explicitly, it will be used and managed by close().
+        """
+        self._session = session
@@ -0,0 +1,24 @@
+# Environment variables for the aggregator agent
+from ray._private import ray_constants
+
+env_var_prefix = "RAY_DASHBOARD_AGGREGATOR_AGENT_PUBLISHER"
+# timeout for the publisher to publish events to the destination
+PUBLISHER_TIMEOUT_SECONDS = ray_constants.env_integer(
+    f"{env_var_prefix}_TIMEOUT_SECONDS", 5
+)
+# maximum number of retries for publishing events to the destination, if less than 0, will retry indefinitely
+PUBLISHER_MAX_RETRIES = ray_constants.env_integer(f"{env_var_prefix}_MAX_RETRIES", -1)
+# initial backoff time for publishing events to the destination
+PUBLISHER_INITIAL_BACKOFF_SECONDS = ray_constants.env_float(
+    f"{env_var_prefix}_INITIAL_BACKOFF_SECONDS", 0.01
+)
+# maximum backoff time for publishing events to the destination
+PUBLISHER_MAX_BACKOFF_SECONDS = ray_constants.env_float(
+    f"{env_var_prefix}_MAX_BACKOFF_SECONDS", 5.0
+)
+# jitter ratio for publishing events to the destination
+PUBLISHER_JITTER_RATIO = ray_constants.env_float(f"{env_var_prefix}_JITTER_RATIO", 0.1)
+# Maximum sleep time between sending batches of events to the destination, should be greater than 0.0 to avoid busy looping
+PUBLISHER_MAX_BUFFER_SEND_INTERVAL_SECONDS = ray_constants.env_float(
+    f"{env_var_prefix}_MAX_BUFFER_SEND_INTERVAL_SECONDS", 0.1
+)