1- """ZMQ Broker Management Client - lifecycle management for ZmqBrokerService.
1+ """ZMQ Broker Management Client - read-only query interface for ZmqBrokerService.
22
3- This module provides a management interface to start/stop the broker service
4- and query its status via PID/status files. Analogous to RabbitmqManagementClient.
3+ This module provides a read-only interface to query broker service status
4+ via PID/status files. The broker lifecycle is managed by circus (production)
5+ or test helpers (testing).
56"""
67
78from __future__ import annotations
89
910import json
1011import os
1112import shutil
12- import signal
13- import subprocess
14- import sys
15- import time
1613from pathlib import Path
1714from typing import Any
1815
2522
2623
2724class ZmqBrokerManagementClient :
28- """Management client for ZmqBrokerService.
25+ """Read-only management client for ZmqBrokerService.
2926
30- Allows external code to:
31- - Start/stop the broker service
32- - Check if service is running
33- - Get service status
27+ Provides:
28+ - Status queries (is_running, get_status, get_pid)
29+ - Endpoint discovery (router_endpoint, pub_endpoint)
3430
3531 Interacts with the service via PID/status files, not direct IPC.
36- Analogous to RabbitmqManagementClient for the RabbitMQ broker.
3732 """
3833
3934 def __init__ (self , base_path : Path | str ):
40- """Initialize the controller .
35+ """Initialize the client .
4136
4237 :param base_path: Base path for broker data (same as ZmqBrokerService)
4338 """
@@ -64,9 +59,6 @@ def status_file(self) -> Path:
6459 def _get_sockets_path (self ) -> Path | None :
6560 """Read the socket directory path from file.
6661
67- The socket directory is created in a temp location by ZmqBrokerService
68- to avoid Unix domain socket path length limits.
69-
7062 :return: Path to socket directory, or None if not available
7163 """
7264 if not self ._sockets_file .exists ():
@@ -121,16 +113,12 @@ def _validate_pid(self, pid: int) -> bool:
121113 if HAS_PSUTIL :
122114 try :
123115 proc = psutil .Process (pid )
124- # Check if process is running and is a Python process
125116 if proc .is_running () and proc .status () != psutil .STATUS_ZOMBIE :
126- # Verify it's our broker by checking command line
127117 cmdline = proc .cmdline ()
128118 return any ('aiida.brokers.zmq' in arg for arg in cmdline )
129119 except (psutil .NoSuchProcess , psutil .AccessDenied ):
130120 return False
131121 else :
132- # Fallback: check if process exists and verify it's our broker
133- # via /proc on Linux, or accept the PID-reuse risk on other platforms
134122 try :
135123 os .kill (pid , 0 )
136124 except OSError :
@@ -152,10 +140,6 @@ def _validate_pid(self, pid: int) -> bool:
152140 def is_running (self ) -> bool :
153141 """Check if broker service is running.
154142
155- Validates that:
156- 1. PID file exists
157- 2. PID in file corresponds to a running broker process
158-
159143 :return: True if service is running
160144 """
161145 pid = self .get_pid ()
@@ -177,169 +161,13 @@ def get_status(self) -> dict[str, Any] | None:
177161 except (json .JSONDecodeError , OSError ):
178162 return None
179163
180- def start (
181- self ,
182- foreground : bool = False ,
183- wait : bool = True ,
184- timeout : float = 10.0 ,
185- ) -> bool :
186- """Start the broker service.
187-
188- :param foreground: If True, run in foreground (blocking); else daemonize
189- :param wait: If True and not foreground, wait for service to start
190- :param timeout: Timeout in seconds when waiting for service to start
191- :return: True if service started successfully
192- """
193- if self .is_running ():
194- return True
195-
196- # Ensure base path exists
197- self ._base_path .mkdir (parents = True , exist_ok = True )
198-
199- # Build command
200- cmd = [
201- sys .executable ,
202- '-m' ,
203- 'aiida.brokers.zmq.service' ,
204- '--base-path' ,
205- str (self ._base_path ),
206- ]
207-
208- if foreground :
209- # Run in foreground (blocking)
210- subprocess .run (cmd , check = True )
211- return True
212- else :
213- # Run as daemon (detached process)
214- # Use subprocess with appropriate flags for daemon behavior
215- if sys .platform == 'win32' :
216- # Windows: use CREATE_NEW_PROCESS_GROUP
217- subprocess .Popen (
218- cmd ,
219- creationflags = subprocess .CREATE_NEW_PROCESS_GROUP | subprocess .DETACHED_PROCESS ,
220- stdout = subprocess .DEVNULL ,
221- stderr = subprocess .DEVNULL ,
222- stdin = subprocess .DEVNULL ,
223- )
224- else :
225- # Unix: use start_new_session
226- subprocess .Popen (
227- cmd ,
228- start_new_session = True ,
229- stdout = subprocess .DEVNULL ,
230- stderr = subprocess .DEVNULL ,
231- stdin = subprocess .DEVNULL ,
232- )
233-
234- if wait :
235- return self ._wait_for_start (timeout )
236-
237- return True
238-
239- def _wait_for_start (self , timeout : float ) -> bool :
240- """Wait for service to start.
241-
242- :param timeout: Timeout in seconds
243- :return: True if service started within timeout
244- """
245- start_time = time .time ()
246- while time .time () - start_time < timeout :
247- if self .is_running ():
248- return True
249- time .sleep (0.1 )
250- return False
251-
252- def stop (self , timeout : float = 5.0 ) -> bool :
253- """Stop the broker service.
254-
255- Uses SIGINT for cross-platform graceful shutdown.
256- Falls back to hard kill if timeout expires.
257-
258- :param timeout: Seconds to wait for graceful shutdown
259- :return: True if stopped successfully
260- """
261- pid = self .get_pid ()
262- if pid is None :
263- return True
264-
265- if not self ._validate_pid (pid ):
266- # PID file exists but process is not running, clean up
267- self ._cleanup_stale_files ()
268- return True
269-
270- # Send SIGINT for graceful shutdown (works on all platforms)
271- try :
272- os .kill (pid , signal .SIGINT )
273- except OSError :
274- # Process already gone
275- self ._cleanup_stale_files ()
276- return True
277-
278- # Wait for graceful shutdown
279- if self ._wait_for_stop (pid , timeout ):
280- return True
281-
282- # Graceful shutdown failed, try hard kill
283- return self ._force_kill (pid )
284-
285- def _wait_for_stop (self , pid : int , timeout : float ) -> bool :
286- """Wait for process to stop.
287-
288- :param pid: Process ID
289- :param timeout: Timeout in seconds
290- :return: True if process stopped within timeout
291- """
292- start_time = time .time ()
293- while time .time () - start_time < timeout :
294- if not self ._validate_pid (pid ):
295- self ._cleanup_stale_files ()
296- return True
297- time .sleep (0.1 )
298- return False
299-
300- def _force_kill (self , pid : int ) -> bool :
301- """Force kill a process.
302-
303- :param pid: Process ID
304- :return: True if killed successfully
305- """
306- if HAS_PSUTIL :
307- try :
308- proc = psutil .Process (pid )
309- proc .terminate () # Sends SIGTERM on Unix, TerminateProcess on Windows
310- proc .wait (timeout = 2.0 )
311- self ._cleanup_stale_files ()
312- return True
313- except (psutil .NoSuchProcess , psutil .TimeoutExpired ):
314- try :
315- proc .kill () # SIGKILL on Unix, TerminateProcess on Windows
316- self ._cleanup_stale_files ()
317- return True
318- except psutil .NoSuchProcess :
319- self ._cleanup_stale_files ()
320- return True
321- except psutil .AccessDenied :
322- return False
323- else :
324- # Without psutil, try SIGKILL on Unix (not available on Windows)
325- if sys .platform != 'win32' :
326- try :
327- os .kill (pid , signal .SIGKILL )
328- self ._cleanup_stale_files ()
329- return True
330- except OSError :
331- self ._cleanup_stale_files ()
332- return True
333- return False # type: ignore[unreachable] # Windows without psutil
334-
335164 def _cleanup_stale_files (self ) -> None :
336165 """Clean up stale PID, status, and socket files."""
337166 if self ._pid_file .exists ():
338167 self ._pid_file .unlink (missing_ok = True )
339168 if self ._status_file .exists ():
340169 self ._status_file .unlink (missing_ok = True )
341170
342- # Clean up orphaned socket directory
343171 sockets_path = self ._get_sockets_path ()
344172 if sockets_path is not None and sockets_path .exists ():
345173 try :
@@ -348,12 +176,3 @@ def _cleanup_stale_files(self) -> None:
348176 pass
349177 if self ._sockets_file .exists ():
350178 self ._sockets_file .unlink (missing_ok = True )
351-
352- def restart (self , timeout : float = 5.0 ) -> bool :
353- """Restart the broker service.
354-
355- :param timeout: Timeout for stop operation
356- :return: True if restarted successfully
357- """
358- self .stop (timeout = timeout )
359- return self .start (wait = True )
0 commit comments