Skip to content

Commit 8d41621

Browse files
michaelchuclaude
andauthored
Fix Railway 502: add /health endpoint and DB init retry logic (#234)
Co-authored-by: Claude <noreply@anthropic.com>
1 parent 45e15b6 commit 8d41621

File tree

2 files changed

+58
-20
lines changed

2 files changed

+58
-20
lines changed

optopsy/ui/app.py

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from chainlit.data.sql_alchemy import SQLAlchemyDataLayer
4444
from chainlit.server import app as chainlit_app
4545
from fastapi import HTTPException
46-
from fastapi.responses import FileResponse
46+
from fastapi.responses import FileResponse, JSONResponse
4747
from fastapi.routing import APIRoute
4848

4949
from optopsy.ui.agent import OptopsyAgent, _sanitize_tool_messages
@@ -244,7 +244,13 @@ def _init_db_sync() -> None:
244244
245245
Uses SQLAlchemy so the same DDL works for both SQLite and PostgreSQL.
246246
The sync Postgres path requires ``psycopg2`` (included in the ``ui`` extra).
247+
248+
Retries up to 5 times with exponential backoff so the app survives
249+
transient database unavailability (e.g. Railway starting the DB service
250+
concurrently with the app).
247251
"""
252+
import time
253+
248254
from sqlalchemy import create_engine, text
249255
from sqlalchemy.exc import OperationalError, ProgrammingError
250256

@@ -254,24 +260,47 @@ def _init_db_sync() -> None:
254260
if sync_url.startswith("sqlite"):
255261
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
256262

257-
engine = create_engine(sync_url)
258-
try:
259-
with engine.begin() as conn:
260-
for stmt in _DB_SCHEMA_STATEMENTS:
261-
conn.execute(text(stmt))
262-
# Add columns introduced in newer Chainlit versions.
263-
for col, definition in [
264-
("defaultOpen", "INTEGER DEFAULT 0"),
265-
("waitForAnswer", "INTEGER"),
266-
]:
267-
try:
268-
conn.execute(
269-
text(f'ALTER TABLE steps ADD COLUMN "{col}" {definition}')
270-
)
271-
except (OperationalError, ProgrammingError):
272-
pass # column already exists
273-
finally:
274-
engine.dispose()
263+
_log_init = logging.getLogger(__name__)
264+
max_retries = 5
265+
for attempt in range(max_retries):
266+
engine = create_engine(sync_url)
267+
try:
268+
with engine.begin() as conn:
269+
for stmt in _DB_SCHEMA_STATEMENTS:
270+
conn.execute(text(stmt))
271+
# Add columns introduced in newer Chainlit versions.
272+
for col, definition in [
273+
("defaultOpen", "INTEGER DEFAULT 0"),
274+
("waitForAnswer", "INTEGER"),
275+
]:
276+
try:
277+
conn.execute(
278+
text(f'ALTER TABLE steps ADD COLUMN "{col}" {definition}')
279+
)
280+
except (OperationalError, ProgrammingError):
281+
pass # column already exists
282+
return # success
283+
except Exception:
284+
engine.dispose()
285+
if attempt < max_retries - 1:
286+
delay = 2 ** (attempt + 1)
287+
_log_init.warning(
288+
"DB init attempt %d/%d failed, retrying in %ds…",
289+
attempt + 1,
290+
max_retries,
291+
delay,
292+
exc_info=True,
293+
)
294+
time.sleep(delay)
295+
else:
296+
_log_init.error(
297+
"DB init failed after %d attempts; "
298+
"app will start but persistence may be unavailable",
299+
max_retries,
300+
exc_info=True,
301+
)
302+
finally:
303+
engine.dispose()
275304

276305

277306
_init_db_sync()
@@ -352,6 +381,15 @@ async def _serve_storage_file(file_path: str):
352381
chainlit_app.routes.insert(_insert_idx, _storage_route)
353382

354383

384+
# --- Health check endpoint for Railway / container orchestrators -----------
385+
async def _health():
386+
return JSONResponse({"status": "ok"})
387+
388+
389+
_health_route = APIRoute(path="/health", endpoint=_health, methods=["GET"])
390+
chainlit_app.routes.insert(_insert_idx, _health_route)
391+
392+
355393
@cl.data_layer
356394
def get_data_layer() -> SQLAlchemyDataLayer:
357395
return SQLAlchemyDataLayer(

railway.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ builder = "DOCKERFILE"
33
dockerfilePath = "Dockerfile"
44

55
[deploy]
6-
healthcheckPath = "/"
6+
healthcheckPath = "/health"
77
healthcheckTimeout = 300
88
restartPolicyType = "ON_FAILURE"
99
restartPolicyMaxRetries = 10

0 commit comments

Comments
 (0)