4343from chainlit .data .sql_alchemy import SQLAlchemyDataLayer
4444from chainlit .server import app as chainlit_app
4545from fastapi import HTTPException
46- from fastapi .responses import FileResponse
46+ from fastapi .responses import FileResponse , JSONResponse
4747from fastapi .routing import APIRoute
4848
4949from optopsy .ui .agent import OptopsyAgent , _sanitize_tool_messages
@@ -244,7 +244,13 @@ def _init_db_sync() -> None:
244244
245245 Uses SQLAlchemy so the same DDL works for both SQLite and PostgreSQL.
246246 The sync Postgres path requires ``psycopg2`` (included in the ``ui`` extra).
247+
248+ Retries up to 5 times with exponential backoff so the app survives
249+ transient database unavailability (e.g. Railway starting the DB service
250+ concurrently with the app).
247251 """
252+ import time
253+
248254 from sqlalchemy import create_engine , text
249255 from sqlalchemy .exc import OperationalError , ProgrammingError
250256
@@ -254,24 +260,47 @@ def _init_db_sync() -> None:
254260 if sync_url .startswith ("sqlite" ):
255261 DB_PATH .parent .mkdir (parents = True , exist_ok = True )
256262
257- engine = create_engine (sync_url )
258- try :
259- with engine .begin () as conn :
260- for stmt in _DB_SCHEMA_STATEMENTS :
261- conn .execute (text (stmt ))
262- # Add columns introduced in newer Chainlit versions.
263- for col , definition in [
264- ("defaultOpen" , "INTEGER DEFAULT 0" ),
265- ("waitForAnswer" , "INTEGER" ),
266- ]:
267- try :
268- conn .execute (
269- text (f'ALTER TABLE steps ADD COLUMN "{ col } " { definition } ' )
270- )
271- except (OperationalError , ProgrammingError ):
272- pass # column already exists
273- finally :
274- engine .dispose ()
263+ _log_init = logging .getLogger (__name__ )
264+ max_retries = 5
265+ for attempt in range (max_retries ):
266+ engine = create_engine (sync_url )
267+ try :
268+ with engine .begin () as conn :
269+ for stmt in _DB_SCHEMA_STATEMENTS :
270+ conn .execute (text (stmt ))
271+ # Add columns introduced in newer Chainlit versions.
272+ for col , definition in [
273+ ("defaultOpen" , "INTEGER DEFAULT 0" ),
274+ ("waitForAnswer" , "INTEGER" ),
275+ ]:
276+ try :
277+ conn .execute (
278+ text (f'ALTER TABLE steps ADD COLUMN "{ col } " { definition } ' )
279+ )
280+ except (OperationalError , ProgrammingError ):
281+ pass # column already exists
282+ return # success
283+ except Exception :
284+ engine .dispose ()
285+ if attempt < max_retries - 1 :
286+ delay = 2 ** (attempt + 1 )
287+ _log_init .warning (
288+ "DB init attempt %d/%d failed, retrying in %ds…" ,
289+ attempt + 1 ,
290+ max_retries ,
291+ delay ,
292+ exc_info = True ,
293+ )
294+ time .sleep (delay )
295+ else :
296+ _log_init .error (
297+ "DB init failed after %d attempts; "
298+ "app will start but persistence may be unavailable" ,
299+ max_retries ,
300+ exc_info = True ,
301+ )
302+ finally :
303+ engine .dispose ()
275304
276305
277306_init_db_sync ()
@@ -352,6 +381,15 @@ async def _serve_storage_file(file_path: str):
352381chainlit_app .routes .insert (_insert_idx , _storage_route )
353382
354383
384+ # --- Health check endpoint for Railway / container orchestrators -----------
385+ async def _health ():
386+ return JSONResponse ({"status" : "ok" })
387+
388+
389+ _health_route = APIRoute (path = "/health" , endpoint = _health , methods = ["GET" ])
390+ chainlit_app .routes .insert (_insert_idx , _health_route )
391+
392+
355393@cl .data_layer
356394def get_data_layer () -> SQLAlchemyDataLayer :
357395 return SQLAlchemyDataLayer (
0 commit comments