Skip to content

Disable healthchecks for Loki and Promtail in `docker-compose.monitor… #62

Disable healthchecks for Loki and Promtail in `docker-compose.monitor…

Disable healthchecks for Loki and Promtail in `docker-compose.monitor… #62

name: Monitoring Stack Tests
on:
push:
branches: [ main, develop ]
paths:
- 'docker-compose.monitoring-rpi.yml'
- 'grafana-config/**'
- 'tests/integration/test_monitoring_stack.py'
- 'scripts/test-monitoring-integration.sh'
- '.github/workflows/test-monitoring-stack.yml'
pull_request:
branches: [ main, develop ]
paths:
- 'docker-compose.monitoring-rpi.yml'
- 'grafana-config/**'
- 'tests/integration/test_monitoring_stack.py'
- 'scripts/test-monitoring-integration.sh'
- '.github/workflows/test-monitoring-stack.yml'
workflow_dispatch: # Allow manual trigger
jobs:
test-monitoring-stack:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.13
uses: actions/setup-python@v6
with:
python-version: '3.13'
- name: Install UV
run: pip install uv
- name: Install dependencies
run: uv sync --all-groups
- name: Start monitoring stack
run: |
# Use CI override to remove non-root user directives (volume permission issue)
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml up -d
echo "Services started, waiting for readiness..."
echo ""
# Wait up to 120 seconds for critical services to be healthy
TIMEOUT=120
ELAPSED=0
INTERVAL=5
ALL_READY=false
check_http_health() {
local url=$1
if curl -sf "$url" >/dev/null 2>&1; then
echo "ready"
else
echo "not_ready"
fi
}
while [ $ELAPSED -lt $TIMEOUT ]; do
echo "[$ELAPSED/$TIMEOUT seconds] Checking service health..."
# Get health status of all services
TEMPO_STATUS=$(docker inspect --format='{{.State.Status}}' tempo 2>/dev/null || echo "not_found")
LOKI_HEALTH=$(docker inspect --format='{{.State.Health.Status}}' loki 2>/dev/null || echo "not_found")
PROMETHEUS_HEALTH=$(docker inspect --format='{{.State.Health.Status}}' prometheus 2>/dev/null || echo "not_found")
OTEL_STATUS=$(docker inspect --format='{{.State.Status}}' otel-collector 2>/dev/null || echo "not_found")
GRAFANA_HEALTH=$(docker inspect --format='{{.State.Health.Status}}' grafana 2>/dev/null || echo "not_found")
PROMTAIL_HEALTH=$(docker inspect --format='{{.State.Health.Status}}' promtail 2>/dev/null || echo "not_found")
LOKI_HTTP=$(check_http_health "http://localhost:3100/ready")
PROMETHEUS_HTTP=$(check_http_health "http://localhost:9090/-/healthy")
GRAFANA_HTTP=$(check_http_health "http://localhost:3000/api/health")
PROMTAIL_HTTP=$(check_http_health "http://localhost:9080/ready")
echo " ├─ Tempo: $TEMPO_STATUS"
echo " ├─ Loki: $LOKI_HEALTH (HTTP: $LOKI_HTTP)"
echo " ├─ Prometheus: $PROMETHEUS_HEALTH (HTTP: $PROMETHEUS_HTTP)"
echo " ├─ OTEL Collector: $OTEL_STATUS"
echo " ├─ Grafana: $GRAFANA_HEALTH (HTTP: $GRAFANA_HTTP)"
echo " └─ Promtail: $PROMTAIL_HEALTH (HTTP: $PROMTAIL_HTTP)"
# Check if critical services are healthy/running
if [ "$TEMPO_STATUS" = "running" ] && \
[ "$LOKI_HTTP" = "ready" ] && \
[ "$PROMETHEUS_HTTP" = "ready" ] && \
[ "$OTEL_STATUS" = "running" ] && \
[ "$GRAFANA_HTTP" = "ready" ] && \
[ "$PROMTAIL_HTTP" = "ready" ]; then
echo ""
echo "✅ All critical services are ready!"
ALL_READY=true
break
fi
sleep $INTERVAL
ELAPSED=$((ELAPSED + INTERVAL))
echo ""
done
if [ "$ALL_READY" = "false" ]; then
echo "⚠️ WARNING: Services did not become healthy within $TIMEOUT seconds"
echo "Continuing with tests anyway..."
fi
echo ""
echo "=== Final Service Status ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml ps
- name: Check service status
run: |
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml ps
echo "---"
echo "Service logs:"
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs --tail=20
- name: Run shell-based smoke tests
run: |
chmod +x scripts/test-monitoring-integration.sh
./scripts/test-monitoring-integration.sh
- name: Run Python integration tests
run: |
uv run pytest tests/integration/test_monitoring_stack.py -v --run-integration
- name: Collect logs on failure
if: failure()
run: |
echo "=== Docker Compose Status ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml ps
echo ""
echo "=== Tempo Logs ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs tempo
echo ""
echo "=== Loki Logs ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs loki
echo ""
echo "=== Prometheus Logs ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs prometheus
echo ""
echo "=== OTEL Collector Logs ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs otel-collector
echo ""
echo "=== Grafana Logs ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs grafana
echo ""
echo "=== Promtail Logs ==="
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml logs promtail
- name: Cleanup
if: always()
run: |
docker compose -f docker-compose.monitoring-rpi.yml -f docker-compose.monitoring-ci.yml down -v