feat(health): add /health/ endpoint for OpsLog monitoring #56

Merged
mark merged 2 commits from feature/health-endpoint-opslog into main 2026-03-06 17:42:10 +00:00
13 changed files with 462 additions and 2 deletions
Showing only changes of commit 10e39b8331 - Show all commits

View File

@@ -50,4 +50,9 @@ RUN pip install --upgrade pip && pip install -r requirements/base.txt
COPY . /app
ARG GIT_SHA=unknown
ARG BUILD_ID=unknown
ENV GIT_SHA=${GIT_SHA} \
BUILD_ID=${BUILD_ID}
CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]

1
apps/health/__init__.py Normal file
View File

@@ -0,0 +1 @@

6
apps/health/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class HealthConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "apps.health"

80
apps/health/checks.py Normal file
View File

@@ -0,0 +1,80 @@
from __future__ import annotations
import importlib
import os
import time
import uuid
from pathlib import Path
from django.core.cache import cache
from django.db import connection
BACKUP_MAX_AGE_SECONDS = 48 * 60 * 60
def check_db() -> dict[str, float | str]:
started = time.perf_counter()
try:
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
except Exception as exc:
return {"status": "fail", "detail": str(exc)}
return {"status": "ok", "latency_ms": (time.perf_counter() - started) * 1000}
def check_cache() -> dict[str, float | str]:
cache_key = f"health:{uuid.uuid4().hex}"
probe_value = uuid.uuid4().hex
started = time.perf_counter()
try:
cache.set(cache_key, probe_value, timeout=5)
cached_value = cache.get(cache_key)
if cached_value != probe_value:
return {"status": "fail", "detail": "Cache probe returned unexpected value"}
cache.delete(cache_key)
except Exception as exc:
return {"status": "fail", "detail": str(exc)}
return {"status": "ok", "latency_ms": (time.perf_counter() - started) * 1000}
def check_celery() -> dict[str, str]:
broker_url = os.environ.get("CELERY_BROKER_URL")
if not broker_url:
return {"status": "ok", "detail": "Celery not configured: CELERY_BROKER_URL is unset"}
try:
kombu = importlib.import_module("kombu")
except ImportError:
return {"status": "ok", "detail": "Celery broker check skipped: kombu is not installed"}
try:
with kombu.Connection(broker_url, connect_timeout=3) as broker_connection:
broker_connection.ensure_connection(max_retries=1)
except Exception as exc:
return {"status": "fail", "detail": str(exc)}
return {"status": "ok"}
def check_backup() -> dict[str, str]:
backup_status_file = os.environ.get("BACKUP_STATUS_FILE")
if not backup_status_file:
return {"status": "fail", "detail": "Backup monitoring not configured: BACKUP_STATUS_FILE is unset"}
try:
raw_timestamp = Path(backup_status_file).read_text(encoding="utf-8").strip()
except FileNotFoundError:
return {"status": "fail", "detail": f"Backup status file not found: {backup_status_file}"}
except OSError as exc:
return {"status": "fail", "detail": str(exc)}
try:
last_backup_at = float(raw_timestamp)
except ValueError:
return {"status": "fail", "detail": "Invalid backup status file"}
age_seconds = time.time() - last_backup_at
if age_seconds > BACKUP_MAX_AGE_SECONDS:
age_hours = age_seconds / 3600
return {"status": "fail", "detail": f"Last backup is {age_hours:.1f} hours old (> 48 h)"}
return {"status": "ok"}

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,205 @@
from __future__ import annotations
import importlib
import time
from types import SimpleNamespace
import pytest
from django.db.utils import OperationalError
from apps.health import checks
class SuccessfulCursor:
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def execute(self, query):
self.query = query
class FailingCursor:
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def execute(self, query):
raise OperationalError("database unavailable")
class FakeCache:
def __init__(self, value_to_return=None):
self.value_to_return = value_to_return
self.stored = {}
def set(self, key, value, timeout=None):
self.stored[key] = value
def get(self, key):
if self.value_to_return is not None:
return self.value_to_return
return self.stored.get(key)
def delete(self, key):
self.stored.pop(key, None)
@pytest.mark.django_db
def test_db_ok(monkeypatch):
monkeypatch.setattr(checks.connection, "cursor", lambda: SuccessfulCursor())
result = checks.check_db()
assert result["status"] == "ok"
assert "latency_ms" in result
@pytest.mark.django_db
def test_db_fail(monkeypatch):
monkeypatch.setattr(checks.connection, "cursor", lambda: FailingCursor())
result = checks.check_db()
assert result == {"status": "fail", "detail": "database unavailable"}
@pytest.mark.django_db
def test_cache_ok(monkeypatch):
monkeypatch.setattr(checks, "cache", FakeCache())
result = checks.check_cache()
assert result["status"] == "ok"
assert "latency_ms" in result
@pytest.mark.django_db
def test_cache_fail(monkeypatch):
monkeypatch.setattr(checks, "cache", FakeCache(value_to_return="wrong-value"))
result = checks.check_cache()
assert result == {"status": "fail", "detail": "Cache probe returned unexpected value"}
def test_celery_no_broker(monkeypatch):
monkeypatch.delenv("CELERY_BROKER_URL", raising=False)
result = checks.check_celery()
assert result["status"] == "ok"
assert "CELERY_BROKER_URL is unset" in result["detail"]
def test_celery_no_kombu(monkeypatch):
monkeypatch.setenv("CELERY_BROKER_URL", "redis://broker")
def raise_import_error(name):
raise ImportError(name)
monkeypatch.setattr(importlib, "import_module", raise_import_error)
result = checks.check_celery()
assert result["status"] == "ok"
assert "kombu is not installed" in result["detail"]
def test_celery_ok(monkeypatch):
monkeypatch.setenv("CELERY_BROKER_URL", "redis://broker")
class FakeBrokerConnection:
def __init__(self, url, connect_timeout):
self.url = url
self.connect_timeout = connect_timeout
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def ensure_connection(self, max_retries):
self.max_retries = max_retries
monkeypatch.setattr(importlib, "import_module", lambda name: SimpleNamespace(Connection=FakeBrokerConnection))
result = checks.check_celery()
assert result == {"status": "ok"}
def test_celery_fail(monkeypatch):
monkeypatch.setenv("CELERY_BROKER_URL", "redis://broker")
class BrokenBrokerConnection:
def __init__(self, url, connect_timeout):
self.url = url
self.connect_timeout = connect_timeout
def __enter__(self):
raise OSError("broker down")
def __exit__(self, exc_type, exc, tb):
return False
monkeypatch.setattr(importlib, "import_module", lambda name: SimpleNamespace(Connection=BrokenBrokerConnection))
result = checks.check_celery()
assert result == {"status": "fail", "detail": "broker down"}
def test_backup_no_env(monkeypatch):
monkeypatch.delenv("BACKUP_STATUS_FILE", raising=False)
result = checks.check_backup()
assert result["status"] == "fail"
assert "BACKUP_STATUS_FILE is unset" in result["detail"]
def test_backup_missing_file(monkeypatch, tmp_path):
status_file = tmp_path / "missing-backup-status"
monkeypatch.setenv("BACKUP_STATUS_FILE", str(status_file))
result = checks.check_backup()
assert result == {"status": "fail", "detail": f"Backup status file not found: {status_file}"}
def test_backup_fresh(monkeypatch, tmp_path):
status_file = tmp_path / "backup-status"
status_file.write_text(str(time.time() - 60), encoding="utf-8")
monkeypatch.setenv("BACKUP_STATUS_FILE", str(status_file))
result = checks.check_backup()
assert result == {"status": "ok"}
def test_backup_stale(monkeypatch, tmp_path):
status_file = tmp_path / "backup-status"
stale_timestamp = time.time() - (checks.BACKUP_MAX_AGE_SECONDS + 1)
status_file.write_text(str(stale_timestamp), encoding="utf-8")
monkeypatch.setenv("BACKUP_STATUS_FILE", str(status_file))
result = checks.check_backup()
assert result["status"] == "fail"
assert "Last backup is" in result["detail"]
def test_backup_invalid(monkeypatch, tmp_path):
status_file = tmp_path / "backup-status"
status_file.write_text("not-a-timestamp", encoding="utf-8")
monkeypatch.setenv("BACKUP_STATUS_FILE", str(status_file))
result = checks.check_backup()
assert result == {"status": "fail", "detail": "Invalid backup status file"}

View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import re
import pytest
def _mock_checks(monkeypatch, **overrides):
payloads = {
"db": {"status": "ok", "latency_ms": 1.0},
"cache": {"status": "ok", "latency_ms": 1.0},
"celery": {"status": "ok"},
"backup": {"status": "ok"},
}
payloads.update(overrides)
monkeypatch.setattr("apps.health.views.check_db", lambda: payloads["db"])
monkeypatch.setattr("apps.health.views.check_cache", lambda: payloads["cache"])
monkeypatch.setattr("apps.health.views.check_celery", lambda: payloads["celery"])
monkeypatch.setattr("apps.health.views.check_backup", lambda: payloads["backup"])
@pytest.mark.django_db
def test_healthy(client, monkeypatch):
_mock_checks(monkeypatch)
response = client.get("/health/")
assert response.status_code == 200
assert response.json()["status"] == "ok"
@pytest.mark.django_db
def test_degraded_celery(client, monkeypatch):
_mock_checks(monkeypatch, celery={"status": "fail", "detail": "broker down"})
response = client.get("/health/")
assert response.status_code == 200
assert response.json()["status"] == "degraded"
@pytest.mark.django_db
def test_degraded_backup(client, monkeypatch):
_mock_checks(monkeypatch, backup={"status": "fail", "detail": "backup missing"})
response = client.get("/health/")
assert response.status_code == 200
assert response.json()["status"] == "degraded"
@pytest.mark.django_db
def test_unhealthy_db(client, monkeypatch):
_mock_checks(monkeypatch, db={"status": "fail", "detail": "db down"})
response = client.get("/health/")
assert response.status_code == 503
assert response.json()["status"] == "unhealthy"
@pytest.mark.django_db
def test_unhealthy_cache(client, monkeypatch):
_mock_checks(monkeypatch, cache={"status": "fail", "detail": "cache down"})
response = client.get("/health/")
assert response.status_code == 503
assert response.json()["status"] == "unhealthy"
@pytest.mark.django_db
def test_response_shape(client, monkeypatch):
_mock_checks(monkeypatch)
payload = client.get("/health/").json()
assert set(payload) == {"status", "version", "checks", "timestamp"}
assert set(payload["version"]) == {"git_sha", "build"}
assert set(payload["checks"]) == {"db", "cache", "celery", "backup"}
assert re.fullmatch(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z", payload["timestamp"])
@pytest.mark.django_db
def test_version_fields(client, monkeypatch):
_mock_checks(monkeypatch)
monkeypatch.setenv("GIT_SHA", "59cc1c4")
monkeypatch.setenv("BUILD_ID", "build-20260306-59cc1c4")
payload = client.get("/health/").json()
assert payload["version"]["git_sha"]
assert payload["version"]["build"]
@pytest.mark.django_db
def test_no_cache_headers(client, monkeypatch):
_mock_checks(monkeypatch)
response = client.get("/health/")
assert "no-cache" in response["Cache-Control"]

7
apps/health/urls.py Normal file
View File

@@ -0,0 +1,7 @@
from django.urls import path
from apps.health.views import health_view
urlpatterns = [
path("", health_view, name="health"),
]

42
apps/health/views.py Normal file
View File

@@ -0,0 +1,42 @@
from __future__ import annotations
import os
from collections.abc import Mapping
from datetime import UTC, datetime
from typing import cast
from django.http import JsonResponse
from django.views.decorators.cache import never_cache
from apps.health.checks import check_backup, check_cache, check_celery, check_db
CRITICAL_CHECKS = {"db", "cache"}
@never_cache
def health_view(request):
checks: dict[str, Mapping[str, object]] = {
"db": check_db(),
"cache": check_cache(),
"celery": check_celery(),
"backup": check_backup(),
}
if any(cast(str, checks[name]["status"]) == "fail" for name in CRITICAL_CHECKS):
overall_status = "unhealthy"
elif any(cast(str, check["status"]) == "fail" for check in checks.values()):
overall_status = "degraded"
else:
overall_status = "ok"
payload = {
"status": overall_status,
"version": {
"git_sha": os.environ.get("GIT_SHA", "unknown"),
"build": os.environ.get("BUILD_ID", "unknown"),
},
"checks": checks,
"timestamp": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
}
response_status = 503 if overall_status == "unhealthy" else 200
return JsonResponse(payload, status=response_status)

View File

@@ -49,6 +49,7 @@ INSTALLED_APPS = [
"tailwind",
"theme",
"django_htmx",
"apps.health",
"apps.core",
"apps.blog",
"apps.authors",

View File

@@ -15,6 +15,7 @@ urlpatterns = [
path("cms/", include("wagtail.admin.urls")),
path("documents/", include("wagtail.documents.urls")),
path("comments/", include("apps.comments.urls")),
path("health/", include("apps.health.urls")),
path("newsletter/", include("apps.newsletter.urls")),
path("consent/", consent_view, name="consent"),
path("robots.txt", robots_txt, name="robots_txt"),

View File

@@ -11,6 +11,10 @@ cd "${SITE_DIR}"
echo "==> Pulling latest code"
git -C "${APP_DIR}" pull origin main
GIT_SHA=$(git -C "${APP_DIR}" rev-parse --short HEAD)
BUILD_ID="build-$(date +%Y%m%d)-${GIT_SHA}"
export GIT_SHA BUILD_ID
echo "==> Updating compose file"
cp "${APP_DIR}/docker-compose.prod.yml" "${SITE_DIR}/docker-compose.prod.yml"
@@ -22,7 +26,7 @@ docker compose -f "${SITE_DIR}/docker-compose.prod.yml" up -d --no-deps --build
echo "==> Waiting for health check"
for i in $(seq 1 30); do
if curl -fsS -H "Host: nohypeai.net" http://localhost:8001/ >/dev/null 2>&1; then
if curl -fsS -H "Host: nohypeai.net" http://localhost:8001/health/ >/dev/null 2>&1; then
echo "==> Site is up"
exit 0
fi

View File

@@ -1,6 +1,10 @@
services:
web:
build: app
build:
context: app
args:
GIT_SHA: ${GIT_SHA:-unknown}
BUILD_ID: ${BUILD_ID:-unknown}
working_dir: /app
command: /app/deploy/entrypoint.prod.sh
env_file: .env