Skip to content

Commit

Permalink
unify healthchecks
Browse files Browse the repository at this point in the history
  • Loading branch information
felixrindt committed Sep 13, 2023
1 parent 9601db4 commit b861866
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 82 deletions.
178 changes: 178 additions & 0 deletions ephios/core/services/health/healthchecks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import os
from pathlib import Path

from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.humanize.templatetags.humanize import naturaltime
from django.dispatch import receiver
from django.utils.safestring import mark_safe
from django.utils.translation import gettext_lazy as _

from ephios.core.dynamic_preferences_registry import LastRunPeriodicCall
from ephios.core.signals import register_healthchecks

# health checks are meant to monitor the health of the application while it is running
# in contrast there are django checks which are meant to check the configuration of the application


def run_healthchecks():
for _, healthchecks in register_healthchecks.send(None):
for HealthCheck in healthchecks:
check = HealthCheck()
status, message = check.check()
yield check, status, message


class HealthCheckStatus:
OK = "ok"
WARNING = "warning"
ERROR = "error"


class AbstractHealthCheck:
@property
def slug(self):
"""
Return a unique slug for this health check.
"""
raise NotImplementedError

@property
def name(self):
"""
Return a short name of this health check.
"""
raise NotImplementedError

@property
def description(self):
"""
Return a short description of this health check.
"""
raise NotImplementedError

@property
def documentation_link(self):
"""
Return a link to the documentation of this health check.
"""
return None

def check(self):
"""
Return a tuple of (status, message) where status is one of HealthCheckStatus
"""
raise NotImplementedError


class DBHealthCheck(AbstractHealthCheck):
slug = "db"
name = _("Database")
description = _("The database is the central storage for all data.")
documentation_link = "https://docs.djangoproject.com/en/stable/ref/databases/"

def check(self):
from django.db import connection

try:
connection.cursor()
Permission.objects.exists()
except Exception as e:
return HealthCheckStatus.ERROR, str(e)

if settings.DATABASES["default"]["ENGINE"] == "django.db.backends.sqlite3":
return HealthCheckStatus.WARNING, _(
"Using SQLite, this is not recommended in production."
)

return HealthCheckStatus.OK, _("Database connection established.")


class CacheHealthCheck(AbstractHealthCheck):
slug = "cache"
name = _("Cache")
description = _("The cache is used to store temporary data.")
documentation_link = "https://docs.djangoproject.com/en/stable/topics/cache/"

def check(self):
from django.core import cache

try:
cache.cache.set("_healthcheck", "1")
if not cache.cache.get("_healthcheck") == "1":
raise Exception("Cache not available")
except Exception as e:
return HealthCheckStatus.ERROR, str(e)

if (
settings.CACHES.get("default", {}).get("BACKEND")
== "django.core.cache.backends.locmem.LocMemCache"
):
return HealthCheckStatus.WARNING, _(
"Using LocMemCache, this is not recommended in production."
)

return HealthCheckStatus.OK, _("Cache connection established.")


class CronJobHealthCheck(AbstractHealthCheck):
slug = "cronjob"
name = _("Cronjob")
description = _(
"A cron job must regularly call ephios to do recurring tasks like sending notifications."
)
documentation_link = (
"https://docs.ephios.de/en/stable/admin/deployment/manual/index.html#setup-cron"
)

def check(self):
last_call = LastRunPeriodicCall.get_last_call()
if LastRunPeriodicCall.is_stuck():
if last_call:
return (
HealthCheckStatus.WARNING,
mark_safe(
_("Cronjob stuck, last run {last_call}.").format(
last_call=naturaltime(last_call),
)
),
)
else:
return (
HealthCheckStatus.ERROR,
mark_safe(_("Cronjob stuck, no last run.")),
)
else:
return (
HealthCheckStatus.OK,
mark_safe(_("Last run {last_call}.").format(last_call=naturaltime(last_call))),
)


class WritableMediaRootHealthCheck(AbstractHealthCheck):
slug = "writable_media_root"
name = _("Writable Media Root")
description = _("The media root must be writable by the application server.")
documentation_link = (
"https://docs.ephios.de/en/stable/admin/deployment/manual/index.html#data-directory"
)

def check(self):
media_root = Path(settings.MEDIA_ROOT)
if not os.access(media_root, os.W_OK):
return (
HealthCheckStatus.ERROR,
mark_safe(_("Media root not writable by application server.")),
)
return (
HealthCheckStatus.OK,
mark_safe(_("Media root writable by application server.")),
)


@receiver(register_healthchecks, dispatch_uid="ephios.core.healthchecks.register_core_healthchecks")
def register_core_healthchecks(sender, **kwargs):
yield DBHealthCheck
yield CacheHealthCheck
yield CronJobHealthCheck
yield WritableMediaRootHealthCheck
6 changes: 6 additions & 0 deletions ephios/core/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@
Receivers should return a list of subclasses of ``ephios.core.notifications.backends.AbstractNotificationBackend``
"""

register_healthchecks = PluginSignal()
"""
This signal is sent out to get all health checks that can be run to monitor the health of the application.
Receivers should return a list of subclasses of ``ephios.core.services.health.AbstractHealthCheck``
"""

periodic_signal = PluginSignal()
"""
This signal is called periodically, at least every 15 minutes.
Expand Down
83 changes: 40 additions & 43 deletions ephios/core/templates/core/settings/settings_instance.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,50 +10,47 @@
<button class="btn btn-primary" type="submit">{% translate "Save" %}</button>
</form>

{% if show_system_health %}
<div class="mb-3">
<h3>
{% translate "System health" %}
</h3>
<div class="card">
<div class="card-body">
<h5 class="card-title">
{% translate "Cron job" %}
{% if last_run_periodic_call_stuck %}
<i class="fas fa-times-circle text-danger"></i>
{% else %}
<i class="fas fa-check-circle text-success"></i>
{% endif %}
</h5>
<p class="card-text">
{% blocktranslate trimmed %}
A cron job must regularly call ephios to do recurring tasks
like sending reminder emails.
{% endblocktranslate %}
<a href="https://docs.ephios.de/en/stable/admin/deployment/manual/index.html#setup-cron"
target="_blank">
<i class="fas fa-question-circle"></i>
<span class="visually-hidden">
{% translate "Learn more" %}
</span>
</a>
<br/>
{% if last_run_periodic_call == None %}
{% translate "Last run:" %}
<span class="text-danger fw-bold">
{% translate "never" %}
</span>
{% elif last_run_periodic_call_stuck %}
{% translate "Last run:" %}
<span class="text-danger fw-bold">
{{ last_run_periodic_call|naturaltime }}
</span>
{% else %}
{% translate "Last run:" %} {{ last_run_periodic_call|naturaltime }}
{% endif %}
</p>
{% if healthchecks %}

<h3>
{% translate "System health" %}
</h3>
<div class="row row-cols-1 row-cols-md-2 g-3">
{% for check, status, message in healthchecks %}
<div class="col">
<div class="card mb-1">
<div class="card-body">
<h5 class="card-title">
{{ check.name }}
{% if status == "error" %}
<i class="fas fa-times-circle text-danger"></i>
<span class="visually-hidden">{% translate "Error" %}</span>
{% elif status == "warning" %}
<i class="fas fa-exclamation-circle text-warning"></i>
<span class="visually-hidden">{% translate "Warning" %}</span>
{% elif status == "ok" %}
<i class="fas fa-check-circle text-success"></i>
<span class="visually-hidden">{% translate "OK" %}</span>
{% endif %}
</h5>
<p class="card-text">
{{ check.description }}
{% if check.documentation_link %}
<a href="{{ check.documentation_link }}" target="_blank" rel="noreferrer">
<i class="fas fa-question-circle"></i>
<span class="visually-hidden">
{% translate "Learn more" %}
</span>
</a>
{% endif %}
</p>
<p class="card-text">
{{ message }}
</p>
</div>
</div>
</div>
</div>
{% endfor %}
</div>
{% endif %}

Expand Down
4 changes: 2 additions & 2 deletions ephios/core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
EventTypeListView,
EventTypeUpdateView,
)
from ephios.core.views.healthcheck import HealthcheckView
from ephios.core.views.healthcheck import HealthCheckView
from ephios.core.views.log import LogView
from ephios.core.views.pwa import OfflineView, PWAManifestView, ServiceWorkerView
from ephios.core.views.settings import (
Expand Down Expand Up @@ -72,7 +72,7 @@
path("manifest.json", PWAManifestView.as_view(), name="pwa_manifest"),
path("serviceworker.js", ServiceWorkerView.as_view(), name="pwa_serviceworker"),
path("offline/", OfflineView.as_view(), name="pwa_offline"),
path("healthcheck/", HealthcheckView.as_view(), name="healthcheck"),
path("healthcheck/", HealthCheckView.as_view(), name="healthcheck"),
path("events/", EventListView.as_view(), name="event_list"),
path(
"events/<int:pk>/edit/",
Expand Down
35 changes: 7 additions & 28 deletions ephios/core/views/healthcheck.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,20 @@
from django.contrib.auth.models import Permission
from django.core import cache
from django.db import Error as DjangoDBError
from django.http import HttpResponse
from django.utils.formats import date_format
from django.views import View

from ephios.core.dynamic_preferences_registry import LastRunPeriodicCall
from ephios.core.services.health.healthchecks import HealthCheckStatus, run_healthchecks


class HealthcheckView(View):
class HealthCheckView(View):
def get(self, request, *args, **kwargs):
messages = []
errors = []
# check db access
try:
Permission.objects.exists()
messages.append("DB OK")
except DjangoDBError:
errors.append("DB not available")

# check cache access
cache.cache.set("_healthcheck", "1")
if not cache.cache.get("_healthcheck") == "1":
errors.append("Cache not available")
else:
messages.append("Cache OK")

# check cronjob
if LastRunPeriodicCall.is_stuck():
if last_call := LastRunPeriodicCall.get_last_call():
errors.append(
f"Cronjob stuck, last run {date_format(last_call,format='SHORT_DATETIME_FORMAT')}"
)
for check, status, message in run_healthchecks():
text = f"{check.name}: {message}"
if status == HealthCheckStatus.OK:
messages.append(text)
else:
errors.append("Cronjob stuck, no last run")
else:
messages.append("Cronjob OK")
errors.append(text)

if errors:
return HttpResponse(
Expand Down
11 changes: 2 additions & 9 deletions ephios/core/views/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from django.views.generic import FormView, TemplateView
from dynamic_preferences.forms import global_preference_form_builder

from ephios.core.dynamic_preferences_registry import LastRunPeriodicCall
from ephios.core.forms.users import UserNotificationPreferenceForm
from ephios.core.services.health.healthchecks import run_healthchecks
from ephios.core.signals import management_settings_sections
from ephios.extra.mixins import StaffRequiredMixin

Expand Down Expand Up @@ -58,16 +58,9 @@ def get_success_url(self):

def get_context_data(self, **kwargs):
if self.request.user.is_superuser:
kwargs.update(self._get_healthcheck_context())
kwargs["healthchecks"] = list(run_healthchecks())
return super().get_context_data(**kwargs)

def _get_healthcheck_context(self):
return {
"show_system_health": True,
"last_run_periodic_call": LastRunPeriodicCall.get_last_call(),
"last_run_periodic_call_stuck": LastRunPeriodicCall.is_stuck(),
}


class PersonalDataSettingsView(LoginRequiredMixin, TemplateView):
template_name = "core/settings/settings_personal_data.html"
Expand Down
6 changes: 6 additions & 0 deletions tests/core/test_views_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,11 @@ def test_settings_calendar(django_app, volunteer):
response = django_app.get(reverse("core:settings_calendar"), user=volunteer)
calendar_url = response.html.find("input", id="calendar-url")["value"]
assert calendar_url

response = django_app.get(calendar_url, user=volunteer)
assert response


def test_settings_instance(django_app, superuser):
response = django_app.get(reverse("core:settings_instance"), user=superuser)
assert "System health" in response.html.text

0 comments on commit b861866

Please sign in to comment.