Skip to content

Commit

Permalink
Merge pull request #186 from neutrons/metrics
Browse files Browse the repository at this point in the history
Add new endpoint /metrics with relevant system monitoring metrics
  • Loading branch information
rosswhitfield authored Sep 30, 2024
2 parents b1c5715 + c766206 commit 7dc0e6b
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Welcome to Web Monitor's documentation!
developer/index
users/usecases/index
admin
metrics

Indices and tables
==================
Expand Down
11 changes: 11 additions & 0 deletions docs/metrics.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Metrics
=======

In order to assist in monitoring a collections of metrics is collected and returned in json.

* https://monitor.sns.gov/metrics/ (for all the data at once)
* https://monitor.sns.gov/metrics/workflow_diagnostics/ (for workflow diagnostics)
* https://monitor.sns.gov/metrics/postprocessing_diagnostics/ (for postprocessing diagnostics)
* https://monitor.sns.gov/metrics/instrument_status/ (for instrument status)
* https://monitor.sns.gov/metrics/run_statuses/ (return the count of the current reduction statuses for all the run started in the last hour)
* https://monitor.sns.gov/metrics/run_statuses/{n}/ (return the count of current reduction statuses for all the run started in the last *n* minutes, *e.g.* https://monitor.sns.gov/metrics/run_statuses/525600/ will return the last year)
10 changes: 5 additions & 5 deletions src/webmon_app/reporting/dasmon/view_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,13 @@ def get_latest(instrument_id, key_id):
# First get it from the cache
try:
last_value = StatusCache.objects.filter(instrument_id=instrument_id, key_id=key_id).latest("timestamp")
except: # noqa: E722
except StatusCache.DoesNotExist:
# If that didn't work, get it from the table of values
values = StatusVariable.objects.filter(instrument_id=instrument_id, key_id=key_id)
# If we don't have any entry yet, just return Non
if len(values) == 0:
try:
last_value = StatusVariable.objects.filter(instrument_id=instrument_id, key_id=key_id).latest("timestamp")
except StatusVariable.DoesNotExist:
# If we don't have any entry yet, just return None
return None
last_value = values.latest("timestamp")

# Put the latest value in the cache so we don't have to go through this again
cached = StatusCache(
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions src/webmon_app/reporting/metrics/urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from django.urls import path
from . import views

app_name = "metrics"

urlpatterns = [
path("/", views.metrics, name="metrics"),
path("/workflow_diagnostics/", views.workflow_diagnostics, name="workflow_diagnostics"),
path("/postprocessing_diagnostics/", views.postprocessing_diagnostics, name="postprocessing_diagnostics"),
path("/instrument_status/", views.instrument_status, name="instrument_status"),
path("/run_statuses/", views.run_statuses, name="run_statuses"),
path("/run_statuses/<int:minutes>/", views.run_statuses, name="run_statuses"),
]
85 changes: 85 additions & 0 deletions src/webmon_app/reporting/metrics/view_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from reporting.report.models import Instrument, DataRun, WorkflowSummary, Information
from reporting.dasmon.models import Parameter, StatusCache, ActiveInstrument
from reporting.report.view_util import is_acquisition_complete
from reporting.dasmon.view_util import is_running
from django.conf import settings
from django.utils import timezone
from django.db.models import Q


def postprocessing_diagnostics():
"""collect and return Cataloging & Reduction diagnostics"""
common_services = Instrument.objects.get(name="common")
agents = []

for node_prefix in settings.POSTPROCESS_NODE_PREFIX:
params = Parameter.objects.filter(
~Q(name__endswith="_pid"), name__startswith=settings.SYSTEM_STATUS_PREFIX + node_prefix
)
for param in params:
node = param.name.removeprefix(settings.SYSTEM_STATUS_PREFIX)
info = {"name": node}
value = StatusCache.objects.filter(instrument_id=common_services, key_id=param).latest("timestamp")
info["timestamp"] = value.timestamp

try:
pid = Parameter.objects.get(name=param.name + "_pid")
info["PID"] = (
StatusCache.objects.filter(instrument_id=common_services, key_id=pid).latest("timestamp").value
)

except (Parameter.DoesNotExist, StatusCache.DoesNotExist):
pass

try:
last_status = Information.objects.filter(description=node).latest("id")
info["last_message"] = str(last_status.run_status_id)
info["last_message_timestamp"] = last_status.run_status_id.created_on
except Information.DoesNotExist:
pass
agents.append(info)

return agents


def instrument_status():
"""return map of instrument name to run status"""

instruments = Instrument.objects.all().order_by("name")
status = {}

for instrument_id in instruments:
if ActiveInstrument.objects.is_alive(instrument_id):
status[str(instrument_id)] = is_running(instrument_id)

return status


def run_statuses(minutes=60):
"""Of all the runs created in the last n minutes,
return the number that are acquiring, complete, incomplete,
error or unknown along with the total number"""

runs = DataRun.objects.filter(created_on__gte=timezone.now() - timezone.timedelta(minutes=minutes)).order_by(
"created_on"
)

statuses = {"total": len(runs), "acquiring": 0, "incomplete": 0, "complete": 0, "error": 0, "unknown": 0}

for run_id in runs:
try:
s = WorkflowSummary.objects.get(run_id=run_id)
except WorkflowSummary.DoesNotExist:
statuses["unknown"] += 1
continue

if not is_acquisition_complete(run_id):
statuses["acquiring"] += 1
elif s.complete:
statuses["complete"] += 1
elif run_id.last_error() is None:
statuses["incomplete"] += 1
else:
statuses["error"] += 1

return statuses
41 changes: 41 additions & 0 deletions src/webmon_app/reporting/metrics/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from django.http import JsonResponse
from django.conf import settings
from django.views.decorators.cache import cache_page
import reporting.users.view_util as users_view_util
import reporting.dasmon.view_util as dasmon_view_util
from . import view_util


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def metrics(request):
data = {}
data["workflow_diagnostics"] = dasmon_view_util.workflow_diagnostics()
data["postprocessing_diagnostics"] = view_util.postprocessing_diagnostics()
data["instrument_status"] = view_util.instrument_status()
data["run_statuses"] = view_util.run_statuses()
return JsonResponse(data)


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def workflow_diagnostics(request):
return JsonResponse(dasmon_view_util.workflow_diagnostics())


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def postprocessing_diagnostics(request):
return JsonResponse(view_util.postprocessing_diagnostics(), safe=False)


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def instrument_status(request):
return JsonResponse(view_util.instrument_status())


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def run_statuses(request, minutes=60):
return JsonResponse(view_util.run_statuses(minutes))
6 changes: 3 additions & 3 deletions src/webmon_app/reporting/report/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,16 @@ def summary(request):
adara_start = datetime.datetime(2012, 10, 1).replace(tzinfo=timezone.get_current_timezone())
today = datetime.datetime.today().replace(tzinfo=timezone.get_current_timezone())
# Fill in the partial data for the current month
runs = DataRun.objects.filter(created_on__gte=max_date)
number_of_runs = DataRun.objects.filter(created_on__gte=max_date).count()
run_rate = []
run_summary = [
{
"min_date": max_date,
"max_date": datetime.datetime.today(),
"number_of_runs": len(runs),
"number_of_runs": number_of_runs,
}
]
run_rate.append([1000 * int((today - epoch).total_seconds()), len(runs)])
run_rate.append([1000 * int((today - epoch).total_seconds()), number_of_runs])
while True:
# Make sure we don't display zeros for the period before
# the system was installed
Expand Down
1 change: 1 addition & 0 deletions src/webmon_app/reporting/reporting_app/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def validate_ldap_settings(server_uri, user_dn_template):
"reporting.dasmon",
"reporting.pvmon",
"reporting.reduction",
"reporting.metrics",
"health_check",
"health_check.db",
"health_check.cache",
Expand Down
1 change: 1 addition & 0 deletions src/webmon_app/reporting/reporting_app/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
path("reduction/", include("reporting.reduction.urls", namespace="reduction")),
path("pvmon/", include("reporting.pvmon.urls", namespace="pvmon")),
path("users/", include("reporting.users.urls", namespace="users")),
path("metrics", include("reporting.metrics.urls", namespace="metrics")),
path("database/", admin.site.urls),
path("ht/", include("health_check.urls")),
]
Loading

0 comments on commit 7dc0e6b

Please sign in to comment.