Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new endpoint /metrics with relevant system monitoring metrics #186

Merged
merged 3 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Welcome to Web Monitor's documentation!
developer/index
users/usecases/index
admin
metrics

Indices and tables
==================
Expand Down
11 changes: 11 additions & 0 deletions docs/metrics.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Metrics
=======

In order to assist in monitoring a collections of metrics is collected and returned in json.

* https://monitor.sns.gov/metrics/ (for all the data at once)
* https://monitor.sns.gov/metrics/workflow_diagnostics/ (for workflow diagnostics)
* https://monitor.sns.gov/metrics/postprocessing_diagnostics/ (for postprocessing diagnostics)
* https://monitor.sns.gov/metrics/instrument_status/ (for instrument status)
* https://monitor.sns.gov/metrics/run_statuses/ (return the count of the current reduction statuses for all the run started in the last hour)
* https://monitor.sns.gov/metrics/run_statuses/{n}/ (return the count of current reduction statuses for all the run started in the last *n* minutes, *e.g.* https://monitor.sns.gov/metrics/run_statuses/525600/ will return the last year)
10 changes: 5 additions & 5 deletions src/webmon_app/reporting/dasmon/view_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,13 @@ def get_latest(instrument_id, key_id):
# First get it from the cache
try:
last_value = StatusCache.objects.filter(instrument_id=instrument_id, key_id=key_id).latest("timestamp")
except: # noqa: E722
except StatusCache.DoesNotExist:
# If that didn't work, get it from the table of values
values = StatusVariable.objects.filter(instrument_id=instrument_id, key_id=key_id)
# If we don't have any entry yet, just return Non
if len(values) == 0:
try:
last_value = StatusVariable.objects.filter(instrument_id=instrument_id, key_id=key_id).latest("timestamp")
except StatusVariable.DoesNotExist:
# If we don't have any entry yet, just return None
return None
last_value = values.latest("timestamp")

# Put the latest value in the cache so we don't have to go through this again
cached = StatusCache(
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions src/webmon_app/reporting/metrics/urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from django.urls import path
from . import views

app_name = "metrics"

urlpatterns = [
path("/", views.metrics, name="metrics"),
path("/workflow_diagnostics/", views.workflow_diagnostics, name="workflow_diagnostics"),
path("/postprocessing_diagnostics/", views.postprocessing_diagnostics, name="postprocessing_diagnostics"),
path("/instrument_status/", views.instrument_status, name="instrument_status"),
path("/run_statuses/", views.run_statuses, name="run_statuses"),
path("/run_statuses/<int:minutes>/", views.run_statuses, name="run_statuses"),
]
83 changes: 83 additions & 0 deletions src/webmon_app/reporting/metrics/view_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from reporting.report.models import Instrument, DataRun, WorkflowSummary, Information
from reporting.dasmon.models import Parameter, StatusCache, ActiveInstrument
from reporting.report.view_util import is_acquisition_complete
from reporting.dasmon.view_util import is_running
from django.conf import settings
from django.utils import timezone
from django.db.models import Q


def postprocessing_diagnostics():
backmari marked this conversation as resolved.
Show resolved Hide resolved
common_services = Instrument.objects.get(name="common")
agents = []

for node_prefix in settings.POSTPROCESS_NODE_PREFIX:
params = Parameter.objects.filter(
~Q(name__endswith="_pid"), name__startswith=settings.SYSTEM_STATUS_PREFIX + node_prefix
)
for param in params:
node = param.name.removeprefix(settings.SYSTEM_STATUS_PREFIX)
info = {"name": node}
value = StatusCache.objects.filter(instrument_id=common_services, key_id=param).latest("timestamp")
info["timestamp"] = value.timestamp

try:
pid = Parameter.objects.get(name=param.name + "_pid")
info["PID"] = (
StatusCache.objects.filter(instrument_id=common_services, key_id=pid).latest("timestamp").value
)

except (Parameter.DoesNotExist, StatusCache.DoesNotExist):
pass

try:
last_status = Information.objects.filter(description=node).latest("id")
info["last_message"] = str(last_status.run_status_id)
info["last_message_timestamp"] = last_status.run_status_id.created_on
except Information.DoesNotExist:
pass
agents.append(info)

return agents


def instrument_status():
# return map of instrument name to run status

instruments = Instrument.objects.all().order_by("name")
status = {}

for instrument_id in instruments:
if ActiveInstrument.objects.is_alive(instrument_id):
status[str(instrument_id)] = is_running(instrument_id)

return status


def run_statuses(minutes=60):
"""Of all the runs created in the last n minutes,
return the number that are acquiring, complete, incomplete,
error along with the total number"""

runs = DataRun.objects.filter(created_on__gte=timezone.now() - timezone.timedelta(minutes=minutes)).order_by(
"created_on"
)

statuses = {"count": len(runs), "acquiring": 0, "incomplete": 0, "complete": 0, "error": 0}

for run_id in runs:
try:
s = WorkflowSummary.objects.get(run_id=run_id)
except WorkflowSummary.DoesNotExist:
continue

if not is_acquisition_complete(run_id):
statuses["acquiring"] += 1
elif s.complete:
statuses["complete"] += 1
elif run_id.last_error() is None:
statuses["incomplete"] += 1
else:
statuses["error"] += 1
backmari marked this conversation as resolved.
Show resolved Hide resolved

return statuses
41 changes: 41 additions & 0 deletions src/webmon_app/reporting/metrics/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from django.http import JsonResponse
from django.conf import settings
from django.views.decorators.cache import cache_page
import reporting.users.view_util as users_view_util
import reporting.dasmon.view_util as dasmon_view_util
from . import view_util


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def metrics(request):
data = {}
data["workflow_diagnostics"] = dasmon_view_util.workflow_diagnostics()
data["postprocessing_diagnostics"] = view_util.postprocessing_diagnostics()
data["instrument_status"] = view_util.instrument_status()
data["run_statuses"] = view_util.run_statuses()
return JsonResponse(data)


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def workflow_diagnostics(request):
return JsonResponse(dasmon_view_util.workflow_diagnostics())


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def postprocessing_diagnostics(request):
return JsonResponse(view_util.postprocessing_diagnostics(), safe=False)


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def instrument_status(request):
return JsonResponse(view_util.instrument_status())


@users_view_util.login_or_local_required_401
@cache_page(settings.FAST_PAGE_CACHE_TIMEOUT)
def run_statuses(request, minutes=60):
return JsonResponse(view_util.run_statuses(minutes))
6 changes: 3 additions & 3 deletions src/webmon_app/reporting/report/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,16 +114,16 @@ def summary(request):
adara_start = datetime.datetime(2012, 10, 1).replace(tzinfo=timezone.get_current_timezone())
today = datetime.datetime.today().replace(tzinfo=timezone.get_current_timezone())
# Fill in the partial data for the current month
runs = DataRun.objects.filter(created_on__gte=max_date)
number_of_runs = DataRun.objects.filter(created_on__gte=max_date).count()
run_rate = []
run_summary = [
{
"min_date": max_date,
"max_date": datetime.datetime.today(),
"number_of_runs": len(runs),
"number_of_runs": number_of_runs,
}
]
run_rate.append([1000 * int((today - epoch).total_seconds()), len(runs)])
run_rate.append([1000 * int((today - epoch).total_seconds()), number_of_runs])
while True:
# Make sure we don't display zeros for the period before
# the system was installed
Expand Down
1 change: 1 addition & 0 deletions src/webmon_app/reporting/reporting_app/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def validate_ldap_settings(server_uri, user_dn_template):
"reporting.dasmon",
"reporting.pvmon",
"reporting.reduction",
"reporting.metrics",
"health_check",
"health_check.db",
"health_check.cache",
Expand Down
1 change: 1 addition & 0 deletions src/webmon_app/reporting/reporting_app/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
path("reduction/", include("reporting.reduction.urls", namespace="reduction")),
path("pvmon/", include("reporting.pvmon.urls", namespace="pvmon")),
path("users/", include("reporting.users.urls", namespace="users")),
path("metrics", include("reporting.metrics.urls", namespace="metrics")),
path("database/", admin.site.urls),
path("ht/", include("health_check.urls")),
]
Loading