From fc46185953b6bd7ffc96b80baf4c06730daac5ab Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Thu, 19 Sep 2019 19:09:25 +0200 Subject: [PATCH] Fix #61: Normandy checks for uptake and reported recipes (#65) --- checks/normandy/reported_recipes.py | 46 +++++++ checks/normandy/uptake_error_rate.py | 118 ++++++++++++++++++ checks/remotesettings/uptake_error_rate.py | 19 +-- poucave/utils.py | 12 ++ .../test_normandy_uptake_error_rate.py | 89 +++++++++++++ .../checks/normandy/test_reported_recipes.py | 67 ++++++++++ .../remotesettings/test_uptake_error_rate.py | 22 +--- tests/test_utils.py | 22 +++- 8 files changed, 357 insertions(+), 38 deletions(-) create mode 100644 checks/normandy/reported_recipes.py create mode 100644 checks/normandy/uptake_error_rate.py create mode 100644 tests/checks/normandy/test_normandy_uptake_error_rate.py create mode 100644 tests/checks/normandy/test_reported_recipes.py diff --git a/checks/normandy/reported_recipes.py b/checks/normandy/reported_recipes.py new file mode 100644 index 00000000..d32efdb9 --- /dev/null +++ b/checks/normandy/reported_recipes.py @@ -0,0 +1,46 @@ +""" +Recipes available on the server should match the recipes clients are reporting +Uptake Telemetry about. + +The list of recipes for which no event was received is returned. The min/max +timestamps give the datetime range of the dataset obtained from +https://sql.telemetry.mozilla.org/queries/64921/ +""" +import aiohttp + +from poucave.typings import CheckResult +from poucave.utils import fetch_redash + + +EXPOSED_PARAMETERS = ["server"] + +REDASH_QUERY_ID = 64921 + +NORMANDY_URL = "{server}/api/v1/recipe/signed/" + + +async def run(api_key: str, server: str) -> CheckResult: + # Fetch latest results from Redash JSON API. + rows = await fetch_redash(REDASH_QUERY_ID, api_key) + + min_timestamp = min(r["min_timestamp"] for r in rows) + max_timestamp = max(r["max_timestamp"] for r in rows) + + reported_recipes = set(int(r["source"].split("/")[-1]) for r in rows) + + async with aiohttp.ClientSession() as session: + # Recipes from source of truth. + normandy_url = NORMANDY_URL.format(server=server) + async with session.get(normandy_url) as response: + normandy_recipes = await response.json() + + normandy_recipes = set(r["recipe"]["id"] for r in normandy_recipes) + + missing = list(normandy_recipes - reported_recipes) + + data = { + "min_timestamp": min_timestamp, + "max_timestamp": max_timestamp, + "missing": missing, + } + return len(missing) == 0, data diff --git a/checks/normandy/uptake_error_rate.py b/checks/normandy/uptake_error_rate.py new file mode 100644 index 00000000..5212a3e3 --- /dev/null +++ b/checks/normandy/uptake_error_rate.py @@ -0,0 +1,118 @@ +""" +The percentage of reported errors in Uptake Telemetry should be under the specified +maximum. + +For each recipe whose error rate is above the maximum, the total number of events +for each status is returned. The min/max timestamps give the datetime range of the +dataset obtained from https://sql.telemetry.mozilla.org/queries/64921/ +""" +from collections import defaultdict +from typing import Dict, List + +from poucave.typings import CheckResult +from poucave.utils import fetch_redash + + +EXPOSED_PARAMETERS = ["max_error_percentage", "min_total_events"] + +REDASH_QUERY_ID = 64921 + +# Normandy uses the Uptake telemetry statuses in a specific way. +# See https://searchfox.org/mozilla-central/rev/4218cb868d8deed13e902718ba2595d85e12b86b/toolkit/components/normandy/lib/Uptake.jsm#23-43 +NORMANDY_STATUSES = { + "custom_1_error": "recipe_action_disabled", + "backoff": "recipe_didnt_match_filter", + "apply_error": "recipe_execution_error", + "content_error": "recipe_filter_broken", + "download_error": "recipe_invalid_action", + "signature_error": "runner_invalid_signature", +} +UPTAKE_STATUSES = {v: k for k, v in NORMANDY_STATUSES.items()} + + +def sort_dict_desc(d, key): + return dict(sorted(d.items(), key=key, reverse=True)) + + +async def run( + api_key: str, + max_error_percentage: float, + min_total_events: int = 100, + ignore_status: List[str] = [], +) -> CheckResult: + # Ignored statuses are specified using the Normandy ones. + ignored_status = [UPTAKE_STATUSES.get(s, s) for s in ignore_status] + # A client reporting that recipe didn't match filter is not an error. + ignore_status += ["recipe_didnt_match_filter"] + + # Fetch latest results from Redash JSON API. + rows = await fetch_redash(REDASH_QUERY_ID, api_key) + + min_timestamp = min(r["min_timestamp"] for r in rows) + max_timestamp = max(r["max_timestamp"] for r in rows) + + by_collection: Dict[str, Dict[str, int]] = defaultdict(dict) + for row in rows: + rid = int(row["source"].split("/")[-1]) + by_collection[rid][row["status"]] = row["total"] + + error_rates = {} + for rid, all_statuses in by_collection.items(): + total_statuses = sum(total for status, total in all_statuses.items()) + + # Ignore uptake Telemetry of a certain recipe if the total of collected + # events is too small. + if total_statuses < min_total_events: + continue + + statuses = { + NORMANDY_STATUSES.get(status, status): total + for status, total in all_statuses.items() + if status not in ignored_status + } + ignored = { + NORMANDY_STATUSES.get(status, status): total + for status, total in all_statuses.items() + if status in ignored_status + } + total_errors = sum( + total for status, total in statuses.items() if status.endswith("_error") + ) + error_rate = round(total_errors * 100 / total_statuses, 2) + + if error_rate < max_error_percentage: + continue + + error_rates[rid] = { + "error_rate": error_rate, + "statuses": sort_dict_desc(statuses, key=lambda item: item[1]), + "ignored": sort_dict_desc(ignored, key=lambda item: item[1]), + } + + sort_by_rate = sort_dict_desc(error_rates, key=lambda item: item[1]["error_rate"]) + + data = { + "recipes": sort_by_rate, + "min_timestamp": min_timestamp, + "max_timestamp": max_timestamp, + } + """ + { + "recipes": { + 532: { + "error_rate": 60.4, + "statuses": { + "recipe_execution_error": 56, + "success": 35, + "action_post_execution_error": 5 + }, + "ignored": { + "recipe_didnt_match_filter": 5 + } + } + }, + "min_timestamp": "2019-09-19T03:47:42.773", + "max_timestamp": "2019-09-19T09:43:26.083" + } + """ + return len(sort_by_rate) == 0, data diff --git a/checks/remotesettings/uptake_error_rate.py b/checks/remotesettings/uptake_error_rate.py index 6940eb28..8559bded 100644 --- a/checks/remotesettings/uptake_error_rate.py +++ b/checks/remotesettings/uptake_error_rate.py @@ -10,31 +10,18 @@ from typing import Dict, List from poucave.typings import CheckResult -from poucave.utils import fetch_json +from poucave.utils import fetch_redash EXPOSED_PARAMETERS = ["max_error_percentage", "min_total_events"] -REDASH_URI = ( - f"https://sql.telemetry.mozilla.org/api/queries/64808/results.json?api_key=" -) +REDASH_QUERY_ID = 64808 def sort_dict_desc(d, key): return dict(sorted(d.items(), key=key, reverse=True)) -async def fetch_redash(api_key): - redash_uri = REDASH_URI + api_key - - body = await fetch_json(redash_uri) - - query_result = body["query_result"] - data = query_result["data"] - rows = data["rows"] - return rows - - async def run( api_key: str, max_error_percentage: float, @@ -42,7 +29,7 @@ async def run( ignore_status: List[str] = [], ) -> CheckResult: # Fetch latest results from Redash JSON API. - rows = await fetch_redash(api_key) + rows = await fetch_redash(REDASH_QUERY_ID, api_key) min_timestamp = min(r["min_timestamp"] for r in rows) max_timestamp = max(r["max_timestamp"] for r in rows) diff --git a/poucave/utils.py b/poucave/utils.py index c9e7227e..c3d7ea5d 100644 --- a/poucave/utils.py +++ b/poucave/utils.py @@ -37,6 +37,18 @@ def get(self, key: str) -> Optional[Any]: return None +REDASH_URI = "https://sql.telemetry.mozilla.org/api/queries/{}/results.json?api_key={}" + + +async def fetch_redash(query_id, api_key): + redash_uri = REDASH_URI.format(query_id, api_key) + body = await fetch_json(redash_uri) + query_result = body["query_result"] + data = query_result["data"] + rows = data["rows"] + return rows + + retry_decorator = backoff.on_exception( backoff.expo, (aiohttp.ClientError, asyncio.TimeoutError), diff --git a/tests/checks/normandy/test_normandy_uptake_error_rate.py b/tests/checks/normandy/test_normandy_uptake_error_rate.py new file mode 100644 index 00000000..42b4305d --- /dev/null +++ b/tests/checks/normandy/test_normandy_uptake_error_rate.py @@ -0,0 +1,89 @@ +from checks.normandy.uptake_error_rate import run +from tests.utils import patch_async + + +MODULE = "checks.normandy.uptake_error_rate" + +FAKE_ROWS = [ + { + "status": "success", + "source": "normandy/recipe/123", + "total": 20000, + "min_timestamp": "2019-09-16T02:36:12.348", + "max_timestamp": "2019-09-16T06:24:58.741", + }, + { + "status": "apply_error", + "source": "normandy/recipe/123", + "total": 15000, + "min_timestamp": "2019-09-16T03:36:12.348", + "max_timestamp": "2019-09-16T05:24:58.741", + }, + { + "status": "backoff", + "source": "normandy/recipe/123", + "total": 5000, + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + }, +] + + +async def test_positive(): + with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS): + status, data = await run(api_key="", max_error_percentage=100.0) + + assert status is True + assert data == { + "recipes": {}, + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + } + + +async def test_negative(): + with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS): + status, data = await run(api_key="", max_error_percentage=0.1) + + assert status is False + assert data == { + "recipes": { + 123: { + "error_rate": 37.5, + "statuses": {"success": 20000, "recipe_execution_error": 15000}, + "ignored": {"recipe_didnt_match_filter": 5000}, + } + }, + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + } + + +async def test_ignore_status(): + with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS): + status, data = await run( + api_key="", + max_error_percentage=0.1, + ignore_status=["recipe_execution_error"], + ) + + assert status is True + assert data == { + "recipes": {}, + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + } + + +async def test_min_total_events(): + with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS): + status, data = await run( + api_key="", max_error_percentage=0.1, min_total_events=40001 + ) + + assert status is True + assert data == { + "recipes": {}, + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + } diff --git a/tests/checks/normandy/test_reported_recipes.py b/tests/checks/normandy/test_reported_recipes.py new file mode 100644 index 00000000..3b728f0b --- /dev/null +++ b/tests/checks/normandy/test_reported_recipes.py @@ -0,0 +1,67 @@ +from checks.normandy.reported_recipes import run +from tests.utils import patch_async + + +MODULE = "checks.normandy.reported_recipes" + +FAKE_ROWS = [ + { + "status": "success", + "source": "normandy/recipe/123", + "total": 20000, + "min_timestamp": "2019-09-16T02:36:12.348", + "max_timestamp": "2019-09-16T06:24:58.741", + }, + { + "status": "backoff", + "source": "normandy/recipe/456", + "total": 15000, + "min_timestamp": "2019-09-16T03:36:12.348", + "max_timestamp": "2019-09-16T05:24:58.741", + }, + { + "status": "apply_error", + "source": "normandy/recipe/123", + "total": 5000, + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + }, +] + + +async def test_positive(mock_aioresponses): + mock_aioresponses.get( + "http://normandy/api/v1/recipe/signed/", + payload=[{"recipe": {"id": 123}}, {"recipe": {"id": 456}}], + ) + + with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS): + status, data = await run(server="http://normandy", api_key="") + + assert status is True + assert data == { + "missing": [], + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + } + + +async def test_negative(mock_aioresponses): + mock_aioresponses.get( + "http://normandy/api/v1/recipe/signed/", + payload=[ + {"recipe": {"id": 123}}, + {"recipe": {"id": 456}}, + {"recipe": {"id": 789}}, + ], + ) + + with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS): + status, data = await run(server="http://normandy", api_key="") + + assert status is False + assert data == { + "missing": [789], + "min_timestamp": "2019-09-16T01:36:12.348", + "max_timestamp": "2019-09-16T07:24:58.741", + } diff --git a/tests/checks/remotesettings/test_uptake_error_rate.py b/tests/checks/remotesettings/test_uptake_error_rate.py index bc77d59f..6fa97628 100644 --- a/tests/checks/remotesettings/test_uptake_error_rate.py +++ b/tests/checks/remotesettings/test_uptake_error_rate.py @@ -1,27 +1,7 @@ -from checks.remotesettings.uptake_error_rate import run, fetch_redash +from checks.remotesettings.uptake_error_rate import run from tests.utils import patch_async -async def test_fetch_redash(mock_aioresponses): - url = "https://sql.telemetry.mozilla.org/api/queries/64808/results.json?api_key=abc" - - row = { - "status": "network_error", - "source": "blocklists/addons", - "min_timestamp": "2019-09-16T01:36:12.348", - "total": 1360, - "max_timestamp": "2019-09-16T07:24:58.741", - } - - mock_aioresponses.get( - url, status=200, payload={"query_result": {"data": {"rows": [row]}}} - ) - - rows = await fetch_redash(api_key="abc") - - assert rows == [row] - - MODULE = "checks.remotesettings.uptake_error_rate" FAKE_ROWS = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index c6caaca6..7ac84a27 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,4 @@ -from poucave.utils import Cache +from poucave.utils import Cache, fetch_redash def test_cache_set_get(): @@ -9,3 +9,23 @@ def test_cache_set_get(): assert cache.get("a") == 42 assert cache.get("b") is None assert cache.get("expired") is None + + +async def test_fetch_redash(mock_aioresponses): + url = "https://sql.telemetry.mozilla.org/api/queries/64921/results.json?api_key=abc" + + row = { + "status": "network_error", + "source": "normandy/recipe/123", + "min_timestamp": "2019-09-16T01:36:12.348", + "total": 333, + "max_timestamp": "2019-09-16T07:24:58.741", + } + + mock_aioresponses.get( + url, status=200, payload={"query_result": {"data": {"rows": [row]}}} + ) + + rows = await fetch_redash(query_id=64921, api_key="abc") + + assert rows == [row]