From fc46185953b6bd7ffc96b80baf4c06730daac5ab Mon Sep 17 00:00:00 2001
From: Mathieu Leplatre <mathieu@mozilla.com>
Date: Thu, 19 Sep 2019 19:09:25 +0200
Subject: [PATCH] Fix #61: Normandy checks for uptake and reported recipes
 (#65)

---
 checks/normandy/reported_recipes.py           |  46 +++++++
 checks/normandy/uptake_error_rate.py          | 118 ++++++++++++++++++
 checks/remotesettings/uptake_error_rate.py    |  19 +--
 poucave/utils.py                              |  12 ++
 .../test_normandy_uptake_error_rate.py        |  89 +++++++++++++
 .../checks/normandy/test_reported_recipes.py  |  67 ++++++++++
 .../remotesettings/test_uptake_error_rate.py  |  22 +---
 tests/test_utils.py                           |  22 +++-
 8 files changed, 357 insertions(+), 38 deletions(-)
 create mode 100644 checks/normandy/reported_recipes.py
 create mode 100644 checks/normandy/uptake_error_rate.py
 create mode 100644 tests/checks/normandy/test_normandy_uptake_error_rate.py
 create mode 100644 tests/checks/normandy/test_reported_recipes.py

diff --git a/checks/normandy/reported_recipes.py b/checks/normandy/reported_recipes.py
new file mode 100644
index 00000000..d32efdb9
--- /dev/null
+++ b/checks/normandy/reported_recipes.py
@@ -0,0 +1,46 @@
+"""
+Recipes available on the server should match the recipes clients are reporting
+Uptake Telemetry about.
+
+The list of recipes for which no event was received is returned. The min/max
+timestamps give the datetime range of the dataset obtained from
+https://sql.telemetry.mozilla.org/queries/64921/
+"""
+import aiohttp
+
+from poucave.typings import CheckResult
+from poucave.utils import fetch_redash
+
+
+EXPOSED_PARAMETERS = ["server"]
+
+REDASH_QUERY_ID = 64921
+
+NORMANDY_URL = "{server}/api/v1/recipe/signed/"
+
+
+async def run(api_key: str, server: str) -> CheckResult:
+    # Fetch latest results from Redash JSON API.
+    rows = await fetch_redash(REDASH_QUERY_ID, api_key)
+
+    min_timestamp = min(r["min_timestamp"] for r in rows)
+    max_timestamp = max(r["max_timestamp"] for r in rows)
+
+    reported_recipes = set(int(r["source"].split("/")[-1]) for r in rows)
+
+    async with aiohttp.ClientSession() as session:
+        # Recipes from source of truth.
+        normandy_url = NORMANDY_URL.format(server=server)
+        async with session.get(normandy_url) as response:
+            normandy_recipes = await response.json()
+
+    normandy_recipes = set(r["recipe"]["id"] for r in normandy_recipes)
+
+    missing = list(normandy_recipes - reported_recipes)
+
+    data = {
+        "min_timestamp": min_timestamp,
+        "max_timestamp": max_timestamp,
+        "missing": missing,
+    }
+    return len(missing) == 0, data
diff --git a/checks/normandy/uptake_error_rate.py b/checks/normandy/uptake_error_rate.py
new file mode 100644
index 00000000..5212a3e3
--- /dev/null
+++ b/checks/normandy/uptake_error_rate.py
@@ -0,0 +1,118 @@
+"""
+The percentage of reported errors in Uptake Telemetry should be under the specified
+maximum.
+
+For each recipe whose error rate is above the maximum, the total number of events
+for each status is returned. The min/max timestamps give the datetime range of the
+dataset obtained from https://sql.telemetry.mozilla.org/queries/64921/
+"""
+from collections import defaultdict
+from typing import Dict, List
+
+from poucave.typings import CheckResult
+from poucave.utils import fetch_redash
+
+
+EXPOSED_PARAMETERS = ["max_error_percentage", "min_total_events"]
+
+REDASH_QUERY_ID = 64921
+
+# Normandy uses the Uptake telemetry statuses in a specific way.
+# See https://searchfox.org/mozilla-central/rev/4218cb868d8deed13e902718ba2595d85e12b86b/toolkit/components/normandy/lib/Uptake.jsm#23-43
+NORMANDY_STATUSES = {
+    "custom_1_error": "recipe_action_disabled",
+    "backoff": "recipe_didnt_match_filter",
+    "apply_error": "recipe_execution_error",
+    "content_error": "recipe_filter_broken",
+    "download_error": "recipe_invalid_action",
+    "signature_error": "runner_invalid_signature",
+}
+UPTAKE_STATUSES = {v: k for k, v in NORMANDY_STATUSES.items()}
+
+
+def sort_dict_desc(d, key):
+    return dict(sorted(d.items(), key=key, reverse=True))
+
+
+async def run(
+    api_key: str,
+    max_error_percentage: float,
+    min_total_events: int = 100,
+    ignore_status: List[str] = [],
+) -> CheckResult:
+    # Ignored statuses are specified using the Normandy ones.
+    ignored_status = [UPTAKE_STATUSES.get(s, s) for s in ignore_status]
+    # A client reporting that recipe didn't match filter is not an error.
+    ignore_status += ["recipe_didnt_match_filter"]
+
+    # Fetch latest results from Redash JSON API.
+    rows = await fetch_redash(REDASH_QUERY_ID, api_key)
+
+    min_timestamp = min(r["min_timestamp"] for r in rows)
+    max_timestamp = max(r["max_timestamp"] for r in rows)
+
+    by_collection: Dict[str, Dict[str, int]] = defaultdict(dict)
+    for row in rows:
+        rid = int(row["source"].split("/")[-1])
+        by_collection[rid][row["status"]] = row["total"]
+
+    error_rates = {}
+    for rid, all_statuses in by_collection.items():
+        total_statuses = sum(total for status, total in all_statuses.items())
+
+        # Ignore uptake Telemetry of a certain recipe if the total of collected
+        # events is too small.
+        if total_statuses < min_total_events:
+            continue
+
+        statuses = {
+            NORMANDY_STATUSES.get(status, status): total
+            for status, total in all_statuses.items()
+            if status not in ignored_status
+        }
+        ignored = {
+            NORMANDY_STATUSES.get(status, status): total
+            for status, total in all_statuses.items()
+            if status in ignored_status
+        }
+        total_errors = sum(
+            total for status, total in statuses.items() if status.endswith("_error")
+        )
+        error_rate = round(total_errors * 100 / total_statuses, 2)
+
+        if error_rate < max_error_percentage:
+            continue
+
+        error_rates[rid] = {
+            "error_rate": error_rate,
+            "statuses": sort_dict_desc(statuses, key=lambda item: item[1]),
+            "ignored": sort_dict_desc(ignored, key=lambda item: item[1]),
+        }
+
+    sort_by_rate = sort_dict_desc(error_rates, key=lambda item: item[1]["error_rate"])
+
+    data = {
+        "recipes": sort_by_rate,
+        "min_timestamp": min_timestamp,
+        "max_timestamp": max_timestamp,
+    }
+    """
+    {
+      "recipes": {
+        532: {
+          "error_rate": 60.4,
+          "statuses": {
+            "recipe_execution_error": 56,
+            "success": 35,
+            "action_post_execution_error": 5
+          },
+          "ignored": {
+            "recipe_didnt_match_filter": 5
+          }
+        }
+      },
+      "min_timestamp": "2019-09-19T03:47:42.773",
+      "max_timestamp": "2019-09-19T09:43:26.083"
+    }
+    """
+    return len(sort_by_rate) == 0, data
diff --git a/checks/remotesettings/uptake_error_rate.py b/checks/remotesettings/uptake_error_rate.py
index 6940eb28..8559bded 100644
--- a/checks/remotesettings/uptake_error_rate.py
+++ b/checks/remotesettings/uptake_error_rate.py
@@ -10,31 +10,18 @@
 from typing import Dict, List
 
 from poucave.typings import CheckResult
-from poucave.utils import fetch_json
+from poucave.utils import fetch_redash
 
 
 EXPOSED_PARAMETERS = ["max_error_percentage", "min_total_events"]
 
-REDASH_URI = (
-    f"https://sql.telemetry.mozilla.org/api/queries/64808/results.json?api_key="
-)
+REDASH_QUERY_ID = 64808
 
 
 def sort_dict_desc(d, key):
     return dict(sorted(d.items(), key=key, reverse=True))
 
 
-async def fetch_redash(api_key):
-    redash_uri = REDASH_URI + api_key
-
-    body = await fetch_json(redash_uri)
-
-    query_result = body["query_result"]
-    data = query_result["data"]
-    rows = data["rows"]
-    return rows
-
-
 async def run(
     api_key: str,
     max_error_percentage: float,
@@ -42,7 +29,7 @@ async def run(
     ignore_status: List[str] = [],
 ) -> CheckResult:
     # Fetch latest results from Redash JSON API.
-    rows = await fetch_redash(api_key)
+    rows = await fetch_redash(REDASH_QUERY_ID, api_key)
 
     min_timestamp = min(r["min_timestamp"] for r in rows)
     max_timestamp = max(r["max_timestamp"] for r in rows)
diff --git a/poucave/utils.py b/poucave/utils.py
index c9e7227e..c3d7ea5d 100644
--- a/poucave/utils.py
+++ b/poucave/utils.py
@@ -37,6 +37,18 @@ def get(self, key: str) -> Optional[Any]:
             return None
 
 
+REDASH_URI = "https://sql.telemetry.mozilla.org/api/queries/{}/results.json?api_key={}"
+
+
+async def fetch_redash(query_id, api_key):
+    redash_uri = REDASH_URI.format(query_id, api_key)
+    body = await fetch_json(redash_uri)
+    query_result = body["query_result"]
+    data = query_result["data"]
+    rows = data["rows"]
+    return rows
+
+
 retry_decorator = backoff.on_exception(
     backoff.expo,
     (aiohttp.ClientError, asyncio.TimeoutError),
diff --git a/tests/checks/normandy/test_normandy_uptake_error_rate.py b/tests/checks/normandy/test_normandy_uptake_error_rate.py
new file mode 100644
index 00000000..42b4305d
--- /dev/null
+++ b/tests/checks/normandy/test_normandy_uptake_error_rate.py
@@ -0,0 +1,89 @@
+from checks.normandy.uptake_error_rate import run
+from tests.utils import patch_async
+
+
+MODULE = "checks.normandy.uptake_error_rate"
+
+FAKE_ROWS = [
+    {
+        "status": "success",
+        "source": "normandy/recipe/123",
+        "total": 20000,
+        "min_timestamp": "2019-09-16T02:36:12.348",
+        "max_timestamp": "2019-09-16T06:24:58.741",
+    },
+    {
+        "status": "apply_error",
+        "source": "normandy/recipe/123",
+        "total": 15000,
+        "min_timestamp": "2019-09-16T03:36:12.348",
+        "max_timestamp": "2019-09-16T05:24:58.741",
+    },
+    {
+        "status": "backoff",
+        "source": "normandy/recipe/123",
+        "total": 5000,
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    },
+]
+
+
+async def test_positive():
+    with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS):
+        status, data = await run(api_key="", max_error_percentage=100.0)
+
+    assert status is True
+    assert data == {
+        "recipes": {},
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
+
+
+async def test_negative():
+    with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS):
+        status, data = await run(api_key="", max_error_percentage=0.1)
+
+    assert status is False
+    assert data == {
+        "recipes": {
+            123: {
+                "error_rate": 37.5,
+                "statuses": {"success": 20000, "recipe_execution_error": 15000},
+                "ignored": {"recipe_didnt_match_filter": 5000},
+            }
+        },
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
+
+
+async def test_ignore_status():
+    with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS):
+        status, data = await run(
+            api_key="",
+            max_error_percentage=0.1,
+            ignore_status=["recipe_execution_error"],
+        )
+
+    assert status is True
+    assert data == {
+        "recipes": {},
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
+
+
+async def test_min_total_events():
+    with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS):
+        status, data = await run(
+            api_key="", max_error_percentage=0.1, min_total_events=40001
+        )
+
+    assert status is True
+    assert data == {
+        "recipes": {},
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
diff --git a/tests/checks/normandy/test_reported_recipes.py b/tests/checks/normandy/test_reported_recipes.py
new file mode 100644
index 00000000..3b728f0b
--- /dev/null
+++ b/tests/checks/normandy/test_reported_recipes.py
@@ -0,0 +1,67 @@
+from checks.normandy.reported_recipes import run
+from tests.utils import patch_async
+
+
+MODULE = "checks.normandy.reported_recipes"
+
+FAKE_ROWS = [
+    {
+        "status": "success",
+        "source": "normandy/recipe/123",
+        "total": 20000,
+        "min_timestamp": "2019-09-16T02:36:12.348",
+        "max_timestamp": "2019-09-16T06:24:58.741",
+    },
+    {
+        "status": "backoff",
+        "source": "normandy/recipe/456",
+        "total": 15000,
+        "min_timestamp": "2019-09-16T03:36:12.348",
+        "max_timestamp": "2019-09-16T05:24:58.741",
+    },
+    {
+        "status": "apply_error",
+        "source": "normandy/recipe/123",
+        "total": 5000,
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    },
+]
+
+
+async def test_positive(mock_aioresponses):
+    mock_aioresponses.get(
+        "http://normandy/api/v1/recipe/signed/",
+        payload=[{"recipe": {"id": 123}}, {"recipe": {"id": 456}}],
+    )
+
+    with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS):
+        status, data = await run(server="http://normandy", api_key="")
+
+    assert status is True
+    assert data == {
+        "missing": [],
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
+
+
+async def test_negative(mock_aioresponses):
+    mock_aioresponses.get(
+        "http://normandy/api/v1/recipe/signed/",
+        payload=[
+            {"recipe": {"id": 123}},
+            {"recipe": {"id": 456}},
+            {"recipe": {"id": 789}},
+        ],
+    )
+
+    with patch_async(f"{MODULE}.fetch_redash", return_value=FAKE_ROWS):
+        status, data = await run(server="http://normandy", api_key="")
+
+    assert status is False
+    assert data == {
+        "missing": [789],
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
diff --git a/tests/checks/remotesettings/test_uptake_error_rate.py b/tests/checks/remotesettings/test_uptake_error_rate.py
index bc77d59f..6fa97628 100644
--- a/tests/checks/remotesettings/test_uptake_error_rate.py
+++ b/tests/checks/remotesettings/test_uptake_error_rate.py
@@ -1,27 +1,7 @@
-from checks.remotesettings.uptake_error_rate import run, fetch_redash
+from checks.remotesettings.uptake_error_rate import run
 from tests.utils import patch_async
 
 
-async def test_fetch_redash(mock_aioresponses):
-    url = "https://sql.telemetry.mozilla.org/api/queries/64808/results.json?api_key=abc"
-
-    row = {
-        "status": "network_error",
-        "source": "blocklists/addons",
-        "min_timestamp": "2019-09-16T01:36:12.348",
-        "total": 1360,
-        "max_timestamp": "2019-09-16T07:24:58.741",
-    }
-
-    mock_aioresponses.get(
-        url, status=200, payload={"query_result": {"data": {"rows": [row]}}}
-    )
-
-    rows = await fetch_redash(api_key="abc")
-
-    assert rows == [row]
-
-
 MODULE = "checks.remotesettings.uptake_error_rate"
 
 FAKE_ROWS = [
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c6caaca6..7ac84a27 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,4 @@
-from poucave.utils import Cache
+from poucave.utils import Cache, fetch_redash
 
 
 def test_cache_set_get():
@@ -9,3 +9,23 @@ def test_cache_set_get():
     assert cache.get("a") == 42
     assert cache.get("b") is None
     assert cache.get("expired") is None
+
+
+async def test_fetch_redash(mock_aioresponses):
+    url = "https://sql.telemetry.mozilla.org/api/queries/64921/results.json?api_key=abc"
+
+    row = {
+        "status": "network_error",
+        "source": "normandy/recipe/123",
+        "min_timestamp": "2019-09-16T01:36:12.348",
+        "total": 333,
+        "max_timestamp": "2019-09-16T07:24:58.741",
+    }
+
+    mock_aioresponses.get(
+        url, status=200, payload={"query_result": {"data": {"rows": [row]}}}
+    )
+
+    rows = await fetch_redash(query_id=64921, api_key="abc")
+
+    assert rows == [row]