Bszabo/frontend app authoring pipeline testing (#21)

bszabo · Bernard Szabo · web-flow · commit c18c093f2dc2 · 2025-02-11T13:24:37.000-05:00
* feat: Run Datadog synthetic tests

* feat: drop confusing "waffle switch" in names

---------

Co-authored-by: Bernard Szabo &lt;bszabo@edx.org&gt;
diff --git a/scripts/dd_synthetic_tests.py b/scripts/dd_synthetic_tests.py
@@ -0,0 +1 @@
+../tubular/scripts/dd_synthetic_tests.py
diff --git a/setup.cfg b/setup.cfg
@@ -31,6 +31,7 @@ console_scripts =
     create_tag.py = tubular.scripts.create_tag:create_tag
     delete-asg.py = tubular.scripts.delete_asg:delete_asg
     delete_expired_partner_gdpr_reports.py = tubular.scripts.delete_expired_partner_gdpr_reports:delete_expired_reports
+    dd_synthetic_tests.py = tubular.scripts.dd_synthetic_tests:run_synthetic_tests
     drupal_backup_database.py = tubular.scripts.drupal_backup_database:backup_database
     drupal_clear_varnish.py = tubular.scripts.drupal_clear_varnish:clear_varnish_cache
     drupal_deploy.py = tubular.scripts.drupal_deploy:deploy
diff --git a/tubular/scripts/dd_synthetic_tests.py b/tubular/scripts/dd_synthetic_tests.py
@@ -0,0 +1,314 @@
+#! /usr/bin/env python3
+
+import click
+import logging
+import os
+import requests
+import time
+import sys
+
+class SyntheticTest:
+    '''
+    Attributes for a Datadog synthetic test and its test run
+    '''
+    def __init__(self, name, public_id):
+        self.name = name            # The test's Datadog name
+        self.public_id = public_id  # The test's Datadog Test ID
+        self.test_run_id = None     # The run ID given by Datadog to this test's invocation
+        self.success = None
+
+class DatadogClient:
+    ''' Client class to invoke datadog API to run and monitor synthetic tests '''
+
+    DATADOG_SYNTHETIC_TESTS_API_URL = "https://api.datadoghq.com/api/v1/synthetics/tests"
+    MAX_ALLOWABLE_TIME_SECS = 600 # 10 minutes
+
+    DEPLOYMENT_TESTING_ENABLED_SWITCH = SyntheticTest(
+                '''
+                Deployment testing enable test governing CI/CD synthetic testing
+                ''',
+                "sad-hqu-h33"
+            )
+
+    def __init__(self, api_key, app_key):
+        self.api_key = api_key
+        self.app_key = app_key
+        self.test_batch_id = None   # A 'batch' is a set of tests intended to be run in parallel
+        self.trigger_time = None    # The system time at which a batch's execution was requested
+        self.tests_by_public_id = {} # Dictionary mapping Datadog test ID to all info we have for a specific test
+
+    def trigger_synthetic_tests(self, tests_to_report: [SyntheticTest]):
+        '''
+        Trigger running of a batch of synthetic tests.
+        :param tests_to_report: List of tests to run and report on
+        :return: None, but saves test info including batch ID and test run IDs in 'self'
+        '''
+
+        # Note that the list of tests to be run is one longer than the list of tests to be reported on.
+        # The extra test is the so-called "deployment testing enable switch test".
+        # That test should be modified via the Datadog UI to either always pass or always fail, depending
+        # on whether synthetic testing is to be enabled at runtime or not, respectively.
+        # While the test's result does affect how the pipeline operates, the result is not treated as reportable.
+        tests_to_run = [self.DEPLOYMENT_TESTING_ENABLED_SWITCH] + tests_to_report
+        self._record_requested_test_particulars(tests_to_run)
+        self.trigger_time = time.time()  # Key timeouts off of this
+        logging.info(f'CI batch triggered at time {self.trigger_time}')
+
+        try:
+            response = self._trigger_batch_tests()  # Kicks off asynchronous test execution for a batch of tests
+            response_body = response.json()
+            self._record_batch_id(response_body)    # a single batch ID has now been assiged. Save for future reference
+            self._map_test_run_ids(response_body)   # one test run ID per test has been assigned. Save for reference.
+
+        except Exception as e:
+            raise Exception("Datadog error on triggering tests: " + str(e))
+
+    def gate_on_deployment_testing_enable_switch(self):
+        '''
+        This is a bit hacky, but there's a designated test that's used as a deployment testing enable switch.
+        If the test passes, it means that the synthetic testing GoCD pipeline is enabled, and the
+        build should only proceed if all reportable tests pass; if the test fails, the build should proceed irrespective
+        of any failures among the synthetic tests (which will be allowed to run, nonetheless). When this is intended,
+        the GoCD pipeline responsible for running the tests should just return a success code without waiting
+        for the reportable tests to complete their runs.
+
+        :return: Nothing, but terminates task with a success code if the synthetic testing feature is disabled
+        and logs the decision to skip testing on this build
+        '''
+        deployment_testing_enabled = self._poll_for_test_result(self.DEPLOYMENT_TESTING_ENABLED_SWITCH)
+        if deployment_testing_enabled == False:
+            switch_test_name = self.DEPLOYMENT_TESTING_ENABLED_SWITCH.name
+            logging.warning(
+                f'*** Datadog Synthetic testing disabled via failing test {switch_test_name} ***')
+            sys.exit(0)
+
+    def get_and_record_test_results(self):
+        '''
+        Poll for pass/fail results for all batch tests
+
+        :return: Nothing, but saves pass/fail results in 'self'
+        '''
+        for test in list(self.tests_by_public_id.values()):
+            test.success = self._poll_for_test_result(test)
+
+    def get_failed_tests(self):
+        '''
+        Compile a list of all failed tests from the set of all tests that were run
+        :return: A list of failed test objects; Empty list if all tests passed
+        '''
+        failed_tests = []
+        for test in list(self.tests_by_public_id.values()):
+            if not test.success:
+                failed_tests.append(test)
+
+        return failed_tests
+
+    # ***************** Private methods **********************
+
+    def _record_requested_test_particulars(self, test_requests):
+        '''
+        Save list of requested tests in this dictionary for later reference, indexed by test public ID
+        '''
+        for test in test_requests:
+            self.tests_by_public_id[test.public_id] = test
+
+    def _trigger_batch_tests(self):
+        '''
+        Ask datadog to run the set of selected synthetic tests
+        returns the response from the datadog API call
+
+        Note that using the ci (continuous integration) route leverages
+        the parallel execution Datadog feature we pay extra for
+        '''
+        url = f"{self.DATADOG_SYNTHETIC_TESTS_API_URL}/trigger/ci"
+        headers = {
+            "Content-Type": "application/json",
+            "DD-API-KEY": self.api_key,
+            "DD-APPLICATION-KEY": self.app_key
+        }
+        test_public_ids = self.tests_by_public_id.keys()
+        json_request_body = {"tests": [{"public_id": public_id} for public_id in test_public_ids]}
+        response = requests.post(url, headers=headers, json=json_request_body)
+        if response.status_code != 200:
+            raise Exception(f"Datadog API error. Status = {response.status_code}")
+        return response
+
+    def _record_batch_id(self, response_body):
+        '''
+        Datadog generates a single batch ID associated with the request for all the requested tests. This is distinct
+        from the run ids, which are uniquely assigned to each test run.
+        '''
+        self.batch_id = response_body['batch_id']
+
+
+    def _map_test_run_ids(self, response_body):
+        '''
+        Saves the test run ID values assigned by datatod to this barch request's tests, as
+        a dictionary keyed off of each test's (unique) public id
+
+        A test's public ID is assigned by Datadog when the test is created, and is entered as hard-coded
+        test configuration data in this module. It is the public ids that are used in the test run results
+        to identify which test is being reported on.
+
+        While we do care as to the result for the "deployment testing enabled switch test", we use that
+        result differently from all other test results, and do not save it in the dictionary
+        with results we intend to report on.
+        '''
+        for result in response_body['results']:
+            public_id = result['public_id']
+            test_run_id = result['result_id']
+            if public_id == self.DEPLOYMENT_TESTING_ENABLED_SWITCH.public_id:
+                self.DEPLOYMENT_TESTING_ENABLED_SWITCH.test_run_id = test_run_id
+            else:
+                self.tests_by_public_id[public_id].test_run_id = test_run_id
+
+    def _poll_for_test_result(self, test):
+        """
+        Poll every few seconds for test run results for a single, specified test, until available.
+
+        Note that if all tests take 90 seconds or more to run, the call into this method will take 90 or
+        more seconds, but subsequent calls may just take a few seconds each, depending on
+        test execution time variability.
+
+        The timeout on this operation is relative to when the batch request for test execution was made,
+        not relative to the last time we polled on a test result.
+
+        Returns None if still running; otherwise, returns True on test success and False on test failure.
+        """
+        test_result = None
+        while test_result is None and (time.time() - self.trigger_time) < (self.MAX_ALLOWABLE_TIME_SECS):
+            time.sleep(5)  # Poll every 5 seconds
+            test_result = self._get_test_result(test)
+            logging.info(f'{test_result=}')
+
+        if test_result is None:
+            raise Exception("The test run timed out.")
+
+        completion_time = time.time()
+        logging.info(f"Test {test.public_id} finished at time {completion_time} with {test_result=}")
+        return test_result
+
+    def _get_test_result(self, test):
+        """
+        Issue a single request to the Datadog API to fetch test results for a single, specified test.
+        returns JSON structure with test results if the test run has completed; returns None otherwise
+        """
+        url = f"{self.DATADOG_SYNTHETIC_TESTS_API_URL}/{test.public_id}/results/{test.test_run_id}"
+        headers = {
+            "DD-API-KEY": self.api_key,
+            "DD-APPLICATION-KEY": self.app_key
+        }
+
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            return None
+
+        response_json = response.json()
+        return response_json['result']['passed']
+
+"""
+Command-line script to run Datadog synthetic tests in the production enviornment and then slack notify and/or roll back
+"""
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+@click.option(
+    '--enable-automated-rollbacks',
+    is_flag=True,
+    default=False,
+    help='When set and synthetic tests fail, the most recent deployment to production is automatically rolled back'
+)
+@click.option(
+    '--slack-notification-channel',
+    required=False,
+    help='When set and synthetic tests fail, an alert Slack message is sent to this channel'
+)
+
+def run_synthetic_tests(tests_to_report_on, enable_automated_rollbacks, slack_notification_channel):
+    '''
+    :param enable_automated_rollbacks: Failing tests trigger a rollback in the build pipeline when true
+    :param slack_notification_channel: Newly failing tests deliver a slack message to this channel; none on repeat fails
+    :return: exits thread with success or fail code indicating tests' collective success or failure (of one or more)
+    '''
+    if enable_automated_rollbacks:
+        logging.Error("Automated rollbacks are not yet supported")
+        sys.exit(1)
+
+    try:
+        api_key = os.getenv("DATADOG_API_KEY")
+        app_key = os.getenv("DATADOG_APP_KEY")
+        dd_client = DatadogClient(api_key, app_key)
+
+        dd_client.trigger_synthetic_tests(tests_to_report_on)
+        dd_client.gate_on_deployment_testing_enable_switch() # Exits summarily if test results to be ignored
+        for test in tests_to_report_on:
+            logging.info(f"\t Running test {test.public_id}: {test.name}")
+        dd_client.get_and_record_test_results()
+        failed_tests = dd_client.get_failed_tests()
+
+        for failed_test in failed_tests:
+            logging.warning(f'Test failed: {failed_test.public_id} -- {failed_test.name}')
+
+        task_failed_code = 1 if failed_tests else 0
+
+    except Exception as e:
+        logging.error("GoCD/Datadog integration error: ", str(e))
+        task_failed_code = 1
+
+    sys.exit(task_failed_code)
+
+if __name__ == "__main__":
+    SLACK_NOTIFICATION_CHANNEL = 'project-edxapp-deployment-future'
+    ENABLE_AUTOMATED_ROLLBACKS = False
+    TESTS_TO_REPORT_ON = [
+        # All tests disabled for now. Will reinstate
+        # them after the deployment testing enable switch functionality has been tested on stage.
+        #
+        # TODO: Two tests are disabled behind two layers of comment symbols. These are broken and should not
+        # be reinstated until fixed.
+        #
+        # SyntheticTest(
+        #     '''
+        #     [Synthetics] edX Smoke Test - [Verified student] A verified student can
+        #     access a graded course problem
+        #     ''',
+        #     "tck-hrr-ubp"
+        # ),
+        # SyntheticTest(
+        #     '''
+        #     [Synthetics] edX Smoke Test - [Verified student] An enrolled verified student can
+        #     access a course’s landing page, course content, and course forum
+        #     ''',
+        #     "zbz-r28-jjx"
+        # ),
+        # # SyntheticTest(
+        # #     '''
+        # #     [Synthetics] edX Smoke Test - [Audit student] An enrolled audit student cannot load
+        # #     a graded problem, and sees the upsell screen
+        # #     ''',
+        # #     "75p-sez-5wg"
+        # # ),
+        # # SyntheticTest(
+        # #     '''
+        # #     [Synthetics] edX Smoke Test - [Audit student] An enrolled audit student can access
+        # #     a course’s landing page, course content, and course forum
+        # #     ''',
+        # #     "jvx-2jw-agj"
+        # # ),
+        # SyntheticTest(
+        #     '''
+        #     edX Smoke Test - [Unenrolled student] An unenrolled student cannot load a
+        #     course’s landing page, and sees the “Enroll Now” screen
+        #     ''',
+        #     "zkx-36f-kui"
+        # ),
+        # SyntheticTest(
+        #     '''
+        #     edX Smoke Test - [Anonymous user] An anonymous user is directed to the
+        #     Logistration page (authn.edx.org) when trying to access content behind log-in wall
+        #     ''',
+        #     "6tq-u28-hwa"
+        # ),
+    ]
+    #TODO: Pick up these settings from GoCD invocation
+    run_synthetic_tests(TESTS_TO_REPORT_ON, ENABLE_AUTOMATED_ROLLBACKS, SLACK_NOTIFICATION_CHANNEL)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../tubular/scripts/dd_synthetic_tests.py`