|
| 1 | +#! /usr/bin/env python3 |
| 2 | + |
| 3 | +import click |
| 4 | +import logging |
| 5 | +import os |
| 6 | +import requests |
| 7 | +import time |
| 8 | +import sys |
| 9 | + |
| 10 | +class SyntheticTest: |
| 11 | + ''' |
| 12 | + Attributes for a Datadog synthetic test and its test run |
| 13 | + ''' |
| 14 | + def __init__(self, name, public_id): |
| 15 | + self.name = name # The test's Datadog name |
| 16 | + self.public_id = public_id # The test's Datadog Test ID |
| 17 | + self.test_run_id = None # The run ID given by Datadog to this test's invocation |
| 18 | + self.success = None |
| 19 | + |
| 20 | +class DatadogClient: |
| 21 | + ''' Client class to invoke datadog API to run and monitor synthetic tests ''' |
| 22 | + |
| 23 | + DATADOG_SYNTHETIC_TESTS_API_URL = "https://api.datadoghq.com/api/v1/synthetics/tests" |
| 24 | + MAX_ALLOWABLE_TIME_SECS = 600 # 10 minutes |
| 25 | + |
| 26 | + DEPLOYMENT_TESTING_ENABLED_SWITCH = SyntheticTest( |
| 27 | + ''' |
| 28 | + Deployment testing enable test governing CI/CD synthetic testing |
| 29 | + ''', |
| 30 | + "sad-hqu-h33" |
| 31 | + ) |
| 32 | + |
| 33 | + def __init__(self, api_key, app_key): |
| 34 | + self.api_key = api_key |
| 35 | + self.app_key = app_key |
| 36 | + self.test_batch_id = None # A 'batch' is a set of tests intended to be run in parallel |
| 37 | + self.trigger_time = None # The system time at which a batch's execution was requested |
| 38 | + self.tests_by_public_id = {} # Dictionary mapping Datadog test ID to all info we have for a specific test |
| 39 | + |
| 40 | + def trigger_synthetic_tests(self, tests_to_report: [SyntheticTest]): |
| 41 | + ''' |
| 42 | + Trigger running of a batch of synthetic tests. |
| 43 | + :param tests_to_report: List of tests to run and report on |
| 44 | + :return: None, but saves test info including batch ID and test run IDs in 'self' |
| 45 | + ''' |
| 46 | + |
| 47 | + # Note that the list of tests to be run is one longer than the list of tests to be reported on. |
| 48 | + # The extra test is the so-called "deployment testing enable switch test". |
| 49 | + # That test should be modified via the Datadog UI to either always pass or always fail, depending |
| 50 | + # on whether synthetic testing is to be enabled at runtime or not, respectively. |
| 51 | + # While the test's result does affect how the pipeline operates, the result is not treated as reportable. |
| 52 | + tests_to_run = [self.DEPLOYMENT_TESTING_ENABLED_SWITCH] + tests_to_report |
| 53 | + self._record_requested_test_particulars(tests_to_run) |
| 54 | + self.trigger_time = time.time() # Key timeouts off of this |
| 55 | + logging.info(f'CI batch triggered at time {self.trigger_time}') |
| 56 | + |
| 57 | + try: |
| 58 | + response = self._trigger_batch_tests() # Kicks off asynchronous test execution for a batch of tests |
| 59 | + response_body = response.json() |
| 60 | + self._record_batch_id(response_body) # a single batch ID has now been assiged. Save for future reference |
| 61 | + self._map_test_run_ids(response_body) # one test run ID per test has been assigned. Save for reference. |
| 62 | + |
| 63 | + except Exception as e: |
| 64 | + raise Exception("Datadog error on triggering tests: " + str(e)) |
| 65 | + |
| 66 | + def gate_on_deployment_testing_enable_switch(self): |
| 67 | + ''' |
| 68 | + This is a bit hacky, but there's a designated test that's used as a deployment testing enable switch. |
| 69 | + If the test passes, it means that the synthetic testing GoCD pipeline is enabled, and the |
| 70 | + build should only proceed if all reportable tests pass; if the test fails, the build should proceed irrespective |
| 71 | + of any failures among the synthetic tests (which will be allowed to run, nonetheless). When this is intended, |
| 72 | + the GoCD pipeline responsible for running the tests should just return a success code without waiting |
| 73 | + for the reportable tests to complete their runs. |
| 74 | +
|
| 75 | + :return: Nothing, but terminates task with a success code if the synthetic testing feature is disabled |
| 76 | + and logs the decision to skip testing on this build |
| 77 | + ''' |
| 78 | + deployment_testing_enabled = self._poll_for_test_result(self.DEPLOYMENT_TESTING_ENABLED_SWITCH) |
| 79 | + if deployment_testing_enabled == False: |
| 80 | + switch_test_name = self.DEPLOYMENT_TESTING_ENABLED_SWITCH.name |
| 81 | + logging.warning( |
| 82 | + f'*** Datadog Synthetic testing disabled via failing test {switch_test_name} ***') |
| 83 | + sys.exit(0) |
| 84 | + |
| 85 | + def get_and_record_test_results(self): |
| 86 | + ''' |
| 87 | + Poll for pass/fail results for all batch tests |
| 88 | +
|
| 89 | + :return: Nothing, but saves pass/fail results in 'self' |
| 90 | + ''' |
| 91 | + for test in list(self.tests_by_public_id.values()): |
| 92 | + test.success = self._poll_for_test_result(test) |
| 93 | + |
| 94 | + def get_failed_tests(self): |
| 95 | + ''' |
| 96 | + Compile a list of all failed tests from the set of all tests that were run |
| 97 | + :return: A list of failed test objects; Empty list if all tests passed |
| 98 | + ''' |
| 99 | + failed_tests = [] |
| 100 | + for test in list(self.tests_by_public_id.values()): |
| 101 | + if not test.success: |
| 102 | + failed_tests.append(test) |
| 103 | + |
| 104 | + return failed_tests |
| 105 | + |
| 106 | + # ***************** Private methods ********************** |
| 107 | + |
| 108 | + def _record_requested_test_particulars(self, test_requests): |
| 109 | + ''' |
| 110 | + Save list of requested tests in this dictionary for later reference, indexed by test public ID |
| 111 | + ''' |
| 112 | + for test in test_requests: |
| 113 | + self.tests_by_public_id[test.public_id] = test |
| 114 | + |
| 115 | + def _trigger_batch_tests(self): |
| 116 | + ''' |
| 117 | + Ask datadog to run the set of selected synthetic tests |
| 118 | + returns the response from the datadog API call |
| 119 | +
|
| 120 | + Note that using the ci (continuous integration) route leverages |
| 121 | + the parallel execution Datadog feature we pay extra for |
| 122 | + ''' |
| 123 | + url = f"{self.DATADOG_SYNTHETIC_TESTS_API_URL}/trigger/ci" |
| 124 | + headers = { |
| 125 | + "Content-Type": "application/json", |
| 126 | + "DD-API-KEY": self.api_key, |
| 127 | + "DD-APPLICATION-KEY": self.app_key |
| 128 | + } |
| 129 | + test_public_ids = self.tests_by_public_id.keys() |
| 130 | + json_request_body = {"tests": [{"public_id": public_id} for public_id in test_public_ids]} |
| 131 | + response = requests.post(url, headers=headers, json=json_request_body) |
| 132 | + if response.status_code != 200: |
| 133 | + raise Exception(f"Datadog API error. Status = {response.status_code}") |
| 134 | + return response |
| 135 | + |
| 136 | + def _record_batch_id(self, response_body): |
| 137 | + ''' |
| 138 | + Datadog generates a single batch ID associated with the request for all the requested tests. This is distinct |
| 139 | + from the run ids, which are uniquely assigned to each test run. |
| 140 | + ''' |
| 141 | + self.batch_id = response_body['batch_id'] |
| 142 | + |
| 143 | + |
| 144 | + def _map_test_run_ids(self, response_body): |
| 145 | + ''' |
| 146 | + Saves the test run ID values assigned by datatod to this barch request's tests, as |
| 147 | + a dictionary keyed off of each test's (unique) public id |
| 148 | +
|
| 149 | + A test's public ID is assigned by Datadog when the test is created, and is entered as hard-coded |
| 150 | + test configuration data in this module. It is the public ids that are used in the test run results |
| 151 | + to identify which test is being reported on. |
| 152 | +
|
| 153 | + While we do care as to the result for the "deployment testing enabled switch test", we use that |
| 154 | + result differently from all other test results, and do not save it in the dictionary |
| 155 | + with results we intend to report on. |
| 156 | + ''' |
| 157 | + for result in response_body['results']: |
| 158 | + public_id = result['public_id'] |
| 159 | + test_run_id = result['result_id'] |
| 160 | + if public_id == self.DEPLOYMENT_TESTING_ENABLED_SWITCH.public_id: |
| 161 | + self.DEPLOYMENT_TESTING_ENABLED_SWITCH.test_run_id = test_run_id |
| 162 | + else: |
| 163 | + self.tests_by_public_id[public_id].test_run_id = test_run_id |
| 164 | + |
| 165 | + def _poll_for_test_result(self, test): |
| 166 | + """ |
| 167 | + Poll every few seconds for test run results for a single, specified test, until available. |
| 168 | +
|
| 169 | + Note that if all tests take 90 seconds or more to run, the call into this method will take 90 or |
| 170 | + more seconds, but subsequent calls may just take a few seconds each, depending on |
| 171 | + test execution time variability. |
| 172 | +
|
| 173 | + The timeout on this operation is relative to when the batch request for test execution was made, |
| 174 | + not relative to the last time we polled on a test result. |
| 175 | +
|
| 176 | + Returns None if still running; otherwise, returns True on test success and False on test failure. |
| 177 | + """ |
| 178 | + test_result = None |
| 179 | + while test_result is None and (time.time() - self.trigger_time) < (self.MAX_ALLOWABLE_TIME_SECS): |
| 180 | + time.sleep(5) # Poll every 5 seconds |
| 181 | + test_result = self._get_test_result(test) |
| 182 | + logging.info(f'{test_result=}') |
| 183 | + |
| 184 | + if test_result is None: |
| 185 | + raise Exception("The test run timed out.") |
| 186 | + |
| 187 | + completion_time = time.time() |
| 188 | + logging.info(f"Test {test.public_id} finished at time {completion_time} with {test_result=}") |
| 189 | + return test_result |
| 190 | + |
| 191 | + def _get_test_result(self, test): |
| 192 | + """ |
| 193 | + Issue a single request to the Datadog API to fetch test results for a single, specified test. |
| 194 | + returns JSON structure with test results if the test run has completed; returns None otherwise |
| 195 | + """ |
| 196 | + url = f"{self.DATADOG_SYNTHETIC_TESTS_API_URL}/{test.public_id}/results/{test.test_run_id}" |
| 197 | + headers = { |
| 198 | + "DD-API-KEY": self.api_key, |
| 199 | + "DD-APPLICATION-KEY": self.app_key |
| 200 | + } |
| 201 | + |
| 202 | + response = requests.get(url, headers=headers) |
| 203 | + if response.status_code != 200: |
| 204 | + return None |
| 205 | + |
| 206 | + response_json = response.json() |
| 207 | + return response_json['result']['passed'] |
| 208 | + |
| 209 | +""" |
| 210 | +Command-line script to run Datadog synthetic tests in the production enviornment and then slack notify and/or roll back |
| 211 | +""" |
| 212 | + |
| 213 | +logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
| 214 | + |
| 215 | +@click.option( |
| 216 | + '--enable-automated-rollbacks', |
| 217 | + is_flag=True, |
| 218 | + default=False, |
| 219 | + help='When set and synthetic tests fail, the most recent deployment to production is automatically rolled back' |
| 220 | +) |
| 221 | +@click.option( |
| 222 | + '--slack-notification-channel', |
| 223 | + required=False, |
| 224 | + help='When set and synthetic tests fail, an alert Slack message is sent to this channel' |
| 225 | +) |
| 226 | + |
| 227 | +def run_synthetic_tests(tests_to_report_on, enable_automated_rollbacks, slack_notification_channel): |
| 228 | + ''' |
| 229 | + :param enable_automated_rollbacks: Failing tests trigger a rollback in the build pipeline when true |
| 230 | + :param slack_notification_channel: Newly failing tests deliver a slack message to this channel; none on repeat fails |
| 231 | + :return: exits thread with success or fail code indicating tests' collective success or failure (of one or more) |
| 232 | + ''' |
| 233 | + if enable_automated_rollbacks: |
| 234 | + logging.Error("Automated rollbacks are not yet supported") |
| 235 | + sys.exit(1) |
| 236 | + |
| 237 | + try: |
| 238 | + api_key = os.getenv("DATADOG_API_KEY") |
| 239 | + app_key = os.getenv("DATADOG_APP_KEY") |
| 240 | + dd_client = DatadogClient(api_key, app_key) |
| 241 | + |
| 242 | + dd_client.trigger_synthetic_tests(tests_to_report_on) |
| 243 | + dd_client.gate_on_deployment_testing_enable_switch() # Exits summarily if test results to be ignored |
| 244 | + for test in tests_to_report_on: |
| 245 | + logging.info(f"\t Running test {test.public_id}: {test.name}") |
| 246 | + dd_client.get_and_record_test_results() |
| 247 | + failed_tests = dd_client.get_failed_tests() |
| 248 | + |
| 249 | + for failed_test in failed_tests: |
| 250 | + logging.warning(f'Test failed: {failed_test.public_id} -- {failed_test.name}') |
| 251 | + |
| 252 | + task_failed_code = 1 if failed_tests else 0 |
| 253 | + |
| 254 | + except Exception as e: |
| 255 | + logging.error("GoCD/Datadog integration error: ", str(e)) |
| 256 | + task_failed_code = 1 |
| 257 | + |
| 258 | + sys.exit(task_failed_code) |
| 259 | + |
| 260 | +if __name__ == "__main__": |
| 261 | + SLACK_NOTIFICATION_CHANNEL = 'project-edxapp-deployment-future' |
| 262 | + ENABLE_AUTOMATED_ROLLBACKS = False |
| 263 | + TESTS_TO_REPORT_ON = [ |
| 264 | + # All tests disabled for now. Will reinstate |
| 265 | + # them after the deployment testing enable switch functionality has been tested on stage. |
| 266 | + # |
| 267 | + # TODO: Two tests are disabled behind two layers of comment symbols. These are broken and should not |
| 268 | + # be reinstated until fixed. |
| 269 | + # |
| 270 | + # SyntheticTest( |
| 271 | + # ''' |
| 272 | + # [Synthetics] edX Smoke Test - [Verified student] A verified student can |
| 273 | + # access a graded course problem |
| 274 | + # ''', |
| 275 | + # "tck-hrr-ubp" |
| 276 | + # ), |
| 277 | + # SyntheticTest( |
| 278 | + # ''' |
| 279 | + # [Synthetics] edX Smoke Test - [Verified student] An enrolled verified student can |
| 280 | + # access a course’s landing page, course content, and course forum |
| 281 | + # ''', |
| 282 | + # "zbz-r28-jjx" |
| 283 | + # ), |
| 284 | + # # SyntheticTest( |
| 285 | + # # ''' |
| 286 | + # # [Synthetics] edX Smoke Test - [Audit student] An enrolled audit student cannot load |
| 287 | + # # a graded problem, and sees the upsell screen |
| 288 | + # # ''', |
| 289 | + # # "75p-sez-5wg" |
| 290 | + # # ), |
| 291 | + # # SyntheticTest( |
| 292 | + # # ''' |
| 293 | + # # [Synthetics] edX Smoke Test - [Audit student] An enrolled audit student can access |
| 294 | + # # a course’s landing page, course content, and course forum |
| 295 | + # # ''', |
| 296 | + # # "jvx-2jw-agj" |
| 297 | + # # ), |
| 298 | + # SyntheticTest( |
| 299 | + # ''' |
| 300 | + # edX Smoke Test - [Unenrolled student] An unenrolled student cannot load a |
| 301 | + # course’s landing page, and sees the “Enroll Now” screen |
| 302 | + # ''', |
| 303 | + # "zkx-36f-kui" |
| 304 | + # ), |
| 305 | + # SyntheticTest( |
| 306 | + # ''' |
| 307 | + # edX Smoke Test - [Anonymous user] An anonymous user is directed to the |
| 308 | + # Logistration page (authn.edx.org) when trying to access content behind log-in wall |
| 309 | + # ''', |
| 310 | + # "6tq-u28-hwa" |
| 311 | + # ), |
| 312 | + ] |
| 313 | + #TODO: Pick up these settings from GoCD invocation |
| 314 | + run_synthetic_tests(TESTS_TO_REPORT_ON, ENABLE_AUTOMATED_ROLLBACKS, SLACK_NOTIFICATION_CHANNEL) |
0 commit comments