From 38009870a1cf9d2e1a69beaacd7e5ae22e3f91e3 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 4 Jan 2021 10:54:34 -0600 Subject: [PATCH 001/109] Update RELEASE_NOTES.md --- RELEASE_NOTES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 8328e9839..3c2bddee2 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -4,6 +4,7 @@ * Fix up tests * Remove dependency on slack * Add batch endpoints + * Rename prod branch to "main" ## 0.0.3.4 * Change 7 day periodic_remove to 7 day hold From 6017c7d3fb49d53428c51da5ef51a0bf788266cf Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Jan 2021 01:01:40 -0600 Subject: [PATCH 002/109] Update README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 425bac778..8a61a2723 100644 --- a/README.md +++ b/README.md @@ -20,10 +20,15 @@ You can also learn more about the apps implemented in this module from its [cata See the .travis file for information on how to test locally -# Setup and test with docker-compose on MacOS +# Setup and test with docker-compose on MacOS/Linux ## Build and exec into the dev container +Make sure you have the latest versions of + +* docker +* docker-compose + ``` git clone https://github.com/kbase/execution_engine2.git cd execution_engine2 From ddd50fbdb9ee3e938681d7b4cb24c573d6e2d9f9 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Sun, 31 Jan 2021 23:02:51 -0600 Subject: [PATCH 003/109] Create codeql.yml (#290) --- .github/codeql.yml | 52 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/codeql.yml diff --git a/.github/codeql.yml b/.github/codeql.yml new file mode 100644 index 000000000..9771ca0f4 --- /dev/null +++ b/.github/codeql.yml @@ -0,0 +1,52 @@ +name: "Code scanning - action" + +on: + push: + pull_request: + schedule: + - cron: '0 19 * * 0' + +jobs: + CodeQL-Build: + + # CodeQL runs on ubuntu-latest and windows-latest + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + # We must fetch at least the immediate parents so that if this is + # a pull request then we can checkout the head. + fetch-depth: 2 + + # If this run was triggered by a pull request event, then checkout + # the head of the pull request instead of the merge commit. + - run: git checkout HEAD^2 + if: ${{ github.event_name == 'pull_request' }} + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + # Override language selection by uncommenting this and choosing your languages + # with: + # languages: go, javascript, csharp, python, cpp, java + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v1 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1 From d65a6d466a75f82a97e0991d36519bdd75fbfeb0 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Sun, 31 Jan 2021 23:05:33 -0600 Subject: [PATCH 004/109] Update README.md (#278) --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a61a2723..50585c12c 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,13 @@ PYTHONPATH=.:lib:test pytest --cov-report=xml --cov lib/execution_engine2/ --ver ## To run a specific test file via PyCharm See [Testing with Pycharm](docs/testing_with_pycharm.md) - +## Installing HTCondor Bindings from the mac +* You may not be able to load without disabling the mac Security Gatekeeper with `sudo spctl --master-disable` +* The HTCondor bindings only work on the Python.org install of python or your system install of python2.7. They will not work with anaconda. So download python from python.org +* Download the mac bindings at https://research.cs.wisc.edu/htcondor/tarball/current/8.9.10/release/ +* Current version is [8.9.10](https://research.cs.wisc.edu/htcondor/tarball/current/8.9.10/release/condor-8.9.10-x86_64_MacOSX-unstripped.tar.gz) +* Add /lib/python3 to PYTHONPATH. +* `import htcondor` ## Test Running Options ### PyCharm From 1634640119e32b7d3090413dc1aed84119edb1e8 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 2 Feb 2021 13:57:39 -0600 Subject: [PATCH 005/109] Update Dockerfile (#301) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 50144e7a3..f6f2615ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM kbase/sdkbase2:python +FROM quay.io/kbase/sdkbase2:python MAINTAINER KBase Developer RUN apt-get clean all && apt-get update --fix-missing -y From 15f65b3dedf880eafd00b9d1fa985ca08b649327 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 2 Feb 2021 17:49:32 -0600 Subject: [PATCH 006/109] Attempt to save to ghcr.io (#291) * Save feature branches when merging against develop or master --- .../workflows/build_and_push_docker_image.yml | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/build_and_push_docker_image.yml diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml new file mode 100644 index 000000000..85b0292b0 --- /dev/null +++ b/.github/workflows/build_and_push_docker_image.yml @@ -0,0 +1,28 @@ +name: Build Develop/Master + +on: + pull_request: + branches: [master,develop] + +jobs: + main: + runs-on: ubuntu-20.04 + steps: + - + name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ secrets.GHCR_USERNAME }} + password: ${{ secrets.GHCR_TOKEN }} + - + name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + push: true + tags: ghcr.io/${{ github.repository }}:${{ github.head_ref }} + + - + name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} From 5033d1cb75d6c0fb95ba11e2c4f3144a35f80d6a Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 4 Feb 2021 13:50:19 -0600 Subject: [PATCH 007/109] dev-require_resources (#281) * Remove cruft.py (#279) I suspect it may contain some cruft. * Correct spec, improve documentation (#284) * Remove cruft.py I suspect it may contain some cruft. * Correct spec, improve documentation Corrected / improved spec and spec documentation for date range methods * Fix spec typo. Also clarify what needs to be done to run tests after making code changes * make WorkspaceAuth and authstrategy unit-testable (#288) * Remove cruft.py I suspect it may contain some cruft. * Correct spec, improve documentation Corrected / improved spec and spec documentation for date range methods * Fix spec typo. Also clarify what needs to be done to run tests after making code changes * make WorkspaceAuth and authstrategy unit-testable Inject dependencies so they can be mocked. * Fix flake8 * python formatting rules are stupid * more flake8 fixes really, flake8? really? * Make token and user id required for sdkmr (#294) Always provided, so no reason to be optional, and makes the code easier to understand. * pass a UserClientSet into SDKMethodRunner (#296) * pass a UserClientSet into SDKMethodRunner UCS contains the clients that need to be instantiated on a per user basis. UCS is passed in to SDKMR to allow for eventual unit testing of SDKMR, since the UCS and workspace client can be mocked via create_autospec * remove extraneous period Co-authored-by: MrCreosote Co-authored-by: Gavin --- .gitignore | 1 + README.md | 18 ++ bin/PurgeHeldJobs.py | 1 - execution_engine2.spec | 61 ++++-- .../authorization/authstrategy.py | 50 ++--- .../authorization/workspaceauth.py | 10 +- lib/execution_engine2/cruft.py | 180 ------------------ .../execution_engine2Impl.py | 72 +++---- .../sdk/EE2Authentication.py | 17 +- lib/execution_engine2/sdk/EE2Constants.py | 9 + lib/execution_engine2/sdk/EE2Runjob.py | 4 +- lib/execution_engine2/sdk/EE2Status.py | 5 +- lib/execution_engine2/sdk/EE2StatusRange.py | 14 +- lib/execution_engine2/sdk/SDKMethodRunner.py | 34 +--- lib/execution_engine2/utils/APIHelpers.py | 31 +++ lib/execution_engine2/utils/clients.py | 119 ++++++++++++ test/tests_for_auth/ee2_admin_mode_test.py | 38 ++-- test/tests_for_auth/ee2_authstrategy_test.py | 27 ++- test/tests_for_auth/ee2_workspaceauth_test.py | 21 +- .../ee2_SDKMethodRunner_EE2Logs_test.py | 3 +- .../ee2_SDKMethodRunner_test.py | 17 +- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 3 +- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 3 +- test/tests_for_sdkmr/ee2_load_test.py | 5 +- test/tests_for_utils/clients_test.py | 70 +++++++ test/utils_shared/test_utils.py | 5 + 26 files changed, 471 insertions(+), 347 deletions(-) delete mode 100644 lib/execution_engine2/cruft.py create mode 100644 lib/execution_engine2/utils/APIHelpers.py create mode 100644 lib/execution_engine2/utils/clients.py create mode 100644 test/tests_for_utils/clients_test.py diff --git a/.gitignore b/.gitignore index 5a4d9b298..1f650694b 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ sdk.cfg .pytest_cache lib/execution_engine2/execution_engine2Impl.py.bak* +coverage.xml diff --git a/README.md b/README.md index 50585c12c..264834207 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,9 @@ cd /ee2 make test-coverage ``` +Once the docker image is built, it does not need to be rebuilt after code changes to rerun tests. +Just ensure the services are up, exec into the container, and run the tests. + ## To run a specific test directory or specific file ``` PYTHONPATH=.:lib:test pytest --cov-report=xml --cov lib/execution_engine2/ --verbose test/tests_for_db/ @@ -49,6 +52,21 @@ PYTHONPATH=.:lib:test pytest --cov-report=xml --cov lib/execution_engine2/ --ver ## To run a specific test file via PyCharm See [Testing with Pycharm](docs/testing_with_pycharm.md) +## To run pre-commit hooks + +`exec` into the docker container as before and switch to the `/ee2` directory. + +``` +pip install pre-commit +pre-commit install +pre-commit run --all-files +``` + +To remove the pre commit hooks: +``` +pre-commit uninstall +``` + ## Installing HTCondor Bindings from the mac * You may not be able to load without disabling the mac Security Gatekeeper with `sudo spctl --master-disable` * The HTCondor bindings only work on the Python.org install of python or your system install of python2.7. They will not work with anaconda. So download python from python.org diff --git a/bin/PurgeHeldJobs.py b/bin/PurgeHeldJobs.py index f675de91c..294a7623b 100644 --- a/bin/PurgeHeldJobs.py +++ b/bin/PurgeHeldJobs.py @@ -173,4 +173,3 @@ def handle_hold_event(event): time.sleep(5) except Exception as e: slack_client.ee2_reaper_failure(endpoint=ee2_endpoint) - diff --git a/execution_engine2.spec b/execution_engine2.spec index 9cac4d958..63cdc60e5 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -525,30 +525,59 @@ /* - Results of check_jobs_date_range - TODO : DOCUMENT THE RETURN OF STATS mapping + Results of check_jobs_date_range methods. + + jobs - the jobs matching the query, up to `limit` jobs. + count - the number of jobs returned. + query_count - the number of jobs that matched the filters. + filter - DEPRECATED - this field may change in the future. The filters that were + applied to the jobs. + skip - the number of jobs that were skipped prior to beginning to return jobs. + projection - the list of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. + sort_order - the order in which the results were sorted by the job ID - + for + ascending, - for descending. + + TODO: DOCUMENT THE RETURN OF STATS mapping */ typedef structure { - mapping jobs; + list jobs; int count; int query_count; - list filter; + mapping filter; int skip; list projection; int limit; string sort_order; } CheckJobsDateRangeResults; - - /* Check job for all jobs in a given date/time range for all users (Admin function) - float start_time; # Filter based on creation timestamp since epoch - float end_time; # Filter based on creation timestamp since epoch - list projection; # A list of fields to include in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, such as error_code=1, wsid=1234, terminated_code = 1 - int limit; # The maximum number of records to return - string user; # Optional. Defaults off of your token + Notes on start_time and end_time: + These fields are designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: + - if the field is a float or a string that contains a float and only a float, + the field value is treated as seconds since the epoch. + - if the field is an int or a string that contains an int and only an int, + the field value is treated as milliseconds since the epoch. + - if the field is a string not matching the criteria above, it is treated as + a date and time. Nearly any unambigous format can be parsed. + + float start_time - Filter based on job creation timestamp since epoch + float end_time - Filter based on job creation timestamp since epoch + list projection - A list of fields to include in the projection, default ALL + See "Projection Fields" above + list filter - DEPRECATED: this field may change or be removed in the future. + A list of simple filters to "AND" together, such as error_code=1, wsid=1234, + terminated_code = 1 + int limit - The maximum number of records to return + string user - The user whose job records will be returned. Optional. Default is the + current user. + int offset - the number of jobs to skip before returning records. + boolean ascending - true to sort by job ID ascending, false descending. + boolean as_admin - true to run the query as an admin; user must have admin EE2 + permissions. Required if setting `user` to something other than your own. + TODO: this seems to have no effect @optional projection @optional filter @optional limit @@ -568,8 +597,10 @@ boolean as_admin; } CheckJobsDateRangeParams; - funcdef check_jobs_date_range_for_user(CheckJobsDateRangeParams params) returns (CheckJobsResults) authentication required; - funcdef check_jobs_date_range_for_all(CheckJobsDateRangeParams params) returns (CheckJobsResults) authentication required; + funcdef check_jobs_date_range_for_user(CheckJobsDateRangeParams params) + returns (CheckJobsDateRangeResults) authentication required; + funcdef check_jobs_date_range_for_all(CheckJobsDateRangeParams params) + returns (CheckJobsDateRangeResults) authentication required; typedef structure { UnspecifiedObject held_job; @@ -586,7 +617,7 @@ /* - str permission; # One of 'r|w|x' (('read' | 'write' | 'none')) + str permission - One of 'r|w|x' (('read' | 'write' | 'none')) */ typedef structure { string permission; diff --git a/lib/execution_engine2/authorization/authstrategy.py b/lib/execution_engine2/authorization/authstrategy.py index 5ba8c2fe7..8ac55a034 100644 --- a/lib/execution_engine2/authorization/authstrategy.py +++ b/lib/execution_engine2/authorization/authstrategy.py @@ -10,68 +10,60 @@ KBASE_WS_AUTHSTRAT = "kbaseworkspace" -def can_read_job(job: Job, user_id: str, token: str, config: Dict[str, str]) -> bool: +def can_read_job(job: Job, user_id: str, ws_auth: WorkspaceAuth) -> bool: """ Returns True if the user has read access to the job, False otherwise. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: bool - True if the user can read the job info """ - return _check_permissions(job, user_id, token, config, level="read") + return _check_permissions(job, user_id, ws_auth, level="read") -def can_write_job(job: Job, user_id: str, token: str, config: Dict[str, str]) -> bool: +def can_write_job(job: Job, user_id: str, ws_auth: WorkspaceAuth) -> bool: """ Returns True if the user has write access to the job, False otherwise. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: bool - True if the user can read the job info """ - return _check_permissions(job, user_id, token, config, level="write") + return _check_permissions(job, user_id, ws_auth, level="write") -def can_read_jobs( - jobs: List[Job], user_id: str, token: str, config: Dict[str, str] -) -> List[bool]: +def can_read_jobs(jobs: List[Job], user_id: str, ws_auth: WorkspaceAuth) -> List[bool]: """ Returns a list of job permissions in the same order as the given list of Jobs. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: List[bool] - Has True values if the user can read job info, False otherwise """ - return _check_permissions_list(jobs, user_id, token, config, level="read") + return _check_permissions_list(jobs, user_id, ws_auth, level="read") -def can_write_jobs( - jobs: List[Job], user_id: str, token: str, config: Dict[str, str] -) -> List[bool]: +def can_write_jobs(jobs: List[Job], user_id: str, ws_auth: WorkspaceAuth) -> List[bool]: """ Returns a list of job write permissions in the same order as the given list of Jobs. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: List[bool] - Has True values if the user can write job info, False otherwise """ - return _check_permissions_list(jobs, user_id, token, config, level="write") + return _check_permissions_list(jobs, user_id, ws_auth, level="write") def _check_permissions( - job: Job, user_id: str, token: str, config: Dict[str, str], level="read" + job: Job, user_id: str, ws_auth: WorkspaceAuth, level="read" ) -> bool: """ Returns a job permissions, for either read or write ability :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config - :param level: string - if "read", then returns the read value, if "write", return whether the user can write. + :param ws_auth: a workspace authorization instance initialized with the user's token. + :param level: string - if "read", then returns the read value, if "write", return whether + the user can write. :returns: bool - True if the permission is valid, False otherwise. """ if user_id == job.user: @@ -79,7 +71,6 @@ def _check_permissions( if job.authstrat == KBASE_WS_AUTHSTRAT: if job.wsid is None: return False - ws_auth = WorkspaceAuth(token, user_id, config["workspace-url"]) if level == "read": return ws_auth.can_read(job.wsid) else: @@ -89,15 +80,15 @@ def _check_permissions( def _check_permissions_list( - jobs: List[Job], user_id: str, token: str, config: Dict[str, str], level="read" + jobs: List[Job], user_id: str, ws_auth: WorkspaceAuth, level="read" ) -> List[bool]: """ Returns True for each job the user has read access to, and False for the ones they don't. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config - :param level: string - if "read" then tests if the Job can be read, otherwise checks if it can be written + :param ws_auth: a workspace authorization instance initialized with the user's token + :param level: string - if "read" then tests if the Job can be read, otherwise checks if it + can be written :returns: List[bool] - Has True values if the user can write job info, False otherwise """ @@ -134,7 +125,6 @@ def _check_permissions_list( if len(ws_ids_to_jobs): # If there's workspaces to look up, go do it. - ws_auth = WorkspaceAuth(token, user_id, config["workspace-url"]) if level == "read": ws_perms = ws_auth.can_read_list( list(ws_ids_to_jobs.keys()) diff --git a/lib/execution_engine2/authorization/workspaceauth.py b/lib/execution_engine2/authorization/workspaceauth.py index 06865873f..cecdc84f8 100644 --- a/lib/execution_engine2/authorization/workspaceauth.py +++ b/lib/execution_engine2/authorization/workspaceauth.py @@ -1,8 +1,8 @@ from typing import List, Dict from enum import Enum -from lib.execution_engine2.authorization.basestrategy import AuthStrategy -from lib.installed_clients.WorkspaceClient import Workspace -from lib.installed_clients.baseclient import ServerError +from execution_engine2.authorization.basestrategy import AuthStrategy +from installed_clients.WorkspaceClient import Workspace +from installed_clients.baseclient import ServerError STRATEGY = "kbaseworkspace" @@ -15,8 +15,8 @@ class WorkspacePermission(Enum): class WorkspaceAuth(AuthStrategy): - def __init__(self, token: str, user_id: str, ws_url: str): - self.ws_client = Workspace(url=ws_url, token=token) + def __init__(self, user_id: str, workspace: Workspace): + self.ws_client = workspace self.user_id = user_id def can_read(self, auth_param: str) -> bool: diff --git a/lib/execution_engine2/cruft.py b/lib/execution_engine2/cruft.py deleted file mode 100644 index 1d290cbaf..000000000 --- a/lib/execution_engine2/cruft.py +++ /dev/null @@ -1,180 +0,0 @@ -# # def _run_admin_command(self, command, params): -# # available_commands = ["cancel_job", "view_job_logs"] -# # if command not in available_commands: -# # raise ValueError(f"{command} not an admin command. See {available_commands} ") -# # commands = {"cancel_job": self.cancel_job, "view_job_logs": self.view_job_logs} -# # p = { -# # "cancel_job": { -# # "job_id": params.get("job_id"), -# # "terminated_code": params.get( -# # "terminated_code", TerminatedCode.terminated_by_admin.value -# # ), -# # }, -# # "view_job_logs": {"job_id": params.get("job_id")}, -# # } -# # return commands[command](**p[command]) -# # -# # def admin_role(self, token): -# # """ -# # Check to see which role the given token has -# # :param token: Token to inspect -# # :return: One of 'EE2_ADMIN_RO' or 'EE2_ADMIN` or None -# # """ -# # return AdminAuthUtil(self.auth_url, self.admin_roles).get_admin_role( -# # token=token, read_role="EE2_ADMIN_RO", write_role="EE2_ADMIN" -# # ) -# # -# # def get_job_wrapper(self, job_id, required_admin_role=None): -# # """ -# # If you are an admin, you can -# # If you are not an admin, you -# # :param job_id: -# # :return: -# # """ -# # if required_admin_role is not None and required_admin_role in self.roles: -# # job = self.get_mongo_util().get_job(job_id=job_id) -# # logging.info(f"ADMIN USER has permission to cancel job {job_id}") -# # self.logger.debug(f"ADMIN USER has permission to cancel job {job_id}") -# # else: -# # job = self.get_job_with_permission(job_id, JobPermissions.WRITE) -# # logging.info(f"User has permission to cancel job {job_id}") -# # self.logger.debug(f"User has permission to cancel job {job_id}") -# # return job -# # -# # def administer(self, command, params, token): -# # """ -# # Run commands as an administrator. Requires a token for a user with an EE2 administrative role. -# # Currently allowed commands are cancel_job and view_job_logs. -# # -# # Commands are given as strings, and their parameters are given as a dictionary of keys and values. -# # For example: -# # administer("cancel_job", {"job_id": 12345}, auth_token) -# # is the same as running -# # cancel_job(12345) -# # but with administrative privileges. -# # :param command: The command to run (See specfile) -# # :param params: The parameters for that command that will be expanded (See specfile) -# # :param token: The auth token (Will be checked for the correct auth role) -# # :return: -# # """ -# # logging.info( -# # f'Attempting to run administrative command "{command}" as user {self.user_id}' -# # ) -# # # set admin privs, one way or the other -# # self.is_admin = self._is_admin(token) -# # if not self.is_admin: -# # raise PermissionError( -# # f"User {self.user_id} is not authorized to run administrative commands." -# # ) -# # self._run_admin_command(command, params) -# # self.is_admin = False -# -# -# def process_old_format(self, cg_resources_requirements): -# """ -# Old format is njs,request_cpu=1,request_memory=1,request_disk=1,request_color=blue -# Regex is assumed to be true -# -# :param cg_resources_requirements: -# :return: -# """ -# cg_res_req_split = cg_resources_requirements.split(",") # List -# -# # Access and remove clientgroup from the statement -# client_group = cg_res_req_split.pop(0) -# -# requirements = dict() -# for item in cg_res_req_split: -# (req, value) = item.split("=") -# requirements[req] = value -# -# # Set up default resources -# resources = self.get_default_resources(client_group) -# -# if client_group is None or client_group is "": -# client_group = resources[self.CG] -# -# requirements_statement = [] -# -# for key, value in requirements.items(): -# if key in resources: -# # Overwrite the resources with catalog entries -# resources[key] = value -# else: -# # Otherwise add it to the requirements statement -# requirements_statement.append(f"{key}={value}") -# -# # Delete special keys -# print(resources) -# print(requirements) -# -# del requirements[self.REQUEST_MEMORY] -# del requirements[self.REQUEST_CPUS] -# del requirements[self.REQUEST_DISK] -# -# # Set the clientgroup just in case it was blank -# -# # Add clientgroup to resources because it is special -# # Regex is enabled by default -# cge = f'regexp("{client_group}",CLIENTGROUP)' -# requirements_statement.append(cge) -# -# rv = dict() -# rv[self.CG] = client_group -# rv["client_group_expression"] = cge -# rv["requirements"] = "".join(requirements_statement) -# rv["requirements_statement"] = cge -# for key, value in resources.items(): -# rv[key] = value -# -# return rv -# -# -# def process_new_format(self, client_group_and_requirements): -# """ -# New format is {'client_group' : 'njs', 'request_cpu' : 1, 'request_disk' : -# :param client_group_and_requirements: -# :return: -# """ -# reqs = json.loads(client_group_and_requirements) -# -# def generate_requirements(self, cg_resources_requirements): -# print(cg_resources_requirements) -# if "{" in cg_resources_requirements: -# reqs = self.process_new_format(cg_resources_requirements) -# else: -# reqs = self.process_old_format(cg_resources_requirements) -# -# self.check_for_missing_requirements(reqs) -# -# return self.resource_requirements( -# request_cpus=reqs["request_cpus"], -# request_disk=reqs["request_disk"], -# request_memory=reqs["request_memory"], -# requirements_statement=reqs["requirements"], -# ) -# return r -# -# @staticmethod -# def check_for_missing_requirements(requirements): -# for item in ( -# "client_group_expression", -# "request_cpus", -# "request_disk", -# "request_memory", -# ): -# if item not in requirements: -# raise MissingCondorRequirementsException( -# f"{item} not found in requirements" -# ) -# -# def _process_requirements_new_format(self, requirements): -# requirements = dict() -# cg = requirements.get("client_group", "") -# if cg is "": -# # requirements[ -# -# if bool(requirements.get("regex", False)) is True: -# cg["client_group_requirement"] = f'regexp("{cg}",CLIENTGROUP)' -# else: -# cg["client_group_requirement"] = f"+CLIENTGROUP == {client_group} " diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index fdd718e89..caecb561a 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -6,6 +6,7 @@ from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.APIHelpers import GenerateFromConfig #END_HEADER @@ -59,9 +60,7 @@ def __init__(self, config): maxsize=self.ADMIN_ROLES_CACHE_SIZE, ttl=self.ADMIN_ROLES_CACHE_EXPIRE_TIME ) self.mongo_util = MongoUtil(config) - - - + self.gen_cfg = GenerateFromConfig(config) #END_CONSTRUCTOR pass @@ -227,7 +226,8 @@ def run_job(self, ctx, params): # return variables are: job_id #BEGIN run_job mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util ) @@ -298,7 +298,8 @@ def run_job_batch(self, ctx, params, batch_params): # return variables are: job_ids #BEGIN run_job_batch mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util ) @@ -326,7 +327,8 @@ def abandon_children(self, ctx, params): # return variables are: parent_and_child_ids #BEGIN abandon_children mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util ) @@ -408,7 +410,8 @@ def run_job_concierge(self, ctx, params, concierge_params): # return variables are: job_id #BEGIN run_job_concierge mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) job_id = mr.run_job_concierge(params=params,concierge_params=concierge_params) @@ -478,8 +481,7 @@ def get_job_params(self, ctx, params): #BEGIN get_job_params mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -508,8 +510,7 @@ def update_job_status(self, ctx, params): #BEGIN update_job_status mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -547,8 +548,7 @@ def add_job_logs(self, ctx, params, lines): #BEGIN add_job_logs mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -599,8 +599,7 @@ def get_job_logs(self, ctx, params): mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -641,8 +640,7 @@ def finish_job(self, ctx, params): #BEGIN finish_job mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -671,8 +669,7 @@ def start_job(self, ctx, params): #BEGIN start_job mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -784,7 +781,8 @@ def check_job(self, ctx, params): # return variables are: job_state #BEGIN check_job mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) job_state = mr.check_job( @@ -990,7 +988,8 @@ def check_job_batch(self, ctx, params): # return variables are: returnVal #BEGIN check_job_batch mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.check_job_batch( @@ -1108,7 +1107,8 @@ def check_jobs(self, ctx, params): # return variables are: returnVal #BEGIN check_jobs mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.check_jobs( @@ -1228,7 +1228,8 @@ def check_workspace_jobs(self, ctx, params): # ctx is the context object # return variables are: returnVal #BEGIN check_workspace_jobs - mr = SDKMethodRunner(self.config, user_id=ctx["user_id"], token=ctx["token"], + mr = SDKMethodRunner(self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util) returnVal = mr.check_workspace_jobs( params.get("workspace_id"), @@ -1261,8 +1262,7 @@ def cancel_job(self, ctx, params): #BEGIN cancel_job mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -1300,7 +1300,8 @@ def check_job_canceled(self, ctx, params): # return variables are: result #BEGIN check_job_canceled mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) result = mr.check_job_canceled(job_id=params["job_id"], as_admin=params.get('as_admin')) @@ -1327,8 +1328,7 @@ def get_job_status(self, ctx, params): #BEGIN get_job_status mr = SDKMethodRunner( self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util @@ -1455,7 +1455,8 @@ def check_jobs_date_range_for_user(self, ctx, params): # return variables are: returnVal #BEGIN check_jobs_date_range_for_user mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.check_jobs_date_range_for_user( @@ -1590,7 +1591,8 @@ def check_jobs_date_range_for_all(self, ctx, params): # return variables are: returnVal #BEGIN check_jobs_date_range_for_all mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.check_jobs_date_range_for_user( @@ -1624,7 +1626,8 @@ def handle_held_job(self, ctx, cluster_id): # return variables are: returnVal #BEGIN handle_held_job mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.handle_held_job(cluster_id=cluster_id) @@ -1646,7 +1649,8 @@ def is_admin(self, ctx): # return variables are: returnVal #BEGIN is_admin mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.get_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.check_is_admin() @@ -1671,7 +1675,8 @@ def get_admin_permission(self, ctx): # return variables are: returnVal #BEGIN get_admin_permission mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + self.config, + user_clients=self.get_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.get_admin_permission() @@ -1692,6 +1697,7 @@ def get_client_groups(self, ctx): # ctx is the context object # return variables are: client_groups #BEGIN get_client_groups + # TODO I think this needs to be actually extracted from the config file client_groups = ['njs', 'bigmem', 'bigmemlong', 'extreme', 'concierge', 'hpc', 'kb_upload', 'terabyte', 'multi_tb', 'kb_upload_bulk'] #END get_client_groups diff --git a/lib/execution_engine2/sdk/EE2Authentication.py b/lib/execution_engine2/sdk/EE2Authentication.py index 94ded3964..973103a67 100644 --- a/lib/execution_engine2/sdk/EE2Authentication.py +++ b/lib/execution_engine2/sdk/EE2Authentication.py @@ -5,6 +5,7 @@ from lib.execution_engine2.authorization.authstrategy import can_read_job, can_write_job from lib.execution_engine2.authorization.roles import AdminAuthUtil from lib.execution_engine2.db.models.models import Job +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE class JobPermissions(Enum): @@ -33,12 +34,12 @@ def _lookup_admin_permissions(self): aau = AdminAuthUtil(self.sdkmr.auth_url, self.sdkmr.admin_roles) p = aau.get_admin_role( token=self.sdkmr.token, - read_role=self.sdkmr.ADMIN_READ_ROLE, - write_role=self.sdkmr.ADMIN_WRITE_ROLE, + read_role=ADMIN_READ_ROLE, + write_role=ADMIN_WRITE_ROLE, ) - if p == self.sdkmr.ADMIN_READ_ROLE: + if p == ADMIN_READ_ROLE: return AdminPermissions.READ - elif p == self.sdkmr.ADMIN_WRITE_ROLE: + elif p == ADMIN_WRITE_ROLE: return AdminPermissions.WRITE else: return AdminPermissions.NONE @@ -149,16 +150,12 @@ def test_job_permissions( perm = False try: if level.value == JobPermissions.READ.value: - perm = can_read_job( - job, self.sdkmr.user_id, self.sdkmr.token, self.sdkmr.config - ) + perm = can_read_job(job, self.sdkmr.user_id, self.sdkmr.workspace_auth) self._update_job_permission_cache( job_id, self.sdkmr.user_id, level, perm ) elif level.value == JobPermissions.WRITE.value: - perm = can_write_job( - job, self.sdkmr.user_id, self.sdkmr.token, self.sdkmr.config - ) + perm = can_write_job(job, self.sdkmr.user_id, self.sdkmr.workspace_auth) self._update_job_permission_cache( job_id, self.sdkmr.user_id, level, perm ) diff --git a/lib/execution_engine2/sdk/EE2Constants.py b/lib/execution_engine2/sdk/EE2Constants.py index dff073399..7d3f3cd0f 100644 --- a/lib/execution_engine2/sdk/EE2Constants.py +++ b/lib/execution_engine2/sdk/EE2Constants.py @@ -1,9 +1,18 @@ from dataclasses import dataclass from typing import Optional, NamedTuple +# May want to make this configurable. Hardcoded for now as we want concierge data to be owned +# by this user. +# An alternative approach would be to configure a kbaseconcierge token in the config, and then +# specify an auth2 role that allows users to replace their token with the kbaseconcierge token +# when running jobs. Needs more thought. KBASE_CONCIERGE_USERNAME = "kbaseconcierge" CONCIERGE_CLIENTGROUP = "kbase_concierge" +# these also probably should be configurable. +ADMIN_READ_ROLE = "EE2_ADMIN_RO" +ADMIN_WRITE_ROLE = "EE2_ADMIN" + class JobError(NamedTuple): name: str diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 5ffcf01d6..33c259f44 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -143,7 +143,7 @@ def _check_ws_objects(self, source_objects) -> None: def _check_workspace_permissions(self, wsid): if wsid: - if not self.sdkmr.get_workspace_auth().can_write(wsid): + if not self.sdkmr.workspace_auth.can_write(wsid): self.logger.debug( f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {wsid}." ) @@ -152,7 +152,7 @@ def _check_workspace_permissions(self, wsid): ) def _check_workspace_permissions_list(self, wsids): - perms = self.sdkmr.get_workspace_auth().can_write_list(wsids) + perms = self.sdkmr.workspace_auth.can_write_list(wsids) bad_ws = [key for key in perms.keys() if perms[key] is False] if bad_ws: self.logger.debug( diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index eb4bfa5f6..c8e4c16ac 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -445,7 +445,7 @@ def check_jobs( "Checking for read permission to: {}".format(job_ids) ) perms = can_read_jobs( - jobs, self.sdkmr.user_id, self.sdkmr.token, self.sdkmr.config + jobs, self.sdkmr.user_id, self.sdkmr.workspace_auth ) except RuntimeError as e: self.sdkmr.logger.error( @@ -502,8 +502,7 @@ def check_workspace_jobs(self, workspace_id, exclude_fields=None, return_list=No if exclude_fields is None: exclude_fields = [] - ws_auth = self.sdkmr.get_workspace_auth() - if not ws_auth.can_read(workspace_id): + if not self.sdkmr.workspace_auth.can_read(workspace_id): self.sdkmr.logger.debug( f"User {self.sdkmr.user_id} doesn't have permission to read jobs in workspace {workspace_id}." ) diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index ae64f2c15..715008801 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -74,18 +74,12 @@ def check_jobs_date_range_for_user( if offset is None: offset = 0 - if self.sdkmr.token is None: - raise AuthError("Please provide a token to check jobs date range") - - token_user = self.sdkmr.auth.get_user(self.sdkmr.token) - if user is None: - user = token_user - # Admins can view "ALL" or check_jobs for other users - if user != token_user: + if user != self.sdkmr.user_id: if not self.sdkmr.check_is_admin(): raise AuthError( - f"You are not authorized to view all records or records for others. user={user} token={token_user}" + "You are not authorized to view all records or records for others. " + + f"user={user} token={self.sdkmr.user_id}" ) dummy_ids = self._get_dummy_dates(creation_start_time, creation_end_time) @@ -161,6 +155,8 @@ def check_jobs_date_range_for_user( # TODO USE AS_PYMONGO() FOR SPEED # TODO Better define default fields # TODO Instead of SKIP use ID GT LT https://www.codementor.io/arpitbhayani/fast-and-efficient-pagination-in-mongodb-9095flbqr + # ^ this one is important - the workspace was DOSed by a single open narrative at one + # point due to skip abuse, which is why it was removed def _get_dummy_dates(self, creation_start_time, creation_end_time): diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 02440f791..14399dc9d 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -16,9 +16,7 @@ import dateutil -from installed_clients.WorkspaceClient import Workspace from installed_clients.authclient import KBaseAuth -from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.exceptions import AuthError @@ -35,6 +33,7 @@ from lib.execution_engine2.utils.EE2Logger import get_logger from lib.execution_engine2.utils.KafkaUtils import KafkaClient from lib.execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.clients import UserClientSet class JobPermissions(Enum): @@ -53,33 +52,31 @@ class SDKMethodRunner: """ JOB_PERMISSION_CACHE_SIZE = 500 JOB_PERMISSION_CACHE_EXPIRE_TIME = 300 # seconds - ADMIN_READ_ROLE = "EE2_ADMIN_RO" - ADMIN_WRITE_ROLE = "EE2_ADMIN" def __init__( self, config, - user_id=None, - token=None, + user_clients: UserClientSet, job_permission_cache=None, admin_permissions_cache=None, mongo_util=None, ): + if not user_clients: + raise ValueError("user_clients is required") self.deployment_config_fp = os.environ["KB_DEPLOYMENT_CONFIG"] self.config = config self.mongo_util = mongo_util self.condor = None - self.workspace = None - self.workspace_auth = None + self.workspace = user_clients.workspace + self.workspace_auth = user_clients.workspace_auth self.admin_roles = config.get("admin_roles", ["EE2_ADMIN", "EE2_ADMIN_RO"]) self.catalog_utils = CatalogUtils( config["catalog-url"], config["catalog-token"] ) - self.workspace_url = config.get("workspace-url") self.auth_url = config.get("auth-url") self.auth = KBaseAuth(auth_url=config.get("auth-service-url")) - self.user_id = user_id - self.token = token + self.user_id = user_clients.user_id + self.token = user_clients.token self.debug = SDKMethodRunner.parse_bool_from_string(config.get("debug")) self.logger = get_logger() @@ -133,13 +130,6 @@ def get_jobs_status(self) -> EE2Status.JobsStatus: self._ee2_status = EE2Status.JobsStatus(self) return self._ee2_status - def get_workspace_auth(self) -> WorkspaceAuth: - if self.workspace_auth is None: - self.workspace_auth = WorkspaceAuth( - self.token, self.user_id, self.workspace_url - ) - return self.workspace_auth - def get_mongo_util(self) -> MongoUtil: if self.mongo_util is None: self.mongo_util = MongoUtil(self.config) @@ -150,11 +140,6 @@ def get_condor(self) -> Condor: self.condor = Condor(self.deployment_config_fp) return self.condor - def get_workspace(self) -> Workspace: - if self.workspace is None: - self.workspace = Workspace(token=self.token, url=self.workspace_url) - return self.workspace - # Permissions Decorators #TODO Verify these actually work #TODO add as_admin to these def allow_job_read(func): @@ -452,8 +437,7 @@ def check_workspace_jobs( if as_admin: self.check_as_admin(requested_perm=JobPermissions.READ) else: - ws_auth = self.get_workspace_auth() - if not ws_auth.can_read(workspace_id): + if not self.workspace_auth.can_read(workspace_id): self.logger.debug( f"User {self.user_id} doesn't have permission to read jobs in workspace {workspace_id}." ) diff --git a/lib/execution_engine2/utils/APIHelpers.py b/lib/execution_engine2/utils/APIHelpers.py new file mode 100644 index 000000000..65e66ed74 --- /dev/null +++ b/lib/execution_engine2/utils/APIHelpers.py @@ -0,0 +1,31 @@ +""" +Contains classes and fuctions for use with the EE2 SDK API class (e.g. the *Impl.py file). +""" + +from typing import Dict +from execution_engine2.utils.clients import UserClientSet, get_user_client_set + + +# this class is only tested as part of integration tests. +class GenerateFromConfig: + """ + Utility methods to generate constructs from the service configuration. + """ + + def __init__(self, cfg: Dict[str, str]): + """ + Create an instance from a configuration. + + cfg - the configuration. + """ + self.cfg = cfg + + def get_user_clients(self, ctx) -> UserClientSet: + """ + Create a user client set from an SDK context object. + + ctx - the context object. This is passed in to SDK methods in the *Impl.py file. It is + expected that the context object contains the user_id and token keys, and this method + will fail with a KeyError if it does not. + """ + return get_user_client_set(self.cfg, ctx["user_id"], ctx["token"]) diff --git a/lib/execution_engine2/utils/clients.py b/lib/execution_engine2/utils/clients.py new file mode 100644 index 000000000..97aa13fb3 --- /dev/null +++ b/lib/execution_engine2/utils/clients.py @@ -0,0 +1,119 @@ +""" Contains the various clients EE2 needs to communicate with other services it depends on. """ + +# Note on testing - this class is not generally unit-testable, and is only tested fully in +# integration tests. + +from typing import Dict + +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.utils.CatalogUtils import CatalogUtils +from execution_engine2.utils.Condor import Condor +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient + +from installed_clients.authclient import KBaseAuth +from installed_clients.WorkspaceClient import Workspace + + +class UserClientSet: + """ + Clients required by EE2 for communicating with other services that need to be instantiated + on a per user basis. Also contains the user credentials for ease of use. + """ + + def __init__( + self, + user_id: str, + token: str, + workspace: Workspace, + workspace_auth: WorkspaceAuth, + ): + """ + Initialize the client set. + + user_id - The user's ID. + token - The users's token + workspace - A workspace client initialized with the user's token. + workspace_auth - A workspace auth client initialized with the user's token. + """ + if not user_id or not user_id.strip(): + raise ValueError("user_id is required") + if not token or not token.strip(): + raise ValueError("token is required") + if not workspace: + raise ValueError("workspace is required") + if not workspace_auth: + raise ValueError("workspace_auth is required") + self.user_id = user_id + self.token = token + self.workspace = workspace + self.workspace_auth = workspace_auth + + +def get_user_client_set(cfg: Dict[str, str], user_id: str, token: str): + """ + Create the client set from a configuration dictionary. + + cfg - the configuration dictionary + user_id - the ID of the user to be used to initialize the client set. + token - the token of the user to be used to initialize the client set. Note that the set + trusts that the token actually belongs to the user ID, and currently does not + independently check the validity of the user ID. + + Expected keys in config: + workspace-url - the URL of the kbase workspace service + """ + if not cfg: + raise ValueError("cfg is required") + # Do a check that the url actually points to the workspace? + # Also maybe consider passing in the workspace url rather than the dict, but the ClientSet + # below will need lots of params so a dict makes sense there, maybe keep the apis similar? + # TODO the client throws a 'X is not a valid url' error if the url isn't valid, improve + # by catching & rethrowing with a more clear message that the config is wrong + ws_url = cfg.get("workspace-url") # may want to make the keys constants? + if not ws_url or not ws_url.strip(): + raise ValueError("missing workspace-url in configuration") + workspace = Workspace(ws_url, token=token) + workspace_auth = WorkspaceAuth(user_id, workspace) + return UserClientSet(user_id, token, workspace, workspace_auth) + + +class ClientSet: + """ + Clients required by EE2 for communicating with other services. + + These are not user-specific and can be reused throughout the application. + """ + + def __init__(self, cfg: Dict[str, str], cfg_path: str, debug: bool = False): + """ + Initialize the client set from a configuration dictionary. + + cfg - the configuration dictionary + cfg_path - the path to the configuration file + debug - set clients that support it to debug mode + + Expected keys in config: + auth-url - the root URL of the kbase auth service + catalog-url - the URL of the catalog service + catalog-token - a token to use with the catalog service. Ideally a service token + kafka-host - the host string for a Kafka service + slack-token - a token for contacting Slack + """ + # TODO seems like it'd make sense to init Condor from a config dict like everything else + self.condor = Condor(cfg_path) + self.catalog_utils = CatalogUtils(cfg["catalog-url"], cfg["catalog-token"]) + auth_url = cfg["auth-url"] + self.auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") + # TODO using hardcoded roles for now to avoid possible bugs with mismatched cfg roles + # these should probably be configurable + self.auth_admin = AdminAuthUtil(auth_url, [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE]) + + # KafkaClient has a nice error message when the arg is None + self.kafka_client = KafkaClient(cfg.get("kafka-host")) + # SlackClient handles None arguments + self.slack_client = SlackClient( + cfg.get("slack-token"), debug=debug, endpoint=cfg.get("ee2-url") + ) diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index abf89cbe3..b2c3edc6d 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -3,17 +3,22 @@ import unittest from configparser import ConfigParser +from unittest.mock import create_autospec + import bson from mock import MagicMock from mock import patch from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace from lib.execution_engine2.authorization.roles import AdminAuthUtil from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from lib.execution_engine2.db.models.models import Status from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.Condor import Condor from lib.execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import get_user_client_set, UserClientSet from test.utils_shared.test_utils import ( get_sample_job_params, get_sample_condor_info, @@ -41,7 +46,7 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token + cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) ) def setUp(self) -> None: @@ -82,11 +87,11 @@ def tearDown(self) -> None: self.condor_patch.stop() self.condor_patch2.start() - def getRunner(self) -> SDKMethodRunner: + def getRunner(self, user_clients=None) -> SDKMethodRunner: # Initialize these clients from None - runner = SDKMethodRunner( - self.cfg, user_id=self.user_id, token=self.token - ) # type : SDKMethodRunner + if not user_clients: + user_clients = get_user_client_set(self.cfg, self.user_id, self.token) + runner = SDKMethodRunner(self.cfg, user_clients) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() @@ -104,13 +109,23 @@ def get_runner_with_condor(self) -> SDKMethodRunner: # TODO How do you test ADMIN_MODE without increasing too much coverage + def get_mocks( + self, user_id=None, token="fake_token" + ) -> (UserClientSet, Workspace, WorkspaceAuth): + user_id = user_id if user_id else self.user_id + ws = create_autospec(Workspace, instance=True, spec_set=True) + wsa = create_autospec(WorkspaceAuth, instance=True, spec_set=True) + ucs = UserClientSet(user_id, token, ws, wsa) + return ucs, ws, wsa + @patch.object(Catalog, "get_module_version", return_value="module.version") - @patch.object(WorkspaceAuth, "can_write", return_value=True) @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_regular_user(self, aau, workspace, catalog): + def test_regular_user(self, aau, catalog): # Regular User lowly_user = "Access Denied: You are not an administrator" - runner = self.getRunner() + user_client_set, _, ws_auth = self.get_mocks() + ws_auth.can_write.return_value = True + runner = self.getRunner(user_client_set) aau.return_value = ["RegularJoe"] method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -127,6 +142,7 @@ def test_regular_user(self, aau, workspace, catalog): job_id = runner.run_job(params=job_params_1, as_admin=False) self.assertTrue(bson.objectid.ObjectId.is_valid(job_id)) + ws_auth.can_write.assert_called_once_with(self.ws_id) # RUNJOB BUT ATTEMPT TO BE AN ADMIN with self.assertRaisesRegexp( @@ -189,7 +205,7 @@ def test_admin_writer(self, aau, workspace, catalog): # Admin User with WRITE runner = self.getRunner() - aau.return_value = [runner.ADMIN_READ_ROLE] + aau.return_value = [ADMIN_READ_ROLE] method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -201,7 +217,7 @@ def test_admin_writer(self, aau, workspace, catalog): runner = self.getRunner() # SET YOUR ADMIN STATUS HERE - aau.return_value = [runner.ADMIN_WRITE_ROLE] + aau.return_value = [ADMIN_WRITE_ROLE] method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -247,7 +263,7 @@ def test_admin_reader(self, aau): # Admin User with WRITE lowly_admin = r"Access Denied: You are a read-only admin. This function requires write access" runner = self.getRunner() - aau.return_value = [runner.ADMIN_READ_ROLE] + aau.return_value = [ADMIN_READ_ROLE] method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) diff --git a/test/tests_for_auth/ee2_authstrategy_test.py b/test/tests_for_auth/ee2_authstrategy_test.py index cae61d41e..a4cfd776d 100644 --- a/test/tests_for_auth/ee2_authstrategy_test.py +++ b/test/tests_for_auth/ee2_authstrategy_test.py @@ -15,6 +15,9 @@ custom_ws_perm_maker, ) +from installed_clients.WorkspaceClient import Workspace +from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth + class AuthStrategyTestCase(unittest.TestCase): @classmethod @@ -73,13 +76,18 @@ def _mock_ws_deleted(self, rq_mock, ws_id): "POST", self.ws_url, [{"json": response, "status_code": 500}] ) + def _get_workspace_auth(self, token) -> WorkspaceAuth: + # TODO these tests can be converted to unit tests by mocking the WorkspaceAuth class + return WorkspaceAuth(self.user, Workspace(url=self.ws_url, token=token)) + @requests_mock.Mocker() def test_can_read_job_ok(self, rq_mock): rq_mock.add_matcher(custom_ws_perm_maker(self.user, self.ws_access)) (jobs, expected_perms) = self._generate_all_test_jobs(perm="read") for idx, job in enumerate(jobs): self.assertEqual( - expected_perms[idx], can_read_job(job, self.user, "foo", self.cfg) + expected_perms[idx], + can_read_job(job, self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -89,7 +97,7 @@ def test_can_read_job_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_read_job(job, self.user, "token", self.cfg) + can_read_job(job, self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) @requests_mock.Mocker() @@ -98,7 +106,8 @@ def test_can_write_job_ok(self, rq_mock): (jobs, expected_perms) = self._generate_all_test_jobs(perm="write") for idx, job in enumerate(jobs): self.assertEqual( - expected_perms[idx], can_write_job(job, self.user, "foo", self.cfg) + expected_perms[idx], + can_write_job(job, self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -108,7 +117,7 @@ def test_can_write_job_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_write_job(job, self.user, "token", self.cfg) + can_write_job(job, self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) @requests_mock.Mocker() @@ -121,7 +130,8 @@ def test_can_read_jobs_ok(self, rq_mock): (jobs, expected_perms) = self._generate_all_test_jobs(perm="read") for idx, job in enumerate(jobs): self.assertEqual( - [expected_perms[idx]], can_read_jobs([job], self.user, "foo", self.cfg) + [expected_perms[idx]], + can_read_jobs([job], self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -131,7 +141,7 @@ def test_can_read_jobs_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_read_jobs([job], self.user, "token", self.cfg) + can_read_jobs([job], self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) @requests_mock.Mocker() @@ -140,7 +150,8 @@ def test_can_write_jobs_ok(self, rq_mock): (jobs, expected_perms) = self._generate_all_test_jobs(perm="write") for idx, job in enumerate(jobs): self.assertEqual( - [expected_perms[idx]], can_write_jobs([job], self.user, "foo", self.cfg) + [expected_perms[idx]], + can_write_jobs([job], self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -150,5 +161,5 @@ def test_can_write_jobs_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_write_jobs([job], self.user, "token", self.cfg) + can_write_jobs([job], self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) diff --git a/test/tests_for_auth/ee2_workspaceauth_test.py b/test/tests_for_auth/ee2_workspaceauth_test.py index af4d614c0..4412afb3b 100644 --- a/test/tests_for_auth/ee2_workspaceauth_test.py +++ b/test/tests_for_auth/ee2_workspaceauth_test.py @@ -3,6 +3,7 @@ import requests_mock +from installed_clients.WorkspaceClient import Workspace from execution_engine2.authorization.workspaceauth import WorkspaceAuth from test.utils_shared.test_utils import read_config_into_dict @@ -37,13 +38,17 @@ def _mock_ws_deleted(self, rq_mock, ws_id): "POST", self.ws_url, [{"json": response, "status_code": 500}] ) + def _get_ws(self, token) -> Workspace: + # TODO these tests can be converted to unit tests by mocking the Workspace class + return Workspace(url=self.ws_url, token=token) + @requests_mock.Mocker() def test_can_read_ok(self, rq_mock): cases = {"123": True, "456": True, "789": False, "321": True} ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} for ws_id in ws_id_map.keys(): self._mock_ok_ws_perms(rq_mock, self.user, {ws_id: ws_id_map[ws_id]}) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_read(ws_id) self.assertEqual(perms, cases[ws_id]) @@ -52,7 +57,7 @@ def test_can_read_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_read(ws_id) self.assertIn( "An error occurred while fetching user permissions from the Workspace", @@ -65,7 +70,7 @@ def test_can_write_ok(self, rq_mock): ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} for ws_id in ws_id_map.keys(): self._mock_ok_ws_perms(rq_mock, self.user, {ws_id: ws_id_map[ws_id]}) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_write(ws_id) self.assertEqual(perms, cases[ws_id]) @@ -74,7 +79,7 @@ def test_can_write_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_write(ws_id) self.assertIn( "An error occurred while fetching user permissions from the Workspace", @@ -86,7 +91,7 @@ def test_can_read_list_ok(self, rq_mock): ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} cases = {"123": True, "456": True, "789": False, "321": True} self._mock_ok_ws_perms(rq_mock, self.user, ws_id_map) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_read_list(list(ws_id_map.keys())) self.assertEqual(perms, cases) @@ -95,7 +100,7 @@ def test_can_read_list_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_read_list([ws_id]) self.assertIn( "An error occurred while fetching user permissions from the Workspace", @@ -107,7 +112,7 @@ def test_can_write_list_ok(self, rq_mock): ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} cases = {"123": False, "456": True, "789": False, "321": True} self._mock_ok_ws_perms(rq_mock, self.user, ws_id_map) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_write_list(list(ws_id_map.keys())) self.assertEqual(perms, cases) @@ -116,7 +121,7 @@ def test_can_write_list_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_write_list([ws_id]) self.assertIn( "An error occurred while fetching user permissions from the Workspace", diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py index a70178c41..9a9b68d55 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py @@ -10,6 +10,7 @@ from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.db.models.models import Job, JobLog from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.clients import get_user_client_set from test.utils_shared.test_utils import ( bootstrap, run_job_adapter, @@ -35,7 +36,7 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token + cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 5a0cd0933..c63310142 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -9,6 +9,7 @@ from datetime import datetime, timedelta from pprint import pprint from unittest.mock import patch +from pytest import raises import bson import dateutil @@ -22,12 +23,14 @@ from lib.execution_engine2.exceptions import InvalidStatusTransitionException from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from execution_engine2.utils.clients import get_user_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import ( bootstrap, get_example_job, validate_job_state, run_job_adapter, + assert_exception_correct, ) from tests_for_db.mongo_test_helper import MongoTestHelper @@ -37,6 +40,7 @@ from lib.execution_engine2.sdk.EE2Runjob import EE2RunJob +# TODO this isn't necessary with pytest, can just use regular old functions class ee2_SDKMethodRunner_test(unittest.TestCase): @classmethod def setUpClass(cls): @@ -60,7 +64,7 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token + cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -121,6 +125,14 @@ def create_job_rec(self): # self.assertEqual(len(git_commit_1), len(git_commit_2)) # self.assertNotEqual(git_commit_1, git_commit_2) + def test_init_fail(self): + self._init_fail({}, None, ValueError("user_clients is required")) + + def _init_fail(self, cfg, user_clients, expected): + with raises(Exception) as e: + SDKMethodRunner(cfg, user_clients) + assert_exception_correct(e.value, expected) + # Status @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_cancel_job(self, condor): @@ -658,7 +670,8 @@ def test_check_job_global_perm(self, rq_mock): # now test with a different user other_method_runner = SDKMethodRunner( - self.cfg, user_id="some_other_user", token="other_token" + self.cfg, + get_user_client_set(self.cfg, "some_other_user", "other_token"), ) job_states = other_method_runner.get_jobs_status().check_workspace_jobs( self.ws_id diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 039edfdfa..33fc9389d 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -13,6 +13,7 @@ from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from execution_engine2.utils.clients import get_user_client_set from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -50,7 +51,7 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token + cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) ) cls.mongo_util = MongoUtil(cls.cfg) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index c34ec4f0a..3ffc5e19f 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -13,6 +13,7 @@ from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from execution_engine2.utils.clients import get_user_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import bootstrap, get_example_job @@ -45,7 +46,7 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token + cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) ) cls.cr = CondorResources( request_cpus="1", diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 3671d2c1e..0fa577495 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -9,7 +9,7 @@ from configparser import ConfigParser from unittest.mock import patch -from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.authorization.workspaceauth import WorkspaceAuth from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.db.models.models import Job, Status from lib.execution_engine2.execution_engine2Impl import execution_engine2 @@ -17,6 +17,7 @@ from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.Condor import Condor from lib.execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import get_user_client_set from test.utils_shared.test_utils import ( bootstrap, get_sample_job_params, @@ -43,7 +44,7 @@ def setUpClass(cls): cls.ctx = {"token": cls.token, "user_id": cls.user_id} cls.impl = execution_engine2(cls.cfg) cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token + cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) diff --git a/test/tests_for_utils/clients_test.py b/test/tests_for_utils/clients_test.py new file mode 100644 index 000000000..3e109e73e --- /dev/null +++ b/test/tests_for_utils/clients_test.py @@ -0,0 +1,70 @@ +# This test only tests code that can be exercised without a network connection to services. +# That code is tested in integration tests. + +from pytest import raises +from unittest.mock import create_autospec + +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.utils.clients import UserClientSet, get_user_client_set +from utils_shared.test_utils import assert_exception_correct +from installed_clients.WorkspaceClient import Workspace + + +def test_get_user_client_set_fail(): + ws_err = "missing workspace-url in configuration" + get_user_client_set_fail(None, "foo", "bar", ValueError("cfg is required")) + get_user_client_set_fail({}, "foo", "bar", ValueError("cfg is required")) + get_user_client_set_fail({"a": "b"}, "foo", "bar", ValueError(ws_err)) + get_user_client_set_fail({"workspace-url": None}, "foo", "bar", ValueError(ws_err)) + get_user_client_set_fail( + {"workspace-url": " \t "}, "foo", "bar", ValueError(ws_err) + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + None, + "bar", + ValueError("user_id is required"), + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + " \t ", + "bar", + ValueError("user_id is required"), + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + "foo", + None, + ValueError("token is required"), + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + "foo", + " \t ", + ValueError("token is required"), + ) + + +def get_user_client_set_fail(cfg, user, token, expected): + with raises(Exception) as e: + get_user_client_set(cfg, user, token) + assert_exception_correct(e.value, expected) + + +def test_user_client_set_init_fail(): + ws = create_autospec(Workspace, spec_set=True, instance=True) + wsa = WorkspaceAuth("u", ws) + user_client_set_init_fail(None, "t", ws, wsa, ValueError("user_id is required")) + user_client_set_init_fail(" \t ", "t", ws, wsa, ValueError("user_id is required")) + user_client_set_init_fail("u", None, ws, wsa, ValueError("token is required")) + user_client_set_init_fail("u", " \t ", ws, wsa, ValueError("token is required")) + user_client_set_init_fail("u", "t", None, wsa, ValueError("workspace is required")) + user_client_set_init_fail( + "u", "t", ws, None, ValueError("workspace_auth is required") + ) + + +def user_client_set_init_fail(user, token, ws_client, ws_auth, expected): + with raises(Exception) as e: + UserClientSet(user, token, ws_client, ws_auth) + assert_exception_correct(e.value, expected) diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index a49517f11..a2c4fe7b2 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -379,3 +379,8 @@ def get_sample_job_params(method=None, wsid="123"): } return job_params + + +def assert_exception_correct(got: Exception, expected: Exception): + assert got.args == expected.args + assert type(got) == type(expected) From 9785b5469557c1192049bf4b7e0daa9a8ec1b748 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Sun, 7 Feb 2021 15:05:17 -0800 Subject: [PATCH 008/109] Move boolean parsing to a separate module (#306) * Move boolean parsing to a separate module Ran into import cycles when making other changes. Probably makes more sense to have it in a separate module anyway. * remove unused import not sure why flak8 isn't catching this * Make boolean parser more robust to unusual inputs * Fix allowed types in parse_bool * Update documentation for parse_bool --- lib/execution_engine2/sdk/EE2Status.py | 3 +- lib/execution_engine2/sdk/EE2StatusRange.py | 3 +- lib/execution_engine2/sdk/SDKMethodRunner.py | 17 +----- lib/execution_engine2/utils/arg_processing.py | 36 ++++++++++++ test/tests_for_utils/arg_processing_test.py | 57 +++++++++++++++++++ 5 files changed, 99 insertions(+), 17 deletions(-) create mode 100644 lib/execution_engine2/utils/arg_processing.py create mode 100644 test/tests_for_utils/arg_processing_test.py diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index c8e4c16ac..96aeb70f6 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -18,6 +18,7 @@ ErrorCode, TerminatedCode, ) +from execution_engine2.utils.arg_processing import parse_bool from lib.execution_engine2.utils.KafkaUtils import ( KafkaCancelJob, KafkaCondorCommand, @@ -486,7 +487,7 @@ def check_jobs( {job_id: job_states.get(job_id, []) for job_id in job_ids} ) - if return_list is not None and self.sdkmr.parse_bool_from_string(return_list): + if return_list is not None and parse_bool(return_list): job_states = {"job_states": list(job_states.values())} return job_states diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index 715008801..ac5f17d97 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -6,6 +6,7 @@ from bson import ObjectId +from execution_engine2.utils.arg_processing import parse_bool from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.exceptions import AuthError @@ -187,7 +188,7 @@ def get_sort_order(self, ascending): if ascending is None: return "+" else: - if self.sdkmr.parse_bool_from_string(ascending): + if parse_bool(ascending): return "+" else: return "-" diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 14399dc9d..97cef00d8 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -8,7 +8,6 @@ * Clients are only loaded if they are necessary """ -import json import os import time from datetime import datetime @@ -34,6 +33,7 @@ from lib.execution_engine2.utils.KafkaUtils import KafkaClient from lib.execution_engine2.utils.SlackUtils import SlackClient from execution_engine2.utils.clients import UserClientSet +from execution_engine2.utils.arg_processing import parse_bool class JobPermissions(Enum): @@ -77,7 +77,7 @@ def __init__( self.auth = KBaseAuth(auth_url=config.get("auth-service-url")) self.user_id = user_clients.user_id self.token = user_clients.token - self.debug = SDKMethodRunner.parse_bool_from_string(config.get("debug")) + self.debug = parse_bool(config.get("debug")) self.logger = get_logger() self.job_permission_cache = EE2Authentication.EE2Auth.get_cache( @@ -459,19 +459,6 @@ def check_workspace_jobs( return job_states - @staticmethod - def parse_bool_from_string(str_or_bool): - if isinstance(str_or_bool, bool): - return str_or_bool - - if isinstance(str_or_bool, int): - return str_or_bool - - if isinstance(json.loads(str_or_bool.lower()), bool): - return json.loads(str_or_bool.lower()) - - raise Exception("Not a boolean value") - @staticmethod def check_and_convert_time(time_input, assign_default_time=False): """ diff --git a/lib/execution_engine2/utils/arg_processing.py b/lib/execution_engine2/utils/arg_processing.py new file mode 100644 index 000000000..252822838 --- /dev/null +++ b/lib/execution_engine2/utils/arg_processing.py @@ -0,0 +1,36 @@ +""" +Functions for processing arguments / parameters, including argument validity checkers and +normalizers. +""" + +from typing import Union + + +def parse_bool(putative_bool: Union[str, bool, int, float, None]) -> bool: + """ + Parse a string, bool, int, or float to a boolean value. + Strings containing 'true' or 'false', regardless of capitalization, are considered booleans. + Strings containing ints or floats are parsed to floats before processing. + + Raises ValueError if the value cannot be parsed. + """ + pb = putative_bool + if pb is None: + return False + + if isinstance(pb, bool) or isinstance(pb, int) or isinstance(pb, float): + return bool(pb) + + if isinstance(pb, str): + try: + return bool(float(pb)) + except ValueError: + pass # check for 'true' and 'false' strings next + # they're more likely and if we really wanted to optimize they should go first. + # probably doesn't matter at all and it makes the code a bit simpler + if pb.lower() == "true": + return True + if pb.lower() == "false": + return False + + raise ValueError(f"{pb} is not a boolean value") diff --git a/test/tests_for_utils/arg_processing_test.py b/test/tests_for_utils/arg_processing_test.py new file mode 100644 index 000000000..11164fad1 --- /dev/null +++ b/test/tests_for_utils/arg_processing_test.py @@ -0,0 +1,57 @@ +from pytest import raises + +from execution_engine2.utils.arg_processing import parse_bool +from utils_shared.test_utils import assert_exception_correct + + +def test_parse_bool_success(): + testcases = { + None: False, + True: True, + False: False, + # ints + -1: True, + 1: True, + 0: False, + 100: True, + -100: True, + # floats + -1.3: True, + 1.7: True, + 100.89: True, + -100.7: True, + # ints as strings + "-1": True, + "1": True, + "0": False, + "100": True, + "-100": True, + # floats as strings + "-1.3": True, + "1.7": True, + "0.0": False, + "100.89": True, + "-100.7": True, + # booleans as strings + "True": True, + "TRUE": True, + "true": True, + "False": False, + "FALSE": False, + "false": False, + } + + for arg, expected in testcases.items(): + assert parse_bool(arg) is expected, f"Testcase: {arg}" + + # can't go in the hash since equivalent to 0 + assert parse_bool(0.0) is False + + +def test_parse_bool_failure(): + testcases = ["Truthy", "fawlse", " ", "f1", "f1.3"] + + for tc in testcases: + with raises(Exception) as e: + parse_bool(tc) + assert_exception_correct(e.value, ValueError(f"{tc} is not a boolean value")) From a3042d77387886d5a1506a53ffffd767d3115cf8 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 11 Feb 2021 11:22:18 -0800 Subject: [PATCH 009/109] Fix timezone related bug / test failures (#308) * Fix timezone related bug / test failures Parts of the code were using naive datetimes for epoch time processing. This meant that when run in an environment where the timezone was UTC (like the docker container build from the provided dockerfile) everything worked. However, when run in an environment with a different timezone (like my laptop, PST) the tests would fail. There were two problems. Firstly, the test was creating a naive timestamp using utcnow() instead of now(), which causes the underlying timestamp to differ when regenerated from the datetime object: ``` n [1]: from datetime import datetime, timezone In [2]: utcnow = datetime.utcnow() In [3]: now = datetime.now(tz=timezone.utc) In [4]: print(utcnow, utcnow.timestamp()) 2021-02-11 01:53:58.919172 1613037238.919172 In [5]: print(now, now.timestamp()) 2021-02-11 01:54:01.017436+00:00 1613008441.017436 ``` Note that `now` is timezone aware, while `utcnow` is not. Secondly, timestamps were being parsed without timezone awareness in the _get_dummy_dates() method, which caused the MongoDB ObjectIDs to be generated from the datetime incorrectly: ``` In [24]: ts = datetime.fromtimestamp(100000) In [25]: ts Out[25]: datetime.datetime(1970, 1, 1, 19, 46, 40) In [26]: ObjectId.from_datetime(ts) Out[26]: ObjectId('000116200000000000000000') In [27]: ts = datetime.fromtimestamp(100000, tz=timezone.utc) In [28]: ts Out[28]: datetime.datetime(1970, 1, 2, 3, 46, 40, tzinfo=datetime.timezone.utc) In [29]: ObjectId.from_datetime(ts) Out[29]: ObjectId('000186a00000000000000000') ``` This commit also adds a Pipfile for ease of use of dependency managers that take that format. * Forgot to run black --- Pipfile | 83 ++ Pipfile.lock | 963 ++++++++++++++++++ Pipfile_notes.md | 5 + lib/execution_engine2/sdk/EE2StatusRange.py | 8 +- .../ee2_SDKMethodRunner_test.py | 34 +- 5 files changed, 1079 insertions(+), 14 deletions(-) create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 Pipfile_notes.md diff --git a/Pipfile b/Pipfile new file mode 100644 index 000000000..86fe313c5 --- /dev/null +++ b/Pipfile @@ -0,0 +1,83 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +aiofiles = "==0.4.0" +aiohttp = "==3.6.3" +asn1crypto = "==1.3.0" +async-timeout = "==3.0.1" +attrs = "==20.2.0" +cachetools = "==3.1.1" +certifi = "==2019.6.16" +cffi = "==1.14.0" +chardet = "==3.0.4" +codecov = "==2.0.15" +configparser = "==3.7.4" +confluent-kafka = "==1.5.0" +coverage = "==4.5.3" +cryptography = "==3.2" +docker = "==4.3.1" +gevent = "==20.9.0" +gprof2dot = "==2019.11.30" +greenlet = "==0.4.17" +gunicorn = "==20.0.4" +h11 = "==0.8.1" +h2 = "==3.1.0" +hpack = "==3.0.0" +htcondor = "==8.9.8" +httpcore = "==0.3.0" +httptools = "==0.0.13" +hyperframe = "==5.2.0" +idna = "==2.8" +importlib-metadata = "==2.0.0" +iniconfig = "==1.1.1" +memory-profiler = "==0.55.0" +mock = "==3.0.5" +mongoengine = "==0.18.2" +multidict = "==4.5.2" +nose = "==1.3.7" +pluggy = "==0.13.1" +psutil = "==5.6.6" +py = "==1.9.0" +pycosat = "==0.6.3" +pycparser = "==2.19" +pymongo = "==3.8.0" +pyparsing = "==2.4.7" +pytest = "==6.1.1" +pytest-cov = "==2.8.1" +pytest-profiling = "==1.7.0" +python-dateutil = "==2.8.0" +python-dotenv = "==0.10.3" +requests = "==2.22.0" +requests-async = "==0.5.0" +requests-mock = "==1.7.0" +rfc3986 = "==1.3.2" +sanic = "==19.6.0" +sentry-sdk = "==0.14.3" +six = "==1.14.0" +slackclient = "==2.7.1" +toml = "==0.10.1" +tqdm = "==4.42.1" +typing-extensions = "==3.7.4.3" +ujson = "==1.35" +urllib3 = "==1.25.3" +uvloop = "==0.12.2" +websockets = "==6.0" +yarl = "==1.5.1" +zipp = "==3.3.1" +Jinja2 = "==2.10.3" +JSONRPCBase = "==0.2.0" +MarkupSafe = "==1.1.1" +pyOpenSSL = "==19.1.0" +PySocks = "==1.7.1" +"ruamel.yaml" = "==0.15.87" +websocket_client = "==0.57.0" +"zope.event" = "==4.5.0" +"zope.interface" = "==5.1.2" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 000000000..081653316 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,963 @@ +{ + "_meta": { + "hash": { + "sha256": "57c0c06ffdcb9f25ba60e8aa673adf703113ce8deeff306215c3c25c700e76a7" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "aiofiles": { + "hashes": [ + "sha256:021ea0ba314a86027c166ecc4b4c07f2d40fc0f4b3a950d1868a0f2571c2bbee", + "sha256:1e644c2573f953664368de28d2aa4c89dfd64550429d0c27c4680ccd3aa4985d" + ], + "index": "pypi", + "version": "==0.4.0" + }, + "aiohttp": { + "hashes": [ + "sha256:1a4160579ffbc1b69e88cb6ca8bb0fbd4947dfcbf9fb1e2a4fc4c7a4a986c1fe", + "sha256:206c0ccfcea46e1bddc91162449c20c72f308aebdcef4977420ef329c8fcc599", + "sha256:2ad493de47a8f926386fa6d256832de3095ba285f325db917c7deae0b54a9fc8", + "sha256:319b490a5e2beaf06891f6711856ea10591cfe84fe9f3e71a721aa8f20a0872a", + "sha256:470e4c90da36b601676fe50c49a60d34eb8c6593780930b1aa4eea6f508dfa37", + "sha256:60f4caa3b7f7a477f66ccdd158e06901e1d235d572283906276e3803f6b098f5", + "sha256:66d64486172b032db19ea8522328b19cfb78a3e1e5b62ab6a0567f93f073dea0", + "sha256:687461cd974722110d1763b45c5db4d2cdee8d50f57b00c43c7590d1dd77fc5c", + "sha256:698cd7bc3c7d1b82bb728bae835724a486a8c376647aec336aa21a60113c3645", + "sha256:797456399ffeef73172945708810f3277f794965eb6ec9bd3a0c007c0476be98", + "sha256:a885432d3cabc1287bcf88ea94e1826d3aec57fd5da4a586afae4591b061d40d", + "sha256:c506853ba52e516b264b106321c424d03f3ddef2813246432fa9d1cefd361c81", + "sha256:fb83326d8295e8840e4ba774edf346e87eca78ba8a89c55d2690352842c15ba5" + ], + "index": "pypi", + "version": "==3.6.3" + }, + "asn1crypto": { + "hashes": [ + "sha256:5a215cb8dc12f892244e3a113fe05397ee23c5c4ca7a69cd6e69811755efc42d", + "sha256:831d2710d3274c8a74befdddaf9f17fcbf6e350534565074818722d6d615b315" + ], + "index": "pypi", + "version": "==1.3.0" + }, + "async-timeout": { + "hashes": [ + "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f", + "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3" + ], + "index": "pypi", + "version": "==3.0.1" + }, + "attrs": { + "hashes": [ + "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594", + "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc" + ], + "index": "pypi", + "version": "==20.2.0" + }, + "cachetools": { + "hashes": [ + "sha256:428266a1c0d36dc5aca63a2d7c5942e88c2c898d72139fca0e97fdd2380517ae", + "sha256:8ea2d3ce97850f31e4a08b0e2b5e6c34997d7216a9d2c98e0f3978630d4da69a" + ], + "index": "pypi", + "version": "==3.1.1" + }, + "certifi": { + "hashes": [ + "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", + "sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695" + ], + "index": "pypi", + "version": "==2019.6.16" + }, + "cffi": { + "hashes": [ + "sha256:001bf3242a1bb04d985d63e138230802c6c8d4db3668fb545fb5005ddf5bb5ff", + "sha256:00789914be39dffba161cfc5be31b55775de5ba2235fe49aa28c148236c4e06b", + "sha256:028a579fc9aed3af38f4892bdcc7390508adabc30c6af4a6e4f611b0c680e6ac", + "sha256:14491a910663bf9f13ddf2bc8f60562d6bc5315c1f09c704937ef17293fb85b0", + "sha256:1cae98a7054b5c9391eb3249b86e0e99ab1e02bb0cc0575da191aedadbdf4384", + "sha256:2089ed025da3919d2e75a4d963d008330c96751127dd6f73c8dc0c65041b4c26", + "sha256:2d384f4a127a15ba701207f7639d94106693b6cd64173d6c8988e2c25f3ac2b6", + "sha256:337d448e5a725bba2d8293c48d9353fc68d0e9e4088d62a9571def317797522b", + "sha256:399aed636c7d3749bbed55bc907c3288cb43c65c4389964ad5ff849b6370603e", + "sha256:3b911c2dbd4f423b4c4fcca138cadde747abdb20d196c4a48708b8a2d32b16dd", + "sha256:3d311bcc4a41408cf5854f06ef2c5cab88f9fded37a3b95936c9879c1640d4c2", + "sha256:62ae9af2d069ea2698bf536dcfe1e4eed9090211dbaafeeedf5cb6c41b352f66", + "sha256:66e41db66b47d0d8672d8ed2708ba91b2f2524ece3dee48b5dfb36be8c2f21dc", + "sha256:675686925a9fb403edba0114db74e741d8181683dcf216be697d208857e04ca8", + "sha256:7e63cbcf2429a8dbfe48dcc2322d5f2220b77b2e17b7ba023d6166d84655da55", + "sha256:8a6c688fefb4e1cd56feb6c511984a6c4f7ec7d2a1ff31a10254f3c817054ae4", + "sha256:8c0ffc886aea5df6a1762d0019e9cb05f825d0eec1f520c51be9d198701daee5", + "sha256:95cd16d3dee553f882540c1ffe331d085c9e629499ceadfbda4d4fde635f4b7d", + "sha256:99f748a7e71ff382613b4e1acc0ac83bf7ad167fb3802e35e90d9763daba4d78", + "sha256:b8c78301cefcf5fd914aad35d3c04c2b21ce8629b5e4f4e45ae6812e461910fa", + "sha256:c420917b188a5582a56d8b93bdd8e0f6eca08c84ff623a4c16e809152cd35793", + "sha256:c43866529f2f06fe0edc6246eb4faa34f03fe88b64a0a9a942561c8e22f4b71f", + "sha256:cab50b8c2250b46fe738c77dbd25ce017d5e6fb35d3407606e7a4180656a5a6a", + "sha256:cef128cb4d5e0b3493f058f10ce32365972c554572ff821e175dbc6f8ff6924f", + "sha256:cf16e3cf6c0a5fdd9bc10c21687e19d29ad1fe863372b5543deaec1039581a30", + "sha256:e56c744aa6ff427a607763346e4170629caf7e48ead6921745986db3692f987f", + "sha256:e577934fc5f8779c554639376beeaa5657d54349096ef24abe8c74c5d9c117c3", + "sha256:f2b0fa0c01d8a0c7483afd9f31d7ecf2d71760ca24499c8697aeb5ca37dc090c" + ], + "index": "pypi", + "version": "==1.14.0" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "index": "pypi", + "version": "==3.0.4" + }, + "codecov": { + "hashes": [ + "sha256:8ed8b7c6791010d359baed66f84f061bba5bd41174bf324c31311e8737602788", + "sha256:ae00d68e18d8a20e9c3288ba3875ae03db3a8e892115bf9b83ef20507732bed4" + ], + "index": "pypi", + "version": "==2.0.15" + }, + "configparser": { + "hashes": [ + "sha256:8be81d89d6e7b4c0d4e44bcc525845f6da25821de80cb5e06e7e0238a2899e32", + "sha256:da60d0014fd8c55eb48c1c5354352e363e2d30bbf7057e5e171a468390184c75" + ], + "index": "pypi", + "version": "==3.7.4" + }, + "confluent-kafka": { + "hashes": [ + "sha256:00acc73f7d49961bf427f5e4fd6c0a220a6bfa5ccc91e0ad1f9ffa1751a169b0", + "sha256:0a59afbb90bdd22b9acdd3bb134f5ee1dff3cc5df55eaf52bf97b2f8d0d00de3", + "sha256:13b0e2011560f461ff39daf38089dd7f91404b3e66dba0456ccce0700f93c4f2", + "sha256:175c7064c8f19975616974558c45f42c147a202d4b1c0b0a83afefb920367696", + "sha256:22d7201d1aa89f1c5546749e781492925ed3eb0d7bd8f781fc57294cd45ddde3", + "sha256:3034cacc3b0d03eb3ce39cc5a64c1070d223870246f5d90c9113996be9db7df8", + "sha256:3e2d4f55ca952aeada3831d6615dc13a8a42c8e97175855ca08bbc6e6091b080", + "sha256:5a1c47320d6afc5b2599f8f8e143aed6845a2d903facde984606e02f10f11221", + "sha256:7b03bd9cc7b5e4df0a27eed359762c61a35313d4981ef1d9b418069eee454e66", + "sha256:85ff4823770ce2efaabb46d88e5ae26a840e0051fd481abaa805f21a5a84d003", + "sha256:9534cd2c0313df75b70eb4cf729382998970d97bbdda5cf3aef7081b855ccebe", + "sha256:99b13d0957a5967c85aee6138ef5f9acec90294267a549c5683744f20cf5d7b4", + "sha256:9a1c77291c1ac4b991aa0358f2f44636686eb8f52fb628502d30c312160a14e9", + "sha256:9ac812006000887f76c95b8a33a9f0b65845bf072fbc54a42a1acffd34e41120", + "sha256:9c47b8aacfe347bffd86bf75b98626718912b63df87f256dff1abc06a0355410", + "sha256:a116382ae67e0d6a54684bab4ee9b1be54e789d031a6e5e74c3edc657c79d23c", + "sha256:b1c89f3653385acc5da71570e03281f35ac6960367f2b2a426ae431deb1a1a35", + "sha256:bb77276d569f511abe4a5b32a53f8a30285bc7be68219e5711a44720bf356ac2", + "sha256:bbd9633552840ab9367fb762ea21272759db8caec2c34ff16ee28be177644cdf", + "sha256:bfdfa81e4e72d2c24e408a5e199aae0a477499ae40647dfa6906d002d9b07f38", + "sha256:c7461d6db081c23a6d38ceba348e7c178d7e974cf22c45ba8a4918ecb8855a44", + "sha256:d6a5d4c72360a75e875e88f7cce42b66a786d037ca2002303ab1c580d49caf53", + "sha256:dabed41cc60d1fc6d3cb44a90fe02e5192c9bf0f73c7b35761981e62ecabc592", + "sha256:dd544847c713eeeb525031348ff6ffea4ecdd11c13590893e599a9d4676a9bd4", + "sha256:eba169a9de8c978c9f33c763857c5279eceac46a4fd55a381c2528b9d4b3359e", + "sha256:f2d1ee0bfdf618017bbfaa42406546155c1a86263e4f286295318578c723803b" + ], + "index": "pypi", + "version": "==1.5.0" + }, + "coverage": { + "hashes": [ + "sha256:3684fabf6b87a369017756b551cef29e505cb155ddb892a7a29277b978da88b9", + "sha256:39e088da9b284f1bd17c750ac672103779f7954ce6125fd4382134ac8d152d74", + "sha256:3c205bc11cc4fcc57b761c2da73b9b72a59f8d5ca89979afb0c1c6f9e53c7390", + "sha256:465ce53a8c0f3a7950dfb836438442f833cf6663d407f37d8c52fe7b6e56d7e8", + "sha256:48020e343fc40f72a442c8a1334284620f81295256a6b6ca6d8aa1350c763bbe", + "sha256:5296fc86ab612ec12394565c500b412a43b328b3907c0d14358950d06fd83baf", + "sha256:5f61bed2f7d9b6a9ab935150a6b23d7f84b8055524e7be7715b6513f3328138e", + "sha256:68a43a9f9f83693ce0414d17e019daee7ab3f7113a70c79a3dd4c2f704e4d741", + "sha256:6b8033d47fe22506856fe450470ccb1d8ba1ffb8463494a15cfc96392a288c09", + "sha256:7ad7536066b28863e5835e8cfeaa794b7fe352d99a8cded9f43d1161be8e9fbd", + "sha256:7bacb89ccf4bedb30b277e96e4cc68cd1369ca6841bde7b005191b54d3dd1034", + "sha256:839dc7c36501254e14331bcb98b27002aa415e4af7ea039d9009409b9d2d5420", + "sha256:8f9a95b66969cdea53ec992ecea5406c5bd99c9221f539bca1e8406b200ae98c", + "sha256:932c03d2d565f75961ba1d3cec41ddde00e162c5b46d03f7423edcb807734eab", + "sha256:988529edadc49039d205e0aa6ce049c5ccda4acb2d6c3c5c550c17e8c02c05ba", + "sha256:998d7e73548fe395eeb294495a04d38942edb66d1fa61eb70418871bc621227e", + "sha256:9de60893fb447d1e797f6bf08fdf0dbcda0c1e34c1b06c92bd3a363c0ea8c609", + "sha256:9e80d45d0c7fcee54e22771db7f1b0b126fb4a6c0a2e5afa72f66827207ff2f2", + "sha256:a545a3dfe5082dc8e8c3eb7f8a2cf4f2870902ff1860bd99b6198cfd1f9d1f49", + "sha256:a5d8f29e5ec661143621a8f4de51adfb300d7a476224156a39a392254f70687b", + "sha256:aca06bfba4759bbdb09bf52ebb15ae20268ee1f6747417837926fae990ebc41d", + "sha256:bb23b7a6fd666e551a3094ab896a57809e010059540ad20acbeec03a154224ce", + "sha256:bfd1d0ae7e292105f29d7deaa9d8f2916ed8553ab9d5f39ec65bcf5deadff3f9", + "sha256:c62ca0a38958f541a73cf86acdab020c2091631c137bd359c4f5bddde7b75fd4", + "sha256:c709d8bda72cf4cd348ccec2a4881f2c5848fd72903c185f363d361b2737f773", + "sha256:c968a6aa7e0b56ecbd28531ddf439c2ec103610d3e2bf3b75b813304f8cb7723", + "sha256:df785d8cb80539d0b55fd47183264b7002077859028dfe3070cf6359bf8b2d9c", + "sha256:f406628ca51e0ae90ae76ea8398677a921b36f0bd71aab2099dfed08abd0322f", + "sha256:f46087bbd95ebae244a0eda01a618aff11ec7a069b15a3ef8f6b520db523dcf1", + "sha256:f8019c5279eb32360ca03e9fac40a12667715546eed5c5eb59eb381f2f501260", + "sha256:fc5f4d209733750afd2714e9109816a29500718b32dd9a5db01c0cb3a019b96a" + ], + "index": "pypi", + "version": "==4.5.3" + }, + "cryptography": { + "hashes": [ + "sha256:22f8251f68953553af4f9c11ec5f191198bc96cff9f0ac5dd5ff94daede0ee6d", + "sha256:284e275e3c099a80831f9898fb5c9559120d27675c3521278faba54e584a7832", + "sha256:3e17d02941c0f169c5b877597ca8be895fca0e5e3eb882526a74aa4804380a98", + "sha256:52a47e60953679eea0b4d490ca3c241fb1b166a7b161847ef4667dfd49e7699d", + "sha256:57b8c1ed13b8aa386cabbfde3be175d7b155682470b0e259fecfe53850967f8a", + "sha256:6a8f64ed096d13f92d1f601a92d9fd1f1025dc73a2ca1ced46dcf5e0d4930943", + "sha256:6e8a3c7c45101a7eeee93102500e1b08f2307c717ff553fcb3c1127efc9b6917", + "sha256:7ef41304bf978f33cfb6f43ca13bb0faac0c99cda33693aa20ad4f5e34e8cb8f", + "sha256:87c2fffd61e934bc0e2c927c3764c20b22d7f5f7f812ee1a477de4c89b044ca6", + "sha256:88069392cd9a1e68d2cfd5c3a2b0d72a44ef3b24b8977a4f7956e9e3c4c9477a", + "sha256:8a0866891326d3badb17c5fd3e02c926b635e8923fa271b4813cd4d972a57ff3", + "sha256:8f0fd8b0751d75c4483c534b209e39e918f0d14232c0d8a2a76e687f64ced831", + "sha256:9a07e6d255053674506091d63ab4270a119e9fc83462c7ab1dbcb495b76307af", + "sha256:9a8580c9afcdcddabbd064c0a74f337af74ff4529cdf3a12fa2e9782d677a2e5", + "sha256:bd80bc156d3729b38cb227a5a76532aef693b7ac9e395eea8063ee50ceed46a5", + "sha256:d1cbc3426e6150583b22b517ef3720036d7e3152d428c864ff0f3fcad2b97591", + "sha256:e15ac84dcdb89f92424cbaca4b0b34e211e7ce3ee7b0ec0e4f3c55cee65fae5a", + "sha256:e4789b84f8dedf190148441f7c5bfe7244782d9cbb194a36e17b91e7d3e1cca9", + "sha256:f01c9116bfb3ad2831e125a73dcd957d173d6ddca7701528eff1e7d97972872c", + "sha256:f0e3986f6cce007216b23c490f093f35ce2068f3c244051e559f647f6731b7ae", + "sha256:f2aa3f8ba9e2e3fd49bd3de743b976ab192fbf0eb0348cebde5d2a9de0090a9f", + "sha256:fb70a4cedd69dc52396ee114416a3656e011fb0311fca55eb55c7be6ed9c8aef" + ], + "index": "pypi", + "version": "==3.2" + }, + "docker": { + "hashes": [ + "sha256:13966471e8bc23b36bfb3a6fb4ab75043a5ef1dac86516274777576bed3b9828", + "sha256:bad94b8dd001a8a4af19ce4becc17f41b09f228173ffe6a4e0355389eef142f2" + ], + "index": "pypi", + "version": "==4.3.1" + }, + "gevent": { + "hashes": [ + "sha256:10110d4881aec04f218c316cb796b18c8b2cac67ae0eb5b0c5780056757268a2", + "sha256:1628a403fc9c3ea9b35924638a4d4fbe236f60ecdf4e22ed133fbbaf0bc7cb6b", + "sha256:1cfa3674866294623e324fa5b76eba7b96744d1956a605cfe24d26c5cd890f91", + "sha256:2269574444113cb4ca1c1808ab9460a87fe25e1c34a6e36d975d4af46e4afff9", + "sha256:283a021a2e14adfad718346f18982b80569d9c3a59e97cfae1b7d4c5b017941a", + "sha256:2aa70726ad1883fe7c17774e5ccc91ac6e30334efa29bafb9b8fe8ca6091b219", + "sha256:315a63a35068183dfb9bc0331c7bb3c265ee7db8a11797cbe98dadbdb45b5d35", + "sha256:324808a8558c733f7a9734525483795d52ca3bbd5662b24b361d81c075414b1f", + "sha256:33a63f230755c6813fca39d9cea2a8894df32df2ee58fd69d8bf8fcc1d8e018e", + "sha256:5f6d48051d336561ec08995431ee4d265ac723a64bba99cc58c3eb1a4d4f5c8d", + "sha256:8d338cd6d040fe2607e5305dd7991b5960b3780ae01f804c2ac5760d31d3b2c6", + "sha256:906175e3fb25f377a0b581e79d3ed5a7d925c136ff92fd022bb3013e25f5f3a9", + "sha256:93980e51dd2e5f81899d644a0b6ef4a73008c679fcedd50e3b21cc3451ba2424", + "sha256:9bb477f514cf39dc20651b479bf1ad4f38b9a679be2bfa3e162ec0c3785dfa2a", + "sha256:a8733a01974433d91308f8c44fa6cc13428b15bb39d46540657e260ff8852cb1", + "sha256:adbb267067f56696b2babced3d0856aa39dcf14b8ccd2dffa1fab587b00c6f80", + "sha256:afc177c37de41ce9c27d351ac84cbaf34407effcab5d6641645838f39d365be1", + "sha256:b07fcbca3e819296979d82fac3d8b44f0d5ced57b9a04dffcfd194da99c8eb2d", + "sha256:b2948566003a1030e47507755fe1f446995e8671c0c67571091539e01faf94cc", + "sha256:db208e74a32cff7f55f5aa1ba5d7d1c1a086a6325c8702ae78a5c741155552ff", + "sha256:dd4c6b2f540b25c3d0f277a725bc1a900ce30a681b90a081216e31f814be453b", + "sha256:e11de4b4d107ca2f35000eb08e9c4c4621c153103b400f48a9ea95b96d8c7e0b", + "sha256:eba19bae532d0c48d489fa16815b242ce074b1f4b63e8a8e663232cbe311ead9", + "sha256:fb33dc1ab27557bccd64ad4bf81e68c8b0d780fe937b1e2c0814558798137229" + ], + "index": "pypi", + "version": "==20.9.0" + }, + "gprof2dot": { + "hashes": [ + "sha256:b43fe04ebb3dfe181a612bbfc69e90555b8957022ad6a466f0308ed9c7f22e99" + ], + "index": "pypi", + "version": "==2019.11.30" + }, + "greenlet": { + "hashes": [ + "sha256:1023d7b43ca11264ab7052cb09f5635d4afdb43df55e0854498fc63070a0b206", + "sha256:124a3ae41215f71dc91d1a3d45cbf2f84e46b543e5d60b99ecc20e24b4c8f272", + "sha256:13037e2d7ab2145300676852fa069235512fdeba4ed1e3bb4b0677a04223c525", + "sha256:3af587e9813f9bd8be9212722321a5e7be23b2bc37e6323a90e592ab0c2ef117", + "sha256:41d8835c69a78de718e466dd0e6bfd4b46125f21a67c3ff6d76d8d8059868d6b", + "sha256:4481002118b2f1588fa3d821936ffdc03db80ef21186b62b90c18db4ba5e743b", + "sha256:47825c3a109f0331b1e54c1173d4e57fa000aa6c96756b62852bfa1af91cd652", + "sha256:5494e3baeacc371d988345fbf8aa4bd15555b3077c40afcf1994776bb6d77eaf", + "sha256:75e4c27188f28149b74e7685809f9227410fd15432a4438fc48627f518577fa5", + "sha256:97f2b01ab622a4aa4b3724a3e1fba66f47f054c434fbaa551833fa2b41e3db51", + "sha256:a34023b9eabb3525ee059f3bf33a417d2e437f7f17e341d334987d4091ae6072", + "sha256:ac85db59aa43d78547f95fc7b6fd2913e02b9e9b09e2490dfb7bbdf47b2a4914", + "sha256:be7a79988b8fdc5bbbeaed69e79cfb373da9759242f1565668be4fb7f3f37552", + "sha256:bee111161420f341a346731279dd976be161b465c1286f82cc0779baf7b729e8", + "sha256:ccd62f09f90b2730150d82f2f2ffc34d73c6ce7eac234aed04d15dc8a3023994", + "sha256:d3436110ca66fe3981031cc6aff8cc7a40d8411d173dde73ddaa5b8445385e2d", + "sha256:e495096e3e2e8f7192afb6aaeba19babc4fb2bdf543d7b7fed59e00c1df7f170", + "sha256:e66a824f44892bc4ec66c58601a413419cafa9cec895e63d8da889c8a1a4fa4a" + ], + "index": "pypi", + "version": "==0.4.17" + }, + "gunicorn": { + "hashes": [ + "sha256:1904bb2b8a43658807108d59c3f3d56c2b6121a701161de0ddf9ad140073c626", + "sha256:cd4a810dd51bf497552cf3f863b575dabd73d6ad6a91075b65936b151cbf4f9c" + ], + "index": "pypi", + "version": "==20.0.4" + }, + "h11": { + "hashes": [ + "sha256:acca6a44cb52a32ab442b1779adf0875c443c689e9e028f8d831a3769f9c5208", + "sha256:f2b1ca39bfed357d1f19ac732913d5f9faa54a5062eca7d2ec3a916cfb7ae4c7" + ], + "index": "pypi", + "version": "==0.8.1" + }, + "h2": { + "hashes": [ + "sha256:c8f387e0e4878904d4978cd688a3195f6b169d49b1ffa572a3d347d7adc5e09f", + "sha256:fd07e865a3272ac6ef195d8904de92dc7b38dc28297ec39cfa22716b6d62e6eb" + ], + "index": "pypi", + "version": "==3.1.0" + }, + "hpack": { + "hashes": [ + "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89", + "sha256:8eec9c1f4bfae3408a3f30500261f7e6a65912dc138526ea054f9ad98892e9d2" + ], + "index": "pypi", + "version": "==3.0.0" + }, + "htcondor": { + "hashes": [ + "sha256:34ea1e214284aca5a06cee4d756c8873a1787477f4fe6a045d3e1a0b42702b52", + "sha256:3bed2a0c4138e37c6bf41b18a559b0513b90fac4a01f0cd97f99ce02b12d6e83", + "sha256:7acc1bde00339634806b3e35010b62ab605aa83bfa56ae1040301c6008983371", + "sha256:7c6dd6524a4f986801cc3a65c69c6b2946e9fa0e1243dffd004bfd52b56e06fa", + "sha256:937daed135d2153cd6d29562cf8253674df0c6748f2887f67e38ba9c42906e1f", + "sha256:b3f9e7557061fd6c3dd8a0ac7d75f045e0b99c6037c9bd7a120e271b6d79b02d", + "sha256:ca436eac7a27f353045278b7276f5146f3f41af618cb6c4234019e19ea6631a7" + ], + "index": "pypi", + "version": "==8.9.8" + }, + "httpcore": { + "hashes": [ + "sha256:96f910b528d47b683242ec207050c7bbaa99cd1b9a07f78ea80cf61e55556b58" + ], + "index": "pypi", + "version": "==0.3.0" + }, + "httptools": { + "hashes": [ + "sha256:e00cbd7ba01ff748e494248183abc6e153f49181169d8a3d41bb49132ca01dfc" + ], + "index": "pypi", + "version": "==0.0.13" + }, + "hyperframe": { + "hashes": [ + "sha256:5187962cb16dcc078f23cb5a4b110098d546c3f41ff2d4038a9896893bbd0b40", + "sha256:a9f5c17f2cc3c719b917c4f33ed1c61bd1f8dfac4b1bd23b7c80b3400971b41f" + ], + "index": "pypi", + "version": "==5.2.0" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "index": "pypi", + "version": "==2.8" + }, + "importlib-metadata": { + "hashes": [ + "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da", + "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3" + ], + "index": "pypi", + "version": "==2.0.0" + }, + "iniconfig": { + "hashes": [ + "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", + "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" + ], + "index": "pypi", + "version": "==1.1.1" + }, + "jinja2": { + "hashes": [ + "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", + "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + ], + "index": "pypi", + "version": "==2.10.3" + }, + "jsonrpcbase": { + "hashes": [ + "sha256:7ea67fc1a7c87756e9a876e18a342e431e80d0ef3ba867dfd6f3fac5bf3fcc0d" + ], + "index": "pypi", + "version": "==0.2.0" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", + "sha256:195d7d2c4fbb0ee8139a6cf67194f3973a6b3042d742ebe0a9ed36d8b6f0c07f", + "sha256:22c178a091fc6630d0d045bdb5992d2dfe14e3259760e713c490da5323866c39", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:2beec1e0de6924ea551859edb9e7679da6e4870d32cb766240ce17e0a0ba2014", + "sha256:3b8a6499709d29c2e2399569d96719a1b21dcd94410a586a18526b143ec8470f", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:6f1e273a344928347c1290119b493a1f0303c52f5a5eae5f16d74f48c15d4a85", + "sha256:6fffc775d90dcc9aed1b89219549b329a9250d918fd0b8fa8d93d154918422e1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:7fed13866cf14bba33e7176717346713881f56d9d2bcebab207f7a036f41b850", + "sha256:84dee80c15f1b560d55bcfe6d47b27d070b4681c699c572af2e3c7cc90a3b8e0", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98bae9582248d6cf62321dcb52aaf5d9adf0bad3b40582925ef7c7f0ed85fceb", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:a6a744282b7718a2a62d2ed9d993cad6f5f585605ad352c11de459f4108df0a1", + "sha256:acf08ac40292838b3cbbb06cfe9b2cb9ec78fce8baca31ddb87aaac2e2dc3bc2", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b1dba4527182c95a0db8b6060cc98ac49b9e2f5e64320e2b56e47cb2831978c7", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:b7d644ddb4dbd407d31ffb699f1d140bc35478da613b441c582aeb7c43838dd8", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:bf5aa3cbcfdf57fa2ee9cd1822c862ef23037f5c832ad09cfea57fa846dec193", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:caabedc8323f1e93231b52fc32bdcde6db817623d33e100708d9a68e1f53b26b", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:d53bc011414228441014aa71dbec320c66468c1030aae3a6e29778a3382d96e5", + "sha256:d73a845f227b0bfe8a7455ee623525ee656a9e2e749e4742706d80a6065d5e2c", + "sha256:d9be0ba6c527163cbed5e0857c451fcd092ce83947944d6c14bc95441203f032", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be", + "sha256:feb7b34d6325451ef96bc0e36e1a6c0c1c64bc1fbec4b854f4529e51887b1621" + ], + "index": "pypi", + "version": "==1.1.1" + }, + "memory-profiler": { + "hashes": [ + "sha256:5fa47b274c929dd2cbcd9190afb62fec110701251d2ac2d301caaf545c81afc1" + ], + "index": "pypi", + "version": "==0.55.0" + }, + "mock": { + "hashes": [ + "sha256:83657d894c90d5681d62155c82bda9c1187827525880eda8ff5df4ec813437c3", + "sha256:d157e52d4e5b938c550f39eb2fd15610db062441a9c2747d3dbfa9298211d0f8" + ], + "index": "pypi", + "version": "==3.0.5" + }, + "mongoengine": { + "hashes": [ + "sha256:9301ca84ada9377a200a50541f9be7d5308081bf2112049d00e1dd163f80b940", + "sha256:fa3e73c966fca2b814cc1103ac4f55bcca7aae05028b112ef0cc8b321ee4a2f7" + ], + "index": "pypi", + "version": "==0.18.2" + }, + "multidict": { + "hashes": [ + "sha256:024b8129695a952ebd93373e45b5d341dbb87c17ce49637b34000093f243dd4f", + "sha256:041e9442b11409be5e4fc8b6a97e4bcead758ab1e11768d1e69160bdde18acc3", + "sha256:045b4dd0e5f6121e6f314d81759abd2c257db4634260abcfe0d3f7083c4908ef", + "sha256:047c0a04e382ef8bd74b0de01407e8d8632d7d1b4db6f2561106af812a68741b", + "sha256:068167c2d7bbeebd359665ac4fff756be5ffac9cda02375b5c5a7c4777038e73", + "sha256:148ff60e0fffa2f5fad2eb25aae7bef23d8f3b8bdaf947a65cdbe84a978092bc", + "sha256:1d1c77013a259971a72ddaa83b9f42c80a93ff12df6a4723be99d858fa30bee3", + "sha256:1d48bc124a6b7a55006d97917f695effa9725d05abe8ee78fd60d6588b8344cd", + "sha256:31dfa2fc323097f8ad7acd41aa38d7c614dd1960ac6681745b6da124093dc351", + "sha256:34f82db7f80c49f38b032c5abb605c458bac997a6c3142e0d6c130be6fb2b941", + "sha256:3d5dd8e5998fb4ace04789d1d008e2bb532de501218519d70bb672c4c5a2fc5d", + "sha256:4a6ae52bd3ee41ee0f3acf4c60ceb3f44e0e3bc52ab7da1c2b2aa6703363a3d1", + "sha256:4b02a3b2a2f01d0490dd39321c74273fed0568568ea0e7ea23e02bd1fb10a10b", + "sha256:4b843f8e1dd6a3195679d9838eb4670222e8b8d01bc36c9894d6c3538316fa0a", + "sha256:5de53a28f40ef3c4fd57aeab6b590c2c663de87a5af76136ced519923d3efbb3", + "sha256:61b2b33ede821b94fa99ce0b09c9ece049c7067a33b279f343adfe35108a4ea7", + "sha256:6a3a9b0f45fd75dc05d8e93dc21b18fc1670135ec9544d1ad4acbcf6b86781d0", + "sha256:76ad8e4c69dadbb31bad17c16baee61c0d1a4a73bed2590b741b2e1a46d3edd0", + "sha256:7ba19b777dc00194d1b473180d4ca89a054dd18de27d0ee2e42a103ec9b7d014", + "sha256:7c1b7eab7a49aa96f3db1f716f0113a8a2e93c7375dd3d5d21c4941f1405c9c5", + "sha256:7fc0eee3046041387cbace9314926aa48b681202f8897f8bff3809967a049036", + "sha256:8ccd1c5fff1aa1427100ce188557fc31f1e0a383ad8ec42c559aabd4ff08802d", + "sha256:8e08dd76de80539d613654915a2f5196dbccc67448df291e69a88712ea21e24a", + "sha256:c18498c50c59263841862ea0501da9f2b3659c00db54abfbf823a80787fde8ce", + "sha256:c49db89d602c24928e68c0d510f4fcf8989d77defd01c973d6cbe27e684833b1", + "sha256:ce20044d0317649ddbb4e54dab3c1bcc7483c78c27d3f58ab3d0c7e6bc60d26a", + "sha256:d1071414dd06ca2eafa90c85a079169bfeb0e5f57fd0b45d44c092546fcd6fd9", + "sha256:d3be11ac43ab1a3e979dac80843b42226d5d3cccd3986f2e03152720a4297cd7", + "sha256:db603a1c235d110c860d5f39988ebc8218ee028f07a7cbc056ba6424372ca31b" + ], + "index": "pypi", + "version": "==4.5.2" + }, + "nose": { + "hashes": [ + "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", + "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", + "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" + ], + "index": "pypi", + "version": "==1.3.7" + }, + "packaging": { + "hashes": [ + "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5", + "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a" + ], + "version": "==20.9" + }, + "pluggy": { + "hashes": [ + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" + ], + "index": "pypi", + "version": "==0.13.1" + }, + "psutil": { + "hashes": [ + "sha256:06660136ab88762309775fd47290d7da14094422d915f0466e0adf8e4b22214e", + "sha256:0c11adde31011a286197630ba2671e34651f004cc418d30ae06d2033a43c9e20", + "sha256:0c211eec4185725847cb6c28409646c7cfa56fdb531014b35f97b5dc7fe04ff9", + "sha256:0fc7a5619b47f74331add476fbc6022d7ca801c22865c7069ec0867920858963", + "sha256:3004361c6b93dbad71330d992c1ae409cb8314a6041a0b67507cc882357f583e", + "sha256:5e8dbf31871b0072bcba8d1f2861c0ec6c84c78f13c723bb6e981bce51b58f12", + "sha256:6d81b9714791ef9a3a00b2ca846ee547fc5e53d259e2a6258c3d2054928039ff", + "sha256:724390895cff80add7a1c4e7e0a04d9c94f3ee61423a2dcafd83784fabbd1ee9", + "sha256:ad21281f7bd6c57578dd53913d2d44218e9e29fd25128d10ff7819ef16fa46e7", + "sha256:f21a7bb4b207e4e7c60b3c40ffa89d790997619f04bbecec9db8e3696122bc78", + "sha256:f60042bef7dc50a78c06334ca8e25580455948ba2fa98f240d034a4fed9141a5" + ], + "index": "pypi", + "version": "==5.6.6" + }, + "py": { + "hashes": [ + "sha256:366389d1db726cd2fcfc79732e75410e5fe4d31db13692115529d34069a043c2", + "sha256:9ca6883ce56b4e8da7e79ac18787889fa5206c79dcc67fb065376cd2fe03f342" + ], + "index": "pypi", + "version": "==1.9.0" + }, + "pycosat": { + "hashes": [ + "sha256:4c99874946a7e939bb941bbb019dd2c20e6068e3107c91366e7779c69d70e0ed" + ], + "index": "pypi", + "version": "==0.6.3" + }, + "pycparser": { + "hashes": [ + "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" + ], + "index": "pypi", + "version": "==2.19" + }, + "pymongo": { + "hashes": [ + "sha256:32421df60d06f479d71b6b539642e410ece3006e8910688e68df962c8eb40a21", + "sha256:324b22a8443e11faca44c96b20e7ec8a9e59a1e664457edeeb4f796080b31cde", + "sha256:4505ff8b7923dd7a8bed1bf25c9c4d0df5ab0b8b2821f2296533f2149a55f401", + "sha256:460b224681ea711e48e3638d15be2249024031b7dcb9622ba19c2e85bd5a26cc", + "sha256:47473b70c5f3cd5ddd2c49ab3b9ceafdafbbed5bc963f147df22a9343d7978f5", + "sha256:49375839af76834e9c5c3cc78c78386873fd0b2ad9a0860a7dc4ec9fe73af9dd", + "sha256:4a65f0f71ece86c860d30a1436b646db8ea32aec518845ef2903ca569faec32e", + "sha256:530621906c5dd6d27305b39c4e017701e5f4299aa68b93cde70eb985f94ca26f", + "sha256:54f4770b5810e8dc3cbeed675874195f02bb2bc4e95a9d665068edfb3baff4f7", + "sha256:5ed9382410e938b0ff76041c34018210504729a83bcf4f6a70c7092c28169f6f", + "sha256:61cad83637ae12c1c825130d7f9325cd6c162e3a64e8747a8144866020be3ff4", + "sha256:61e8e1c58b4fdf47ab79b7c7db8bb022c1e40b3b5fcbbaeea5fc94dc5c75638d", + "sha256:6e04e496af7d156b66cce70460011c621ecbadf5dcdce325c7acbb3cd6ea245d", + "sha256:7ef89ec435e89da902451dde6845066fe2770befaf0301fe2a1ac426b51fced3", + "sha256:854e8425e5eb775ccfffad04ecd094c99923d60a2c2d49babb5c435e836a91fa", + "sha256:9569796d48498e4db4e1d56284b626a8ed15f641ce3a8b2085f06bb03f4c2c88", + "sha256:9d50c99c6388863cbfdc5db9bad62e3a7c2e5fc151554a07c7f3c2530334a34f", + "sha256:9ea016c2c011df21f77c1f806ce45129a344ba2d414bd50f9e065b13a4a134be", + "sha256:a8421f0823174888fb12a5fa675322e756499d71e77ff712b4412d4b8f3c6503", + "sha256:aef7d88384ada699976350a285c7a333f96ebc959e98e7d2c98589f47bbf3b7f", + "sha256:b4d7ff9957ee770cf03bd7156a68a2f2e838e60712d9608eadc8741c15d01e72", + "sha256:c1db85c39e6a60588f855dbc7bd68fb0dab796096148ab5aa4abecaff19e1c6e", + "sha256:cee2fc0b94e66e7230da12fc4b3d34793c49957e16ee04f6468a94e264a1e41d", + "sha256:cf1dea28379a16b23e47db312883f07b3ba8d9d6abc1c59e51d4c8ae1820ab43", + "sha256:d1cd175df7c8b5fc976bade78bf4d9fb5aa7ab465c0f59931e380bbe188ef8fc", + "sha256:d48a94edf3cdd34524936a72ea01b352682b337f33a42db10ba29a96c37147d3", + "sha256:d9cc103a4e97f78bc77a1d72759ab3722f6cdf0374ad4fb4b0c53bd3238bdf98", + "sha256:fcb9ae8aa9158106c5d98a4349ec0d90b68f052d620b2d24622ba03b91e4d81d" + ], + "index": "pypi", + "version": "==3.8.0" + }, + "pyopenssl": { + "hashes": [ + "sha256:621880965a720b8ece2f1b2f54ea2071966ab00e2970ad2ce11d596102063504", + "sha256:9a24494b2602aaf402be5c9e30a0b82d4a5c67528fe8fb475e3f3bc00dd69507" + ], + "index": "pypi", + "version": "==19.1.0" + }, + "pyparsing": { + "hashes": [ + "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1", + "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b" + ], + "index": "pypi", + "version": "==2.4.7" + }, + "pysocks": { + "hashes": [ + "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", + "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", + "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" + ], + "index": "pypi", + "version": "==1.7.1" + }, + "pytest": { + "hashes": [ + "sha256:7a8190790c17d79a11f847fba0b004ee9a8122582ebff4729a082c109e81a4c9", + "sha256:8f593023c1a0f916110285b6efd7f99db07d59546e3d8c36fc60e2ab05d3be92" + ], + "index": "pypi", + "version": "==6.1.1" + }, + "pytest-cov": { + "hashes": [ + "sha256:cc6742d8bac45070217169f5f72ceee1e0e55b0221f54bcf24845972d3a47f2b", + "sha256:cdbdef4f870408ebdbfeb44e63e07eb18bb4619fae852f6e760645fa36172626" + ], + "index": "pypi", + "version": "==2.8.1" + }, + "pytest-profiling": { + "hashes": [ + "sha256:93938f147662225d2b8bd5af89587b979652426a8a6ffd7e73ec4a23e24b7f29", + "sha256:999cc9ac94f2e528e3f5d43465da277429984a1c237ae9818f8cfd0b06acb019" + ], + "index": "pypi", + "version": "==1.7.0" + }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" + ], + "index": "pypi", + "version": "==2.8.0" + }, + "python-dotenv": { + "hashes": [ + "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", + "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + ], + "index": "pypi", + "version": "==0.10.3" + }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, + "requests-async": { + "hashes": [ + "sha256:8731420451383196ecf2fd96082bfc8ae5103ada90aba185888499d7784dde6f" + ], + "index": "pypi", + "version": "==0.5.0" + }, + "requests-mock": { + "hashes": [ + "sha256:510df890afe08d36eca5bb16b4aa6308a6f85e3159ad3013bac8b9de7bd5a010", + "sha256:88d3402dd8b3c69a9e4f9d3a73ad11b15920c6efd36bc27bf1f701cf4a8e4646" + ], + "index": "pypi", + "version": "==1.7.0" + }, + "rfc3986": { + "hashes": [ + "sha256:0344d0bd428126ce554e7ca2b61787b6a28d2bbd19fc70ed2dd85efe31176405", + "sha256:df4eba676077cefb86450c8f60121b9ae04b94f65f85b69f3f731af0516b7b18" + ], + "index": "pypi", + "version": "==1.3.2" + }, + "ruamel.yaml": { + "hashes": [ + "sha256:18078354bfcf00d51bcc17984aded80840379aed36036f078479e191b59bc059", + "sha256:211e6ef2530f44fc3197c713892678e7fbfbc40a1db6741179d6981514be1674", + "sha256:2e8f7cee12a2372cec4480fe81086b1fdab163f4b56e58b5592a105c52973b78", + "sha256:48cc8e948a7ec4917bf94adff2cc1255e98f1eef5e1961889886acc4ff3a7194", + "sha256:4a0c7f970aa0e30bc541f690fbd14aca19de1cab70787180de5083b902ec40b5", + "sha256:5dd0ea7c5c703e8675f3caf2898a50b4dadaa52838f8e104637a452a05e03030", + "sha256:612fb4833f1978ceb7fd7a24d86a5ebd103bcc408394f3af621293194658cf1b", + "sha256:61c421a7a2b8e2886a94fbe29866df6b99451998abaa1584b9fdc9c10c33e40b", + "sha256:6483416847980aa7090b697d177a8754c4f340683cc84abd38da7b850826687d", + "sha256:6622f3b0cae7ed6fe5d3d6a6d8d8cb9413a05b408d69a789a57b77a616bb6562", + "sha256:80b2acde0d1b9d25e5c041960a9149480c15c6d9f4c24b8ddb381b14e9e70ea4", + "sha256:8f9ed94be17f306485df8fd0274a30f130a73f127798657d4dc65b1f89ec7a36", + "sha256:9a6b94cc9b6e738036426498ac9fe8ca05afea4249fb9dec1be32ce4823d5756", + "sha256:a4b11dfe421a9836c723107a4ccc9cab9674de611ba60b8212e85526ea8bf254", + "sha256:a55e55c6ecb5725ba472f9b811940e8d258a32fb36f5793dbc38582d6f377f3f", + "sha256:a736ab1d8c2d5566254a1a2ee38e7c5460520bcccd4a8f0feb25a4463735e5a7", + "sha256:c29d0a3cffa5a25f5259bfeac06ffdc5e7d1fd38a0a26a6664d160192730434f", + "sha256:c33458217a8c352b59c86065c4f05f3f1ac28b01c3e1a422845c306237446bf3", + "sha256:cc9bd3c3fa8a928f7b6e19fe8de13a61deb91f257eccbe0d16114ce8c54cdc81", + "sha256:d63b7c828a7358ce5b03a3e2c2a3e5a7058a954f8919334cb09b3d8541d1fff6", + "sha256:fbd301680a3563e84d667042dac1c5d50ef402ecf1f4b1763507a6877b8181ad", + "sha256:fc67e79e2f5083be6fd1000c4646e13a891585772a503f56f51f845b547fe621" + ], + "index": "pypi", + "version": "==0.15.87" + }, + "sanic": { + "hashes": [ + "sha256:cc64978266025afb0e7c0f8be928e2b81670c5d58ddac290d04c9d0da6ec2112", + "sha256:ebd806298782400db811ea9d63e8096e835e67f0b5dc5e66e507532984a82bb3" + ], + "index": "pypi", + "version": "==19.6.0" + }, + "sentry-sdk": { + "hashes": [ + "sha256:23808d571d2461a4ce3784ec12bbee5bdb8c026c143fe79d36cef8a6d653e71f", + "sha256:bb90a4e19c7233a580715fc986cc44be2c48fc10b31e71580a2037e1c94b6950" + ], + "index": "pypi", + "version": "==0.14.3" + }, + "six": { + "hashes": [ + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" + ], + "index": "pypi", + "version": "==1.14.0" + }, + "slackclient": { + "hashes": [ + "sha256:b1b24df115e78b908565d9fa67bb3a86e66dd9a133954b953eb1c0559e7205b9", + "sha256:ccb0b8b203bc6087f7ab995fb4d2971dbe8925472afb078087ed76d1d8f939ca" + ], + "index": "pypi", + "version": "==2.7.1" + }, + "toml": { + "hashes": [ + "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f", + "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88" + ], + "index": "pypi", + "version": "==0.10.1" + }, + "tqdm": { + "hashes": [ + "sha256:251ee8440dbda126b8dfa8a7c028eb3f13704898caaef7caa699b35e119301e2", + "sha256:fe231261cfcbc6f4a99165455f8f6b9ef4e1032a6e29bccf168b4bf42012f09c" + ], + "index": "pypi", + "version": "==4.42.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918", + "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c", + "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f" + ], + "index": "pypi", + "version": "==3.7.4.3" + }, + "ujson": { + "hashes": [ + "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86" + ], + "index": "pypi", + "version": "==1.35" + }, + "urllib3": { + "hashes": [ + "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", + "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + ], + "index": "pypi", + "version": "==1.25.3" + }, + "uvloop": { + "hashes": [ + "sha256:0fcd894f6fc3226a962ee7ad895c4f52e3f5c3c55098e21efb17c071849a0573", + "sha256:2f31de1742c059c96cb76b91c5275b22b22b965c886ee1fced093fa27dde9e64", + "sha256:459e4649fcd5ff719523de33964aa284898e55df62761e7773d088823ccbd3e0", + "sha256:67867aafd6e0bc2c30a079603a85d83b94f23c5593b3cc08ec7e58ac18bf48e5", + "sha256:8c200457e6847f28d8bb91c5e5039d301716f5f2fce25646f5fb3fd65eda4a26", + "sha256:958906b9ca39eb158414fbb7d6b8ef1b7aee4db5c8e8e5d00fcbb69a1ce9dca7", + "sha256:ac1dca3d8f3ef52806059e81042ee397ac939e5a86c8a3cea55d6b087db66115", + "sha256:b284c22d8938866318e3b9d178142b8be316c52d16fcfe1560685a686718a021", + "sha256:c48692bf4587ce281d641087658eca275a5ad3b63c78297bbded96570ae9ce8f", + "sha256:fefc3b2b947c99737c348887db2c32e539160dcbeb7af9aa6b53db7a283538fe" + ], + "index": "pypi", + "version": "==0.12.2" + }, + "websocket-client": { + "hashes": [ + "sha256:0fc45c961324d79c781bab301359d5a1b00b13ad1b10415a4780229ef71a5549", + "sha256:d735b91d6d1692a6a181f2a8c9e0238e5f6373356f561bb9dc4c7af36f452010" + ], + "index": "pypi", + "version": "==0.57.0" + }, + "websockets": { + "hashes": [ + "sha256:0e2f7d6567838369af074f0ef4d0b802d19fa1fee135d864acc656ceefa33136", + "sha256:2a16dac282b2fdae75178d0ed3d5b9bc3258dabfae50196cbb30578d84b6f6a6", + "sha256:5a1fa6072405648cb5b3688e9ed3b94be683ce4a4e5723e6f5d34859dee495c1", + "sha256:5c1f55a1274df9d6a37553fef8cff2958515438c58920897675c9bc70f5a0538", + "sha256:669d1e46f165e0ad152ed8197f7edead22854a6c90419f544e0f234cc9dac6c4", + "sha256:695e34c4dbea18d09ab2c258994a8bf6a09564e762655408241f6a14592d2908", + "sha256:6b2e03d69afa8d20253455e67b64de1a82ff8612db105113cccec35d3f8429f0", + "sha256:79ca7cdda7ad4e3663ea3c43bfa8637fc5d5604c7737f19a8964781abbd1148d", + "sha256:7fd2dd9a856f72e6ed06f82facfce01d119b88457cd4b47b7ae501e8e11eba9c", + "sha256:82c0354ac39379d836719a77ee360ef865377aa6fdead87909d50248d0f05f4d", + "sha256:8f3b956d11c5b301206382726210dc1d3bee1a9ccf7aadf895aaf31f71c3716c", + "sha256:91ec98640220ae05b34b79ee88abf27f97ef7c61cf525eec57ea8fcea9f7dddb", + "sha256:952be9540d83dba815569d5cb5f31708801e0bbfc3a8c5aef1890b57ed7e58bf", + "sha256:99ac266af38ba1b1fe13975aea01ac0e14bb5f3a3200d2c69f05385768b8568e", + "sha256:9fa122e7adb24232247f8a89f2d9070bf64b7869daf93ac5e19546b409e47e96", + "sha256:a0873eadc4b8ca93e2e848d490809e0123eea154aa44ecd0109c4d0171869584", + "sha256:cb998bd4d93af46b8b49ecf5a72c0a98e5cc6d57fdca6527ba78ad89d6606484", + "sha256:e02e57346f6a68523e3c43bbdf35dde5c440318d1f827208ae455f6a2ace446d", + "sha256:e79a5a896bcee7fff24a788d72e5c69f13e61369d055f28113e71945a7eb1559", + "sha256:ee55eb6bcf23ecc975e6b47c127c201b913598f38b6a300075f84eeef2d3baff", + "sha256:f1414e6cbcea8d22843e7eafdfdfae3dd1aba41d1945f6ca66e4806c07c4f454" + ], + "index": "pypi", + "version": "==6.0" + }, + "yarl": { + "hashes": [ + "sha256:040b237f58ff7d800e6e0fd89c8439b841f777dd99b4a9cca04d6935564b9409", + "sha256:17668ec6722b1b7a3a05cc0167659f6c95b436d25a36c2d52db0eca7d3f72593", + "sha256:3a584b28086bc93c888a6c2aa5c92ed1ae20932f078c46509a66dce9ea5533f2", + "sha256:4439be27e4eee76c7632c2427ca5e73703151b22cae23e64adb243a9c2f565d8", + "sha256:48e918b05850fffb070a496d2b5f97fc31d15d94ca33d3d08a4f86e26d4e7c5d", + "sha256:9102b59e8337f9874638fcfc9ac3734a0cfadb100e47d55c20d0dc6087fb4692", + "sha256:9b930776c0ae0c691776f4d2891ebc5362af86f152dd0da463a6614074cb1b02", + "sha256:b3b9ad80f8b68519cc3372a6ca85ae02cc5a8807723ac366b53c0f089db19e4a", + "sha256:bc2f976c0e918659f723401c4f834deb8a8e7798a71be4382e024bcc3f7e23a8", + "sha256:c22c75b5f394f3d47105045ea551e08a3e804dc7e01b37800ca35b58f856c3d6", + "sha256:c52ce2883dc193824989a9b97a76ca86ecd1fa7955b14f87bf367a61b6232511", + "sha256:ce584af5de8830d8701b8979b18fcf450cef9a382b1a3c8ef189bedc408faf1e", + "sha256:da456eeec17fa8aa4594d9a9f27c0b1060b6a75f2419fe0c00609587b2695f4a", + "sha256:db6db0f45d2c63ddb1a9d18d1b9b22f308e52c83638c26b422d520a815c4b3fb", + "sha256:df89642981b94e7db5596818499c4b2219028f2a528c9c37cc1de45bf2fd3a3f", + "sha256:f18d68f2be6bf0e89f1521af2b1bb46e66ab0018faafa81d70f358153170a317", + "sha256:f379b7f83f23fe12823085cd6b906edc49df969eb99757f58ff382349a3303c6" + ], + "index": "pypi", + "version": "==1.5.1" + }, + "zipp": { + "hashes": [ + "sha256:16522f69653f0d67be90e8baa4a46d66389145b734345d68a257da53df670903", + "sha256:c1532a8030c32fd52ff6a288d855fe7adef5823ba1d26a29a68fd6314aa72baa" + ], + "index": "pypi", + "version": "==3.3.1" + }, + "zope.event": { + "hashes": [ + "sha256:2666401939cdaa5f4e0c08cf7f20c9b21423b95e88f4675b1443973bdb080c42", + "sha256:5e76517f5b9b119acf37ca8819781db6c16ea433f7e2062c4afc2b6fbedb1330" + ], + "index": "pypi", + "version": "==4.5.0" + }, + "zope.interface": { + "hashes": [ + "sha256:040f833694496065147e76581c0bf32b229a8b8c5eda120a0293afb008222387", + "sha256:11198b44e4a3d8c7a80cc20bbdd65522258a4d82fe467cd310c9fcce8ffe2ed2", + "sha256:121a9dccfe0c34be9c33b2c28225f0284f9b8e090580ffdff26c38fa16c7ffe1", + "sha256:15f3082575e7e19581a80b866664f843719b647a7f7189c811ba7f9ab3309f83", + "sha256:1d73d8986f948525536956ddd902e8a587a6846ebf4492117db16daba2865ddf", + "sha256:208e82f73b242275b8566ac07a25158e7b21fa2f14e642a7881048430612d1a6", + "sha256:2557833df892558123d791d6ff80ac4a2a0351f69c7421c7d5f0c07db72c8865", + "sha256:25ea6906f9987d42546329d06f9750e69f0ee62307a2e7092955ed0758e64f09", + "sha256:2c867914f7608674a555ac8daf20265644ac7be709e1da7d818089eebdfe544e", + "sha256:2eadac20711a795d3bb7a2bfc87c04091cb5274d9c3281b43088a1227099b662", + "sha256:37999d5ebd5d7bcd32438b725ca3470df05a7de8b1e9c0395bef24296b31ca99", + "sha256:3ae8946d51789779f76e4fa326fd6676d8c19c1c3b4c4c5e9342807185264875", + "sha256:5636cd7e60583b1608044ae4405e91575399430e66a5e1812f4bf30bcc55864e", + "sha256:570e637cb6509998555f7e4af13006d89fad6c09cfc5c4795855385391063e4b", + "sha256:590a40447ff3803c44050ce3c17c3958f11ca028dae3eacdd7b96775184394fa", + "sha256:5aab51b9c1af1b8a84f40aa49ffe1684d41810b18d6c3e94aa50194e0a563f01", + "sha256:5ffe4e0753393bcbcfc9a58133ed3d3a584634cc7cc2e667f8e3e6fbcbb2155d", + "sha256:663982381bd428a275a841009e52983cc69c471a4979ce01344fadbf72cf353d", + "sha256:6d06bf8e24dd6c473c4fbd8e16a83bd2e6d74add6ba25169043deb46d497b211", + "sha256:6e5b9a4bf133cf1887b4a04c21c10ca9f548114f19c83957b2820d5c84254940", + "sha256:70a2aed9615645bbe9d82c0f52bc7e676d2c0f8a63933d68418e0cb307f30536", + "sha256:7750746421c4395e3d2cc3d805919f4f57bb9f2a9a0ccd955566a9341050a1b4", + "sha256:7fc8708bc996e50fc7a9a2ad394e1f015348e389da26789fa6916630237143d7", + "sha256:91abd2f080065a7c007540f6bbd93ef7bdbbffa6df4a4cfab3892d8623b83c98", + "sha256:988f8b2281f3d95c66c01bdb141cefef1cc97db0d473c25c3fe2927ef00293b9", + "sha256:9f56121d8a676802044584e6cc41250bbcde069d8adf725b9b817a6b0fd87f09", + "sha256:a0f51536ce6e817a7aa25b0dca8b62feb210d4dc22cabfe8d1a92d47979372cd", + "sha256:a1cdd7390d7f66ddcebf545203ca3728c4890d605f9f2697bc8e31437906e8e7", + "sha256:b10eb4d0a77609679bf5f23708e20b1cd461a1643bd8ea42b1ca4149b1a5406c", + "sha256:b274ac8e511b55ffb62e8292316bd2baa80c10e9fe811b1aa5ce81da6b6697d8", + "sha256:c75b502af2c83fcfa2ee9c2257c1ba5806634a91a50db6129ff70e67c42c7e7b", + "sha256:c9c8e53a5472b77f6a391b515c771105011f4b40740ce53af8428d1c8ca20004", + "sha256:d867998a56c5133b9d31992beb699892e33b72150a8bf40f86cb52b8c606c83f", + "sha256:eb566cab630ec176b2d6115ed08b2cf4d921b47caa7f02cca1b4a9525223ee94", + "sha256:f61e6b95b414431ffe9dc460928fe9f351095fde074e2c2f5c6dda7b67a2192d", + "sha256:f718675fd071bcce4f7cbf9250cbaaf64e2e91ef1b0b32a1af596e7412647556", + "sha256:f9d4bfbd015e4b80dbad11c97049975f94592a6a0440e903ee647309f6252a1f", + "sha256:fae50fc12a5e8541f6f1cc4ed744ca8f76a9543876cf63f618fb0e6aca8f8375", + "sha256:fcf9c8edda7f7b2fd78069e97f4197815df5e871ec47b0f22580d330c6dec561", + "sha256:fdedce3bc5360bd29d4bb90396e8d4d3c09af49bc0383909fe84c7233c5ee675" + ], + "index": "pypi", + "version": "==5.1.2" + } + }, + "develop": {} +} diff --git a/Pipfile_notes.md b/Pipfile_notes.md new file mode 100644 index 000000000..a6a4c6600 --- /dev/null +++ b/Pipfile_notes.md @@ -0,0 +1,5 @@ +Note that requirements*.txt is currently the source of truth for which modules and versions +are required for this software. The Pipfile is provided as a convenience for users of tools +that consume one. + +You should verify that it is equivalent to requirements*.txt before using it. \ No newline at end of file diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index ac5f17d97..9abc06e84 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -1,6 +1,6 @@ from collections import Counter from collections import namedtuple -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from typing import Dict @@ -167,14 +167,16 @@ def _get_dummy_dates(self, creation_start_time, creation_end_time): ) creation_start_time = self.sdkmr.check_and_convert_time(creation_start_time) - creation_start_date = datetime.fromtimestamp(creation_start_time) + creation_start_date = datetime.fromtimestamp( + creation_start_time, tz=timezone.utc + ) dummy_start_id = ObjectId.from_datetime(creation_start_date) if creation_end_time is None: raise Exception("Please provide a valid end time for when job was created") creation_end_time = self.sdkmr.check_and_convert_time(creation_end_time) - creation_end_date = datetime.fromtimestamp(creation_end_time) + creation_end_date = datetime.fromtimestamp(creation_end_time, tz=timezone.utc) dummy_end_id = ObjectId.from_datetime(creation_end_date) if creation_start_time > creation_end_time: diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index c63310142..1134e1f64 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -6,7 +6,7 @@ import time import unittest from configparser import ConfigParser -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from pprint import pprint from unittest.mock import patch from pytest import raises @@ -864,7 +864,12 @@ def replace_job_id(self, job1, new_id): with self.mongo_util.mongo_engine_connection(): job2 = self.create_job_from_job(job1, new_id) job2.save() - print("Saved job with id", job2.id, job2.id.generation_time) + print( + "Saved job with id", + job2.id, + job2.id.generation_time, + job2.id.generation_time.timestamp(), + ) job1.delete() # flake8: noqa: C901 @@ -912,7 +917,7 @@ def test_check_jobs_date_range(self, condor_mock): new_job_ids = [] - now = datetime.utcnow() + now = datetime.now(tz=timezone.utc) last_month = now - timedelta(days=30) last_month_and_1_hour = now - timedelta(days=30) - timedelta(hours=1) @@ -921,6 +926,17 @@ def test_check_jobs_date_range(self, condor_mock): tomorrow = now + timedelta(days=1) day_after = now + timedelta(days=2) + print( + f"Last month - 1 hour: {last_month_and_1_hour} " + + f"ts: {last_month_and_1_hour.timestamp()}" + ) + print(f"Last month: {last_month} ts: {last_month.timestamp()}") + print(f"Last Week: {last_week} ts: {last_week.timestamp()}") + print(f"Yesterday: {yesterday} ts: {yesterday.timestamp()}") + print(f"Now: {now} ts: {now.timestamp()}") + print(f"Tomorrow: {tomorrow} ts: {tomorrow.timestamp()}") + print(f"Day after: {day_after} ts: {day_after.timestamp()}") + with self.mongo_util.mongo_engine_connection(): # Last Month job = Job.objects.with_id(job_id1) # type : Job @@ -1028,10 +1044,8 @@ def test_check_jobs_date_range(self, condor_mock): if job_id in new_job_ids: count += 1 self.assertIn(js["status"], ["created", "queued"]) - date = SDKMethodRunner.check_and_convert_time(js["created"]) - ts = date - print(date, last_week, tomorrow) - print(ts, last_week.timestamp(), tomorrow.timestamp()) + ts = SDKMethodRunner.check_and_convert_time(js["created"]) + print(f"Timestamp: {ts}") self.assertTrue(ts > last_month_and_1_hour.timestamp()) self.assertTrue(ts < tomorrow.timestamp()) self.assertEqual(4, count) @@ -1079,10 +1093,8 @@ def test_check_jobs_date_range(self, condor_mock): if job_id in new_job_ids: count += 1 self.assertIn(js["status"], ["created", "queued"]) - date = SDKMethodRunner.check_and_convert_time(js["created"]) - ts = date - print(date, last_week, tomorrow) - print(ts, last_week.timestamp(), tomorrow.timestamp()) + ts = SDKMethodRunner.check_and_convert_time(js["created"]) + print(f"Timestamp: {ts}") self.assertTrue(ts > last_month_and_1_hour.timestamp()) self.assertTrue(ts < tomorrow.timestamp()) From e1f9af375ed6b4d1a6dc72e6267d4e4a180c482e Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 17 Feb 2021 09:37:20 -0800 Subject: [PATCH 010/109] API to DB tests step 1: authentication harness (#309) * API to DB tests step 1: authentication harness This commit is the first step towards having API to DB tests running in the ee2 test suite. As part of a noop test, it starts and stops the auth service. Tests were manually verified to pass when run inside the Dockerfile. The noop test was manually verified to print out the root HTML page from the auth server. * Run black * Fix travis Why are we running both travis and GHA...? * remove unused imports * make retry count a constant in the auth controller --- .github/workflows/ee2-tests.yml | 3 + .gitignore | 1 + .travis.yml | 3 + Dockerfile | 6 + test/tests_for_integration/api_to_db_test.py | 95 +++++++++++ test/tests_for_integration/auth_controller.py | 161 ++++++++++++++++++ test/tests_for_integration/authjars | 74 ++++++++ test/utils_shared/test_utils.py | 72 ++++++++ 8 files changed, 415 insertions(+) create mode 100644 test/tests_for_integration/api_to_db_test.py create mode 100644 test/tests_for_integration/auth_controller.py create mode 100644 test/tests_for_integration/authjars diff --git a/.github/workflows/ee2-tests.yml b/.github/workflows/ee2-tests.yml index 0972e556c..8faedc45a 100644 --- a/.github/workflows/ee2-tests.yml +++ b/.github/workflows/ee2-tests.yml @@ -27,6 +27,9 @@ jobs: python -m pip install --upgrade pip pip install flake8 black pytest if [ -f requirements.txt ]; then pip install -r requirements-dev.txt; fi + cd /opt + git clone https://github.com/kbase/jars + cd - - name: Lint with flake8 and black run: | flake8 ./lib ./test diff --git a/.gitignore b/.gitignore index 1f650694b..4d823b979 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ sdk.cfg lib/execution_engine2/execution_engine2Impl.py.bak* coverage.xml +test_temp_can_delete \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index d70fe1dc9..2c3180e9d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,9 @@ before_install: install: + - cd /opt + - git clone https://github.com/kbase/jars + - cd - - pip install -r requirements.txt - pip install black flake8 - flake8 ./lib ./test diff --git a/Dockerfile b/Dockerfile index f6f2615ac..252b32794 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,12 @@ RUN DEBIAN_FRONTEND=noninteractive wget -qO - https://research.cs.wisc.edu/htcon && apt-get update -y \ && apt-get install -y condor +# install jars +# perhaps we should have test and prod dockerfiles to avoid jars and mongo installs in prod +RUN cd /opt \ + && git clone https://github.com/kbase/jars \ + && cd - + # install mongodb RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2930ADAE8CAF5059EE73BB4B58712A2291FA4AD5 \ && echo "deb http://repo.mongodb.org/apt/debian stretch/mongodb-org/3.6 main" | tee /etc/apt/sources.list.d/mongodb-org-3.6.list \ diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py new file mode 100644 index 000000000..61bc54f25 --- /dev/null +++ b/test/tests_for_integration/api_to_db_test.py @@ -0,0 +1,95 @@ +""" +Integration tests that cover the entire codebase from API to database. + +NOTE 1: These tests are designed to only be runnable after running docker-compose up + +NOTE 2: These tests were set up quickly in order to debug a problem with administration related +calls. As such, the auth server was set up to run in test mode locally. If more integrations +(e.g. the workspace) are needed, they will need to be added either locally or as docker containers. +If the latter, the test auth integration will likely need to be converted to a docker container or +exposed to other containers. +""" + +from pathlib import Path +import pymongo +from pytest import fixture +from tests_for_integration.auth_controller import AuthController +from utils_shared.test_utils import get_test_config + +KEEP_TEMP_FILES = True +AUTH_DB = "api_to_db_test" +AUTH_MONGO_USER = "auth" +TEMP_DIR = Path("test_temp_can_delete") + +# may need to make this configurable +JARS_DIR = Path("/opt/jars/lib/jars") + + +@fixture(scope="module") +def config(): + yield get_test_config() + + +@fixture(scope="module") +def mongo_client(config): + mc = pymongo.MongoClient( + config["mongo-host"], + username=config["mongo-user"], + password=config["mongo-password"], + ) + yield mc + + mc.close() + + +def _clean_auth_db(mongo_client): + try: + mongo_client[AUTH_DB].command("dropUser", AUTH_MONGO_USER) + except pymongo.errors.OperationFailure as e: + if f"User '{AUTH_MONGO_USER}@{AUTH_DB}' not found" not in e.args[0]: + raise # otherwise ignore and continue, user is already toast + mongo_client.drop_database(AUTH_DB) + + +@fixture(scope="module") +def auth_url(config, mongo_client): + # clean up from any previously failed test runs that left the db in place + _clean_auth_db(mongo_client) + + # make a user for the auth db + mongo_client[AUTH_DB].command( + "createUser", AUTH_MONGO_USER, pwd="authpwd", roles=["readWrite"] + ) + auth = AuthController( + JARS_DIR, + config["mongo-host"], + AUTH_DB, + TEMP_DIR, + mongo_user=AUTH_MONGO_USER, + mongo_pwd="authpwd", + ) + print( + f"Started KBase Auth2 {auth.version} on port {auth.port} " + + f"in dir {auth.temp_dir} in {auth.startup_count}s" + ) + url = f"http://localhost:{auth.port}" + + yield url + + print(f"shutting down auth, KEEP_TEMP_FILES={KEEP_TEMP_FILES}") + auth.destroy(not KEEP_TEMP_FILES) + + # Because the tests are run with mongo in a persistent docker container via docker-compose, + # we need to clean up after ourselves. + _clean_auth_db(mongo_client) + + +# TODO start the ee2 service +# TODO wipe the ee2 database between every test + + +def test_is_admin(auth_url): + import requests + + print(requests.get(auth_url).text) + # TODO add a test diff --git a/test/tests_for_integration/auth_controller.py b/test/tests_for_integration/auth_controller.py new file mode 100644 index 000000000..fc7484dc2 --- /dev/null +++ b/test/tests_for_integration/auth_controller.py @@ -0,0 +1,161 @@ +""" +A controller for the KBase Auth2 service (https://github.com/kbase/auth2) for use in testing +auth-enabled applications. +""" + +# Ported from: +# https://github.com/kbase/sample_service/blob/master/test/auth_controller.py +# May want to set up a python package for this...? + +import os +import requests +import shutil +import subprocess +import tempfile +import time +import zipfile + +from pathlib import Path +from utils_shared.test_utils import TestException +from utils_shared import test_utils + +_AUTH_CLASS = "us.kbase.test.auth2.StandaloneAuthServer" +_JARS_FILE = Path(__file__).resolve().parent.joinpath("authjars") +_RETRY_COUNT = 40 + + +class AuthController: + """ + The main Auth controller class. + + Attributes: + version - the version of the auth service + port - the port for the Auth service. + temp_dir - the location of the Auth data and logs. + """ + + def __init__( + self, + jars_dir: Path, + mongo_host: str, + mongo_db: str, + root_temp_dir: Path, + mongo_user: str = None, + mongo_pwd: str = None, + ): + """ + Create and start a new Auth service. An unused port will be selected for the server. + + :param jars_dir: The path to the lib/jars dir of the KBase Jars repo + (https://github.com/kbase/jars), e.g /path_to_repo/lib/jars. + :param mongo_host: The address of the MongoDB server to use as the Auth service database, + e.g. localhost:27017. + :param mongo_db: The database in which to store Auth data. + :param root_temp_dir: A temporary directory in which to store Auth data and log files. + The files will be stored inside a child directory that is unique per invocation. + :param mongo_user: if the MongoDB server requires authentication, the user name. + :param mongo_pwd: if the MongoDB server requires authentication, the user password. + """ + if not jars_dir or not os.access(jars_dir, os.X_OK): + raise TestException( + "jars_dir {} does not exist or is not executable.".format(jars_dir) + ) + if not mongo_host: + raise TestException("mongo_host must be provided") + if not mongo_db: + raise TestException("mongo_db must be provided") + if not root_temp_dir: + raise TestException("root_temp_dir is None") + + if bool(mongo_user) ^ bool(mongo_pwd): # xor + raise TestException( + "Neither or both of mongo_user and mongo_pwd is required" + ) + + jars_dir = jars_dir.resolve() + class_path = self._get_class_path(jars_dir) + + # make temp dirs + root_temp_dir = root_temp_dir.absolute() + os.makedirs(root_temp_dir, exist_ok=True) + self.temp_dir = Path( + tempfile.mkdtemp(prefix="AuthController-", dir=str(root_temp_dir)) + ) + + self.port = test_utils.find_free_port() + + template_dir = self.temp_dir.joinpath("templates") + self._install_templates(jars_dir, template_dir) + + command = [ + "java", + "-classpath", + class_path, + "-DAUTH2_TEST_MONGOHOST=" + mongo_host, + "-DAUTH2_TEST_MONGODB=" + mongo_db, + "-DAUTH2_TEST_TEMPLATE_DIR=" + str(template_dir), + _AUTH_CLASS, + str(self.port), + ] + if mongo_user: + command.insert(5, "-DAUTH2_TEST_MONGOPWD=" + mongo_pwd) + command.insert(5, "-DAUTH2_TEST_MONGOUSER=" + mongo_user) + + self._outfile = open(self.temp_dir.joinpath("auth.log"), "w") + + self._proc = subprocess.Popen( + command, stdout=self._outfile, stderr=subprocess.STDOUT + ) + + for count in range(_RETRY_COUNT): + err = None + time.sleep(1) # wait for server to start + try: + res = requests.get( + f"http://localhost:{self.port}", + headers={"accept": "application/json"}, + ) + if res.ok: + self.version = res.json()["version"] + break + err = TestException(res.text) + except requests.exceptions.ConnectionError as e: + err = TestException(e.args[0]) + err.__cause__ = e + if err: + raise err + self.startup_count = count + 1 + + def destroy(self, delete_temp_files: bool = True): + """ + Shut down the server and optionally delete any files generated. + + :param delete_temp_files: if true, delete all the temporary files generated as part of + running the server. + """ + if self._proc: + self._proc.terminate() + if self._outfile: + self._outfile.close() + if delete_temp_files and self.temp_dir: + shutil.rmtree(self.temp_dir) + + def _install_templates(self, jars_dir: Path, template_dir: Path): + with open(_JARS_FILE) as jf: + template_zip_file = jars_dir.joinpath(jf.readline().strip()) + with zipfile.ZipFile(template_zip_file) as z: + # should really check to see that the entries are safe, but it's our zipfile, so + # don't bother for now. + z.extractall(template_dir) + + def _get_class_path(self, jars_dir: Path): + cp = [] + with open(_JARS_FILE) as jf: + jf.readline() # 1st line is template file + for line in jf: + if line.strip() and not line.startswith("#"): + p = jars_dir.joinpath(line.strip()) + if not p.is_file(): + raise TestException(f"Required jar does not exist: {p}") + cp.append(str(p)) + return ":".join(cp) diff --git a/test/tests_for_integration/authjars b/test/tests_for_integration/authjars new file mode 100644 index 000000000..c79b8d685 --- /dev/null +++ b/test/tests_for_integration/authjars @@ -0,0 +1,74 @@ +kbase/auth2/kbase-auth2templates-0.4.3.zip + +kbase/auth2/kbase-auth2-0.4.3.jar +kbase/auth2/kbase-auth2test-0.4.3.jar + +#lib +apache_commons/commons-codec-1.8.jar +apache_commons/commons-validator-1.5.1.jar +google/guava-18.0.jar +ini4j/ini4j-0.5.2.jar +jcommander/jcommander-1.48.jar +mongo/mongo-java-driver-3.8.2.jar +mustache/compiler-0.9.3.jar +nulab-inc/zxcvbn-1.2.2.jar + +#logging +kbase/common/kbase-common-0.0.22.jar +jna/jna-3.4.0.jar +logback/logback-core-1.1.2.jar +logback/logback-classic-1.1.2.jar +slf4j/slf4j-api-1.7.25.jar +syslog4j/syslog4j-0.9.46.jar + +#yauaa +yauaa/yauaa-1.3.jar +apache_commons/commons-lang3-3.5.jar +apache_commons/commons-collections4-4.1.jar +apache_commons/commons-logging-1.2.jar +apache_commons/commons-io-2.4.jar +kohsuke/args4j-2.33.jar +snakeyaml/snakeyaml-1.18.jar + +#jackson +jackson/jackson-annotations-2.5.4.jar +jackson/jackson-core-2.5.4.jar +jackson/jackson-databind-2.5.4.jar +jackson/jackson-jaxrs-base-2.5.4.jar +jackson/jackson-jaxrs-json-provider-2.5.4.jar +jackson/jackson-module-jaxb-annotations-2.5.4.jar + +#jersey +jersey/entity-filtering/jersey-entity-filtering-2.23.2.jar +jersey/entity-filtering/jersey-media-json-jackson-2.23.2.jar +jersey/mvc/jersey-mvc-2.23.2.jar +jersey/mvc/jersey-mvc-mustache-2.23.2.jar +jersey/jersey-client-2.23.2.jar +jersey/jersey-common-2.23.2.jar +jersey/jersey-container-servlet-2.23.2.jar +jersey/jersey-container-servlet-core-2.23.2.jar +jersey/jersey-guava-2.23.2.jar +jersey/jersey-media-jaxb-2.23.2.jar +jersey/jersey-server-2.23.2.jar + +#jerseydeps +annotation/javax.annotation-api-1.2.jar +asm/asm-debug-all-5.0.4.jar +inject/javax.inject-2.5.0-b05.jar +javassist/javassist-3.20.0-GA.jar +jaxb/jaxb-api-2.2.7.jar +jaxrs/javax.ws.rs-api-2.0.1.jar +osgi/org.osgi.core-4.2.0.jar +persistence/persistence-api-1.0.jar +servlet/javax.servlet-api-3.0.1.jar +validationapi/validation-api-1.1.0.Final.jar + +#jerseydep_hk2 +hk2/aopalliance-repackaged-2.5.0-b05.jar +hk2/hk2-api-2.5.0-b05.jar +hk2/hk2-locator-2.5.0-b05.jar +hk2/hk2-utils-2.5.0-b05.jar +hk2/osgi-resource-locator-1.0.1.jar + +#test +jetty/jetty-all-9.3.11.v20160721-uber.jar diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index a2c4fe7b2..dab793002 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -1,7 +1,10 @@ import json import os.path import uuid +import logging +import socket from configparser import ConfigParser +from contextlib import closing from datetime import datetime from typing import List, Dict @@ -384,3 +387,72 @@ def get_sample_job_params(method=None, wsid="123"): def assert_exception_correct(got: Exception, expected: Exception): assert got.args == expected.args assert type(got) == type(expected) + + +def find_free_port() -> int: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +class TestException(Exception): + __test__ = False + + +def create_auth_user(auth_url, username, displayname): + ret = requests.post( + auth_url + "/testmode/api/V2/testmodeonly/user", + headers={"accept": "application/json"}, + json={"user": username, "display": displayname}, + ) + if not ret.ok: + ret.raise_for_status() + + +def create_auth_login_token(auth_url, username): + ret = requests.post( + auth_url + "/testmode/api/V2/testmodeonly/token", + headers={"accept": "application/json"}, + json={"user": username, "type": "Login"}, + ) + if not ret.ok: + ret.raise_for_status() + return ret.json()["token"] + + +def create_auth_role(auth_url, role, description): + ret = requests.post( + auth_url + "/testmode/api/V2/testmodeonly/customroles", + headers={"accept": "application/json"}, + json={"id": role, "desc": description}, + ) + if not ret.ok: + ret.raise_for_status() + + +def set_custom_roles(auth_url, user, roles): + ret = requests.put( + auth_url + "/testmode/api/V2/testmodeonly/userroles", + headers={"accept": "application/json"}, + json={"user": user, "customroles": roles}, + ) + if not ret.ok: + ret.raise_for_status() + + +def get_test_config(): + config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + logging.info(f"Loading config from {config_file}") + + config_parser = ConfigParser() + config_parser.read(config_file) + + cfg = {} + + for nameval in config_parser.items("execution_engine2"): + cfg[nameval[0]] = nameval[1] + + mongo_in_docker = cfg.get("mongo-in-docker-compose", None) + if mongo_in_docker is not None: + cfg["mongo-host"] = cfg["mongo-in-docker-compose"] + return cfg From e9a7c04583cfa933b1a63898cd1c5f05f1765a6d Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 17 Feb 2021 10:50:07 -0800 Subject: [PATCH 011/109] Disable travis (#312) * Disable travis Redundant to GHA tests. * s/travis/Github Actions/ in PR template --- .github/pull_request_template.md | 4 ++-- .travis.yml | 36 -------------------------------- 2 files changed, 2 insertions(+), 38 deletions(-) delete mode 100644 .travis.yml diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 4b3eaba61..f15b16722 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -9,7 +9,7 @@ # Testing Instructions * Details for how to test the PR: -- [ ] Tests pass in travis and locally +- [ ] Tests pass in Github Actions and locally - [ ] Changes available by spinning up a local test suite and doing X # Dev Checklist: @@ -22,7 +22,7 @@ - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged and published in downstream modules -- [ ] I have run Black and Flake8 on changed Python Code manually or with git precommit (and the travis build passes) +- [ ] I have run Black and Flake8 on changed Python Code manually or with git precommit (and the Github Actions build passes) # Updating Version and Release Notes (if applicable) diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 2c3180e9d..000000000 --- a/.travis.yml +++ /dev/null @@ -1,36 +0,0 @@ -language: python -python: - - "3.7" - -dist: xenial -sudo: required - -services: - - docker - -env: - - KB_DEPLOYMENT_CONFIG=test/deploy.cfg - -before_install: - - sudo useradd kbase - - sed -i '/conda/d' ./requirements.txt - - (cd test/dockerfiles/condor && docker-compose up -d) - - cp test/env/test.travis.env test/env/test.env - - -install: - - cd /opt - - git clone https://github.com/kbase/jars - - cd - - - pip install -r requirements.txt - - pip install black flake8 - - flake8 ./lib ./test - - black --check ./lib ./test - - make setup-database - -script: - - make test-coverage - # - make integration_test Doesn't yet work in travis - -after_success: - - codecov From 7baad00437890138de5438d8ff584c1655b7ad99 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 18 Feb 2021 13:54:31 -0800 Subject: [PATCH 012/109] Start up ee2 service in api to db test (#313) * Start up ee2 service in api to db test Manually verified that the status() is printed out correctly. To run an API to DB test I need to start the service. The service expects a couple of files in specific places in the repo, so I had to check them in. * Use a var rather than hardcode a string * Fix black Exclusions don't seem to be working, pushing up to have other eyes on it * Make LGTM ignore log.py * remove wildcard in black config Added in when trying to figure out how to get black to ignore files, forgot to fix it * Add comments to explain some funky test things make it clear why we're updating the config in place and what the problem is with stopping the server --- .flake8 | 2 + lgtm.yml | 3 + lib/biokbase/README.md | 3 + lib/biokbase/log.py | 368 +++++++++++++++++++ lib/execution_engine2/README.md | 3 + lib/execution_engine2/authclient.py | 94 +++++ pyproject.toml | 2 + test/tests_for_integration/api_to_db_test.py | 100 ++++- test/utils_shared/test_utils.py | 37 +- 9 files changed, 597 insertions(+), 15 deletions(-) create mode 100644 lgtm.yml create mode 100644 lib/biokbase/README.md create mode 100644 lib/biokbase/log.py create mode 100644 lib/execution_engine2/README.md create mode 100644 lib/execution_engine2/authclient.py diff --git a/.flake8 b/.flake8 index 1ddfc3680..cd50db704 100644 --- a/.flake8 +++ b/.flake8 @@ -9,4 +9,6 @@ exclude = execution_engine2Impl.py, lib/installed_clients/, lib/execution_engine2/execution_engine2Impl.py, + lib/execution_engine2/authclient.py, + lib/biokbase/log.py, *Impl.py diff --git a/lgtm.yml b/lgtm.yml new file mode 100644 index 000000000..b064fdbb9 --- /dev/null +++ b/lgtm.yml @@ -0,0 +1,3 @@ +path_classifiers: + generated: + - lib/biokbase/log.py diff --git a/lib/biokbase/README.md b/lib/biokbase/README.md new file mode 100644 index 000000000..2caf66c73 --- /dev/null +++ b/lib/biokbase/README.md @@ -0,0 +1,3 @@ +log.py lives here: https://raw.githubusercontent.com/kbase/sdkbase2/python/log.py + +However, it's needed to run tests so it's checked into this repo. diff --git a/lib/biokbase/log.py b/lib/biokbase/log.py new file mode 100644 index 000000000..5626ac03f --- /dev/null +++ b/lib/biokbase/log.py @@ -0,0 +1,368 @@ +""" +NAME + log + +DESCRIPTION + A library for sending logging messages to syslog. + +METHODS + log(string subsystem, hashref constraints): Initializes log. You + should call this at the beginning of your program. Constraints are + optional. + + log_message(int level, string message): sends log message to syslog. + + * level: (0-9) The logging level for this message is compared to + the logging level that has been set in log. If it is <= + the set logging level, the message will be sent to syslog, + otherwise it will be ignored. Logging level is set to 6 + if control API cannot be reached and the user does + not set the log level. Log level can also be entered as + string (e.g. 'DEBUG') + + * message: This is the log message. + + get_log_level(): Returns the current log level as an integer. + + set_log_level(integer level) : Sets the log level. Only use this if you + wish to override the log levels that are defined by the control API. + Can also be entered as string (e.g. 'DEBUG') + + * level : priority + + * 0 : EMERG - system is unusable + + * 1 : ALERT - component must be fixed immediately + + * 2 : CRIT - secondary component must be fixed immediately + + * 3 : ERR - non-urgent failure + + * 4 : WARNING - warning that an error will occur if no action + is taken + + * 5 : NOTICE - unusual but safe conditions + + * 6 : INFO - normal operational messages + + * 7 : DEBUG - lowest level of debug + + * 8 : DEBUG2 - second level of debug + + * 9 : DEBUG3 - highest level of debug + + set_log_msg_check_count(integer count): used to set the number the + messages that log will log before querying the control API for the + log level (default is 100 messages). + + set_log_msg_check_interval(integer seconds): used to set the interval, + in seconds, that will be allowed to pass before log will query the + control API for the log level (default is 300 seconds). + + update_api_log_level() : Checks the control API for the currently set + log level. + + use_api_log_level() : Removes the user-defined log level and tells log + to use the control API-defined log level. +""" + +import json as _json +import urllib.request as _urllib2 +import syslog as _syslog +import platform as _platform +import inspect as _inspect +import os as _os +import getpass as _getpass +import warnings as _warnings +from configparser import ConfigParser as _ConfigParser +import time + +MLOG_ENV_FILE = 'MLOG_CONFIG_FILE' +_GLOBAL = 'global' +MLOG_LOG_LEVEL = 'mlog_log_level' +MLOG_API_URL = 'mlog_api_url' +MLOG_LOG_FILE = 'mlog_log_file' + +DEFAULT_LOG_LEVEL = 6 +#MSG_CHECK_COUNT = 100 +#MSG_CHECK_INTERVAL = 300 # 300s = 5min +MSG_FACILITY = _syslog.LOG_LOCAL1 +EMERG_FACILITY = _syslog.LOG_LOCAL0 + +EMERG = 0 +ALERT = 1 +CRIT = 2 +ERR = 3 +WARNING = 4 +NOTICE = 5 +INFO = 6 +DEBUG = 7 +DEBUG2 = 8 +DEBUG3 = 9 +_MLOG_TEXT_TO_LEVEL = {'EMERG': EMERG, + 'ALERT': ALERT, + 'CRIT': CRIT, + 'ERR': ERR, + 'WARNING': WARNING, + 'NOTICE': NOTICE, + 'INFO': INFO, + 'DEBUG': DEBUG, + 'DEBUG2': DEBUG2, + 'DEBUG3': DEBUG3, + } +_MLOG_TO_SYSLOG = [_syslog.LOG_EMERG, _syslog.LOG_ALERT, _syslog.LOG_CRIT, + _syslog.LOG_ERR, _syslog.LOG_WARNING, _syslog.LOG_NOTICE, + _syslog.LOG_INFO, _syslog.LOG_DEBUG, _syslog.LOG_DEBUG, + _syslog.LOG_DEBUG] +#ALLOWED_LOG_LEVELS = set(_MLOG_TEXT_TO_LEVEL.values()) +_MLOG_LEVEL_TO_TEXT = {} +for k, v in _MLOG_TEXT_TO_LEVEL.items(): + _MLOG_LEVEL_TO_TEXT[v] = k +LOG_LEVEL_MIN = min(_MLOG_LEVEL_TO_TEXT.keys()) +LOG_LEVEL_MAX = max(_MLOG_LEVEL_TO_TEXT.keys()) +del k, v + + +class log(object): + """ + This class contains the methods necessary for sending log messages. + """ + + def __init__(self, subsystem, constraints=None, config=None, logfile=None, + ip_address=False, authuser=False, module=False, + method=False, call_id=False, changecallback=None): + if not subsystem: + raise ValueError("Subsystem must be supplied") + + self.user = _getpass.getuser() + self.parentfile = _os.path.abspath(_inspect.getfile( + _inspect.stack()[1][0])) + self.ip_address = ip_address + self.authuser = authuser + self.module = module + self.method = method + self.call_id = call_id + noop = lambda: None + self._callback = changecallback or noop + self._subsystem = str(subsystem) + self._mlog_config_file = config + if not self._mlog_config_file: + self._mlog_config_file = _os.environ.get(MLOG_ENV_FILE, None) + if self._mlog_config_file: + self._mlog_config_file = str(self._mlog_config_file) + self._user_log_level = -1 + self._config_log_level = -1 + self._user_log_file = logfile + self._config_log_file = None + self._api_log_level = -1 + self._msgs_since_config_update = 0 + self._time_at_config_update = time.time() + self.msg_count = 0 + self._recheck_api_msg = 100 + self._recheck_api_time = 300 # 5 mins + self._log_constraints = {} if not constraints else constraints + + self._init = True + self.update_config() + self._init = False + + def _get_time_since_start(self): + time_diff = time.time() - self._time_at_config_update + return time_diff + + def get_log_level(self): + if(self._user_log_level != -1): + return self._user_log_level + elif(self._config_log_level != -1): + return self._config_log_level + elif(self._api_log_level != -1): + return self._api_log_level + else: + return DEFAULT_LOG_LEVEL + + def _get_config_items(self, cfg, section): + cfgitems = {} + if cfg.has_section(section): + for k, v in cfg.items(section): + cfgitems[k] = v + return cfgitems + + def update_config(self): + loglevel = self.get_log_level() + logfile = self.get_log_file() + + self._api_log_level = -1 + self._msgs_since_config_update = 0 + self._time_at_config_update = time.time() + + # Retrieving the control API defined log level + api_url = None + if self._mlog_config_file and _os.path.isfile(self._mlog_config_file): + cfg = _ConfigParser() + cfg.read(self._mlog_config_file) + cfgitems = self._get_config_items(cfg, _GLOBAL) + cfgitems.update(self._get_config_items(cfg, self._subsystem)) + if MLOG_LOG_LEVEL in cfgitems: + try: + self._config_log_level = int(cfgitems[MLOG_LOG_LEVEL]) + except: + _warnings.warn( + 'Cannot parse log level {} from file {} to int'.format( + cfgitems[MLOG_LOG_LEVEL], self._mlog_config_file) + + '. Keeping current log level.') + if MLOG_API_URL in cfgitems: + api_url = cfgitems[MLOG_API_URL] + if MLOG_LOG_FILE in cfgitems: + self._config_log_file = cfgitems[MLOG_LOG_FILE] + elif self._mlog_config_file: + _warnings.warn('Cannot read config file ' + self._mlog_config_file) + + if (api_url): + subsystem_api_url = api_url + "/" + self._subsystem + try: + data = _json.load(_urllib2.urlopen(subsystem_api_url, + timeout=5)) + except _urllib2.URLError as e: + code_ = None + if hasattr(e, 'code'): + code_ = ' ' + str(e.code) + _warnings.warn( + 'Could not connect to mlog api server at ' + + '{}:{} {}. Using default log level {}.'.format( + subsystem_api_url, code_, str(e.reason), + str(DEFAULT_LOG_LEVEL))) + else: + max_matching_level = -1 + for constraint_set in data['log_levels']: + level = constraint_set['level'] + constraints = constraint_set['constraints'] + if level <= max_matching_level: + continue + + matches = 1 + for constraint in constraints: + if constraint not in self._log_constraints: + matches = 0 + elif (self._log_constraints[constraint] != + constraints[constraint]): + matches = 0 + + if matches == 1: + max_matching_level = level + + self._api_log_level = max_matching_level + if ((self.get_log_level() != loglevel or + self.get_log_file() != logfile) and not self._init): + self._callback() + + def _resolve_log_level(self, level): + if(level in _MLOG_TEXT_TO_LEVEL): + level = _MLOG_TEXT_TO_LEVEL[level] + elif(level not in _MLOG_LEVEL_TO_TEXT): + raise ValueError('Illegal log level') + return level + + def set_log_level(self, level): + self._user_log_level = self._resolve_log_level(level) + self._callback() + + def get_log_file(self): + if self._user_log_file: + return self._user_log_file + if self._config_log_file: + return self._config_log_file + return None + + def set_log_file(self, filename): + self._user_log_file = filename + self._callback() + + def set_log_msg_check_count(self, count): + count = int(count) + if count < 0: + raise ValueError('Cannot check a negative number of messages') + self._recheck_api_msg = count + + def set_log_msg_check_interval(self, interval): + interval = int(interval) + if interval < 0: + raise ValueError('interval must be positive') + self._recheck_api_time = interval + + def clear_user_log_level(self): + self._user_log_level = -1 + self._callback() + + def _get_ident(self, level, user, parentfile, ip_address, authuser, module, + method, call_id): + infos = [self._subsystem, _MLOG_LEVEL_TO_TEXT[level], + repr(time.time()), user, parentfile, str(_os.getpid())] + if self.ip_address: + infos.append(str(ip_address) if ip_address else '-') + if self.authuser: + infos.append(str(authuser) if authuser else '-') + if self.module: + infos.append(str(module) if module else '-') + if self.method: + infos.append(str(method) if method else '-') + if self.call_id: + infos.append(str(call_id) if call_id else '-') + return "[" + "] [".join(infos) + "]" + + def _syslog(self, facility, level, ident, message): + _syslog.openlog(ident, facility) + if isinstance(message, str): + _syslog.syslog(_MLOG_TO_SYSLOG[level], message) + else: + try: + for m in message: + _syslog.syslog(_MLOG_TO_SYSLOG[level], m) + except TypeError: + _syslog.syslog(_MLOG_TO_SYSLOG[level], str(message)) + _syslog.closelog() + + def _log(self, ident, message): + ident = ' '.join([str(time.strftime( + "%Y-%m-%d %H:%M:%S", time.localtime())), + _platform.node(), ident + ': ']) + try: + with open(self.get_log_file(), 'a') as log: + if isinstance(message, str): + log.write(ident + message + '\n') + else: + try: + for m in message: + log.write(ident + m + '\n') + except TypeError: + log.write(ident + str(message) + '\n') + except Exception as e: + err = 'Could not write to log file ' + str(self.get_log_file()) + \ + ': ' + str(e) + '.' + _warnings.warn(err) + + def log_message(self, level, message, ip_address=None, authuser=None, + module=None, method=None, call_id=None): +# message = str(message) + level = self._resolve_log_level(level) + + self.msg_count += 1 + self._msgs_since_config_update += 1 + + if(self._msgs_since_config_update >= self._recheck_api_msg + or self._get_time_since_start() >= self._recheck_api_time): + self.update_config() + + ident = self._get_ident(level, self.user, self.parentfile, ip_address, + authuser, module, method, call_id) + # If this message is an emergency, send a copy to the emergency + # facility first. + if(level == 0): + self._syslog(EMERG_FACILITY, level, ident, message) + + if(level <= self.get_log_level()): + self._syslog(MSG_FACILITY, level, ident, message) + if self.get_log_file(): + self._log(ident, message) + +if __name__ == '__main__': + pass diff --git a/lib/execution_engine2/README.md b/lib/execution_engine2/README.md new file mode 100644 index 000000000..84e6f898c --- /dev/null +++ b/lib/execution_engine2/README.md @@ -0,0 +1,3 @@ +authclient.py lives here: https://github.com/kbase/kb_sdk/blob/master/src/java/us/kbase/templates/authclient.py + +... but is checked in as it's needed for tests. \ No newline at end of file diff --git a/lib/execution_engine2/authclient.py b/lib/execution_engine2/authclient.py new file mode 100644 index 000000000..844f9b0c2 --- /dev/null +++ b/lib/execution_engine2/authclient.py @@ -0,0 +1,94 @@ +''' +Created on Aug 1, 2016 + +A very basic KBase auth client for the Python server. + +@author: gaprice@lbl.gov +''' +import time as _time +import requests as _requests +import threading as _threading +import hashlib + + +class TokenCache(object): + ''' A basic cache for tokens. ''' + + _MAX_TIME_SEC = 5 * 60 # 5 min + + _lock = _threading.RLock() + + def __init__(self, maxsize=2000): + self._cache = {} + self._maxsize = maxsize + self._halfmax = maxsize / 2 # int division to round down + + def get_user(self, token): + token = hashlib.sha256(token.encode('utf-8')).hexdigest() + with self._lock: + usertime = self._cache.get(token) + if not usertime: + return None + + user, intime = usertime + if _time.time() - intime > self._MAX_TIME_SEC: + return None + return user + + def add_valid_token(self, token, user): + if not token: + raise ValueError('Must supply token') + if not user: + raise ValueError('Must supply user') + token = hashlib.sha256(token.encode('utf-8')).hexdigest() + with self._lock: + self._cache[token] = [user, _time.time()] + if len(self._cache) > self._maxsize: + sorted_items = sorted( + list(self._cache.items()), + key=(lambda v: v[1][1]) + ) + for i, (t, _) in enumerate(sorted_items): + if i <= self._halfmax: + del self._cache[t] + else: + break + + +class KBaseAuth(object): + ''' + A very basic KBase auth client for the Python server. + ''' + + _LOGIN_URL = 'https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login' + + def __init__(self, auth_url=None): + ''' + Constructor + ''' + self._authurl = auth_url + if not self._authurl: + self._authurl = self._LOGIN_URL + self._cache = TokenCache() + + def get_user(self, token): + if not token: + raise ValueError('Must supply token') + user = self._cache.get_user(token) + if user: + return user + + d = {'token': token, 'fields': 'user_id'} + ret = _requests.post(self._authurl, data=d) + if not ret.ok: + try: + err = ret.json() + except Exception as e: + ret.raise_for_status() + raise ValueError('Error connecting to auth service: {} {}\n{}' + .format(ret.status_code, ret.reason, + err['error']['message'])) + + user = ret.json()['user_id'] + self._cache.add_valid_token(token, user) + return user diff --git a/pyproject.toml b/pyproject.toml index 757d99be5..234b8d86b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,8 @@ exclude = ''' | __pycache__ | lib/__pycache__ | lib/execution_engine2/execution_engine2Impl.py + | lib/execution_engine2/authclient.py + | lib/biokbase/log.py | lib/installed_clients/* ) ''' diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 61bc54f25..083993684 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -10,11 +10,26 @@ exposed to other containers. """ +import os +import tempfile +import time + +from configparser import ConfigParser +from threading import Thread from pathlib import Path import pymongo from pytest import fixture +from typing import Dict from tests_for_integration.auth_controller import AuthController -from utils_shared.test_utils import get_test_config +from utils_shared.test_utils import ( + get_full_test_config, + get_ee2_test_config, + EE2_CONFIG_SECTION, + KB_DEPLOY_ENV, + find_free_port, +) +from execution_engine2 import execution_engine2Server +from installed_clients.execution_engine2Client import execution_engine2 as ee2client KEEP_TEMP_FILES = True AUTH_DB = "api_to_db_test" @@ -26,8 +41,13 @@ @fixture(scope="module") -def config(): - yield get_test_config() +def config() -> Dict[str, str]: + yield get_ee2_test_config() + + +@fixture(scope="module") +def full_config() -> ConfigParser: + yield get_full_test_config() @fixture(scope="module") @@ -84,12 +104,76 @@ def auth_url(config, mongo_client): _clean_auth_db(mongo_client) -# TODO start the ee2 service -# TODO wipe the ee2 database between every test +def _update_config_and_create_config_file(full_config, auth_url): + """ + Updates the config in place with the correct auth url for the tests and + writes the updated config to a temporary file. + + Returns the file path. + """ + # Don't call get_ee2_test_config here, we *want* to update the config object in place + # so any other tests that use the config fixture run against the test auth server if they + # access those keys + ee2c = full_config[EE2_CONFIG_SECTION] + ee2c["auth-service-url"] = auth_url + "/api/legacy/KBase/Sessions/Login" + ee2c["auth-service-url-v2"] = auth_url + "/api/v2/token" + ee2c["auth-url"] = auth_url + ee2c["auth-service-url-allow-insecure"] = "true" + deploy = tempfile.mkstemp(".cfg", "deploy-", dir=TEMP_DIR, text=True) + os.close(deploy[0]) -def test_is_admin(auth_url): - import requests + with open(deploy[1], "w") as handle: + full_config.write(handle) - print(requests.get(auth_url).text) + return deploy[1] + + +def _clear_ee2_db(mc: pymongo.MongoClient, config: Dict[str, str]): + ee2 = mc[config["mongo-database"]] + for name in ee2.list_collection_names(): + if not name.startswith("system."): + # don't drop collection since that drops indexes + ee2.get_collection(name).delete_many({}) + + +@fixture(scope="module") +def service(full_config, auth_url, mongo_client, config): + # also updates the config in place so it contains the correct auth urls for any other + # methods that use the config fixture + cfgpath = _update_config_and_create_config_file(full_config, auth_url) + _clear_ee2_db(mongo_client, config) + + # from this point on, calling the get_*_test_config methods will get the temp config file + os.environ[KB_DEPLOY_ENV] = cfgpath + portint = find_free_port() + Thread( + target=execution_engine2Server.start_server, + kwargs={"port": portint}, + daemon=True, + ).start() + time.sleep(0.05) + port = str(portint) + print("running ee2 service at localhost:" + port) + yield port + + # shutdown the server + # SampleServiceServer.stop_server() <-- this causes an error. + # See the server file for the full scoop, but in short, the stop method expects a _proc + # package variable to be set, but start doesn't always set it, and that causes an error. + + if not KEEP_TEMP_FILES: + os.remove(cfgpath) + + +@fixture +def ee2_port(service, mongo_client, config): + _clear_ee2_db(mongo_client, config) + + yield service + + +def test_is_admin(ee2_port): + ee2cli = ee2client("http://localhost:" + ee2_port) + print(ee2cli.status()) # TODO add a test diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index dab793002..d23752074 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -17,6 +17,11 @@ from lib.execution_engine2.utils.CondorTuples import CondorResources, JobInfo +EE2_CONFIG_SECTION = "execution_engine2" +KB_DEPLOY_ENV = "KB_DEPLOYMENT_CONFIG" +DEFAULT_TEST_DEPLOY_CFG = "test/deploy.cfg" + + def bootstrap(): test_env_0 = "../test.env" test_env_1 = "test.env" @@ -440,19 +445,37 @@ def set_custom_roles(auth_url, user, roles): ret.raise_for_status() -def get_test_config(): - config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") +def get_full_test_config() -> ConfigParser: + f""" + Gets the full configuration for ee2, including all sections of the config file. + + If the {KB_DEPLOY_ENV} environment variable is set, loads the configuration from there. + Otherwise, the repo's {DEFAULT_TEST_DEPLOY_CFG} file is used. + """ + config_file = os.environ.get(KB_DEPLOY_ENV, DEFAULT_TEST_DEPLOY_CFG) logging.info(f"Loading config from {config_file}") config_parser = ConfigParser() config_parser.read(config_file) + if config_parser[EE2_CONFIG_SECTION].get("mongo-in-docker-compose"): + config_parser[EE2_CONFIG_SECTION]["mongo-host"] = config_parser[ + EE2_CONFIG_SECTION + ]["mongo-in-docker-compose"] + return config_parser - cfg = {} - for nameval in config_parser.items("execution_engine2"): +def get_ee2_test_config() -> Dict[str, str]: + f""" + Gets the configuration for the ee2 service, e.g. the {EE2_CONFIG_SECTION} section of the + deploy.cfg file. + + If the {KB_DEPLOY_ENV} environment variable is set, loads the configuration from there. + Otherwise, the repo's {DEFAULT_TEST_DEPLOY_CFG} file is used. + """ + cp = get_full_test_config() + + cfg = {} + for nameval in cp.items(EE2_CONFIG_SECTION): cfg[nameval[0]] = nameval[1] - mongo_in_docker = cfg.get("mongo-in-docker-compose", None) - if mongo_in_docker is not None: - cfg["mongo-host"] = cfg["mongo-in-docker-compose"] return cfg From b5d82bc207fe687f157a43aa7d4a1f99112758e7 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 18 Feb 2021 15:21:47 -0800 Subject: [PATCH 013/109] Fix is_admin and get_admin_permission bugs (#315) * Fix is_admin and get_admin_permission bugs And add happy path integration tests for those functions. * Run black --- .../execution_engine2Impl.py | 4 +- test/tests_for_integration/api_to_db_test.py | 68 ++++++++++++++++--- 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index caecb561a..43c9f62a3 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -1650,7 +1650,7 @@ def is_admin(self, ctx): #BEGIN is_admin mr = SDKMethodRunner( self.config, - user_clients=self.get_cfg.get_user_clients(ctx), + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.check_is_admin() @@ -1676,7 +1676,7 @@ def get_admin_permission(self, ctx): #BEGIN get_admin_permission mr = SDKMethodRunner( self.config, - user_clients=self.get_cfg.get_user_clients(ctx), + user_clients=self.gen_cfg.get_user_clients(ctx), mongo_util=self.mongo_util ) returnVal = mr.get_admin_permission() diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 083993684..50880b036 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -27,8 +27,12 @@ EE2_CONFIG_SECTION, KB_DEPLOY_ENV, find_free_port, + create_auth_login_token, + create_auth_user, + create_auth_role, + set_custom_roles, ) -from execution_engine2 import execution_engine2Server +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from installed_clients.execution_engine2Client import execution_engine2 as ee2client KEEP_TEMP_FILES = True @@ -39,6 +43,13 @@ # may need to make this configurable JARS_DIR = Path("/opt/jars/lib/jars") +USER_READ_ADMIN = "readuser" +TOKEN_READ_ADMIN = None +USER_NO_ADMIN = "nouser" +TOKEN_NO_ADMIN = None +USER_WRITE_ADMIN = "writeuser" +TOKEN_WRITE_ADMIN = None + @fixture(scope="module") def config() -> Dict[str, str]: @@ -71,6 +82,25 @@ def _clean_auth_db(mongo_client): mongo_client.drop_database(AUTH_DB) +def _set_up_auth_users(auth_url): + create_auth_role(auth_url, ADMIN_READ_ROLE, "ee2 admin read doohickey") + create_auth_role(auth_url, ADMIN_WRITE_ROLE, "ee2 admin write thinger") + + global TOKEN_READ_ADMIN + create_auth_user(auth_url, USER_READ_ADMIN, "display1") + TOKEN_READ_ADMIN = create_auth_login_token(auth_url, USER_READ_ADMIN) + set_custom_roles(auth_url, USER_READ_ADMIN, [ADMIN_READ_ROLE]) + + global TOKEN_NO_ADMIN + create_auth_user(auth_url, USER_NO_ADMIN, "display2") + TOKEN_NO_ADMIN = create_auth_login_token(auth_url, USER_NO_ADMIN) + + global TOKEN_WRITE_ADMIN + create_auth_user(auth_url, USER_WRITE_ADMIN, "display3") + TOKEN_WRITE_ADMIN = create_auth_login_token(auth_url, USER_WRITE_ADMIN) + set_custom_roles(auth_url, USER_WRITE_ADMIN, [ADMIN_WRITE_ROLE]) + + @fixture(scope="module") def auth_url(config, mongo_client): # clean up from any previously failed test runs that left the db in place @@ -94,6 +124,8 @@ def auth_url(config, mongo_client): ) url = f"http://localhost:{auth.port}" + _set_up_auth_users(url) + yield url print(f"shutting down auth, KEEP_TEMP_FILES={KEEP_TEMP_FILES}") @@ -115,9 +147,9 @@ def _update_config_and_create_config_file(full_config, auth_url): # so any other tests that use the config fixture run against the test auth server if they # access those keys ee2c = full_config[EE2_CONFIG_SECTION] - ee2c["auth-service-url"] = auth_url + "/api/legacy/KBase/Sessions/Login" - ee2c["auth-service-url-v2"] = auth_url + "/api/v2/token" - ee2c["auth-url"] = auth_url + ee2c["auth-service-url"] = auth_url + "/testmode/api/legacy/KBase/Sessions/Login" + ee2c["auth-service-url-v2"] = auth_url + "/testmode/api/v2/token" + ee2c["auth-url"] = auth_url + "/testmode" ee2c["auth-service-url-allow-insecure"] = "true" deploy = tempfile.mkstemp(".cfg", "deploy-", dir=TEMP_DIR, text=True) @@ -146,6 +178,11 @@ def service(full_config, auth_url, mongo_client, config): # from this point on, calling the get_*_test_config methods will get the temp config file os.environ[KB_DEPLOY_ENV] = cfgpath + # The server creates the configuration, impl, and application *AT IMPORT TIME* so we have to + # import *after* setting the config path. + # This is terrible design. Awful. It definitely wasn't me that wrote it over Xmas in 2012 + from execution_engine2 import execution_engine2Server + portint = find_free_port() Thread( target=execution_engine2Server.start_server, @@ -173,7 +210,22 @@ def ee2_port(service, mongo_client, config): yield service -def test_is_admin(ee2_port): - ee2cli = ee2client("http://localhost:" + ee2_port) - print(ee2cli.status()) - # TODO add a test +def test_is_admin_success(ee2_port): + ee2cli_read = ee2client("http://localhost:" + ee2_port, token=TOKEN_READ_ADMIN) + ee2cli_no = ee2client("http://localhost:" + ee2_port, token=TOKEN_NO_ADMIN) + ee2cli_write = ee2client("http://localhost:" + ee2_port, token=TOKEN_WRITE_ADMIN) + + # note that if we ever need to have Java talk to ee2 these responses will break the SDK client + assert ee2cli_read.is_admin() is True + assert ee2cli_no.is_admin() is False + assert ee2cli_write.is_admin() is True + + +def test_get_admin_permission_success(ee2_port): + ee2cli_read = ee2client("http://localhost:" + ee2_port, token=TOKEN_READ_ADMIN) + ee2cli_no = ee2client("http://localhost:" + ee2_port, token=TOKEN_NO_ADMIN) + ee2cli_write = ee2client("http://localhost:" + ee2_port, token=TOKEN_WRITE_ADMIN) + + assert ee2cli_read.get_admin_permission() == {"permission": "r"} + assert ee2cli_no.get_admin_permission() == {"permission": "n"} + assert ee2cli_write.get_admin_permission() == {"permission": "w"} From eba99eee965a2c27af71b92f247a76d596abbdea Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 23 Feb 2021 16:57:45 -0800 Subject: [PATCH 014/109] Fix workspace client access bug & unit test (#317) * Fix workspace client access bug & unit test Fixes a bug where a SDKMR.get_workspace() method was expected but was removed in a previous PR. Creates a unit test for the run() method where the bug occurred. * Run black * Minor test cleanup * run black * typo * Use variables for repetitive literals * Swith tests from multiple asserts to getattr based and a list of fields to check * run black yet again * remove unused import * Remove stray lib in import path * Add comment re getattr functionality * clarify what the hacky equals test method does Rename the method to describe what it actually does and document to make it clear to future programmers it is not a replacement for an equals() method. * Fix choice wording whoops * Improve job equality checking import lines were causing the embedded classes to be counted as different clasess. Still can't get a straight job1 == job2 equality to work. * Remove lib from Job import Doesn't fix the job equality problem but removes a variable * Explain why the filthy job equality hack is required * man black is picky --- lib/execution_engine2/db/models/models.py | 9 + lib/execution_engine2/sdk/EE2Runjob.py | 70 ++--- lib/execution_engine2/sdk/SDKMethodRunner.py | 68 ++++- lib/execution_engine2/utils/CatalogUtils.py | 5 + test/tests_for_sdkmr/EE2Runjob_test.py | 250 ++++++++++++++++++ .../ee2_SDKMethodRunner_test.py | 34 ++- 6 files changed, 400 insertions(+), 36 deletions(-) create mode 100644 test/tests_for_sdkmr/EE2Runjob_test.py diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index f8eb128a7..2c2326138 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -111,6 +111,9 @@ class Meta(EmbeddedDocument): cell_id = StringField() status = StringField() + def __repr__(self): + return self.to_json() + class CondorResourceUsage(EmbeddedDocument): """ @@ -147,6 +150,9 @@ class JobRequirements(EmbeddedDocument): disk = IntField() estimate = EmbeddedDocumentField(Estimate) + def __repr__(self): + return self.to_json() + class JobInput(EmbeddedDocument): """ @@ -164,6 +170,9 @@ class JobInput(EmbeddedDocument): requirements = EmbeddedDocumentField(JobRequirements) narrative_cell_info = EmbeddedDocumentField(Meta, required=True) + def __repr__(self): + return self.to_json() + class JobOutput(EmbeddedDocument): """ diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 33c259f44..ee30e7d5b 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -9,7 +9,7 @@ from enum import Enum from typing import Optional, Dict, NamedTuple, Union, List -from lib.execution_engine2.db.models.models import ( +from execution_engine2.db.models.models import ( Job, JobInput, Meta, @@ -20,7 +20,7 @@ ) from lib.execution_engine2.sdk.EE2Constants import ConciergeParams from lib.execution_engine2.utils.CondorTuples import CondorResources -from lib.execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange +from execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange class JobPermissions(Enum): @@ -44,7 +44,7 @@ class EE2RunJob: def __init__(self, sdkmr): self.sdkmr = sdkmr # type: SDKMethodRunner self.override_clientgroup = os.environ.get("OVERRIDE_CLIENT_GROUP", None) - self.logger = self.sdkmr.logger + self.logger = self.sdkmr.get_logger() def _init_job_rec( self, @@ -100,15 +100,14 @@ def _init_job_rec( job.job_input = inputs self.logger.debug(job.job_input.to_mongo().to_dict()) - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - self.logger.debug(job.to_mongo().to_dict()) - job.save() + self.logger.debug(job.to_mongo().to_dict()) + job_id = self.sdkmr.save_job(job) - self.sdkmr.kafka_client.send_kafka_message( - message=KafkaCreateJob(job_id=str(job.id), user=user_id) + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaCreateJob(job_id=job_id, user=user_id) ) - return str(job.id) + return job_id def _get_module_git_commit(self, method, service_ver=None) -> Optional[str]: module_name = method.split(".")[0] @@ -118,8 +117,10 @@ def _get_module_git_commit(self, method, service_ver=None) -> Optional[str]: self.logger.debug(f"Getting commit for {module_name} {service_ver}") - module_version = self.sdkmr.catalog_utils.catalog.get_module_version( - {"module_name": module_name, "version": service_ver} + module_version = ( + self.sdkmr.get_catalog_utils() + .get_catalog() + .get_module_version({"module_name": module_name, "version": service_ver}) ) git_commit_hash = module_version.get("git_commit_hash") @@ -194,7 +195,9 @@ def _prepare_to_run(self, params, concierge_params=None) -> PreparedJobParams: self._check_ws_objects(source_objects=params.get("source_ws_objects")) method = params.get("method") # Normalize multiple formats into one format (csv vs json) - normalized_resources = self.sdkmr.catalog_utils.get_normalized_resources(method) + normalized_resources = self.sdkmr.get_catalog_utils().get_normalized_resources( + method + ) # These are for saving into job inputs. Maybe its best to pass this into condor as well? extracted_resources = self.sdkmr.get_condor().extract_resources( cgrr=normalized_resources @@ -202,16 +205,16 @@ def _prepare_to_run(self, params, concierge_params=None) -> PreparedJobParams: # insert initial job document into db job_id = self._init_job_rec( - self.sdkmr.user_id, params, extracted_resources, concierge_params + self.sdkmr.get_user_id(), params, extracted_resources, concierge_params ) params["job_id"] = job_id - params["user_id"] = self.sdkmr.user_id - params["token"] = self.sdkmr.token + params["user_id"] = self.sdkmr.get_user_id() + params["token"] = self.sdkmr.get_token() params["cg_resources_requirements"] = normalized_resources self.logger.debug( - f"User {self.sdkmr.user_id} attempting to run job {method} {params}" + f"User {self.sdkmr.get_user_id()} attempting to run job {method} {params}" ) return PreparedJobParams(params=params, job_id=job_id) @@ -249,8 +252,8 @@ def _run(self, params, concierge_params=None): ) self.update_job_to_queued(job_id=job_id, scheduler_id=condor_job_id) - self.sdkmr.slack_client.run_job_message( - job_id=job_id, scheduler_id=condor_job_id, username=self.sdkmr.user_id + self.sdkmr.get_slack_client().run_job_message( + job_id=job_id, scheduler_id=condor_job_id, username=self.sdkmr.get_user_id() ) return job_id @@ -374,23 +377,22 @@ def update_job_to_queued(self, job_id, scheduler_id): # TODO RETRY FOR RACE CONDITION OF RUN/CANCEL # TODO PASS QUEUE TIME IN FROM SCHEDULER ITSELF? # TODO PASS IN SCHEDULER TYPE? - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - j = self.sdkmr.get_mongo_util().get_job(job_id=job_id) - previous_status = j.status - j.status = Status.queued.value - j.queued = time.time() - j.scheduler_id = scheduler_id - j.scheduler_type = "condor" - j.save() - - self.sdkmr.kafka_client.send_kafka_message( - message=KafkaQueueChange( - job_id=str(j.id), - new_status=j.status, - previous_status=previous_status, - scheduler_id=scheduler_id, - ) + j = self.sdkmr.get_mongo_util().get_job(job_id=job_id) + previous_status = j.status + j.status = Status.queued.value + j.queued = time.time() + j.scheduler_id = scheduler_id + j.scheduler_type = "condor" + self.sdkmr.save_job(j) + + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaQueueChange( + job_id=str(j.id), + new_status=j.status, + previous_status=previous_status, + scheduler_id=scheduler_id, ) + ) def get_job_params(self, job_id, as_admin=False): """ diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 97cef00d8..9610779a7 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -12,12 +12,13 @@ import time from datetime import datetime from enum import Enum +from logging import Logger import dateutil from installed_clients.authclient import KBaseAuth from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job +from execution_engine2.db.models.models import Job from lib.execution_engine2.exceptions import AuthError from lib.execution_engine2.sdk import ( EE2Runjob, @@ -34,6 +35,7 @@ from lib.execution_engine2.utils.SlackUtils import SlackClient from execution_engine2.utils.clients import UserClientSet from execution_engine2.utils.arg_processing import parse_bool +from installed_clients.WorkspaceClient import Workspace class JobPermissions(Enum): @@ -140,6 +142,62 @@ def get_condor(self) -> Condor: self.condor = Condor(self.deployment_config_fp) return self.condor + # A note on getters: + # Getters are commonly described as unpythonic. However, accessing instance variables + # directly, rather than via getters, causes significant problems when mocking a class in + # that instance variables cannot be detected by create_autospec with spec_set=True, and thus + # cannot be mocked in a rigorous way. The danger of not using spec_set=True is that if a + # mocked class's API changes, the unit tests will still pass. Thus the choice is between + # unpythonic getters or false positives in unit tests, and we choose the former. + # For more details: https://www.seanh.cc/2017/03/17/the-problem-with-mocks/ + + def get_workspace(self) -> Workspace: + """ + Get the workspace client for this instance of SDKMR. + """ + return self.workspace + + def get_logger(self) -> Logger: + """ + Get the logger for this instance of SDKMR. + """ + # There's not really any way to meaningfully test this method without passing in the + # logger, which seems... overkill? + return self.logger + + def get_catalog_utils(self) -> CatalogUtils: + """ + Get the catalog utilities for this instance of SDKMR. + """ + # TODO Unit test this method once catalog_utils can be mocked. + return self.catalog_utils + + def get_kafka_client(self) -> KafkaClient: + """ + Get the Kafka client for this instance of SDKMR. + """ + # TODO Unit test this method once kafka_client can be mocked. + return self.kafka_client + + def get_slack_client(self) -> SlackClient: + """ + Get the Kafka client for this instance of SDKMR. + """ + # TODO Unit test this method once slack_client can be mocked. + return self.slack_client + + def get_user_id(self) -> str: + """ + Get the user id of the user for this instance of SDKMR. + """ + return self.user_id + + def get_token(self) -> str: + """ + Get the token of the user for this instance of SDKMR. + """ + return self.token + # Permissions Decorators #TODO Verify these actually work #TODO add as_admin to these def allow_job_read(func): @@ -175,6 +233,14 @@ def check_as_concierge(self): "You are not the concierge user. This method is not for you" ) + def save_job(self, job: Job): + """ + Save a job record to the Mongo database. + """ + # The purpose of this method is to allow unit testing the various EE2*.py classes. + job.save() + return str(job.id) + # API ENDPOINTS # ENDPOINTS: Admin Related Endpoints diff --git a/lib/execution_engine2/utils/CatalogUtils.py b/lib/execution_engine2/utils/CatalogUtils.py index 214ac28b2..1f970f10e 100644 --- a/lib/execution_engine2/utils/CatalogUtils.py +++ b/lib/execution_engine2/utils/CatalogUtils.py @@ -8,6 +8,11 @@ class CatalogUtils: def __init__(self, url, admin_token): self.catalog = Catalog(url=url, token=admin_token) + def get_catalog(self): + """ Get the catalog client for this instance. """ + # TODO unit test this method after switching to dependency injection + return self.catalog + def get_normalized_resources(self, method) -> Dict: """ get client groups info from Catalog diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py new file mode 100644 index 000000000..aa453a92e --- /dev/null +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -0,0 +1,250 @@ +""" +Unit tests for the EE2Runjob class. +""" + +# Incomplete by a long way. Will add more unit tests as they come up. + +from typing import List +from bson.objectid import ObjectId +from logging import Logger +from unittest.mock import create_autospec +from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta +from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.CatalogUtils import CatalogUtils +from execution_engine2.utils.Condor import ( + Condor, + CondorResources, + SubmissionInfo, +) +from execution_engine2.utils.KafkaUtils import ( + KafkaClient, + KafkaQueueChange, + KafkaCreateJob, +) +from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.db.MongoUtil import MongoUtil +from installed_clients.WorkspaceClient import Workspace +from installed_clients.CatalogClient import Catalog + + +def test_run_as_admin(): + """ + A basic unit test of the run() method with an administrative user. + + This test is a fairly minimal test of the run() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + + # Set up data variables + job_id = "603051cfaf2e3401b0500982" + git_commit = "git5678" + ws_obj1 = "1/2/3" + ws_obj2 = "4/5/6" + client_group = "grotesquememlong" + cpus = "4" + mem = "32M" + cluster = "cluster42" + method = "lolcats.lol_unto_death" + user = "someuser" + token = "tokentokentoken" + created_state = "created" + queued_state = "queued" + + # The amount of mocking required here implies the method should be broken up into smaller + # classes that are individually mockable. Or maybe it's just really complicated and this + # is the best we can do. Worth looking into at some point though. + + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catutils = create_autospec(CatalogUtils, spec_set=True, instance=True) + condor = create_autospec(Condor, spec_set=True, instance=True) + kafka = create_autospec(KafkaClient, spec_set=True, instance=True) + logger = create_autospec(Logger, spec_set=True, instance=True) + mongo = create_autospec(MongoUtil, spec_set=True, instance=True) + slack = create_autospec(SlackClient, spec_set=True, instance=True) + ws = create_autospec(Workspace, spec_set=True, instance=True) + # Set up basic getter calls + catutils.get_catalog.return_value = catalog + sdkmr.get_catalog_utils.return_value = catutils + sdkmr.get_condor.return_value = condor + sdkmr.get_kafka_client.return_value = kafka + sdkmr.get_logger.return_value = logger + sdkmr.get_mongo_util.return_value = mongo + sdkmr.get_slack_client.return_value = slack + sdkmr.get_token.return_value = token + sdkmr.get_user_id.return_value = user + sdkmr.get_workspace.return_value = ws + + # Set up call returns. These calls are in the order they occur in the code + sdkmr.check_as_admin.return_value = True + sdkmr.save_job.return_value = job_id + ws.get_object_info3.return_value = {"paths": [[ws_obj1], [ws_obj2]]} + catalog_resources = { + "client_group": client_group, + "request_cpus": cpus, + "request_memory": mem, + } + catutils.get_normalized_resources.return_value = catalog_resources + condor.extract_resources.return_value = CondorResources( + cpus, "2600GB", mem, client_group + ) + catalog.get_module_version.return_value = {"git_commit_hash": git_commit} + condor.run_job.return_value = SubmissionInfo(cluster, {}, None) + retjob = Job() + retjob.id = ObjectId(job_id) + retjob.status = created_state + mongo.get_job.return_value = retjob + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": method, + "source_ws_objects": [ws_obj1, ws_obj2], + } + assert rj.run(params, as_admin=True) == job_id + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + ws.get_object_info3.assert_called_once_with( + {"objects": [{"ref": ws_obj1}, {"ref": ws_obj2}], "ignoreErrors": 1} + ) + catutils.get_normalized_resources.assert_called_once_with(method) + condor.extract_resources.assert_called_once_with(catalog_resources) + catalog.get_module_version.assert_called_once_with( + {"module_name": "lolcats", "version": "release"} + ) + + # initial job data save + expected_job = Job() + expected_job.user = user + expected_job.status = created_state + ji = JobInput() + ji.method = method + ji.service_ver = git_commit + ji.source_ws_objects = [ws_obj1, ws_obj2] + ji.parent_job_id = "None" + jr = JobRequirements() + jr.clientgroup = client_group + jr.cpu = cpus + jr.memory = "32" + jr.disk = "2600" + ji.requirements = jr + ji.narrative_cell_info = Meta() + expected_job.job_input = ji + assert len(sdkmr.save_job.call_args_list) == 2 + got_job = sdkmr.save_job.call_args_list[0][0][0] + assert_jobs_equal(got_job, expected_job) + + kafka.send_kafka_message.assert_any_call(KafkaCreateJob(user, job_id)) + condor.run_job.assert_called_once_with( + params={ + "method": method, + "source_ws_objects": [ws_obj1, ws_obj2], + "service_ver": git_commit, + "job_id": job_id, + "user_id": user, + "token": token, + "cg_resources_requirements": { + "client_group": client_group, + "request_cpus": cpus, + "request_memory": mem, + }, + }, + concierge_params=None, + ) + + # updated job data save + mongo.get_job.assert_called_once_with(job_id) + + # update to queued state + got_job = sdkmr.save_job.call_args_list[1][0][0] + expected_job = Job() + expected_job.id = ObjectId(job_id) + expected_job.status = queued_state + # no way to test this really without code refactoring + expected_job.queued = got_job.queued + + expected_job.scheduler_type = "condor" + expected_job.scheduler_id = cluster + assert_jobs_equal(got_job, expected_job) + + kafka.send_kafka_message.assert_called_with( # update to queued state + KafkaQueueChange( + job_id=job_id, + new_status=queued_state, + previous_status=created_state, + scheduler_id=cluster, + ) + ) + slack.run_job_message.assert_called_once_with(job_id, cluster, user) + + +def assert_jobs_equal(got_job: Job, expected_job: Job): + """ + Checks that the two jobs are equivalent, except that the 'updated' fields are checked that + they're within 1 second of each other. + """ + # Job inherits from Document which inherits from BaseDocument in MongoEngine. BD provides + # the __eq__ method for the hierarchy, which bases equality on the Jobs having equal id + # fields, or if no id is present, on identity. Therefore + # assert job1 == job2 + # will not work as a test mechanic. + # JobInput and its contained classes inherit from EmbeddedDocument which *does* have an + # __eq__ method that takes the class fields into account. + # Also note that all these classes use __slots__ so vars() and __dict__ are empty other + # than the class name. + # Hence we do this disgusting hack instead. Note it will need to be updated any time a + # job field is added. + + if not hasattr(got_job, "id"): + assert not hasattr(expected_job, "id") + else: + assert got_job.id == expected_job.id + + # The Job class fills the updated field with the output of time.time on instantiation + # so we can't do a straight equality + assert abs(got_job.updated - expected_job.updated) < 1 + + job_fields = [ + "user", + "authstrat", + "wsid", + "status", + "queued", + "estimating", + "running", + "finished", + "errormsg", + "msg", + "error", + "terminated_code", + "error_code", + "scheduler_type", + "scheduler_id", + "scheduler_estimator_id", + "job_input", + "job_output", + "condor_job_ads", + "child_jobs", + "batch_job", + ] + + _assert_field_subset_equal(got_job, expected_job, job_fields) + + +def _assert_field_subset_equal(obj1: object, obj2: object, fields: List[str]): + """ + Checks that field subsets from two objects are the same. + + :param obj1: The first object + :param obj2: The second object + :param fields: The fields in the objects to compare for equality. Any fields in the object + not in this list are ignored and not included in the equality calculation. + :raises AttributeError: If the field is not present in one or both of the objects. + """ + for field in fields: + assert getattr(obj1, field) == getattr(obj2, field), field diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 1134e1f64..a2fbaace6 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -8,7 +8,7 @@ from configparser import ConfigParser from datetime import datetime, timedelta, timezone from pprint import pprint -from unittest.mock import patch +from unittest.mock import patch, create_autospec from pytest import raises import bson @@ -17,12 +17,14 @@ from bson import ObjectId from mock import MagicMock +from execution_engine2.authorization.workspaceauth import WorkspaceAuth from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode from lib.execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from execution_engine2.utils.clients import UserClientSet from execution_engine2.utils.clients import get_user_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import ( @@ -39,6 +41,8 @@ from lib.execution_engine2.sdk.EE2Runjob import EE2RunJob +from installed_clients.WorkspaceClient import Workspace + # TODO this isn't necessary with pytest, can just use regular old functions class ee2_SDKMethodRunner_test(unittest.TestCase): @@ -133,6 +137,34 @@ def _init_fail(self, cfg, user_clients, expected): SDKMethodRunner(cfg, user_clients) assert_exception_correct(e.value, expected) + def test_getters(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + sdkmr = SDKMethodRunner(self.cfg, cliset) + + assert sdkmr.get_workspace() is ws + assert sdkmr.get_user_id() == "user" + assert sdkmr.get_token() == "token" + + def test_save_job(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + sdkmr = SDKMethodRunner(self.cfg, cliset) + + # We cannot use spec_set=True here because the code must access the Job.id field, + # which is set dynamically. This means if the Job api changes, this test could pass + # when it should fail, but there doesn't seem to be a way around that other than + # completely rewriting how the code interfaces with MongoDB. + # For a discussion of spec_set see + # https://www.seanh.cc/2017/03/17/the-problem-with-mocks/ + j = create_autospec(Job, spec_set=False, instance=True) + j.id = bson.objectid.ObjectId("603051cfaf2e3401b0500982") + assert sdkmr.save_job(j) == "603051cfaf2e3401b0500982" + + j.save.assert_called_once_with() + # Status @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_cancel_job(self, condor): From e2c8086bd1f52b3ca488882c493aaaa9704626ad Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 25 Feb 2021 15:07:51 -0800 Subject: [PATCH 015/109] Fix date range job search bug (#319) * Fix date range job search bug and add unit tests Previously if the user was None it would be set to the user's name, but that behavior was mistakenly removed when making user_id and token required fields in SDKMR. Without that behavior, a user = None filter is added to the mongo query which matches no jobs. * run black * remove unused imports * remove unused import * Minor test cleanup & new test Since the second if block was altered added a test to exercise the positive path. * run black --- lib/execution_engine2/sdk/EE2StatusRange.py | 25 ++-- lib/execution_engine2/sdk/SDKMethodRunner.py | 36 ++++- test/tests_for_sdkmr/EE2StatusRange_test.py | 133 ++++++++++++++++++ .../ee2_SDKMethodRunner_test.py | 2 +- 4 files changed, 180 insertions(+), 16 deletions(-) create mode 100644 test/tests_for_sdkmr/EE2StatusRange_test.py diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index 9abc06e84..724dda322 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -7,10 +7,10 @@ from bson import ObjectId from execution_engine2.utils.arg_processing import parse_bool -from lib.execution_engine2.db.models.models import Job -from lib.execution_engine2.exceptions import AuthError +from execution_engine2.exceptions import AuthError +# TODO this class is duplicated all over the place, move to common file class JobPermissions(Enum): READ = "r" WRITE = "w" @@ -75,12 +75,14 @@ def check_jobs_date_range_for_user( if offset is None: offset = 0 + if user is None: + user = self.sdkmr.get_user_id() # Admins can view "ALL" or check_jobs for other users - if user != self.sdkmr.user_id: + elif user != self.sdkmr.get_user_id(): if not self.sdkmr.check_is_admin(): raise AuthError( "You are not authorized to view all records or records for others. " - + f"user={user} token={self.sdkmr.user_id}" + + f"user={user} token={self.sdkmr.get_user_id()}" ) dummy_ids = self._get_dummy_dates(creation_start_time, creation_end_time) @@ -116,17 +118,12 @@ def check_jobs_date_range_for_user( if user != "ALL": job_filter_temp["user"] = user - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - count = Job.objects.filter(**job_filter_temp).count() - jobs = ( - Job.objects[:limit] - .filter(**job_filter_temp) - .order_by(f"{sort_order}_id") - .skip(offset) - .only(*job_projection) - ) + count = self.sdkmr.get_job_counts(job_filter_temp) + jobs = self.sdkmr.get_jobs( + job_filter_temp, job_projection, sort_order, offset, limit + ) - self.sdkmr.logger.debug( + self.sdkmr.get_logger().debug( f"Searching for jobs with id_gt {dummy_ids.start} id_lt {dummy_ids.stop}" ) diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 9610779a7..b66d4e71a 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -233,14 +233,48 @@ def check_as_concierge(self): "You are not the concierge user. This method is not for you" ) + # The next few methods allow for unit testing the various EE2*.py classes. + # They could also be moved to the MongoUtil class, but there doesn't appear to be a need + # at this point since MongoEngine creates a global connection to MongoDB + # and makes it available to all the model objects. + def save_job(self, job: Job): """ Save a job record to the Mongo database. """ - # The purpose of this method is to allow unit testing the various EE2*.py classes. job.save() return str(job.id) + def get_job_counts(self, job_filter): + """ + Get the number of jobs matching a filter. + + job_filter - a dict of keys to filter terms in the MongoEngine filter language. + """ + return Job.objects.filter(**job_filter).count() + + def get_jobs(self, job_filter, job_projection, sort_order, offset, limit): + """ + Get jobs from the database. + + job_filter - a dict of keys to filter terms in the MongoEngine filter language. + job_projection - a list of field names to include in the returned jobs. + sort_order - '+' to sort by job ID ascending, '-' descending. + offset - the number of jobs to skip before returning results. + limit - the maximum number of jobs to return. + """ + # TODO Instead of SKIP use ID GT LT + # https://www.codementor.io/arpitbhayani/fast-and-efficient-pagination-in-mongodb-9095flbqr + # ^ this one is important - the workspace was DOSed by a single open narrative at one + # point due to skip abuse, which is why it was removed + return ( + Job.objects[:limit] + .filter(**job_filter) + .order_by(f"{sort_order}_id") + .skip(offset) + .only(*job_projection) + ) + # API ENDPOINTS # ENDPOINTS: Admin Related Endpoints diff --git a/test/tests_for_sdkmr/EE2StatusRange_test.py b/test/tests_for_sdkmr/EE2StatusRange_test.py new file mode 100644 index 000000000..7d2e14db4 --- /dev/null +++ b/test/tests_for_sdkmr/EE2StatusRange_test.py @@ -0,0 +1,133 @@ +""" +Unit tests for the EE2StatusRange class. +""" + +from pytest import raises + +from logging import Logger +from unittest.mock import create_autospec, call +from bson.objectid import ObjectId + +from execution_engine2.exceptions import AuthError +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.EE2StatusRange import JobStatusRange +from execution_engine2.db.models.models import Job + +from utils_shared.test_utils import assert_exception_correct + +# Incomplete by a long way. Will add more unit tests as they come up. + +USER1 = "user1" + + +def test_run_minimal_no_user_in_input(): + """ + Tests a minimal run of the job lookup method as a standard user with no username passed into + the method. + The returned job has minimal fields. + """ + _run_minimal(None) + + +def test_run_minimal_self_user_in_input(): + """ + Tests a minimal run of the job lookup method as a standard user with the user's own username + passed into the method. + The returned job has minimal fields. + """ + _run_minimal(USER1) + + +def _run_minimal(user): + # set up constants + expected_user = USER1 + job_count = 26 + objectid = "603051cfaf2e3401b0500982" + created_state = "created" + expected_job_filter = { + "id__gt": "000000230000000000000000", + "id__lt": "0000005c0000000000000000", + "user": expected_user, + } + + # set up mock return values. Ordered as per the call order in the EE2SR code. + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + logger = create_autospec(Logger, spec_set=True, instance=True) + sdkmr.get_logger.return_value = logger + sdkmr.get_user_id.return_value = expected_user + sdkmr.check_and_convert_time.side_effect = [35.6, 92.4] + sdkmr.get_job_counts.return_value = job_count + + j = Job() + j.id = ObjectId(objectid) + j.user = expected_user + j.updated = 1000000.0 + j.status = created_state + sdkmr.get_jobs.return_value = [j] + + # call the method + ee2sr = JobStatusRange(sdkmr) + ret = ee2sr.check_jobs_date_range_for_user("5/6/21", "7/6/21", user=user) + + assert ret == { + "count": 1, + "filter": expected_job_filter, + "jobs": [ + { + "_id": objectid, + "authstrat": "kbaseworkspace", + "batch_job": False, + "child_jobs": [], + # this comes from the ObjectID, which has an embedded date + "created": 1613779407000, + "job_id": objectid, + "status": created_state, + "updated": 1000000000, + "user": expected_user, + } + ], + "limit": 2000, + "projection": [], + "query_count": job_count, + "skip": 0, + "sort_order": "+", + "stats": { + "app_id": {None: 1}, + "clientgroup": {None: 1}, + "method": {None: 1}, + "status": {created_state: 1}, + "user": {expected_user: 1}, + "wsid": {None: 1}, + }, + } + + # check mocks called as expected. Ordered as per the call order in the EE2SR code + sdkmr.check_and_convert_time.assert_has_calls([call("5/6/21"), call("7/6/21")]) + sdkmr.get_job_counts.assert_called_once_with(expected_job_filter) + sdkmr.get_jobs.assert_called_once_with(expected_job_filter, [], "+", 0, 2000) + logger.debug.assert_called_once_with( + "Searching for jobs with id_gt 000000230000000000000000 id_lt 0000005c0000000000000000" + ) + + +def test_run_with_non_matching_user_and_not_admin(): + """ + Test that a user trying to see another user's jobs without admin privs fails as expected. + """ + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + sdkmr.get_user_id.return_value = "user1" + sdkmr.check_is_admin.return_value = False + + ee2sr = JobStatusRange(sdkmr) + with raises(Exception) as got: + ee2sr.check_jobs_date_range_for_user("5/6/21", "7/6/21", user="user2") + assert_exception_correct( + got.value, + AuthError( + "You are not authorized to view all records or records for others. " + + "user=user2 token=user1" + ), + ) + + sdkmr.get_user_id.assert_has_calls([call(), call()]) + sdkmr.check_is_admin.assert_called_once_with() diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index a2fbaace6..ba7d10592 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -20,7 +20,7 @@ from execution_engine2.authorization.workspaceauth import WorkspaceAuth from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode -from lib.execution_engine2.exceptions import AuthError +from execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources From aecddbf3357eba07c2da92a1538387a2779e6ea1 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 1 Mar 2021 12:01:32 -0800 Subject: [PATCH 016/109] Pass in clients via dependency injection (#320) * Pass in clients via dependency injection Allows the clients to be mocked, and therefore makes is easier to unit test SDKMR without monkey patching. Also makes it easier to swap out client implementations as long as the new implementation has the same interface, especially at runtime for alternate configurations. * Run back and a little cleanup * Remove unused imports * more unused imports * Try and figure out when the tests aren't passing They pass locally, why not in GHA * sometimes black really gets on my nerves * Test fix attempt #1 Is appears that the tests may be passing locally because the test that changes the environment is run last, while it's run just before the test that fails in github actions. * remove debugging code --- .../execution_engine2Impl.py | 80 +++++------ .../sdk/EE2Authentication.py | 4 +- lib/execution_engine2/sdk/EE2Logs.py | 22 ++- lib/execution_engine2/sdk/SDKMethodRunner.py | 61 ++++---- lib/execution_engine2/utils/CatalogUtils.py | 6 +- lib/execution_engine2/utils/clients.py | 131 ++++++++++++++---- test/tests_for_auth/ee2_admin_mode_test.py | 94 ++++++++----- test/tests_for_integration/api_to_db_test.py | 7 +- .../ee2_SDKMethodRunner_EE2Logs_test.py | 13 +- .../ee2_SDKMethodRunner_test.py | 51 +++++-- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 12 +- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 7 +- .../ee2_SDKMethodRunner_test_utils.py | 14 +- test/tests_for_sdkmr/ee2_load_test.py | 62 ++++----- test/utils_shared/mock_utils.py | 92 ++++++++++++ 15 files changed, 426 insertions(+), 230 deletions(-) create mode 100644 test/utils_shared/mock_utils.py diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 43c9f62a3..fac90c96f 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- #BEGIN_HEADER +import os import time from cachetools import TTLCache -from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.utils.APIHelpers import GenerateFromConfig +from execution_engine2.utils.clients import get_client_set #END_HEADER @@ -59,8 +60,10 @@ def __init__(self, config): self.admin_permissions_cache = TTLCache( maxsize=self.ADMIN_ROLES_CACHE_SIZE, ttl=self.ADMIN_ROLES_CACHE_EXPIRE_TIME ) - self.mongo_util = MongoUtil(config) self.gen_cfg = GenerateFromConfig(config) + # move these into GFC? Since they're only generated once it doesn't seem necessary + configpath = os.environ["KB_DEPLOYMENT_CONFIG"] + self.clients = get_client_set(config, configpath) #END_CONSTRUCTOR pass @@ -226,10 +229,10 @@ def run_job(self, ctx, params): # return variables are: job_id #BEGIN run_job mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util + admin_permissions_cache=self.admin_permissions_cache ) job_id = mr.run_job(params) #END run_job @@ -298,10 +301,10 @@ def run_job_batch(self, ctx, params, batch_params): # return variables are: job_ids #BEGIN run_job_batch mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util + admin_permissions_cache=self.admin_permissions_cache ) job_ids = mr.run_job_batch(params, batch_params) #END run_job_batch @@ -327,10 +330,10 @@ def abandon_children(self, ctx, params): # return variables are: parent_and_child_ids #BEGIN abandon_children mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util + admin_permissions_cache=self.admin_permissions_cache, ) parent_and_child_ids = mr.abandon_children(parent_job_id=params['parent_job_id'], child_job_ids=params['child_job_ids'], @@ -410,9 +413,8 @@ def run_job_concierge(self, ctx, params, concierge_params): # return variables are: job_id #BEGIN run_job_concierge mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) job_id = mr.run_job_concierge(params=params,concierge_params=concierge_params) #END run_job_concierge @@ -480,11 +482,10 @@ def get_job_params(self, ctx, params): # return variables are: params #BEGIN get_job_params mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) params = mr.get_job_params(job_id=params['job_id'], as_admin=params.get('as_admin')) #END get_job_params @@ -509,11 +510,10 @@ def update_job_status(self, ctx, params): # return variables are: job_id #BEGIN update_job_status mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) job_id = mr.update_job_status(job_id=params['job_id'], status=params['status'], @@ -547,11 +547,10 @@ def add_job_logs(self, ctx, params, lines): # return variables are: results #BEGIN add_job_logs mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) add_job_logs = mr.add_job_logs(job_id=params['job_id'], log_lines=lines, as_admin=params.get('as_admin')) @@ -598,11 +597,10 @@ def get_job_logs(self, ctx, params): raise ValueError("Please provide only one of skip_lines or offset") mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) returnVal = mr.view_job_logs( job_id=params["job_id"], @@ -639,11 +637,10 @@ def finish_job(self, ctx, params): # ctx is the context object #BEGIN finish_job mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) mr.finish_job( job_id=params["job_id"], @@ -668,11 +665,10 @@ def start_job(self, ctx, params): # ctx is the context object #BEGIN start_job mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) mr.start_job( params["job_id"], skip_estimation=params.get("skip_estimation", True), @@ -781,9 +777,8 @@ def check_job(self, ctx, params): # return variables are: job_state #BEGIN check_job mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) job_state = mr.check_job( params["job_id"], exclude_fields=params.get("exclude_fields", None), @@ -988,9 +983,8 @@ def check_job_batch(self, ctx, params): # return variables are: returnVal #BEGIN check_job_batch mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.check_job_batch( parent_job_id=params["job_id"], exclude_fields=params.get("exclude_fields", None), @@ -1107,9 +1101,8 @@ def check_jobs(self, ctx, params): # return variables are: returnVal #BEGIN check_jobs mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.check_jobs( params.get("job_ids"), @@ -1228,9 +1221,10 @@ def check_workspace_jobs(self, ctx, params): # ctx is the context object # return variables are: returnVal #BEGIN check_workspace_jobs - mr = SDKMethodRunner(self.config, - user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util) + mr = SDKMethodRunner( + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, + ) returnVal = mr.check_workspace_jobs( params.get("workspace_id"), exclude_fields=params.get("exclude_fields", None), @@ -1261,11 +1255,10 @@ def cancel_job(self, ctx, params): # ctx is the context object #BEGIN cancel_job mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) mr.cancel_job( @@ -1300,9 +1293,8 @@ def check_job_canceled(self, ctx, params): # return variables are: result #BEGIN check_job_canceled mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) result = mr.check_job_canceled(job_id=params["job_id"], as_admin=params.get('as_admin')) #END check_job_canceled @@ -1327,11 +1319,10 @@ def get_job_status(self, ctx, params): # return variables are: result #BEGIN get_job_status mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) result = mr.get_job_status_field(job_id=params['job_id'], as_admin=params.get('as_admin')) #END get_job_status @@ -1455,9 +1446,8 @@ def check_jobs_date_range_for_user(self, ctx, params): # return variables are: returnVal #BEGIN check_jobs_date_range_for_user mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.check_jobs_date_range_for_user( creation_start_time=params.get("start_time"), @@ -1591,9 +1581,8 @@ def check_jobs_date_range_for_all(self, ctx, params): # return variables are: returnVal #BEGIN check_jobs_date_range_for_all mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.check_jobs_date_range_for_user( creation_start_time=params.get("start_time"), @@ -1626,9 +1615,8 @@ def handle_held_job(self, ctx, cluster_id): # return variables are: returnVal #BEGIN handle_held_job mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.handle_held_job(cluster_id=cluster_id) #END handle_held_job @@ -1649,9 +1637,8 @@ def is_admin(self, ctx): # return variables are: returnVal #BEGIN is_admin mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.check_is_admin() #END is_admin @@ -1675,9 +1662,8 @@ def get_admin_permission(self, ctx): # return variables are: returnVal #BEGIN get_admin_permission mr = SDKMethodRunner( - self.config, user_clients=self.gen_cfg.get_user_clients(ctx), - mongo_util=self.mongo_util + clients=self.clients, ) returnVal = mr.get_admin_permission() #END get_admin_permission diff --git a/lib/execution_engine2/sdk/EE2Authentication.py b/lib/execution_engine2/sdk/EE2Authentication.py index 973103a67..649d6ab1f 100644 --- a/lib/execution_engine2/sdk/EE2Authentication.py +++ b/lib/execution_engine2/sdk/EE2Authentication.py @@ -3,7 +3,6 @@ from cachetools import TTLCache from lib.execution_engine2.authorization.authstrategy import can_read_job, can_write_job -from lib.execution_engine2.authorization.roles import AdminAuthUtil from lib.execution_engine2.db.models.models import Job from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE @@ -31,8 +30,7 @@ def get_cache(cache, size, expire): return cache def _lookup_admin_permissions(self): - aau = AdminAuthUtil(self.sdkmr.auth_url, self.sdkmr.admin_roles) - p = aau.get_admin_role( + p = self.sdkmr.auth_admin.get_admin_role( token=self.sdkmr.token, read_role=ADMIN_READ_ROLE, write_role=ADMIN_WRITE_ROLE, diff --git a/lib/execution_engine2/sdk/EE2Logs.py b/lib/execution_engine2/sdk/EE2Logs.py index daca2347e..e9d19de9e 100644 --- a/lib/execution_engine2/sdk/EE2Logs.py +++ b/lib/execution_engine2/sdk/EE2Logs.py @@ -23,7 +23,6 @@ class AddLogResult(NamedTuple): class EE2Logs: def __init__(self, sdkmr): self.sdkmr = sdkmr - self.mongo_util = self.sdkmr.get_mongo_util() def _format_job_logs(self, record_position, log_lines): @@ -49,14 +48,13 @@ def _create_new_log(self, pk, log_lines: list): :param log_lines: The lines to add to this log :return: """ - with self.mongo_util.mongo_engine_connection(): - jl = JLModel() - jl.primary_key = pk - jl.original_line_count = 0 - jl.stored_line_count = 0 - jl.lines = self._format_job_logs(record_position=-1, log_lines=log_lines) - jl.original_line_count = jl.stored_line_count = len(log_lines) - jl.save() + jl = JLModel() + jl.primary_key = pk + jl.original_line_count = 0 + jl.stored_line_count = 0 + jl.lines = self._format_job_logs(record_position=-1, log_lines=log_lines) + jl.original_line_count = jl.stored_line_count = len(log_lines) + jl.save() return jl def _add_first_logs(self, log_lines, job_id): @@ -76,7 +74,7 @@ def _add_subsequent_logs(self, job_log, log_lines): record_position=job_log["stored_line_count"] - 1, log_lines=log_lines ) record_count = int(job_log["stored_line_count"]) + len(formatted_logs) - slc = self.mongo_util._push_job_logs( + slc = self.sdkmr.mongo_util._push_job_logs( formatted_logs, job_id=job_log["_id"], record_count=record_count ) return AddLogResult(success=True, stored_line_count=slc) @@ -109,7 +107,7 @@ def add_job_logs(self, job_id, log_lines, as_admin=False) -> AddLogResult: self.sdkmr.logger.debug(f"About to add logs for {job_id}") try: try: - job_log = self.mongo_util.get_job_log_pymongo(job_id) + job_log = self.sdkmr.mongo_util.get_job_log_pymongo(job_id) except RecordNotFoundException: return self._add_first_logs(log_lines=log_lines, job_id=job_id) return self._add_subsequent_logs(job_log, log_lines) @@ -145,7 +143,7 @@ def _get_job_logs(self, job_id, skip_lines, limit=None) -> Dict: :return: """ - log = self.mongo_util.get_job_log_pymongo(job_id) + log = self.sdkmr.mongo_util.get_job_log_pymongo(job_id) lines = [] last_line_number = 0 count = len(log.get("lines", [])) diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index b66d4e71a..bef390a4f 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -8,7 +8,6 @@ * Clients are only loaded if they are necessary """ -import os import time from datetime import datetime from enum import Enum @@ -16,7 +15,6 @@ import dateutil -from installed_clients.authclient import KBaseAuth from lib.execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.db.models.models import Job from lib.execution_engine2.exceptions import AuthError @@ -30,12 +28,11 @@ from lib.execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME from lib.execution_engine2.utils.CatalogUtils import CatalogUtils from lib.execution_engine2.utils.Condor import Condor -from lib.execution_engine2.utils.EE2Logger import get_logger +from lib.execution_engine2.utils.EE2Logger import get_logger as _get_logger from lib.execution_engine2.utils.KafkaUtils import KafkaClient from lib.execution_engine2.utils.SlackUtils import SlackClient -from execution_engine2.utils.clients import UserClientSet -from execution_engine2.utils.arg_processing import parse_bool from installed_clients.WorkspaceClient import Workspace +from execution_engine2.utils.clients import UserClientSet, ClientSet class JobPermissions(Enum): @@ -57,30 +54,25 @@ class SDKMethodRunner: def __init__( self, - config, user_clients: UserClientSet, + clients: ClientSet, job_permission_cache=None, admin_permissions_cache=None, - mongo_util=None, ): if not user_clients: raise ValueError("user_clients is required") - self.deployment_config_fp = os.environ["KB_DEPLOYMENT_CONFIG"] - self.config = config - self.mongo_util = mongo_util - self.condor = None + if not clients: + raise ValueError("clients is required") + self.mongo_util = clients.mongo_util + self.condor = clients.condor self.workspace = user_clients.workspace self.workspace_auth = user_clients.workspace_auth - self.admin_roles = config.get("admin_roles", ["EE2_ADMIN", "EE2_ADMIN_RO"]) - self.catalog_utils = CatalogUtils( - config["catalog-url"], config["catalog-token"] - ) - self.auth_url = config.get("auth-url") - self.auth = KBaseAuth(auth_url=config.get("auth-service-url")) + self.catalog_utils = clients.catalog_utils + self.auth = clients.auth + self.auth_admin = clients.auth_admin self.user_id = user_clients.user_id self.token = user_clients.token - self.debug = parse_bool(config.get("debug")) - self.logger = get_logger() + self.logger = _get_logger() self.job_permission_cache = EE2Authentication.EE2Auth.get_cache( cache=job_permission_cache, @@ -100,10 +92,8 @@ def __init__( self._ee2_logs = None self._ee2_status_range = None self._ee2_auth = None - self.kafka_client = KafkaClient(config.get("kafka-host")) - self.slack_client = SlackClient( - config.get("slack-token"), debug=self.debug, endpoint=config.get("ee2-url") - ) + self.kafka_client = clients.kafka_client + self.slack_client = clients.slack_client # Various Clients: TODO: Think about sending in just required clients, not entire SDKMR @@ -132,16 +122,6 @@ def get_jobs_status(self) -> EE2Status.JobsStatus: self._ee2_status = EE2Status.JobsStatus(self) return self._ee2_status - def get_mongo_util(self) -> MongoUtil: - if self.mongo_util is None: - self.mongo_util = MongoUtil(self.config) - return self.mongo_util - - def get_condor(self) -> Condor: - if self.condor is None: - self.condor = Condor(self.deployment_config_fp) - return self.condor - # A note on getters: # Getters are commonly described as unpythonic. However, accessing instance variables # directly, rather than via getters, causes significant problems when mocking a class in @@ -169,21 +149,18 @@ def get_catalog_utils(self) -> CatalogUtils: """ Get the catalog utilities for this instance of SDKMR. """ - # TODO Unit test this method once catalog_utils can be mocked. return self.catalog_utils def get_kafka_client(self) -> KafkaClient: """ Get the Kafka client for this instance of SDKMR. """ - # TODO Unit test this method once kafka_client can be mocked. return self.kafka_client def get_slack_client(self) -> SlackClient: """ Get the Kafka client for this instance of SDKMR. """ - # TODO Unit test this method once slack_client can be mocked. return self.slack_client def get_user_id(self) -> str: @@ -198,6 +175,18 @@ def get_token(self) -> str: """ return self.token + def get_mongo_util(self) -> MongoUtil: + """ + Get the mongo utilities for this instance of SDKMR. + """ + return self.mongo_util + + def get_condor(self) -> Condor: + """ + Get the Condor interface for this instance of SDKMR + """ + return self.condor + # Permissions Decorators #TODO Verify these actually work #TODO add as_admin to these def allow_job_read(func): diff --git a/lib/execution_engine2/utils/CatalogUtils.py b/lib/execution_engine2/utils/CatalogUtils.py index 1f970f10e..ac58302f8 100644 --- a/lib/execution_engine2/utils/CatalogUtils.py +++ b/lib/execution_engine2/utils/CatalogUtils.py @@ -6,12 +6,12 @@ class CatalogUtils: def __init__(self, url, admin_token): - self.catalog = Catalog(url=url, token=admin_token) + self._catalog = Catalog(url=url, token=admin_token) def get_catalog(self): """ Get the catalog client for this instance. """ # TODO unit test this method after switching to dependency injection - return self.catalog + return self._catalog def get_normalized_resources(self, method) -> Dict: """ @@ -29,7 +29,7 @@ def get_normalized_resources(self, method) -> Dict: module_name, function_name = method.split(".") - group_config = self.catalog.list_client_group_configs( + group_config = self._catalog.list_client_group_configs( {"module_name": module_name, "function_name": function_name} ) diff --git a/lib/execution_engine2/utils/clients.py b/lib/execution_engine2/utils/clients.py index 97aa13fb3..279b0c695 100644 --- a/lib/execution_engine2/utils/clients.py +++ b/lib/execution_engine2/utils/clients.py @@ -7,11 +7,13 @@ from execution_engine2.authorization.roles import AdminAuthUtil from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.utils.CatalogUtils import CatalogUtils from execution_engine2.utils.Condor import Condor from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.arg_processing import parse_bool from installed_clients.authclient import KBaseAuth from installed_clients.WorkspaceClient import Workspace @@ -87,33 +89,106 @@ class ClientSet: These are not user-specific and can be reused throughout the application. """ - def __init__(self, cfg: Dict[str, str], cfg_path: str, debug: bool = False): + def __init__( + self, + auth: KBaseAuth, + auth_admin: AdminAuthUtil, + condor: Condor, + catalog_utils: CatalogUtils, + kafka_client: KafkaClient, + mongo_util: MongoUtil, + slack_client: SlackClient, + ): """ - Initialize the client set from a configuration dictionary. - - cfg - the configuration dictionary - cfg_path - the path to the configuration file - debug - set clients that support it to debug mode - - Expected keys in config: - auth-url - the root URL of the kbase auth service - catalog-url - the URL of the catalog service - catalog-token - a token to use with the catalog service. Ideally a service token - kafka-host - the host string for a Kafka service - slack-token - a token for contacting Slack + Initialize the client set from the individual clients. """ - # TODO seems like it'd make sense to init Condor from a config dict like everything else - self.condor = Condor(cfg_path) - self.catalog_utils = CatalogUtils(cfg["catalog-url"], cfg["catalog-token"]) - auth_url = cfg["auth-url"] - self.auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") - # TODO using hardcoded roles for now to avoid possible bugs with mismatched cfg roles - # these should probably be configurable - self.auth_admin = AdminAuthUtil(auth_url, [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE]) - - # KafkaClient has a nice error message when the arg is None - self.kafka_client = KafkaClient(cfg.get("kafka-host")) - # SlackClient handles None arguments - self.slack_client = SlackClient( - cfg.get("slack-token"), debug=debug, endpoint=cfg.get("ee2-url") - ) + + # TODO check no clients are None. Make a general method somewhere + self.auth = auth + self.auth_admin = auth_admin + self.condor = condor + self.catalog_utils = catalog_utils + self.kafka_client = kafka_client + self.mongo_util = mongo_util + self.slack_client = slack_client + + +# the constructor allows for mix and match of mocks and real implementations as needed +# the method below handles all the client set up for going straight from a config + + +def get_clients( + cfg: Dict[str, str], cfg_path: str +) -> ( + KBaseAuth, + AdminAuthUtil, + Condor, + CatalogUtils, + KafkaClient, + MongoUtil, + SlackClient, +): + """ + Get the set of clients used in the EE2 application that are not user-specific and can be + reused from user to user. + + cfg - the configuration dictionary + cfg_path - the path to the configuration file + + Expected keys in config: + auth-url - the root URL of the kbase auth service + catalog-url - the URL of the catalog service + catalog-token - a token to use with the catalog service. Ideally a service token + kafka-host - the host string for a Kafka service + slack-token - a token for contacting Slack + """ + # Condor needs access to the entire deploy.cfg file, not just the ee2 section + condor = Condor(cfg_path) + # Do a check to ensure the urls and tokens actually work correctly? + # TODO check keys are present - make some general methods for dealing with this + catalog_utils = CatalogUtils(cfg["catalog-url"], cfg["catalog-token"]) + auth_url = cfg["auth-url"] + auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") + # TODO using hardcoded roles for now to avoid possible bugs with mismatched cfg roles + # these should probably be configurable. + # See https://github.com/kbase/execution_engine2/issues/295 + auth_admin = AdminAuthUtil(auth_url, [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE]) + + # KafkaClient has a nice error message when the arg is None + kafka_client = KafkaClient(cfg.get("kafka-host")) + + debug = parse_bool(cfg.get("debug")) + # SlackClient handles None arguments + slack_client = SlackClient( + cfg.get("slack-token"), debug=debug, endpoint=cfg.get("ee2-url") + ) + # TODO check how MongoUtil handles a bad config + that error messages are understandable + mongo_util = MongoUtil(cfg) + return ( + auth, + auth_admin, + condor, + catalog_utils, + kafka_client, + mongo_util, + slack_client, + ) + + +def get_client_set(cfg: Dict[str, str], cfg_path: str) -> ClientSet: + """ + A helper method to create a ClientSet from a config dict rather than constructing and passing + in clients individually. + + cfg - the configuration dictionary + cfg_path - the path to the configuration file + + Expected keys in config: + auth-url - the root URL of the kbase auth service + catalog-url - the URL of the catalog service + catalog-token - a token to use with the catalog service. Ideally a service token + kafka-host - the host string for a Kafka service + slack-token - a token for contacting Slack + """ + + return ClientSet(*get_clients(cfg, cfg_path)) diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index b2c3edc6d..9a7afac08 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -11,26 +11,33 @@ from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace -from lib.execution_engine2.authorization.roles import AdminAuthUtil -from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE -from lib.execution_engine2.db.models.models import Status -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.Condor import Condor -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo -from execution_engine2.utils.clients import get_user_client_set, UserClientSet +from execution_engine2.db.models.models import Status +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import ( + UserClientSet, + ClientSet, + get_client_set, + get_user_client_set, +) from test.utils_shared.test_utils import ( get_sample_job_params, get_sample_condor_info, ) +from test.utils_shared.mock_utils import get_client_mocks as _get_client_mocks + class EE2TestAdminMode(unittest.TestCase): @classmethod def setUpClass(cls): - config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + cls.config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") config_parser = ConfigParser() - config_parser.read(config_file) + config_parser.read(cls.config_file) cls.cfg = {} @@ -45,10 +52,6 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) - ) - def setUp(self) -> None: """ Patch out Catalog and Condor @@ -87,11 +90,13 @@ def tearDown(self) -> None: self.condor_patch.stop() self.condor_patch2.start() - def getRunner(self, user_clients=None) -> SDKMethodRunner: + def getRunner(self, user_clients=None, clients=None) -> SDKMethodRunner: # Initialize these clients from None if not user_clients: user_clients = get_user_client_set(self.cfg, self.user_id, self.token) - runner = SDKMethodRunner(self.cfg, user_clients) # type : SDKMethodRunner + if not clients: + clients = get_client_set(self.cfg, self.config_file) + runner = SDKMethodRunner(user_clients, clients) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() @@ -109,24 +114,29 @@ def get_runner_with_condor(self) -> SDKMethodRunner: # TODO How do you test ADMIN_MODE without increasing too much coverage - def get_mocks( - self, user_id=None, token="fake_token" + def get_user_mocks( + self, user_id=None, token=None ) -> (UserClientSet, Workspace, WorkspaceAuth): user_id = user_id if user_id else self.user_id + token = token if token else self.token ws = create_autospec(Workspace, instance=True, spec_set=True) wsa = create_autospec(WorkspaceAuth, instance=True, spec_set=True) ucs = UserClientSet(user_id, token, ws, wsa) return ucs, ws, wsa + def get_client_mocks(self, *to_be_mocked): + return _get_client_mocks(self.cfg, self.config_file, *to_be_mocked) + @patch.object(Catalog, "get_module_version", return_value="module.version") - @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_regular_user(self, aau, catalog): + def test_regular_user(self, catalog): # Regular User lowly_user = "Access Denied: You are not an administrator" - user_client_set, _, ws_auth = self.get_mocks() + user_client_set, _, ws_auth = self.get_user_mocks() + clients_and_mocks = self.get_client_mocks(AdminAuthUtil) + aau = clients_and_mocks[AdminAuthUtil] + aau.get_admin_role.return_value = None ws_auth.can_write.return_value = True - runner = self.getRunner(user_client_set) - aau.return_value = ["RegularJoe"] + runner = self.getRunner(user_client_set, clients_and_mocks[ClientSet]) method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -134,6 +144,10 @@ def test_regular_user(self, aau, catalog): is_admin = runner.check_is_admin() self.assertFalse(is_admin) + aau.get_admin_role.assert_called_once_with( + self.token, ADMIN_READ_ROLE, ADMIN_WRITE_ROLE + ) + # Check Admin Status admin_type = runner.get_admin_permission() self.assertEqual(admin_type, {"permission": "n"}) @@ -196,16 +210,19 @@ def test_regular_user(self, aau, catalog): # TODO do the above with as_admin=True and assert failure each time - # Start the job and get it's status as an admin + # Start the job and get its status as an admin @patch.object(Catalog, "get_module_version", return_value="module.version") @patch.object(WorkspaceAuth, "can_write", return_value=True) - @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_admin_writer(self, aau, workspace, catalog): + def test_admin_writer(self, workspace, catalog): # Admin User with WRITE - runner = self.getRunner() - aau.return_value = [ADMIN_READ_ROLE] + clients_and_mocks = self.get_client_mocks(AdminAuthUtil) + clients = clients_and_mocks[ClientSet] + adminauth = clients_and_mocks[AdminAuthUtil] + + runner = self.getRunner(None, clients) + adminauth.get_admin_role.return_value = ADMIN_READ_ROLE method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -213,11 +230,15 @@ def test_admin_writer(self, aau, workspace, catalog): is_admin = runner.check_is_admin() self.assertTrue(is_admin) + adminauth.get_admin_role.assert_called_once_with( + self.token, ADMIN_READ_ROLE, ADMIN_WRITE_ROLE + ) + # Admin User with WRITE - runner = self.getRunner() + runner = self.getRunner(None, clients) # SET YOUR ADMIN STATUS HERE - aau.return_value = [ADMIN_WRITE_ROLE] + adminauth.get_admin_role.return_value = ADMIN_WRITE_ROLE method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -258,12 +279,13 @@ def test_no_user(self): ): runner.run_job(params=job_params_1, as_admin=False) - @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_admin_reader(self, aau): - # Admin User with WRITE + def test_admin_reader(self): + # Admin User with READ lowly_admin = r"Access Denied: You are a read-only admin. This function requires write access" - runner = self.getRunner() - aau.return_value = [ADMIN_READ_ROLE] + clients_and_mocks = self.get_client_mocks(AdminAuthUtil) + adminauth = clients_and_mocks[AdminAuthUtil] + runner = self.getRunner(None, clients_and_mocks[ClientSet]) + adminauth.get_admin_role.return_value = ADMIN_READ_ROLE method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -271,6 +293,10 @@ def test_admin_reader(self, aau): is_admin = runner.check_is_admin() self.assertTrue(is_admin) + adminauth.get_admin_role.assert_called_once_with( + self.token, ADMIN_READ_ROLE, ADMIN_WRITE_ROLE + ) + # Check Admin Status admin_type = runner.get_admin_permission() self.assertEqual(admin_type, {"permission": "r"}) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 50880b036..6127488a9 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -35,7 +35,7 @@ from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from installed_clients.execution_engine2Client import execution_engine2 as ee2client -KEEP_TEMP_FILES = True +KEEP_TEMP_FILES = False AUTH_DB = "api_to_db_test" AUTH_MONGO_USER = "auth" TEMP_DIR = Path("test_temp_can_delete") @@ -174,8 +174,10 @@ def service(full_config, auth_url, mongo_client, config): # also updates the config in place so it contains the correct auth urls for any other # methods that use the config fixture cfgpath = _update_config_and_create_config_file(full_config, auth_url) + print(f"created test deploy at {cfgpath}") _clear_ee2_db(mongo_client, config) + prior_deploy = os.environ[KB_DEPLOY_ENV] # from this point on, calling the get_*_test_config methods will get the temp config file os.environ[KB_DEPLOY_ENV] = cfgpath # The server creates the configuration, impl, and application *AT IMPORT TIME* so we have to @@ -199,6 +201,9 @@ def service(full_config, auth_url, mongo_client, config): # See the server file for the full scoop, but in short, the stop method expects a _proc # package variable to be set, but start doesn't always set it, and that causes an error. + # Tests are run in the same process to we need to be put the environment back the way it was + os.environ[KB_DEPLOY_ENV] = prior_deploy + if not KEEP_TEMP_FILES: os.remove(cfgpath) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py index 9a9b68d55..e2a0d0352 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py @@ -7,10 +7,10 @@ import requests_mock -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job, JobLog -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from execution_engine2.utils.clients import get_user_client_set +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job, JobLog +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.utils_shared.test_utils import ( bootstrap, run_job_adapter, @@ -36,7 +36,8 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, deploy), ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -45,7 +46,7 @@ def setUpClass(cls): db=cls.cfg["mongo-database"], col=cls.cfg["mongo-jobs-collection"] ) - cls.test_helper = ee2_sdkmr_test_helper(cls.method_runner) + cls.test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: return copy.copy(self.__class__.method_runner) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index ba7d10592..716f0bc4e 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -18,14 +18,18 @@ from mock import MagicMock from execution_engine2.authorization.workspaceauth import WorkspaceAuth -from lib.execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.utils.CatalogUtils import CatalogUtils +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode from execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources -from execution_engine2.utils.clients import UserClientSet -from execution_engine2.utils.clients import get_user_client_set +from execution_engine2.utils.clients import UserClientSet, ClientSet +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import ( bootstrap, @@ -34,6 +38,7 @@ run_job_adapter, assert_exception_correct, ) +from test.utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from tests_for_db.mongo_test_helper import MongoTestHelper logging.basicConfig(level=logging.INFO) @@ -48,11 +53,11 @@ class ee2_SDKMethodRunner_test(unittest.TestCase): @classmethod def setUpClass(cls): - config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") - logging.info(f"Loading config from {config_file}") + cls.config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + logging.info(f"Loading config from {cls.config_file}") config_parser = ConfigParser() - config_parser.read(config_file) + config_parser.read(cls.config_file) cls.cfg = {} @@ -68,7 +73,8 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cls.config_file), ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -83,7 +89,7 @@ def setUpClass(cls): request_memory="100M", client_group="njs", ) - cls.sdkmr_test_helper = ee2_sdkmr_test_helper(mr=cls.method_runner) + cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: # Initialize these clients from None @@ -130,7 +136,14 @@ def create_job_rec(self): # self.assertNotEqual(git_commit_1, git_commit_2) def test_init_fail(self): - self._init_fail({}, None, ValueError("user_clients is required")) + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + user_clients = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + clients = clients_and_mocks[ClientSet] + + self._init_fail(None, clients, ValueError("user_clients is required")) + self._init_fail(user_clients, None, ValueError("clients is required")) def _init_fail(self, cfg, user_clients, expected): with raises(Exception) as e: @@ -140,18 +153,26 @@ def _init_fail(self, cfg, user_clients, expected): def test_getters(self): ws = Workspace("https://fake.com") wsa = WorkspaceAuth("user", ws) - cliset = UserClientSet("user", "token", ws, wsa) - sdkmr = SDKMethodRunner(self.cfg, cliset) + user_clients = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + + sdkmr = SDKMethodRunner(user_clients, clients_and_mocks[ClientSet]) assert sdkmr.get_workspace() is ws assert sdkmr.get_user_id() == "user" assert sdkmr.get_token() == "token" + assert sdkmr.get_kafka_client() is clients_and_mocks[KafkaClient] + assert sdkmr.get_mongo_util() is clients_and_mocks[MongoUtil] + assert sdkmr.get_slack_client() is clients_and_mocks[SlackClient] + assert sdkmr.get_catalog_utils() is clients_and_mocks[CatalogUtils] + assert sdkmr.get_condor() is clients_and_mocks[Condor] def test_save_job(self): ws = Workspace("https://fake.com") wsa = WorkspaceAuth("user", ws) cliset = UserClientSet("user", "token", ws, wsa) - sdkmr = SDKMethodRunner(self.cfg, cliset) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + sdkmr = SDKMethodRunner(cliset, clients_and_mocks[ClientSet]) # We cannot use spec_set=True here because the code must access the Job.id field, # which is set dynamically. This means if the Job api changes, this test could pass @@ -544,7 +565,9 @@ def test_finish_job(self, condor): runner = self.getRunner() runner._test_job_permissions = MagicMock(return_value=True) - runner.catalog_utils.catalog.log_exec_stats = MagicMock(return_value=True) + runner.catalog_utils.get_catalog().log_exec_stats = MagicMock( + return_value=True + ) # test missing job_id input with self.assertRaises(ValueError) as context1: @@ -702,8 +725,8 @@ def test_check_job_global_perm(self, rq_mock): # now test with a different user other_method_runner = SDKMethodRunner( - self.cfg, get_user_client_set(self.cfg, "some_other_user", "other_token"), + get_client_set(self.cfg, self.config_file), ) job_states = other_method_runner.get_jobs_status().check_workspace_jobs( self.ws_id diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 33fc9389d..8c6a9d7c4 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -13,7 +13,10 @@ from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources -from execution_engine2.utils.clients import get_user_client_set +from execution_engine2.utils.clients import ( + get_client_set, + get_user_client_set, +) from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -51,7 +54,8 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, config_file), ) cls.mongo_util = MongoUtil(cls.cfg) @@ -67,7 +71,7 @@ def setUpClass(cls): request_memory="100M", client_group="njs", ) - cls.sdkmr_test_helper = ee2_sdkmr_test_helper(mr=cls.method_runner) + cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: # Initialize these clients from None @@ -81,7 +85,7 @@ def create_job_rec(self): return self.sdkmr_test_helper.create_job_rec() def test_init_ok(self): - class_attri = ["config", "catalog_utils", "workspace", "mongo_util", "condor"] + class_attri = ["catalog_utils", "workspace", "mongo_util", "condor"] runner = self.getRunner() self.assertTrue(set(class_attri) <= set(runner.__dict__.keys())) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index 3ffc5e19f..09295267a 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -13,7 +13,7 @@ from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources -from execution_engine2.utils.clients import get_user_client_set +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import bootstrap, get_example_job @@ -46,7 +46,8 @@ def setUpClass(cls): cls.token = "token" cls.method_runner = SDKMethodRunner( - cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, config_file), ) cls.cr = CondorResources( request_cpus="1", @@ -60,7 +61,7 @@ def setUpClass(cls): "DiskUsage": "1", } cls.mongo_util = cls.method_runner.get_mongo_util() - cls.sdkmr_test_helper = ee2_sdkmr_test_helper(mr=cls.method_runner) + cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: # Initialize these clients from None diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py index 5f5fdd094..9ec251f22 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py @@ -5,14 +5,15 @@ class ee2_sdkmr_test_helper: - def __init__(self, mr: SDKMethodRunner, wsid: str = 9999): - self.user_id = mr.user_id + def __init__(self, user_id: str, wsid: str = 9999): + self.user_id = user_id self.ws_id = wsid - self.token = mr.token - self.method_runner = mr def create_job_rec(self): - """ Save a job, forgoing runjob.run""" + """ + Save a job, forgoing runjob.run + Requires a MongoEngine connection + """ job = Job() @@ -53,7 +54,6 @@ def create_job_rec(self): job.job_output = None job.scheduler_id = "123" - with self.method_runner.get_mongo_util().mongo_engine_connection(): - job.save() + job.save() return str(job.id) diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 0fa577495..d97e4512c 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -10,14 +10,14 @@ from unittest.mock import patch from execution_engine2.authorization.workspaceauth import WorkspaceAuth -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job, Status -from lib.execution_engine2.execution_engine2Impl import execution_engine2 -from lib.execution_engine2.sdk.EE2Status import JobsStatus -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.Condor import Condor -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo -from execution_engine2.utils.clients import get_user_client_set +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job, Status +from execution_engine2.execution_engine2Impl import execution_engine2 +from execution_engine2.sdk.EE2Status import JobsStatus +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.utils_shared.test_utils import ( bootstrap, get_sample_job_params, @@ -33,9 +33,9 @@ class ee2_server_load_test(unittest.TestCase): @classmethod def setUpClass(cls): - deploy = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") - print("Deploy is", deploy) - config = read_config_into_dict(deploy) + cls.deploy = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + print("Deploy is", cls.deploy) + config = read_config_into_dict(cls.deploy) cls.cfg = config cls.user_id = "wsadmin" cls.ws_id = 9999 @@ -43,9 +43,7 @@ def setUpClass(cls): cls.ctx = {"token": cls.token, "user_id": cls.user_id} cls.impl = execution_engine2(cls.cfg) - cls.method_runner = SDKMethodRunner( - cls.cfg, get_user_client_set(cls.cfg, cls.user_id, cls.token) - ) + cls.method_runner = cls._getRunner() cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -55,18 +53,19 @@ def setUpClass(cls): cls.thread_count = 5 - def getRunner(self) -> SDKMethodRunner: - # Initialize these clients from None - runner = copy.copy(self.__class__.method_runner) # type : SDKMethodRunner - runner._ee2_status = runner.get_jobs_status() # type: JobsStatus - runner._ee2_status._send_exec_stats_to_catalog = MagicMock(return_value=True) - runner._ee2_status.update_finished_job_with_usage = MagicMock(return_value=True) - runner.get_runjob() - runner._ee2_runjob._get_module_git_commit = MagicMock( - return_value="GitCommithash" + @classmethod + def _getRunner(cls) -> SDKMethodRunner: + runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cls.deploy), ) + # Initialize these clients from None + status = runner.get_jobs_status() # type: JobsStatus + status._send_exec_stats_to_catalog = MagicMock(return_value=True) + status.update_finished_job_with_usage = MagicMock(return_value=True) + runjob = runner.get_runjob() + runjob._get_module_git_commit = MagicMock(return_value="GitCommithash") runner.get_job_logs() - runner.get_condor() runner.condor = MagicMock(autospec=True) # runner.get_job_resource_info = MagicMock(return_val={}) @@ -81,8 +80,7 @@ def test_init_job_stress(self): with self.mongo_util.mongo_engine_connection(): ori_job_count = Job.objects.count() - runner = self.getRunner() - + runner = self.method_runner # set job method differently to distinguish method_1 = "app_1.a_method" method_2 = "app_1.b_method" @@ -141,7 +139,7 @@ def test_update_job_status_stress(self): """ with self.mongo_util.mongo_engine_connection(): ori_job_count = Job.objects.count() - runner = self.getRunner() + runner = self.method_runner job_params = get_sample_job_params() @@ -329,7 +327,7 @@ def test_update_job_status(self): """ with self.mongo_util.mongo_engine_connection(): ori_job_count = Job.objects.count() - runner = self.getRunner() + runner = self.method_runner job_params = get_sample_job_params() @@ -419,7 +417,7 @@ def test_check_jobs_stress(self): with self.mongo_util.mongo_engine_connection(): ori_job_count = Job.objects.count() - runner = self.getRunner() + runner = self.method_runner # set job method differently to distinguish method_1 = "a_method" @@ -474,7 +472,7 @@ def test_check_job_canceled_stress(self): with self.mongo_util.mongo_engine_connection(): ori_job_count = Job.objects.count() - runner = self.getRunner() + runner = self.method_runner job_params = get_sample_job_params() @@ -582,7 +580,7 @@ def test_get_job_logs_stress(self): with self.mongo_util.mongo_engine_connection(): ori_job_count = Job.objects.count() - runner = self.getRunner() + runner = self.method_runner # create job job_id = runner.get_runjob()._init_job_rec( @@ -643,7 +641,7 @@ def test_add_job_logs_stress(self): ori_job_count = Job.objects.count() print("original job count is", ori_job_count) - runner = self.getRunner() + runner = self.method_runner # create job job_id = runner.get_runjob()._init_job_rec( diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py new file mode 100644 index 000000000..b4c8fc2ce --- /dev/null +++ b/test/utils_shared/mock_utils.py @@ -0,0 +1,92 @@ +from unittest.mock import create_autospec + +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.utils.CatalogUtils import CatalogUtils +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient + +from installed_clients.authclient import KBaseAuth + +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.utils.Condor import Condor +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from execution_engine2.utils.clients import ClientSet + +ALL_CLIENTS = [ + KBaseAuth, + AdminAuthUtil, + Condor, + CatalogUtils, + KafkaClient, + MongoUtil, + SlackClient, +] + + +def get_client_mocks(config, config_path, *to_be_mocked): + """ + Create a client set containing a mix of mocks and real implementations as needed for + a test. + + config is the config dict from the ee2 section of the deploy.cfg. + config_path is the path to the configfile. + to_be_mocked is the classes in the client set that should be mocked, e.g. KBaseAuth, etc. + + Returns a dict of the class to the the class's mock or implementation as specified in + the arguments. + """ + + # could make a dict of methods to call on a non-mock build but that seems like overkill + # and almost as much code as this + if KBaseAuth in to_be_mocked: + kbase_auth = create_autospec(KBaseAuth, instance=True, spec_set=True) + else: + kbase_auth = KBaseAuth( + auth_url=config["auth-url"] + "/api/legacy/KBase/Sessions/Login" + ) + + if AdminAuthUtil in to_be_mocked: + authadmin = create_autospec(AdminAuthUtil, instance=True, spec_set=True) + else: + authadmin = AdminAuthUtil( + config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] + ) + + if Condor in to_be_mocked: + condor = create_autospec(Condor, instance=True, spec_set=True) + else: + condor = condor = Condor(config_path) + + if CatalogUtils in to_be_mocked: + catutil = create_autospec(CatalogUtils, instance=True, spec_set=True) + else: + catutil = CatalogUtils(config["catalog-url"], config["catalog-token"]) + + if KafkaClient in to_be_mocked: + kafka = create_autospec(KafkaClient, instance=True, spec_set=True) + else: + kafka = KafkaClient(config["kafka-host"]) + + if MongoUtil in to_be_mocked: + mongo = create_autospec(MongoUtil, instance=True, spec_set=True) + else: + mongo = MongoUtil(config) + + if SlackClient in to_be_mocked: + slack = create_autospec(SlackClient, instance=True, spec_set=True) + else: + slack = SlackClient( + config["slack-token"], debug=True, endpoint=config["ee2-url"] + ) + + cs = ClientSet(kbase_auth, authadmin, condor, catutil, kafka, mongo, slack) + return { + ClientSet: cs, + KBaseAuth: kbase_auth, + AdminAuthUtil: authadmin, + Condor: condor, + CatalogUtils: catutil, + KafkaClient: kafka, + MongoUtil: mongo, + SlackClient: slack, + } From 67df90f8c3f121cc8bb60a87d589b4db31ed929b Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 1 Mar 2021 15:32:10 -0800 Subject: [PATCH 017/109] Add unit testing guidelines (#322) * Add unit testing guidelines * unit testing guildlines fixes/improvements * Misc corrections/improvements to unit testing guide * Add python tag to code blocks --- test/tests_for_integration/api_to_db_test.py | 2 +- unit_testing_guidelines.md | 181 +++++++++++++++++++ 2 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 unit_testing_guidelines.md diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 6127488a9..391a290ed 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -201,7 +201,7 @@ def service(full_config, auth_url, mongo_client, config): # See the server file for the full scoop, but in short, the stop method expects a _proc # package variable to be set, but start doesn't always set it, and that causes an error. - # Tests are run in the same process to we need to be put the environment back the way it was + # Tests are run in the same process so we need to be put the environment back the way it was os.environ[KB_DEPLOY_ENV] = prior_deploy if not KEEP_TEMP_FILES: diff --git a/unit_testing_guidelines.md b/unit_testing_guidelines.md new file mode 100644 index 000000000..0130dedb5 --- /dev/null +++ b/unit_testing_guidelines.md @@ -0,0 +1,181 @@ +# Unit and Integration Testing guidelines + +This document briefly covers testing philosophy with regard to integration and unit tests, +especially for the Python language and in the context of developing a KBase core service like +the Execution Engine. + +## Unit versus Integration tests + +Unit tests cover one module, class, or function, called a code unit from here on out. For example, +a unit test file might cover the contents of `my_module.py` or more granularly `my_module.MyClass`. +Code outside the code unit should be excluded from the tests. The exception is "value classes" +which are classes which primarily hold data and whose behavior is based on that data. Other +classes required by the unit under test should be mocked out as far as possible and practical. + +This makes unit tests fast and easy to understand, as only the isolated code unit needs to be +comprehended in order to grasp test failures. + +In contrast an integration test tests that two or more code units work well together. This can +range from anything between testing two code units' interactions to api-to-DB tests for a server. +Integration tests are typically much much slower, much more complex, take much more setup code, +and are harder to understand. Due to this, it is advisable to minimze the number of integration +tests to the least possible to ensure the various code units work together correctly, and write +unit tests to cover as much code as possible. In the author's experience, it is usually not +difficult to write unit tests with 100% coverage for the code unit (although keep in mind +that 100% test coverage does not necessarily indicate quality tests). + +## Mocking dependencies + +As previously described, a unit test should only cover a single unit of code. What this means +is that complex dependencies (e.g. not simple helper functions that may be called from the +class, not value classes, etc.) need to be mocked out. We do this via inversion of control, or +dependency injection. That is, if a code unit needs another code unit as a dependency, the +dependency should *provided to* the code unit rather than *constructed by* the code unit. + +For example, consider a toy function that contacts the workspace service: + +```python +def get_object_name_from_id(url, token, ref): + ws = Workspace(url, token=token) + return ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][1] +``` + +Note that the same situation may arise in a class that needs to contact the workspace regularly and +constructs the client in its `__init__` method. + +This makes the function difficult to unit test, as if run as-is, it will contact the workspace +service. This means that to run the test the workspace service must be running and populated +with data, or a mock service must be running that can validate the call and return the expected +payload. + +Instead, we can rewrite the function (or class) with dependency injection: + +```python +def get_object_name_from_id(ws, ref): + return ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][1] +``` + +Now we can easily pass in a mock object for the `Workspace` depencency in a unit test: + +```python +def test_get_object_name_from_id_success(): + ws = create_autospec(Workspace, spec_set=True, instance=True) [1] + ws.get_object_info3.return_value = {'infos': [ [2] + [3, + 'my_name', + 'Some.Type-1.0', + '1/1/1T01:01:01+00:00', + 1, + 'someguy', + 8, + 'my_workspace', + '79054025255fb1a26e4bc422aef54eb4', 82, {}] + ]} + + assert get_object_name_from_id(ws, '8/3/1') == 'my_name' [3] + + ws.get_object_info3.assert_called_once_with({'objects': [{'ref': '8/3/1'}]}) [4] +``` + +In this test, we: +1. Create the mock object +2. Tell the mock object what to return if the `get_object_info3` method is called +3. Call the method with the mock object as an argument and `assert` that it returns the correct + result +4. Confirm that the mock was called correctly. + +No server, mock or otherwise, is required, nor is confusing and error-prone monkey patching. + +If step 4 is omitted, any code that is run prior to the mock being called is ignored +by the tests as long as the mock is called and an error doesn't occur. Confirming the correct +call is required to test that any code that, for example, mutates the input arguments before +calling the mock with said mutated arguments works correctly. + +For more information on the Python mock standard library, see +https://docs.python.org/3/library/unittest.mock.html. + +For an example of mocks used in real code, see +[this EE2 test](https://github.com/kbase/execution_engine2/blob/e2c8086bd1f52b3ca488882c493aaaa9704626ad/test/tests_for_sdkmr/EE2StatusRange_test.py). + +## More on Dependency Injection + +Dependency Injection (DI), as we've seen, makes unit tests much easier, or even possible. There's +another benefit as well: modularity. DI makes it much easier to swap out modules, even at runtime, +to provide alternate implmentations of the fuctionality. Imagine an application that requires an +authorization module with a large number of parameters: + +```python +class Application: + + def __init__(self, + auth_url, + auth_client_id, + auth_client_secret, + auth_protocol, + auth_cache_time, + # more Application parameters go here + ): + self.auth = SomeCompaniesAuthImplementation( + auth_url, auth_client_id, auth_client_secret, auth_protocol, auth_cache_time) +``` + +If we wish to support `SomeOtherCompaniesAuthImplementation`, determined at runtime, we may need +another batch of parameters to support that implementation as well as a parameter to tell +`Application` which authorization implementation to use. + +An implementation based on DI might look like: + +```python +class Application: + + def __init__(self, auth_implementation): + self.auth = auth_implementation +``` + +Where the interface of `auth_implementation` can be merely documented (e.g. ducktyping) or +more rigorously defined with an [abstract base class](https://docs.python.org/3/library/abc.html) +and [type hints](https://docs.python.org/3/library/typing.html). + +In this way, code that interprets a configuration at run time can build whichever version of +the authentication module that is required and pass it to `Application`. This makes `Application` +more modular, easier to test, easier to use, and simplifies the initialization. + +The drawback of DI is that it pushes the responsibility for building dependencies up the +software stack, making the user of the class have to write that code, although package authors +could provide helper methods. + +## `Mock()` versus `create_autospec` + +Those familiar with the python mock library will be aware of the `Mock` class. In the examples +above, we use `create_autospec` to create the mock rather than creating a mock class directly. +The way `create_autospec` is used, with `spec_set=True` and `instance=True`, creates a mock object +based off the interface of the class being mocked, and unlike a regular mock, will not allow +reading or writing of an attribute that does not exist on the class being mocked (as well as +avoiding [other problems](https://docs.python.org/3/library/unittest.mock.html#auto-speccing)). +This prevents test false positives if the interface of the class changes but the tests are not +updated - a standard `Mock()` will allow method returns to be set and will record method calls +for methods that do not exist, but in the example above, the tests would fail if, for example, +`get_object_info3` was removed from the `Workspace` class. + +The drawback of using `spec_set=True` is that autospeccing is unaware of any instance variables +(e.g. `self.foo = foo_arg` in a constructor, for example). The unittest documentation suggests +a number of approaches to get around this problem, but in the author's opinion the least +bad option is to create getters (and setters for mutable instance variables) for any instance +variables that need to be exposed in the class's public interface. + +## External services + +The rule of thumb is to not mock external services, but instead create a wrapper around the +external service, mock that, and test the wrapper with integration tests. In some cases this +is relatively simple, but other cases are much more difficult. + +If the service is easy to set up and run locally, an integration test with a live service +is likely the best choice. Databases like MongoDB often fit this category as it is quick to +download and run a binary or Docker image. + +If the service is more difficult to run locally, a mock server might be employed to mock the +service responses. This is dangerous because if the service API changes, the test results will +contain false positives. An example is using a mock server in the +[KBase auth2](https://github.com/kbase/auth2) repo to mock identity provider services, which +cannot be installed locally and cannot be incorporated into automated testing without +enormous difficulty. From 89e675991f30488b46a424acfea8fbdbf0a94fa0 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 2 Mar 2021 08:51:34 -0800 Subject: [PATCH 018/109] DRY up mock_utils.py (#323) * DRY up mock_utils.py * We mustn't use extraneous verbiage --- test/utils_shared/mock_utils.py | 101 ++++++++++++-------------------- 1 file changed, 36 insertions(+), 65 deletions(-) diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py index b4c8fc2ce..343fadf28 100644 --- a/test/utils_shared/mock_utils.py +++ b/test/utils_shared/mock_utils.py @@ -12,15 +12,25 @@ from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from execution_engine2.utils.clients import ClientSet -ALL_CLIENTS = [ - KBaseAuth, - AdminAuthUtil, - Condor, - CatalogUtils, - KafkaClient, - MongoUtil, - SlackClient, -] +_CLASS_IMPLEMENTATION_BUILDERS = { + KBaseAuth: lambda config, cfgfile: KBaseAuth( + auth_url=config["auth-url"] + "/api/legacy/KBase/Sessions/Login" + ), + AdminAuthUtil: lambda config, cfgfile: AdminAuthUtil( + config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] + ), + Condor: lambda config, cfgfile: Condor(cfgfile), + CatalogUtils: lambda config, cfgfile: CatalogUtils( + config["catalog-url"], config["catalog-token"] + ), + KafkaClient: lambda config, cfgfile: KafkaClient(config["kafka-host"]), + MongoUtil: lambda config, cfgfile: MongoUtil(config), + SlackClient: lambda config, cfgfile: SlackClient( + config["slack-token"], debug=True, endpoint=config["ee2-url"] + ), +} + +ALL_CLIENTS = _CLASS_IMPLEMENTATION_BUILDERS.keys() def get_client_mocks(config, config_path, *to_be_mocked): @@ -32,61 +42,22 @@ def get_client_mocks(config, config_path, *to_be_mocked): config_path is the path to the configfile. to_be_mocked is the classes in the client set that should be mocked, e.g. KBaseAuth, etc. - Returns a dict of the class to the the class's mock or implementation as specified in + Returns a dict of the class to the class's mock or implementation as specified in the arguments. """ - - # could make a dict of methods to call on a non-mock build but that seems like overkill - # and almost as much code as this - if KBaseAuth in to_be_mocked: - kbase_auth = create_autospec(KBaseAuth, instance=True, spec_set=True) - else: - kbase_auth = KBaseAuth( - auth_url=config["auth-url"] + "/api/legacy/KBase/Sessions/Login" - ) - - if AdminAuthUtil in to_be_mocked: - authadmin = create_autospec(AdminAuthUtil, instance=True, spec_set=True) - else: - authadmin = AdminAuthUtil( - config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] - ) - - if Condor in to_be_mocked: - condor = create_autospec(Condor, instance=True, spec_set=True) - else: - condor = condor = Condor(config_path) - - if CatalogUtils in to_be_mocked: - catutil = create_autospec(CatalogUtils, instance=True, spec_set=True) - else: - catutil = CatalogUtils(config["catalog-url"], config["catalog-token"]) - - if KafkaClient in to_be_mocked: - kafka = create_autospec(KafkaClient, instance=True, spec_set=True) - else: - kafka = KafkaClient(config["kafka-host"]) - - if MongoUtil in to_be_mocked: - mongo = create_autospec(MongoUtil, instance=True, spec_set=True) - else: - mongo = MongoUtil(config) - - if SlackClient in to_be_mocked: - slack = create_autospec(SlackClient, instance=True, spec_set=True) - else: - slack = SlackClient( - config["slack-token"], debug=True, endpoint=config["ee2-url"] - ) - - cs = ClientSet(kbase_auth, authadmin, condor, catutil, kafka, mongo, slack) - return { - ClientSet: cs, - KBaseAuth: kbase_auth, - AdminAuthUtil: authadmin, - Condor: condor, - CatalogUtils: catutil, - KafkaClient: kafka, - MongoUtil: mongo, - SlackClient: slack, - } + ret = {} + for clazz in ALL_CLIENTS: + if clazz in to_be_mocked: + ret[clazz] = create_autospec(clazz, instance=True, spec_set=True) + else: + ret[clazz] = _CLASS_IMPLEMENTATION_BUILDERS[clazz](config, config_path) + ret[ClientSet] = ClientSet( + ret[KBaseAuth], + ret[AdminAuthUtil], + ret[Condor], + ret[CatalogUtils], + ret[KafkaClient], + ret[MongoUtil], + ret[SlackClient], + ) + return ret From 5d1ced69d0250c8139285772ccb909cc42008368 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 5 Mar 2021 08:46:32 -0800 Subject: [PATCH 019/109] DATAUP-389 Reduce Condor wrapper public API size (#324) * Clarify the Condor class public API A number of methods that are not used and don't appear to be necessary for use outside the Condor class have been made "private". * Remove Scheduler ABC and reduce Condor public interface Scheduler ABC not currently needed, we can extract an interface from Condor if it turns out we need to support multiple queues. Further reduced the public interface of Condor.py * Remove submit_file param from Condor.run_job Completely unused, so removing simplifies the interface * Correct testing comment * Allow DI based unit testing for Condor wrapper Allows passing in a htcondor module to the Condor wapper so it can be mocked via create_autospec. --- lib/execution_engine2/utils/Condor.py | 63 +++++++++---------- lib/execution_engine2/utils/Scheduler.py | 32 ---------- lib/execution_engine2/utils/Scheduler.pyi | 16 ----- test/tests_for_auth/ee2_admin_mode_test.py | 4 +- .../ee2_SDKMethodRunner_test.py | 2 +- test/tests_for_sdkmr/ee2_scheduler_test.py | 22 +++---- 6 files changed, 44 insertions(+), 95 deletions(-) delete mode 100644 lib/execution_engine2/utils/Scheduler.py delete mode 100644 lib/execution_engine2/utils/Scheduler.pyi diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 5dca510ac..1d4570a79 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -21,10 +21,9 @@ SubmissionInfo, JobInfo, ) -from lib.execution_engine2.utils.Scheduler import Scheduler -class Condor(Scheduler): +class Condor: # TODO: Should these be outside of the class? REQUEST_CPUS = "request_cpus" REQUEST_MEMORY = "request_memory" @@ -43,7 +42,14 @@ class Condor(Scheduler): PYTHON_EXECUTABLE = "PYTHON_EXECUTABLE" DEFAULT_CLIENT_GROUP = "default_client_group" - def __init__(self, config_filepath): + def __init__(self, config_filepath, htc=htcondor): + """ + Create the condor wrapper. + + config_filepath - the path to the execution_engine2 configuration file. + htc - the htcondor module, or an alternate implementation or mock. + """ + self.htcondor = htc self.config = ConfigParser() self.override_clientgroup = os.environ.get("OVERRIDE_CLIENT_GROUP", None) self.config.read(config_filepath) @@ -81,7 +87,7 @@ def __init__(self, config_filepath): ) self.logger = logging.getLogger("ee2") - def setup_environment_vars(self, params: Dict, client_group: str) -> str: + def _setup_environment_vars(self, params: Dict, client_group: str) -> str: # 7 day docker job timeout default, Catalog token used to get access to volume mounts dm = ( str(params["cg_resources_requirements"].get("debug_mode", "")).lower() @@ -153,7 +159,7 @@ def extract_resources(self, cgrr: Dict[str, str]) -> CondorResources: return cr - def extract_requirements( + def _extract_requirements( self, cgrr: Optional[dict] = None, client_group: Optional[str] = None ): """ @@ -235,7 +241,7 @@ def _extract_resources_and_requirements( sub["request_disk"] = resources.request_disk client_group = resources.client_group # Set requirements statement - requirements = self.extract_requirements(cgrr=cgrr, client_group=client_group) + requirements = self._extract_requirements(cgrr=cgrr, client_group=client_group) sub["requirements"] = " && ".join(requirements) sub["+KB_CLIENTGROUP"] = f'"{client_group}"' return (sub, client_group) @@ -269,7 +275,7 @@ def _add_resources_and_special_attributes( sub = dict() sub["JobBatchName"] = params.get("job_id") sub["arguments"] = f"{params['job_id']} {self.ee_endpoint}" - sub = self.add_job_labels(sub=sub, params=params) + sub = self._add_job_labels(sub=sub, params=params) # Extract special requirements (sub, client_group) = self._extract_resources_and_requirements( sub, params["cg_resources_requirements"] @@ -282,16 +288,18 @@ def _add_resources_and_special_attributes( client_group = concierge_params.client_group sub["+AccountingGroup"] = f'"{sub["+AccountingGroup"]}"' - sub["environment"] = self.setup_environment_vars( + sub["environment"] = self._setup_environment_vars( params, client_group=client_group ) return sub # TODO Copy stuff from Concierge Params into #AcctGroup/Clientgroup/JobPrio, CPu/MEMORY/DISK/ - def create_submit( + def _create_submit( self, params: Dict, concierge_params: ConciergeParams = None ) -> Dict: + # note some tests call this function directly and will need to be updated if the + # signature is changed self._check_for_missing_runjob_params(params) sub = self._add_resources_and_special_attributes(params, concierge_params) @@ -302,11 +310,8 @@ def create_submit( sub[item] = str(sub[item]) return sub - def concierge(self, sub, concierge_params): - pass - @staticmethod - def add_job_labels(sub: Dict, params: Dict[str, str]): + def _add_job_labels(sub: Dict, params: Dict[str, str]): sub["+KB_PARENT_JOB_ID"] = params.get("parent_job_id", "") sub["+KB_MODULE_NAME"] = params.get("method", "").split(".")[0] sub["+KB_FUNCTION_NAME"] = params.get("method", "").split(".")[-1] @@ -327,27 +332,20 @@ def add_job_labels(sub: Dict, params: Dict[str, str]): def run_job( self, params: Dict[str, str], - submit_file: Dict[str, str] = None, concierge_params: Dict[str, str] = None, ) -> SubmissionInfo: """ TODO: Add a retry TODO: Add list of required params :param params: Params to run the job, such as the username, job_id, token, client_group_and_requirements - :param submit_file: A optional completed Submit File :param concierge_params: Concierge Options for Submit Files :return: """ - if submit_file is None: - submit_file = self.create_submit(params, concierge_params) + submit = self._create_submit(params, concierge_params) - return self.run_submit(submit_file) - - def run_submit(self, submit: Dict[str, str]) -> SubmissionInfo: - - sub = htcondor.Submit(submit) + sub = self.htcondor.Submit(submit) try: - schedd = htcondor.Schedd() + schedd = self.htcondor.Schedd() self.logger.debug(schedd) self.logger.debug(submit) self.logger.debug(os.getuid()) @@ -364,7 +362,7 @@ def get_job_resource_info( if job_id is not None and cluster_id is not None: raise Exception("Use only batch name (job_id) or cluster_id, not both") - condor_stats = self.get_job_info(job_id=job_id, cluster_id=cluster_id) + condor_stats = self._get_job_info(job_id=job_id, cluster_id=cluster_id) # Don't leak token into the logs here job_info = condor_stats.info if job_info is None: @@ -397,9 +395,11 @@ def get_job_resource_info( return extracted_resources - def get_job_info( + def _get_job_info( self, job_id: Optional[str] = None, cluster_id: Optional[str] = None ) -> JobInfo: + # note some tests replace this function with a MagicMock and will need to be updated if + # the signature is changed if job_id is not None and cluster_id is not None: return JobInfo( @@ -416,7 +416,7 @@ def get_job_info( ) try: - job = htcondor.Schedd().query(constraint=constraint, limit=1) + job = self.htcondor.Schedd().query(constraint=constraint, limit=1) if len(job) == 0: job = [{}] return JobInfo(info=job[0], error=None) @@ -425,18 +425,15 @@ def get_job_info( raise e # return JobInfo(info=None, error=e) - def get_user_info(self, user_id, projection=None): - pass - def cancel_job(self, job_id: str) -> bool: """ :param job_id: :return: """ - return self.cancel_jobs([f"{job_id}"]) + return self._cancel_jobs([f"{job_id}"]) - def cancel_jobs(self, scheduler_ids: list): + def _cancel_jobs(self, scheduler_ids: list): """ Possible return structure like this [ @@ -457,8 +454,8 @@ def cancel_jobs(self, scheduler_ids: list): raise Exception("Please provide a list of condor ids to cancel") try: - cancel_jobs = htcondor.Schedd().act( - action=htcondor.JobAction.Remove, job_spec=scheduler_ids + cancel_jobs = self.htcondor.Schedd().act( + action=self.htcondor.JobAction.Remove, job_spec=scheduler_ids ) self.logger.info(f"Cancel job message for {scheduler_ids} is") self.logger.debug(f"{cancel_jobs}") diff --git a/lib/execution_engine2/utils/Scheduler.py b/lib/execution_engine2/utils/Scheduler.py deleted file mode 100644 index f945248e2..000000000 --- a/lib/execution_engine2/utils/Scheduler.py +++ /dev/null @@ -1,32 +0,0 @@ -from abc import ABC, abstractmethod - - -class Scheduler(ABC): - @abstractmethod - def run_job(self, params, submit_file=None): - raise NotImplementedError - - @abstractmethod - def create_submit(self, params): - raise NotImplementedError - - def validate_submit_file( - self, - ): - raise NotImplementedError - - @abstractmethod - def run_submit(self, submit): - raise NotImplementedError - - @abstractmethod - def get_job_info(self, job_id, cluster_id): - raise NotImplementedError - - @abstractmethod - def get_user_info(self, user_id, projection=None): - raise NotImplementedError - - @abstractmethod - def cancel_job(self, job_id): - raise NotImplementedError diff --git a/lib/execution_engine2/utils/Scheduler.pyi b/lib/execution_engine2/utils/Scheduler.pyi deleted file mode 100644 index f22afc81e..000000000 --- a/lib/execution_engine2/utils/Scheduler.pyi +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Dict, List -from abc import ABC - -class Scheduler(ABC): - def run_job( - self, params: Dict[str, str], submit_file: Dict[str, str] = None - ) -> str: ... - def create_submit(self, params: Dict[str, str]) -> str: ... - def validate_submit_file(self, submit_file_path) -> bool: ... - def cleanup_submit_file(self, submit_file_path) -> bool: ... - def run_submit(self, submit) -> str: ... - def get_job_info(self, job_id: str, cluster_id: str = None) -> Dict[str, str]: ... - def get_user_info( - self, user_id: str, projection: List[str] = None - ) -> Dict[str, str]: ... - def cancel_job(self, job_id: str) -> bool: ... diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index 9a7afac08..0b955310b 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -69,7 +69,7 @@ def setUp(self) -> None: ) self.condor_patch2 = patch.object( target=Condor, - attribute="get_job_info", + attribute="_get_job_info", return_value=get_sample_condor_info(), ) @@ -106,7 +106,7 @@ def getRunner(self, user_clients=None, clients=None) -> SDKMethodRunner: def get_runner_with_condor(self) -> SDKMethodRunner: runner = self.getRunner() condor = MagicMock(return_value={}) - condor.get_job_info = MagicMock(return_value="") + condor._get_job_info = MagicMock(return_value="") condor.get_job_resource_info = MagicMock(return_value="njs") runner.condor = condor diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 716f0bc4e..80d62e005 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -635,7 +635,7 @@ def test_finish_job_with_error_message(self, condor): self.assertEqual(ori_job_count, new_count - 1) runner = self.getRunner() - condor.get_job_info = MagicMock(return_value={}) + condor._get_job_info = MagicMock(return_value={}) condor.get_job_resource_info = MagicMock(return_value={}) runner.condor = condor runner._send_exec_stats_to_catalog = MagicMock(return_value=True) diff --git a/test/tests_for_sdkmr/ee2_scheduler_test.py b/test/tests_for_sdkmr/ee2_scheduler_test.py index 66ec00622..79799f809 100644 --- a/test/tests_for_sdkmr/ee2_scheduler_test.py +++ b/test/tests_for_sdkmr/ee2_scheduler_test.py @@ -48,7 +48,7 @@ def test_empty_params(self): with self.assertRaisesRegex( Exception, "cg_resources_requirements not found in params" ): - c.create_submit(params) + c._create_submit(params) def test_create_submit_file(self): # Test with empty clientgroup @@ -56,7 +56,7 @@ def test_create_submit_file(self): c = self.condor params = self._create_sample_params(cgroups=["njs"]) - default_sub = c.create_submit(params) + default_sub = c._create_submit(params) sub = default_sub self.assertEqual(sub["executable"], c.initial_dir + "/" + c.executable) @@ -84,7 +84,7 @@ def test_create_submit_file(self): cgroups=["njs,request_cpus=8,request_memory=10GB,request_apples=5"] ) - njs_sub = c.create_submit(params) + njs_sub = c._create_submit(params) sub = njs_sub self.assertIn("njs", sub["requirements"]) @@ -113,7 +113,7 @@ def test_create_submit_file(self): params = self._create_sample_params(cgroups="") - empty_sub = c.create_submit(params) + empty_sub = c._create_submit(params) sub = empty_sub self.assertEqual(sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS]) @@ -133,17 +133,17 @@ def test_create_submit_file(self): params = self._create_sample_params(cgroups=["{}"]) - empty_json_sub = c.create_submit(params) + empty_json_sub = c._create_submit(params) params = self._create_sample_params(cgroups=['{"client_group" : "njs"}']) - json_sub = c.create_submit(params) + json_sub = c._create_submit(params) params = self._create_sample_params( cgroups=['{"client_group" : "njs", "client_group_regex" : "false"}'] ) - json_sub_with_regex_disabled_njs = c.create_submit(params) + json_sub_with_regex_disabled_njs = c._create_submit(params) # json_sub_with_regex_disabled @@ -166,7 +166,7 @@ def test_create_submit_file(self): ) # json_sub_with_regex_disabled - c.create_submit(params) + c._create_submit(params) logging.info("Testing with real json, regex disabled, bigmem") @@ -174,7 +174,7 @@ def test_create_submit_file(self): cgroups=['{"client_group" : "bigmem", "client_group_regex" : "FaLsE"}'] ) - json_sub_with_regex_disabled_bigmem = c.create_submit(params) + json_sub_with_regex_disabled_bigmem = c._create_submit(params) self.assertIn( '(CLIENTGROUP == "bigmem', json_sub_with_regex_disabled_bigmem["requirements"], @@ -194,7 +194,7 @@ def test_create_submit_file_concierge(self): c = self.condor params = self._create_sample_params(cgroups=["njs"]) cp = self._get_concierge_params() - sub = c.create_submit(params=params, concierge_params=cp) + sub = c._create_submit(params=params, concierge_params=cp) # Concurrency limits removed self.assertNotIn("Concurrency_Limits", sub) self.assertEqual(sub["+AccountingGroup"], '"' + params["user_id"] + '"') @@ -205,7 +205,7 @@ def test_create_submit_file_concierge(self): cp.client_group = "LeConcierge" cp.account_group = "LeCat" - sub2 = c.create_submit(params=params, concierge_params=cp) + sub2 = c._create_submit(params=params, concierge_params=cp) self.assertEqual(sub2["+KB_CLIENTGROUP"], f'"{str(cp.client_group)}"') self.assertEqual(sub2["+AccountingGroup"], '"' + cp.account_group + '"') self.assertNotIn("Concurrency_Limits", sub2) From 84b2f461e7f241a3e6259f96fe2bae227a34c136 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Sun, 7 Mar 2021 21:46:39 -0800 Subject: [PATCH 020/109] Add argument / parameter checkers (#325) * Add more argument checkers Will be needed for upcoming PRs re job params * run black --- lib/execution_engine2/utils/arg_processing.py | 131 ++++++++++++++- test/tests_for_utils/arg_processing_test.py | 152 +++++++++++++++++- 2 files changed, 278 insertions(+), 5 deletions(-) diff --git a/lib/execution_engine2/utils/arg_processing.py b/lib/execution_engine2/utils/arg_processing.py index 252822838..ffc1c5e9f 100644 --- a/lib/execution_engine2/utils/arg_processing.py +++ b/lib/execution_engine2/utils/arg_processing.py @@ -3,7 +3,12 @@ normalizers. """ -from typing import Union +from typing import Optional, Iterable, TypeVar, Union +import datetime +import unicodedata +from execution_engine2.exceptions import IncorrectParamsException + +T = TypeVar("T") def parse_bool(putative_bool: Union[str, bool, int, float, None]) -> bool: @@ -12,7 +17,7 @@ def parse_bool(putative_bool: Union[str, bool, int, float, None]) -> bool: Strings containing 'true' or 'false', regardless of capitalization, are considered booleans. Strings containing ints or floats are parsed to floats before processing. - Raises ValueError if the value cannot be parsed. + Raises IncorrectParamsException if the value cannot be parsed. """ pb = putative_bool if pb is None: @@ -33,4 +38,124 @@ def parse_bool(putative_bool: Union[str, bool, int, float, None]) -> bool: if pb.lower() == "false": return False - raise ValueError(f"{pb} is not a boolean value") + raise IncorrectParamsException(f"{pb} is not a boolean value") + + +# The remaining methods are ported from +# https://github.com/kbase/sample_service/blob/master/lib/SampleService/core/arg_checkers.py +# with slight changes. +# Should probably make a package or see if there are equivalent 3rd party functions at some point. +# Although if you want to use custom exceptions as here that won't work + + +def not_falsy(item: T, item_name: str) -> T: + """ + Check if a value is falsy and throw and exception if so. + :param item: the item to check for falsiness. + :param item_name: the name of the item to include in any exception. + :raises ValueError: if the item is falsy. + :returns: the item. + """ + if not item: + raise ValueError(f"{item_name} cannot be a value that evaluates to false") + return item + + +def not_falsy_in_iterable( + iterable: Optional[Iterable[T]], name: str, allow_none: bool = False +) -> Optional[Iterable[T]]: + """ + Check that an iterable is not None and contains no falsy items. Empty iterables are accepted. + :param iterable: the iterable to check. + :param name: the name of the iterable to be used in error messages. + :param allow_none: allow the iterable to be None - in this case return None. The contents of + the iterable may not be None. + :returns: the iterable. + :raises ValueError: if the iterable is None or contains falsy items. + """ + # probably need to allow for 0 as an option + if iterable is None: + if allow_none: + return None + raise ValueError(f"{name} cannot be None") + for i, item in enumerate(iterable): + not_falsy(item, f"Index {i} of iterable {name}") + return iterable + + +def _contains_control_characters(string: str) -> bool: + """ + Check if a string contains control characters, as denoted by the Unicode character category + starting with a C. + :param string: the string to check. + :returns: True if the string contains control characters, False otherwise. + """ + # make public if needed + # See https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python # noqa: E501 + for c in string: + if unicodedata.category(c)[0] == "C": + return True + return False + + +def _no_control_characters(string: str, name: str) -> str: + """ + Checks that a string contains no control characters and throws an exception if it does. + See :meth:`contains_control_characters` for more information. + :param string: The string to check. + :param name: the name of the string to include in any exception. + :raises IncorrectParamsException: if the string contains control characters. + :returns: the string. + """ + # make public if needed + if _contains_control_characters(string): + raise IncorrectParamsException(name + " contains control characters") + return string + + +def check_string( + string: Optional[str], name: str, max_len: int = None, optional: bool = False +) -> Optional[str]: + """ + Check that a string meets a set of criteria: + - it is not None or whitespace only (unless the optional parameter is specified) + - it contains no control characters + - (optional) it is less than some specified maximum length + :param string: the string to test. + :param name: the name of the string to be used in error messages. + :param max_len: the maximum length of the string. + :param optional: True if no error should be thrown if the string is None. + :returns: the stripped string or None if the string was optional and None or whitespace only. + :raises IncorrectParamsException: if the string is None, whitespace only, too long, or + contains illegal characters. + """ + # See the IDMapping service if character classes are needed. + # Maybe package this stuff + if max_len is not None and max_len < 1: + raise ValueError("max_len must be > 0 if provided") + if not string or not string.strip(): + if optional: + return None + raise IncorrectParamsException("Missing input parameter: " + name) + string = string.strip() + _no_control_characters(string, name) + if max_len and len(string) > max_len: + raise IncorrectParamsException(f"{name} exceeds maximum length of {max_len}") + return string + + +def check_timestamp(timestamp: datetime.datetime, name: str): + """ + Check that a timestamp is not None and not naive. See + https://docs.python.org/3.8/library/datetime.html#aware-and-naive-objects + :param timestamp: the timestamp to check. + :param name: the name of the variable to use in thrown errors. + :returns: the timestamp. + :raises ValueError: if the check fails. + """ + if not_falsy(timestamp, name).tzinfo is None: + # The docs say you should also check savetime.tzinfo.utcoffset(savetime) is not None, + # but initializing a datetime with a tzinfo subclass that returns None for that method + # causes the constructor to throw an error + raise ValueError(f"{name} cannot be a naive datetime") + return timestamp diff --git a/test/tests_for_utils/arg_processing_test.py b/test/tests_for_utils/arg_processing_test.py index 11164fad1..f3282ca05 100644 --- a/test/tests_for_utils/arg_processing_test.py +++ b/test/tests_for_utils/arg_processing_test.py @@ -1,6 +1,14 @@ from pytest import raises -from execution_engine2.utils.arg_processing import parse_bool +import datetime +from execution_engine2.utils.arg_processing import ( + parse_bool, + check_string, + not_falsy, + not_falsy_in_iterable, + check_timestamp, +) +from execution_engine2.exceptions import IncorrectParamsException from utils_shared.test_utils import assert_exception_correct @@ -54,4 +62,144 @@ def test_parse_bool_failure(): for tc in testcases: with raises(Exception) as e: parse_bool(tc) - assert_exception_correct(e.value, ValueError(f"{tc} is not a boolean value")) + assert_exception_correct( + e.value, IncorrectParamsException(f"{tc} is not a boolean value") + ) + + +def test_falsy_true(): + for t in ["a", 1, True, [1], {"a": 1}, {1}]: + assert not_falsy(t, "foo") is t + + +def test_falsy_fail(): + for f in ["", 0, False, [], dict(), {}]: + with raises(Exception) as got: + not_falsy(f, "my name") + assert_exception_correct( + got.value, ValueError("my name cannot be a value that evaluates to false") + ) + + +def test_falsy_in_iterable_true(): + for t in [[], [1, "a"], [True], [{"foo"}]]: + assert not_falsy_in_iterable(t, "foo") is t + + +def test_falsy_in_iterable_allow_none(): + assert not_falsy_in_iterable(None, "yay", allow_none=True) is None + + +def test_falsy_in_iterable_no_iterable(): + with raises(Exception) as got: + not_falsy_in_iterable(None, "whee") + assert_exception_correct(got.value, ValueError("whee cannot be None")) + + +def test_falsy_in_iterable_false_insides(): + for item, pos in [ + [["", "bar"], 0], + [["foo", 0], 1], + [[True, True, False, True], 2], + [[[]], 0], + [[dict()], 0], + [[{}], 0], + ]: + with raises(Exception) as got: + not_falsy_in_iterable(item, "my name") + assert_exception_correct( + got.value, + ValueError( + f"Index {pos} of iterable my name cannot be a value that evaluates to false" + ), + ) + + +def test_check_string(): + for string, expected in { + " foo": "foo", + " \t baɷr ": "baɷr", + "baᚠz \t ": "baᚠz", + "bat": "bat", + "a" * 1000: "a" * 1000, + }.items(): + assert check_string(string, "name") == expected + + +def test_check_string_bad_max_len(): + for max_len in [0, -1, -100]: + with raises(Exception) as got: + check_string("str", "var name", max_len=max_len) + assert_exception_correct( + got.value, ValueError("max_len must be > 0 if provided") + ) + + +def test_check_string_optional_true(): + for string in [None, " \t "]: + assert check_string(string, "name", optional=True) is None + + +def test_check_string_optional_false(): + for string in [None, " \t "]: + with raises(Exception) as got: + check_string(string, "var name") + assert_exception_correct( + got.value, IncorrectParamsException("Missing input parameter: var name") + ) + + +def test_check_string_control_characters(): + for string in ["foo \b bar", "foo\u200bbar", "foo\0bar", "foo\bbar"]: + with raises(Exception) as got: + check_string(string, "var name") + assert_exception_correct( + got.value, IncorrectParamsException("var name contains control characters") + ) + + +def test_check_string_max_len(): + for string, length in { + "123456789": 9, + "a": 1, + "a" * 100: 100, + "a" * 10000: 10000, + "a" * 10000: 1000000, + }.items(): + assert check_string(string, "name", max_len=length) == string + + +def test_check_string_long_fail(): + for string, length in {"123456789": 8, "ab": 1, "a" * 100: 99}.items(): + with raises(Exception) as got: + check_string(string, "var name", max_len=length) + assert_exception_correct( + got.value, + IncorrectParamsException(f"var name exceeds maximum length of {length}"), + ) + + +def _dt(timestamp): + return datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc) + + +def test_check_timestamp(): + for t in [-1000000, -256, -1, 0, 1, 6, 100, 100000000000]: + assert check_timestamp(_dt(t), "name") == _dt(t) + + +def test_check_timestamp_fail_bad_args(): + _check_timestamp_fail( + None, "ts", ValueError("ts cannot be a value that evaluates to false") + ) + _check_timestamp_fail( + datetime.datetime.now(), + "tymestampz", + ValueError("tymestampz cannot be a naive datetime"), + ) + + +def _check_timestamp_fail(ts, name, expected): + with raises(Exception) as got: + check_timestamp(ts, name) + assert_exception_correct(got.value, expected) From 6001235f174ae003f9dfa70eb60c1dc55bd57e2b Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 8 Mar 2021 10:06:53 -0800 Subject: [PATCH 021/109] Add user info class to hold user information (#327) * Add user info class to hold uder information Might add more fields later as needed, but there's a couple places where this info is showing up so a container makes sense * run black * typo, remove irrelevant comments --- lib/execution_engine2/utils/arg_processing.py | 4 +- lib/execution_engine2/utils/user_info.py | 36 +++++++++++ test/tests_for_utils/user_info_test.py | 61 +++++++++++++++++++ 3 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 lib/execution_engine2/utils/user_info.py create mode 100644 test/tests_for_utils/user_info_test.py diff --git a/lib/execution_engine2/utils/arg_processing.py b/lib/execution_engine2/utils/arg_processing.py index ffc1c5e9f..688964b69 100644 --- a/lib/execution_engine2/utils/arg_processing.py +++ b/lib/execution_engine2/utils/arg_processing.py @@ -50,7 +50,7 @@ def parse_bool(putative_bool: Union[str, bool, int, float, None]) -> bool: def not_falsy(item: T, item_name: str) -> T: """ - Check if a value is falsy and throw and exception if so. + Check if a value is falsy and throw an exception if so. :param item: the item to check for falsiness. :param item_name: the name of the item to include in any exception. :raises ValueError: if the item is falsy. @@ -129,8 +129,6 @@ def check_string( :raises IncorrectParamsException: if the string is None, whitespace only, too long, or contains illegal characters. """ - # See the IDMapping service if character classes are needed. - # Maybe package this stuff if max_len is not None and max_len < 1: raise ValueError("max_len must be > 0 if provided") if not string or not string.strip(): diff --git a/lib/execution_engine2/utils/user_info.py b/lib/execution_engine2/utils/user_info.py new file mode 100644 index 000000000..1a7a4e10f --- /dev/null +++ b/lib/execution_engine2/utils/user_info.py @@ -0,0 +1,36 @@ +""" +User information classes and methods. +""" + +from execution_engine2.utils.arg_processing import check_string as _check_string + + +class UserCreds: + """ + Contains a user's username and token. + + Instance variables: + username - the users's username. + token - the user's token. + """ + + # TODO replace the creds in the clients.UserClientSet with this class + + def __init__(self, username: str, token: str): + """ + Create the creds. + + username - the user's username. + token - the user's token. It is expected that the client programmer verifies that the + token is indeed tied to the user. + """ + self.username = _check_string(username, "username") + self.token = _check_string(token, "token") + + def __eq__(self, other): + if type(self) == type(other): + return (self.username, self.token) == (other.username, other.token) + return False + + def __hash__(self): + return hash((self.username, self.token)) diff --git a/test/tests_for_utils/user_info_test.py b/test/tests_for_utils/user_info_test.py new file mode 100644 index 000000000..42d7b54b7 --- /dev/null +++ b/test/tests_for_utils/user_info_test.py @@ -0,0 +1,61 @@ +from pytest import raises +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_user_creds_init_success(): + uc = UserCreds(" username ", " some token ") + assert uc.username == "username" + assert uc.token == "some token" + + +def test_user_creds_init_fail(): + _user_creds_init_fail( + None, "t", IncorrectParamsException("Missing input parameter: username") + ) + _user_creds_init_fail( + " \t ", "t", IncorrectParamsException("Missing input parameter: username") + ) + _user_creds_init_fail( + "u", None, IncorrectParamsException("Missing input parameter: token") + ) + _user_creds_init_fail( + "u", " \t ", IncorrectParamsException("Missing input parameter: token") + ) + + +def _user_creds_init_fail(username, token, expected): + with raises(Exception) as got: + UserCreds(username, token) + assert_exception_correct(got.value, expected) + + +def test_user_creds_eq(): + u1 = "u1" + u1a = "u1" + u2 = "u2" + t1 = "t1" + t1a = "t1" + t2 = "t2" + + assert UserCreds(u1, t1) == UserCreds(u1a, t1a) + assert UserCreds(u1, t1) != UserCreds(u1, t2) + assert UserCreds(u1, t1) != UserCreds(u2, t1) + assert UserCreds(u1, t1) != (u1, t1) + + +def test_user_creds_hash(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + u1 = "u1" + u1a = "u1" + u2 = "u2" + t1 = "t1" + t1a = "t1" + t2 = "t2" + + assert hash(UserCreds(u1, t1)) == hash(UserCreds(u1a, t1a)) + assert hash(UserCreds(u1, t1)) != hash(UserCreds(u1, t2)) + assert hash(UserCreds(u1, t1)) != hash(UserCreds(u2, t1)) From 4819033883fae0b3ad0555cdbc501bb05d4e1e5e Mon Sep 17 00:00:00 2001 From: Gavin Date: Mon, 8 Mar 2021 15:32:26 -0800 Subject: [PATCH 022/109] Reduce EE2status API Method was unused outside the class other than in a test where it was mocked out --- lib/execution_engine2/sdk/EE2Status.py | 5 +++-- test/tests_for_sdkmr/ee2_load_test.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 96aeb70f6..902c22654 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -368,15 +368,16 @@ def finish_job( ) ) self._send_exec_stats_to_catalog(job_id=job_id) - self.update_finished_job_with_usage(job_id, as_admin=as_admin) + self._update_finished_job_with_usage(job_id, as_admin=as_admin) - def update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: + def _update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: """ # TODO Does this need a kafka message? :param job_id: :param as_admin: :return: """ + # note this method is replaced by a magic mock in some tests job = self.sdkmr.get_job_with_permission( job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=as_admin ) diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index d97e4512c..51047fdb6 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -62,7 +62,7 @@ def _getRunner(cls) -> SDKMethodRunner: # Initialize these clients from None status = runner.get_jobs_status() # type: JobsStatus status._send_exec_stats_to_catalog = MagicMock(return_value=True) - status.update_finished_job_with_usage = MagicMock(return_value=True) + status._update_finished_job_with_usage = MagicMock(return_value=True) runjob = runner.get_runjob() runjob._get_module_git_commit = MagicMock(return_value="GitCommithash") runner.get_job_logs() From 0b097de16303a9430f8ad521f07c4944af2e5cd7 Mon Sep 17 00:00:00 2001 From: Gavin Date: Mon, 8 Mar 2021 17:38:43 -0800 Subject: [PATCH 023/109] Fix finish_job bug, add tests EE2status attempting to access get_catalog_utils().catalog caused finish_job() to throw an exception and consequently the job runner to freak out: https://github.com/kbase/JobRunner/issues/43 Accessed the catalog directly and added a unit test that covers the code in question. --- lib/execution_engine2/sdk/EE2Status.py | 8 +- test/tests_for_sdkmr/EE2Status_test.py | 123 +++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 test/tests_for_sdkmr/EE2Status_test.py diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 902c22654..35468b7b7 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -355,9 +355,9 @@ def finish_job( ) ) else: - self.sdkmr.logger.debug("Finishing job with a success") + self.sdkmr.get_logger().debug("Finishing job with a success") self._finish_job_with_success(job_id=job_id, job_output=job_output) - self.sdkmr.kafka_client.send_kafka_message( + self.sdkmr.get_kafka_client().send_kafka_message( message=KafkaFinishJob( job_id=str(job_id), new_status=Status.completed.value, @@ -391,7 +391,7 @@ def _update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: ) condor = self.sdkmr.get_condor() resources = condor.get_job_resource_info(job_id=job_id) - self.sdkmr.logger.debug(f"Extracted the following condor job ads {resources}") + self.sdkmr.get_logger().debug(f"Extracted the following condor job ads {resources}") self.sdkmr.get_mongo_util().update_job_resources( job_id=job_id, resources=resources ) @@ -547,7 +547,7 @@ def _send_exec_stats_to_catalog(self, job_id): log_exec_stats_params["is_error"] = int(job.status == Status.error.value) log_exec_stats_params["job_id"] = job_id - self.sdkmr.catalog_utils.catalog.log_exec_stats(log_exec_stats_params) + self.sdkmr.get_catalog_utils().get_catalog().log_exec_stats(log_exec_stats_params) def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict: if not parent_job_id: diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py new file mode 100644 index 000000000..764103581 --- /dev/null +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -0,0 +1,123 @@ +""" +Unit tests for the EE2Status class. +""" + +from pytest import raises + +from logging import Logger +from unittest.mock import create_autospec, call +from bson.objectid import ObjectId + +from execution_engine2.db.models.models import Job, Status, JobInput +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.EE2Status import JobsStatus, JobPermissions +from execution_engine2.db.MongoUtil import MongoUtil +from lib.execution_engine2.utils.KafkaUtils import KafkaClient, KafkaFinishJob +from lib.execution_engine2.utils.CatalogUtils import CatalogUtils +from lib.execution_engine2.utils.Condor import Condor +from installed_clients.CatalogClient import Catalog + +from utils_shared.test_utils import assert_exception_correct + + +def _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user): + job = Job() + job.id = ObjectId(job_id) + job.running = 123.0 + job.finished = 456.5 + job.status = Status.running.value + job.scheduler_id = sched + job_input = JobInput() + job.job_input = job_input + job_input.app_id = app_id + job_input.method = "module.method_id" + job_input.service_ver = gitcommit + job.user = user + return job + + +def test_finish_job_complete_minimal(): + """ + Tests a very simple case of completing a job successfully by the `finish_job` method. + """ + # set up constants + job_id = "6046b539ce9c58ecf8c3e5f3" + job_output = { + 'version': '1.1', + 'id': job_id, + 'result': [{"foo": "bar"}] + } + user = "someuser" + app_id = "module/myapp" + gitcommit = "somecommit" + resources = { + "fake": "condor", + "resources": "in", + "here": "yo" + } + sched = "somescheduler" + + # set up mocks + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + logger = create_autospec(Logger, spec_set=True, instance=True) + mongo = create_autospec(MongoUtil, spec_set=True, instance=True) + kafka = create_autospec(KafkaClient, spec_set=True, instance=True) + catutil = create_autospec(CatalogUtils, spec_set=True, instance=True) + catalog = create_autospec(Catalog, spec_set=True, instance=True) + condor = create_autospec(Condor, spec_set=True, instance=True) + sdkmr.get_mongo_util.return_value = mongo + sdkmr.get_logger.return_value = logger + sdkmr.get_kafka_client.return_value = kafka + sdkmr.get_condor.return_value = condor + sdkmr.get_catalog_utils.return_value = catutil + catutil.get_catalog.return_value = catalog + + # set up return values for mocks. Ordered as per order of operations in code + job1 = _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user) + job2 = _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user) + job2.status = Status.completed.value + + sdkmr.get_job_with_permission.side_effect = [job1, job2] + mongo.get_job.return_value = job2 # gets the job 3x...? + condor.get_job_resource_info.return_value = resources + + # call the method + JobsStatus(sdkmr).finish_job(job_id, job_output=job_output) # no return + + # check mocks called as expected. Ordered as per order of operations in code + + sdkmr.get_job_with_permission.assert_has_calls([ + call(job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False), + call(job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False) + ]) + logger.debug.assert_has_calls([ + call("Finishing job with a success"), + # depending on stable dict ordering for this test to pass + call(f"Extracted the following condor job ads {resources}") + ]) + mongo.finish_job_with_success.assert_called_once_with(job_id, job_output) + kafka.send_kafka_message.assert_called_once_with( + KafkaFinishJob( + job_id=job_id, + new_status=Status.completed.value, + previous_status=Status.running.value, + scheduler_id=sched, + error_code=None, + error_message=None, + ) + ) + mongo.get_job.assert_called_once_with(job_id) + catalog.log_exec_stats.assert_called_once_with({ + "user_id": user, + "app_module_name": "module", + "app_id": app_id, + "func_module_name": "module", + "func_name": "method_id", + "git_commit_hash": gitcommit, + "creation_time": 1615246649.0, # from Job ObjectId + "exec_start_time": 123.0, + "finish_time": 456.5, + "is_error": 0, + "job_id": job_id + }) + mongo.update_job_resources.assert_called_once_with(job_id, resources) From e2af033e56b56bdd5f4191049b4c1607daadc6f3 Mon Sep 17 00:00:00 2001 From: Gavin Date: Mon, 8 Mar 2021 17:41:30 -0800 Subject: [PATCH 024/109] run black --- lib/execution_engine2/sdk/EE2Status.py | 8 ++- test/tests_for_sdkmr/EE2Status_test.py | 74 ++++++++++++++------------ 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 35468b7b7..31b2bd4ae 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -391,7 +391,9 @@ def _update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: ) condor = self.sdkmr.get_condor() resources = condor.get_job_resource_info(job_id=job_id) - self.sdkmr.get_logger().debug(f"Extracted the following condor job ads {resources}") + self.sdkmr.get_logger().debug( + f"Extracted the following condor job ads {resources}" + ) self.sdkmr.get_mongo_util().update_job_resources( job_id=job_id, resources=resources ) @@ -547,7 +549,9 @@ def _send_exec_stats_to_catalog(self, job_id): log_exec_stats_params["is_error"] = int(job.status == Status.error.value) log_exec_stats_params["job_id"] = job_id - self.sdkmr.get_catalog_utils().get_catalog().log_exec_stats(log_exec_stats_params) + self.sdkmr.get_catalog_utils().get_catalog().log_exec_stats( + log_exec_stats_params + ) def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict: if not parent_job_id: diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py index 764103581..65df2a557 100644 --- a/test/tests_for_sdkmr/EE2Status_test.py +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -42,19 +42,11 @@ def test_finish_job_complete_minimal(): """ # set up constants job_id = "6046b539ce9c58ecf8c3e5f3" - job_output = { - 'version': '1.1', - 'id': job_id, - 'result': [{"foo": "bar"}] - } + job_output = {"version": "1.1", "id": job_id, "result": [{"foo": "bar"}]} user = "someuser" app_id = "module/myapp" gitcommit = "somecommit" - resources = { - "fake": "condor", - "resources": "in", - "here": "yo" - } + resources = {"fake": "condor", "resources": "in", "here": "yo"} sched = "somescheduler" # set up mocks @@ -73,8 +65,12 @@ def test_finish_job_complete_minimal(): catutil.get_catalog.return_value = catalog # set up return values for mocks. Ordered as per order of operations in code - job1 = _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user) - job2 = _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user) + job1 = _finish_job_complete_minimal_get_test_job( + job_id, sched, app_id, gitcommit, user + ) + job2 = _finish_job_complete_minimal_get_test_job( + job_id, sched, app_id, gitcommit, user + ) job2.status = Status.completed.value sdkmr.get_job_with_permission.side_effect = [job1, job2] @@ -86,15 +82,23 @@ def test_finish_job_complete_minimal(): # check mocks called as expected. Ordered as per order of operations in code - sdkmr.get_job_with_permission.assert_has_calls([ - call(job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False), - call(job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False) - ]) - logger.debug.assert_has_calls([ - call("Finishing job with a success"), - # depending on stable dict ordering for this test to pass - call(f"Extracted the following condor job ads {resources}") - ]) + sdkmr.get_job_with_permission.assert_has_calls( + [ + call( + job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False + ), + call( + job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False + ), + ] + ) + logger.debug.assert_has_calls( + [ + call("Finishing job with a success"), + # depending on stable dict ordering for this test to pass + call(f"Extracted the following condor job ads {resources}"), + ] + ) mongo.finish_job_with_success.assert_called_once_with(job_id, job_output) kafka.send_kafka_message.assert_called_once_with( KafkaFinishJob( @@ -107,17 +111,19 @@ def test_finish_job_complete_minimal(): ) ) mongo.get_job.assert_called_once_with(job_id) - catalog.log_exec_stats.assert_called_once_with({ - "user_id": user, - "app_module_name": "module", - "app_id": app_id, - "func_module_name": "module", - "func_name": "method_id", - "git_commit_hash": gitcommit, - "creation_time": 1615246649.0, # from Job ObjectId - "exec_start_time": 123.0, - "finish_time": 456.5, - "is_error": 0, - "job_id": job_id - }) + catalog.log_exec_stats.assert_called_once_with( + { + "user_id": user, + "app_module_name": "module", + "app_id": app_id, + "func_module_name": "module", + "func_name": "method_id", + "git_commit_hash": gitcommit, + "creation_time": 1615246649.0, # from Job ObjectId + "exec_start_time": 123.0, + "finish_time": 456.5, + "is_error": 0, + "job_id": job_id, + } + ) mongo.update_job_resources.assert_called_once_with(job_id, resources) From 7e1c03f32ff7159a20ab3a407676d046b334a03b Mon Sep 17 00:00:00 2001 From: Gavin Date: Mon, 8 Mar 2021 17:49:15 -0800 Subject: [PATCH 025/109] remove unused imports --- test/tests_for_sdkmr/EE2Status_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py index 65df2a557..191da652b 100644 --- a/test/tests_for_sdkmr/EE2Status_test.py +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -2,8 +2,6 @@ Unit tests for the EE2Status class. """ -from pytest import raises - from logging import Logger from unittest.mock import create_autospec, call from bson.objectid import ObjectId @@ -17,8 +15,6 @@ from lib.execution_engine2.utils.Condor import Condor from installed_clients.CatalogClient import Catalog -from utils_shared.test_utils import assert_exception_correct - def _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user): job = Job() From 34449d0fd7ce6629a73f2b63be06c899e6f6a051 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 15 Mar 2021 10:48:46 -0700 Subject: [PATCH 026/109] DATAUP-389 Add application info class (#329) * Add application info class Will be part of a job reqs class in the future * Run black --- .../utils/application_info.py | 106 ++++++++++ test/tests_for_utils/application_info_test.py | 193 ++++++++++++++++++ 2 files changed, 299 insertions(+) create mode 100644 lib/execution_engine2/utils/application_info.py create mode 100644 test/tests_for_utils/application_info_test.py diff --git a/lib/execution_engine2/utils/application_info.py b/lib/execution_engine2/utils/application_info.py new file mode 100644 index 000000000..cf76f1b7d --- /dev/null +++ b/lib/execution_engine2/utils/application_info.py @@ -0,0 +1,106 @@ +""" +Contains information about KBase applications. +""" + +from execution_engine2.utils.arg_processing import check_string as _check_string +from execution_engine2.exceptions import IncorrectParamsException + + +def _get2part_string(s, sep, name): + parts = s.split(sep) + if len(parts) != 2: + raise IncorrectParamsException(f"Expected exactly one '{sep}' in {name} '{s}'") + return parts[0].strip(), parts[1].strip() + + +class AppInfo: + """ + Information about a KBase app. + + Instance variables: + module - the app's module, e.g. kb_uploadmethods. + method - the SDK method the app will run, e.g. import_reads_from_staging + application_module - the module containing the application. Under normal conditions this + will be the same as 'module'. + application - the id of the application, e.g. import_fastq_interleaved_as_reads_from_staging. + This is the name of the folder in the 'ui/narrative/methods' folder in the app repo + contining the spec files for the app. + """ + + def __init__(self, method: str, app_id: str, strict=True): + """ + Create the application information. + + method - the method name, e.g. kb_uploadmethods.import_reads_from_staging + app_id - the app name, either fully qualified (e.g. + kb_uploadmethods/import_fastq_interleaved_as_reads_from_staging or unqualified (e.g. + import_fastq_interleaved_as_reads_from_staging). If fully qualified, the module name + of the app (kb_uploadmethds in this example) must match the module name for the method. + strict - whether the app_id should be processed strictly or not. Without strict=True, + 1) The application module name may be different from the method module name + 2) The application module may be separated from the application name with a '.' + rather than a '/'. + """ + # Implementation notes: as of this writing, there are app_ids in the ee2 database + # that have a . separator rather than a /, and, in some cases, test data where the + # module for the application and method is not the same, although that should never + # happen in practice. Hence we support non-strict mode to allow for those cases. + mod, meth = _get2part_string( + _check_string(method, "method ID"), ".", "method ID" + ) + self.module = _check_string(mod, "module portion of method ID") + self.method = _check_string(meth, "method portion of method ID") + app_id = _check_string(app_id, "application ID") + if "/" in app_id and "." in app_id: + raise IncorrectParamsException( + f"Application ID '{app_id}' has both '/' and '.' separators" + ) + if "/" in app_id: + mod, app = _get2part_string(app_id, "/", "application ID") + elif "." in app_id: + if strict: + raise IncorrectParamsException( + f"Application ID '{app_id}' contains a '.'" + ) + mod, app = _get2part_string(app_id, ".", "application ID") + else: + mod = self.module + app = app_id + if strict and mod != self.module: + raise IncorrectParamsException( + f"Application module '{mod}' must equal method module '{self.module}'" + ) + self.application_module = _check_string(mod, "module portion of application ID") + self.application = _check_string(app, "application portion of application ID") + + def get_method_id(self) -> str: + """ + Get the method id, e.g. module.method. + """ + return f"{self.module}.{self.method}" + + def get_application_id(self) -> str: + """ + Get the application id, e.g. module/application + """ + return f"{self.application_module}/{self.application}" + + def __eq__(self, other): + if type(self) == type(other): + return ( + self.module, + self.method, + self.application_module, + self.application, + ) == ( + other.module, + other.method, + other.application_module, + other.application, + ) + return False + + def __hash__(self): + return hash( + (self.module, self.method, self.application_module, self.application) + ) diff --git a/test/tests_for_utils/application_info_test.py b/test/tests_for_utils/application_info_test.py new file mode 100644 index 000000000..978bb2949 --- /dev/null +++ b/test/tests_for_utils/application_info_test.py @@ -0,0 +1,193 @@ +from pytest import raises +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_app_info_strict_init_success(): + ai = AppInfo(" \t mod . meth ", "mod/ appthing") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod" + assert ai.application == "appthing" + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod/appthing" + + +def test_app_info_without_app_module_strict_init_success(): + ai = AppInfo(" \t mod . meth ", " appthing \t ") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod" + assert ai.application == "appthing" + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod/appthing" + + +def test_app_info_init_success(): + ai = AppInfo(" \t mod . meth ", "mod2. appthing", strict=False) + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod2" + assert ai.application == "appthing" + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod2/appthing" + + +def test_app_info_init_fail(): + m = "m.n" + a = "m.b" + _app_info_init_fail( + None, a, False, IncorrectParamsException("Missing input parameter: method ID") + ) + _app_info_init_fail( + " \t ", + a, + False, + IncorrectParamsException("Missing input parameter: method ID"), + ) + _app_info_init_fail( + " method ", + a, + False, + IncorrectParamsException("Expected exactly one '.' in method ID 'method'"), + ) + _app_info_init_fail( + " mod.innermod.method ", + a, + False, + IncorrectParamsException( + "Expected exactly one '.' in method ID 'mod.innermod.method'" + ), + ) + _app_info_init_fail( + " . meth", + a, + False, + IncorrectParamsException( + "Missing input parameter: module portion of method ID" + ), + ) + _app_info_init_fail( + " mod . ", + a, + False, + IncorrectParamsException( + "Missing input parameter: method portion of method ID" + ), + ) + + _app_info_init_fail( + m, + None, + False, + IncorrectParamsException("Missing input parameter: application ID"), + ) + _app_info_init_fail( + m, + " \t ", + False, + IncorrectParamsException("Missing input parameter: application ID"), + ) + _app_info_init_fail( + m, + "mod / meth.bak ", + False, + IncorrectParamsException( + "Application ID 'mod / meth.bak' has both '/' and '.' separators" + ), + ) + _app_info_init_fail( + m, + "mod / meth / bak ", + False, + IncorrectParamsException( + "Expected exactly one '/' in application ID 'mod / meth / bak'" + ), + ) + _app_info_init_fail( + m, + "mod.meth", + True, + IncorrectParamsException("Application ID 'mod.meth' contains a '.'"), + ) + _app_info_init_fail( + m, + "mod.meth.anothermeth", + False, + IncorrectParamsException( + "Expected exactly one '.' in application ID 'mod.meth.anothermeth'" + ), + ) + + _app_info_init_fail( + "mod.meth", + " mod2 /meth", + True, + IncorrectParamsException( + "Application module 'mod2' must equal method module 'mod'" + ), + ) + + _app_info_init_fail( + m, + "mod/", + False, + IncorrectParamsException( + "Missing input parameter: application portion of application ID" + ), + ) + _app_info_init_fail( + m, + "/meth", + False, + IncorrectParamsException( + "Missing input parameter: module portion of application ID" + ), + ) + _app_info_init_fail( + m, + "mod. ", + False, + IncorrectParamsException( + "Missing input parameter: application portion of application ID" + ), + ) + _app_info_init_fail( + m, + " .meth", + False, + IncorrectParamsException( + "Missing input parameter: module portion of application ID" + ), + ) + + +def _app_info_init_fail(meth, app, strict, expected): + with raises(Exception) as got: + AppInfo(meth, app, strict) + assert_exception_correct(got.value, expected) + + +def test_equals(): + assert AppInfo("m.n", "m/p") == AppInfo("m.n", "m/p") + assert AppInfo("m.n", "p/p", False) == AppInfo("m.n", "p/p", False) + + assert AppInfo("m.n", "m/p", False) != AppInfo("n.n", "m/p", False) + assert AppInfo("m.n", "m/p") != AppInfo("m.x", "m/p") + assert AppInfo("m.n", "m/p", False) != AppInfo("m.n", "x/p", False) + assert AppInfo("m.n", "m/p") != AppInfo("m.n", "m/x") + assert AppInfo("m.n", "m/p") != ("m.n", "m/x") + + +def test_hashcode(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + assert hash(AppInfo("m.n", "m/p")) == hash(AppInfo("m.n", "m/p")) + assert hash(AppInfo("m.n", "p/p", False)) == hash(AppInfo("m.n", "p/p", False)) + + assert hash(AppInfo("m.n", "m/p", False)) != hash(AppInfo("n.n", "m/p", False)) + assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.x", "m/p")) + assert hash(AppInfo("m.n", "m/p", False)) != hash(AppInfo("m.n", "x/p", False)) + assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.n", "m/x")) From 93635779d55cad42272c091673f1bf932b8d26ff Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 16 Mar 2021 09:23:48 -0700 Subject: [PATCH 027/109] DATAUP-389: Add job requirements value class (#330) * Add job requirements class Hold information about resources and requirements for running a job. * run black --- Pipfile | 1 + Pipfile.lock | 9 +- .../sdk/job_submission_parameters.py | 98 ++++++++ requirements-dev.txt | 1 + requirements.txt | 1 + .../job_submission_parameters_test.py | 230 ++++++++++++++++++ 6 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 lib/execution_engine2/sdk/job_submission_parameters.py create mode 100644 test/tests_for_sdkmr/job_submission_parameters_test.py diff --git a/Pipfile b/Pipfile index 86fe313c5..dec351075 100644 --- a/Pipfile +++ b/Pipfile @@ -35,6 +35,7 @@ hyperframe = "==5.2.0" idna = "==2.8" importlib-metadata = "==2.0.0" iniconfig = "==1.1.1" +maps = "==5.1.1" memory-profiler = "==0.55.0" mock = "==3.0.5" mongoengine = "==0.18.2" diff --git a/Pipfile.lock b/Pipfile.lock index 081653316..70e4c074b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "57c0c06ffdcb9f25ba60e8aa673adf703113ce8deeff306215c3c25c700e76a7" + "sha256": "b3b343fb441fabbc794f1f8283b2f6737578c7fa482703fb9dce293bb6d91adb" }, "pipfile-spec": 6, "requires": { @@ -413,6 +413,13 @@ "index": "pypi", "version": "==0.2.0" }, + "maps": { + "hashes": [ + "sha256:a92131122b3f6a2acc008e6a4d341a8510da5a83da39b76ef7a49807e1b28de5" + ], + "index": "pypi", + "version": "==5.1.1" + }, "markupsafe": { "hashes": [ "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py new file mode 100644 index 000000000..e7dff15ae --- /dev/null +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -0,0 +1,98 @@ +""" +Parameters for submitting a job to a scheduler. +""" + +from maps import FrozenMap +from typing import Dict +from execution_engine2.utils.arg_processing import check_string as _check_string +from execution_engine2.exceptions import IncorrectParamsException + + +def _gt_zero(num: int, name: str) -> int: + if num is None or num < 1: + raise IncorrectParamsException(f"{name} must be at least 1") + return num + + +class JobRequirements: + """ + Requirements for running a job on a scheduler. + """ + + def __init__( + self, + cpus: int, + memory_MB: int, + disk_GB: int, + client_group: str, + client_group_regex: bool = False, + as_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + ): + """ + Create the job requirements. + + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. + as_user - run the job as an alternate user; take the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + """ + self.cpus = _gt_zero(cpus, "CPU count") + self.memory_MB = _gt_zero(memory_MB, "memory in MB") + self.disk_GB = _gt_zero(disk_GB, "disk space in GB") + self.client_group = _check_string(client_group, "client_group") + self.client_group_regex = client_group_regex + self.as_user = _check_string(as_user, "as_user", optional=True) + self.ignore_concurrency_limits = ignore_concurrency_limits + sr = scheduler_requirements if scheduler_requirements else {} + for key, value in sr.items(): + _check_string(key, "key in scheduler requirements structure") + _check_string( + value, f"value for key '{key}' in scheduler requirements structure" + ) + self.scheduler_requirements = FrozenMap(sr) + + def __eq__(self, other): + if type(self) == type(other): + return ( + self.cpus, + self.memory_MB, + self.disk_GB, + self.client_group, + self.client_group_regex, + self.as_user, + self.ignore_concurrency_limits, + self.scheduler_requirements, + ) == ( + other.cpus, + other.memory_MB, + other.disk_GB, + other.client_group, + other.client_group_regex, + other.as_user, + other.ignore_concurrency_limits, + other.scheduler_requirements, + ) + return False + + def __hash__(self): + return hash( + ( + self.cpus, + self.memory_MB, + self.disk_GB, + self.client_group, + self.client_group_regex, + self.as_user, + self.ignore_concurrency_limits, + self.scheduler_requirements, + ) + ) diff --git a/requirements-dev.txt b/requirements-dev.txt index 366b4859e..e71cab6ae 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -30,6 +30,7 @@ importlib-metadata==2.0.0 iniconfig==1.1.1 Jinja2==2.10.3 JSONRPCBase==0.2.0 +maps==5.1.1 MarkupSafe==1.1.1 memory-profiler==0.55.0 mock==3.0.5 diff --git a/requirements.txt b/requirements.txt index bf77a6d0f..339d37407 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,6 +29,7 @@ importlib-metadata==2.0.0 iniconfig==1.1.1 Jinja2==2.10.3 JSONRPCBase==0.2.0 +maps==5.1.1 MarkupSafe==1.1.1 memory-profiler==0.55.0 mock==3.0.5 diff --git a/test/tests_for_sdkmr/job_submission_parameters_test.py b/test/tests_for_sdkmr/job_submission_parameters_test.py new file mode 100644 index 000000000..4f7e0bd3f --- /dev/null +++ b/test/tests_for_sdkmr/job_submission_parameters_test.py @@ -0,0 +1,230 @@ +from pytest import raises +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_job_req_init_minimal(): + jr = JobRequirements(1, 1, 1, "njs") + + assert jr.cpus == 1 + assert jr.memory_MB == 1 + assert jr.disk_GB == 1 + assert jr.client_group == "njs" + assert jr.client_group_regex is False + assert jr.as_user is None + assert jr.ignore_concurrency_limits is False + assert jr.scheduler_requirements == {} + + +def test_job_req_init_maximal(): + jr = JobRequirements( + 6, + 7, + 8, + "bigmemlong", + True, + "someuser", + True, + {"proc": "x286", "maxmem": "640k"}, + ) + + assert jr.cpus == 6 + assert jr.memory_MB == 7 + assert jr.disk_GB == 8 + assert jr.client_group == "bigmemlong" + assert jr.client_group_regex is True + assert jr.as_user == "someuser" + assert jr.ignore_concurrency_limits is True + assert jr.scheduler_requirements == {"proc": "x286", "maxmem": "640k"} + + +def test_job_req_init_fail(): + n = None + _job_req_init_fail( + n, 1, 1, "f", n, n, IncorrectParamsException("CPU count must be at least 1") + ) + _job_req_init_fail( + 0, 1, 1, "f", n, n, IncorrectParamsException("CPU count must be at least 1") + ) + _job_req_init_fail( + 1, n, 1, "f", n, n, IncorrectParamsException("memory in MB must be at least 1") + ) + _job_req_init_fail( + 1, 0, 1, "f", n, n, IncorrectParamsException("memory in MB must be at least 1") + ) + _job_req_init_fail( + 1, + 1, + n, + "f", + n, + n, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _job_req_init_fail( + 1, + 1, + 0, + "f", + n, + n, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _job_req_init_fail( + 1, + 1, + 1, + n, + n, + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _job_req_init_fail( + 1, + 1, + 1, + " \t ", + n, + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + # as_user is optional, so this is the only possible failure mode + _job_req_init_fail( + 1, + 1, + 1, + "f", + "user\tname", + n, + IncorrectParamsException("as_user contains control characters"), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {n: "a"}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {" \t ": "a"}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {"a": n}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {"a": " \t "}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + + +def _job_req_init_fail(cpus, mem, disk, cgroup, user, reqs, expected): + with raises(Exception) as got: + JobRequirements(cpus, mem, disk, cgroup, False, user, False, reqs) + assert_exception_correct(got.value, expected) + + +def test_equals(): + c1 = "cligroupf" + c1a = "cligroupf" + c2 = "cligroupg" + t = True + f = False + u1 = "user1" + u1a = "user1" + u2 = "user2" + r1 = {"a": "b"} + r1a = {"a": "b"} + r2 = {"a": "c"} + + assert JobRequirements(1, 1, 1, c1) == JobRequirements(1, 1, 1, c1a) + assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) == JobRequirements( + 1, 1, 1, c1a, t, u1a, f, r1a + ) + + assert JobRequirements(1, 1, 1, c1) != JobRequirements(2, 1, 1, c1a) + assert JobRequirements(1, 1, 1, c1) != JobRequirements(1, 2, 1, c1a) + assert JobRequirements(1, 1, 1, c1) != JobRequirements(1, 1, 2, c1a) + assert JobRequirements(1, 1, 1, c1) != JobRequirements(1, 1, 1, c2) + assert JobRequirements(1, 1, 1, c1) != (1, 1, 1, c1) + + assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( + 1, 1, 1, c1a, f, u1a, f, r1a + ) + assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( + 1, 1, 1, c1a, t, u2, f, r1a + ) + assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( + 1, 1, 1, c1a, t, u1a, t, r1a + ) + assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( + 1, 1, 1, c1a, t, u1a, f, r2 + ) + assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != (1, 1, 1, c1a, t, u1a, f, r1a) + + +def test_hash(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + c1 = "cligroupf" + c1a = "cligroupf" + c2 = "cligroupg" + t = True + f = False + u1 = "user1" + u1a = "user1" + u2 = "user2" + r1 = {"a": "b"} + r1a = {"a": "b"} + r2 = {"a": "c"} + + assert hash(JobRequirements(1, 1, 1, c1)) == hash(JobRequirements(1, 1, 1, c1a)) + assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) == hash( + JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a) + ) + + assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(2, 1, 1, c1a)) + assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(1, 2, 1, c1a)) + assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(1, 1, 2, c1a)) + assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(1, 1, 1, c2)) + + assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( + JobRequirements(1, 1, 1, c1a, f, u1a, f, r1a) + ) + assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( + JobRequirements(1, 1, 1, c1a, t, u2, f, r1a) + ) + assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( + JobRequirements(1, 1, 1, c1a, t, u1a, t, r1a) + ) + assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( + JobRequirements(1, 1, 1, c1a, t, u1a, f, r2) + ) From aee87994b519d78450cdb187de9e975c656788fd Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 17 Mar 2021 20:43:47 -0700 Subject: [PATCH 028/109] Fix bug processing catalog client groups in CSV format (#333) Details: https://github.com/kbase/execution_engine2/issues/332 In short, the CSV inputs shown in the catalog UI are already split into separate strings before being sent to the catalog UI, so the split is not necesssary. Only accepting the first entry in the list causes everything except the client group to be ignored. --- lib/execution_engine2/utils/CatalogUtils.py | 2 +- test/tests_for_sdkmr/ee2_scheduler_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/execution_engine2/utils/CatalogUtils.py b/lib/execution_engine2/utils/CatalogUtils.py index ac58302f8..a9e614889 100644 --- a/lib/execution_engine2/utils/CatalogUtils.py +++ b/lib/execution_engine2/utils/CatalogUtils.py @@ -57,7 +57,7 @@ def normalize_job_settings(resources_request: List): json_resources_request = ", ".join(resources_request) return json.loads(json_resources_request) # CSV Format - rr = resources_request[0].split(",") # type: list + rr = resources_request # type: list rv = {"client_group": rr.pop(0)} for item in rr: if "=" not in item: diff --git a/test/tests_for_sdkmr/ee2_scheduler_test.py b/test/tests_for_sdkmr/ee2_scheduler_test.py index 79799f809..dbd2038b6 100644 --- a/test/tests_for_sdkmr/ee2_scheduler_test.py +++ b/test/tests_for_sdkmr/ee2_scheduler_test.py @@ -81,7 +81,7 @@ def test_create_submit_file(self): logging.info("Testing with complex-empty clientgroup") params = self._create_sample_params( - cgroups=["njs,request_cpus=8,request_memory=10GB,request_apples=5"] + cgroups=["njs", "request_cpus=8", "request_memory=10GB", "request_apples=5"] ) njs_sub = c._create_submit(params) From 330194d1f1c09c7dc9598fb7d6afedf50feb253d Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 18 Mar 2021 09:30:54 -0700 Subject: [PATCH 029/109] Add release notes for CSV bugfix (#334) --- RELEASE_NOTES.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 3c2bddee2..29e30ad61 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,9 @@ # execution_engine2 (ee2) release notes ========================================= +## 0.0.5 + * Fix a bug that caused job requirements from the catalog in CSV format to be ignored other + than the client group + ## 0.0.4 * Fix up tests * Remove dependency on slack From 79a1c994d7c5304cebf8fa9d4a2f9b54591e8227 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 18 Mar 2021 12:44:40 -0700 Subject: [PATCH 030/109] Bump version, add development instructions... (#335) ... and add an interactive HTML version of the spec. --- KIDLspec.css | 65 +++++ Makefile | 4 + README.md | 24 ++ execution_engine2.html | 1 + kbase.yml | 4 +- .../execution_engine2Impl.py | 231 ++++++++++++------ 6 files changed, 256 insertions(+), 73 deletions(-) create mode 100644 KIDLspec.css create mode 100644 execution_engine2.html diff --git a/KIDLspec.css b/KIDLspec.css new file mode 100644 index 000000000..4d2a3e3af --- /dev/null +++ b/KIDLspec.css @@ -0,0 +1,65 @@ +html, body { + height: 100%; +} +html { + display: table; + margin: auto; +} +body { + background-color: white; + color: #000; + font-family: Menlo, Monaco, Consolas, "Courier New", monospace; + font-weight: normal; + font-size: 12px; + margin: 0; + padding: 20px; + display: table-cell; + vertical-align: middle; +} +span.space { + display: inline-block; + width: 7px; +} +span.tab { + display: inline-block; + width: 30px; +} +span.keyword { + font-weight: bold; + color: #008; +} +span.name { + color: #000; !important +} +span.deprecated { + text-decoration: line-through; +} +span.annotation { + color: #303030; +} +span.primitive { + font-weight: bold; + color: #066; +} +div.body { + background-color: #ffffff; + color: #3e4349; + padding: 0 30px; +} +div.comment { + color: #A0A0A0; +} +a { + color: #004b6b; + text-decoration: none; +} +a:hover { + color: #6d4100; + text-decoration: underline; +} +:target { + background-color: #ffa; +} +div.body p, div.body dd, div.body li { + line-height: 1.4em; +} diff --git a/Makefile b/Makefile index 0e50f5547..848d03d5f 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,10 @@ compile: --pysrvname $(SERVICE_CAPS).$(SERVICE_CAPS)Server \ --pyimplname $(SERVICE_CAPS).$(SERVICE_CAPS)Impl; + kb-sdk compile $(SPEC_FILE) \ + --out . \ + --html \ + build: chmod +x $(SCRIPTS_DIR)/entrypoint.sh diff --git a/README.md b/README.md index 264834207..3dd26c40f 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,30 @@ pre-commit uninstall * Use a remote ssh debugger with the correct path mappings * Right click on the file you'd like to run and select run test +## Develop + +* To add a bugfix or new feature: + * Create a new feature branch, branching from `develop`. Ask a repo owner for help if + necessary. + * If you're a repo owner you can push directly to this branch. If not, make pull requests to + the branch as necessary. + * Add: + * Feature / bugfix code + * Tests + * Documentation, if applicable + * Release notes, if applicable + * See the PR template in `worksflows/pull_request_template.md` for details + * Once the feature is complete, create a PR from the feature branch to `develop` and request a + review from person with EE2 knowledge via the Github interface and via Slack. + * When the PR is approved, squash and merge into `develop` and delete the feature branch. +* To create a new release: + * Increment the version as per [semantic versioning](https://semver.org/) in `kbase.yml`. + * Update the release notes to the correct version, if necessary. + * Run `make compile`. + * Go through the process above to get the changes into `develop`. + * Make a PR from `develop` to `main`. + * Once the PR is apporoved, merge (no squash) to `main`. + * Tag the merge commit in GitHub with the semantic version from `kbase.yml`. # Help diff --git a/execution_engine2.html b/execution_engine2.html new file mode 100644 index 000000000..aa8b431be --- /dev/null +++ b/execution_engine2.html @@ -0,0 +1 @@ +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y or X/Y/Z, where
*X is the workspace name or id,
*Y is the object name or id,
*Z is the version, which is optional.
*/
typedefstringwsref;

/*
*time - the time the call was started;
*method - service defined in standard JSON RPC way, typically it's
*module name from spec-file followed by '.' and name of funcdef
*from spec-file corresponding to running method (e.g.
*'KBaseTrees.construct_species_tree' from trees service);
*job_id - job id if method is asynchronous (optional field).
*/
typedefstructure{
stringmethod;
job_idjob_id;
}
MethodCall;

/*
*call_stack - upstream calls details including nested service calls and
*parent jobs where calls are listed in order from outer to inner.
*/
typedefstructure{
list<MethodCall>call_stack;
stringrun_id;
}
RpcContext;

/*
*method - service defined in standard JSON RPC way, typically it's
*module name from spec-file followed by '.' and name of funcdef
*from spec-file corresponding to running method (e.g.
*'KBaseTrees.construct_species_tree' from trees service);
*params - the parameters of the method that performed this call;
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*rpc_context - context of current method call including nested call
*history
*remote_url - run remote service call instead of local command line
*execution.
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance.
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*mapping<string, string> meta - user defined metadata to associate with
*the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 id of the parent of a batch job. Batch jobs will add
*this id to the EE2 database under the field "parent_job_id"
*/
typedefstructure{
stringmethod;
list<UnspecifiedObject>params;
stringservice_ver;
RpcContextrpc_context;
stringremote_url;
list<wsref>source_ws_objects;
stringapp_id;
mapping<string,string>meta;
intwsid;
stringparent_job_id;
}
RunJobParams;

/*
*Start a new job (long running method of service registered in ServiceRegistery).
*Such job runs Docker image for this service in script mode.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

typedefstructure{
intwsid;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in MB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*account_group: str = None # Someone elses account
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*/
typedefstructure{
intrequest_cpu;
intrequest_memory_mb;
intrequest_disk_mb;
intjob_priority;
stringaccount_group;
list<string>requirements_list;
stringclient_group;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/kbase.yml b/kbase.yml index ddfbbc858..0cee4a309 100644 --- a/kbase.yml +++ b/kbase.yml @@ -8,8 +8,8 @@ service-language: python module-version: - 0.0.1 + 0.0.5 owners: - [bsadkhin, tgu2, wjriehl] + [bsadkhin, tgu2, wjriehl, gaprice] diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index fac90c96f..02842b88a 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -28,9 +28,9 @@ class execution_engine2: # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa - VERSION = "0.0.1" - GIT_URL = "https://bio-boris@github.com/kbase/execution_engine2" - GIT_COMMIT_HASH = "78ab4aaa17181deb81e06cd077c31bf6929b009f" + VERSION = "0.0.5" + GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" + GIT_COMMIT_HASH = "330194d1f1c09c7dc9598fb7d6afedf50feb253d" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -804,45 +804,45 @@ def check_job_batch(self, ctx, params): of list of String, parameter "as_admin" of type "boolean" (@range [0,1]) :returns: instance of type "CheckJobBatchResults" (parent_job - state - of parent job job_states - states of child jobs aggregate_states - - count of all available child job states, even if they are zero) -> - structure: parameter "parent_job" of type "JobState" (job_id - - string - id of the job user - string - user who started the job - wsid - int - optional id of the workspace where the job is bound - authstrat - string - what strategy used to authenticate the job - job_input - object - inputs to the job (from the run_job call) ## - TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in - milliseconds when the job was created finished - int - timestamp - since epoch in milliseconds when the job was finished status - - string - status of the job. one of the following: created - job - has been created in the service estimating - an estimation job is - running to estimate resources required for the main job, and which - queue should be used queued - job is queued to be run running - - job is running on a worker node completed - job was completed - successfully error - job is no longer running, but failed with an - error terminated - job is no longer running, terminated either due - to user cancellation, admin cancellation, or some automated task - error_code - int - internal reason why the job is an error. one of - the following: 0 - unknown 1 - job crashed 2 - job terminated by - automation 3 - job ran over time limit 4 - job was missing its - automated output document 5 - job authentication token expired - errormsg - string - message (e.g. stacktrace) accompanying an - errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. + of parent job job_states - states of child jobs IDEA: ADD + aggregate_states - count of all available child job states, even + if they are zero) -> structure: parameter "parent_jobstate" of + type "JobState" (job_id - string - id of the job user - string - + user who started the job wsid - int - optional id of the workspace + where the job is bound authstrat - string - what strategy used to + authenticate the job job_input - object - inputs to the job (from + the run_job call) ## TODO - verify updated - int - timestamp + since epoch in milliseconds of the last time the status was + updated running - int - timestamp since epoch in milliseconds of + when it entered the running state created - int - timestamp since + epoch in milliseconds when the job was created finished - int - + timestamp since epoch in milliseconds when the job was finished + status - string - status of the job. one of the following: created + - job has been created in the service estimating - an estimation + job is running to estimate resources required for the main job, + and which queue should be used queued - job is queued to be run + running - job is running on a worker node completed - job was + completed successfully error - job is no longer running, but + failed with an error terminated - job is no longer running, + terminated either due to user cancellation, admin cancellation, or + some automated task error_code - int - internal reason why the job + is an error. one of the following: 0 - unknown 1 - job crashed 2 - + job terminated by automation 3 - job ran over time limit 4 - job + was missing its automated output document 5 - job authentication + token expired errormsg - string - message (e.g. stacktrace) + accompanying an errored job error - object - the JSON-RPC error + package that accompanies the error code and message + terminated_code - int - internal reason why a job was terminated, + one of: 0 - user cancellation 1 - admin cancellation 2 - + terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - service defined in standard JSON RPC way, + typically it's module name from spec-file followed by '.' and name + of funcdef from spec-file corresponding to running method (e.g. 'KBaseTrees.construct_species_tree' from trees service); params - the parameters of the method that performed this call; Optional parameters: service_ver - specific version of deployed service, @@ -891,8 +891,8 @@ def check_job_batch(self, ctx, params): structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "job_states" of - list of type "JobState" (job_id - string - id of the job user - + parameter "terminated_code" of Long, parameter "child_jobstates" + of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs @@ -976,8 +976,7 @@ def check_job_batch(self, ctx, params): structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "aggregate_states" - of unspecified object + parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -1338,14 +1337,30 @@ def check_jobs_date_range_for_user(self, ctx, params): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -1353,9 +1368,34 @@ def check_jobs_date_range_for_user(self, ctx, params): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the @@ -1440,7 +1480,11 @@ def check_jobs_date_range_for_user(self, ctx, params): structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + parameter "terminated_code" of Long, parameter "count" of Long, + parameter "query_count" of Long, parameter "filter" of mapping + from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal @@ -1473,14 +1517,30 @@ def check_jobs_date_range_for_all(self, ctx, params): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -1488,9 +1548,34 @@ def check_jobs_date_range_for_all(self, ctx, params): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the @@ -1575,7 +1660,11 @@ def check_jobs_date_range_for_all(self, ctx, params): structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + parameter "terminated_code" of Long, parameter "count" of Long, + parameter "query_count" of Long, parameter "filter" of mapping + from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal @@ -1654,7 +1743,7 @@ def get_admin_permission(self, ctx): """ Check if current user has ee2 admin rights. If so, return the type of rights and their roles - :returns: instance of type "AdminRolesResults" (str permission; # One + :returns: instance of type "AdminRolesResults" (str permission - One of 'r|w|x' (('read' | 'write' | 'none'))) -> structure: parameter "permission" of String """ From 613a01f1ec3f855374f1fdbeebfd75f28b0301b1 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 19 Mar 2021 11:16:54 -0700 Subject: [PATCH 031/109] DATAUP-389 Add job submission parameters class (#331) * Add job submission parameters class * run black * Minor cleanup * Move debug mode to job requirements After a bit of cogitation it makes more sense there since it applies to how the job is run. In any case, it's stored in the catalog along with all the other params in the class, so it also makes sense practially. Also made the client group regex have a 'no preference' default value so the scheduler wrapper can decide on a sane default for the scheulder (for Condor the default is to use a regex, for example, which to me isn't intuitive, and I feel iffy about making the default True). * run black * DRY up some code --- .../sdk/job_submission_parameters.py | 145 +++++++-- .../job_submission_parameters_test.py | 303 +++++++++++++++--- 2 files changed, 381 insertions(+), 67 deletions(-) diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py index e7dff15ae..280fb36c6 100644 --- a/lib/execution_engine2/sdk/job_submission_parameters.py +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -3,12 +3,19 @@ """ from maps import FrozenMap -from typing import Dict -from execution_engine2.utils.arg_processing import check_string as _check_string +from typing import Dict, List, Union +from execution_engine2.utils.arg_processing import ( + check_string as _check_string, + not_falsy as _not_falsy, +) +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.utils.application_info import AppInfo from execution_engine2.exceptions import IncorrectParamsException -def _gt_zero(num: int, name: str) -> int: +def _gt_zero(num: int, name: str, optional=False) -> Union[int, None]: + if num is None and optional: + return None if num is None or num < 1: raise IncorrectParamsException(f"{name} must be at least 1") return num @@ -25,10 +32,11 @@ def __init__( memory_MB: int, disk_GB: int, client_group: str, - client_group_regex: bool = False, + client_group_regex: Union[bool, None] = None, as_user: str = None, ignore_concurrency_limits: bool = False, scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = False, ): """ Create the job requirements. @@ -38,12 +46,13 @@ def __init__( disk_GB - the amount of disk space, in GB, required for the job. client_group - the client group in which the job will run. client_group_regex - whether to treat the client group string as a regular expression - that can match multiple client groups. + that can match multiple client groups. Pass None for no preference. as_user - run the job as an alternate user; take the user's username. ignore_concurrency_limits - allow the user to run this job even if the user's maximum job count has already been reached. scheduler_requirements - arbitrary requirements for the scheduler passed as key/value pairs. Requires knowledge of the scheduler API. + debug_mode - whether to run the job in debug mode. """ self.cpus = _gt_zero(cpus, "CPU count") self.memory_MB = _gt_zero(memory_MB, "memory in MB") @@ -59,19 +68,24 @@ def __init__( value, f"value for key '{key}' in scheduler requirements structure" ) self.scheduler_requirements = FrozenMap(sr) + self.debug_mode = debug_mode + + def _params(self): + return ( + self.cpus, + self.memory_MB, + self.disk_GB, + self.client_group, + self.client_group_regex, + self.as_user, + self.ignore_concurrency_limits, + self.scheduler_requirements, + self.debug_mode, + ) def __eq__(self, other): if type(self) == type(other): - return ( - self.cpus, - self.memory_MB, - self.disk_GB, - self.client_group, - self.client_group_regex, - self.as_user, - self.ignore_concurrency_limits, - self.scheduler_requirements, - ) == ( + return self._params() == ( other.cpus, other.memory_MB, other.disk_GB, @@ -80,19 +94,98 @@ def __eq__(self, other): other.as_user, other.ignore_concurrency_limits, other.scheduler_requirements, + other.debug_mode, ) return False def __hash__(self): - return hash( - ( - self.cpus, - self.memory_MB, - self.disk_GB, - self.client_group, - self.client_group_regex, - self.as_user, - self.ignore_concurrency_limits, - self.scheduler_requirements, - ) + return hash(self._params()) + + +# move this function somewhere else? +def _is_valid_UPA(upa: str) -> (str, bool): + # returns an empty string if not a valid upa + if upa is None or not upa.strip(): + return "", False + parts = [p.strip() for p in upa.split("/")] + if not len(parts) == 3: + return "", False + for p in parts: + try: + int(p) + except ValueError: + return "", False + return "/".join(parts), True + + +class JobSubmissionParameters: + """ + Parameters for submitting a job to a job scheduler. + """ + + def __init__( + self, + job_id: str, + app_info: AppInfo, + job_reqs: JobRequirements, + user_creds: UserCreds, + parent_job_id: str = None, + wsid: int = None, + source_ws_objects: List[str] = None, + ): + """ + Create the parameters. + + job_id - the ID of the job. + app_info - information about the application to be run. + job_reqs - requirements for the job. + user_creds - user credentials. + parent_job_id - the ID of the parent job to this job, if any. + wsid - the ID of the workspace with which the job is associated, if any. + source_ws_objects - workspace objects that are part of the job input. + """ + self.job_id = _check_string(job_id, "job_id") + self.app_info = _not_falsy(app_info, "app_info") + self.job_reqs = _not_falsy(job_reqs, "job_reqs") + self.user_creds = _not_falsy(user_creds, "user_creds") + self.parent_job_id = _check_string( + parent_job_id, "parent_job_id", optional=True ) + self.wsid = _gt_zero(wsid, "wsid", optional=True) + source_ws_objects = source_ws_objects if source_ws_objects else [] + for i, ref in enumerate(source_ws_objects): + upa, is_valid = _is_valid_UPA(ref) + if not is_valid: + raise IncorrectParamsException( + f"source_ws_objects index {i}, '{ref}', " + + "is not a valid Unique Permanent Address" + ) + source_ws_objects[i] = upa + self.source_ws_objects = tuple(source_ws_objects) + + def _params(self): + return ( + self.job_id, + self.app_info, + self.job_reqs, + self.user_creds, + self.parent_job_id, + self.wsid, + self.source_ws_objects, + ) + + def __eq__(self, other): + if type(self) == type(other): + return self._params() == ( + other.job_id, + other.app_info, + other.job_reqs, + other.user_creds, + other.parent_job_id, + other.wsid, + other.source_ws_objects, + ) + return False + + def __hash__(self): + return hash(self._params()) diff --git a/test/tests_for_sdkmr/job_submission_parameters_test.py b/test/tests_for_sdkmr/job_submission_parameters_test.py index 4f7e0bd3f..76f816e2f 100644 --- a/test/tests_for_sdkmr/job_submission_parameters_test.py +++ b/test/tests_for_sdkmr/job_submission_parameters_test.py @@ -1,5 +1,10 @@ from pytest import raises -from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.sdk.job_submission_parameters import ( + JobRequirements, + JobSubmissionParameters, +) +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.utils.application_info import AppInfo from execution_engine2.exceptions import IncorrectParamsException from utils_shared.test_utils import assert_exception_correct @@ -11,10 +16,11 @@ def test_job_req_init_minimal(): assert jr.memory_MB == 1 assert jr.disk_GB == 1 assert jr.client_group == "njs" - assert jr.client_group_regex is False + assert jr.client_group_regex is None assert jr.as_user is None assert jr.ignore_concurrency_limits is False assert jr.scheduler_requirements == {} + assert jr.debug_mode is False def test_job_req_init_maximal(): @@ -27,6 +33,7 @@ def test_job_req_init_maximal(): "someuser", True, {"proc": "x286", "maxmem": "640k"}, + True, ) assert jr.cpus == 6 @@ -37,6 +44,7 @@ def test_job_req_init_maximal(): assert jr.as_user == "someuser" assert jr.ignore_concurrency_limits is True assert jr.scheduler_requirements == {"proc": "x286", "maxmem": "640k"} + assert jr.debug_mode is True def test_job_req_init_fail(): @@ -151,7 +159,7 @@ def _job_req_init_fail(cpus, mem, disk, cgroup, user, reqs, expected): assert_exception_correct(got.value, expected) -def test_equals(): +def test_job_req_equals(): c1 = "cligroupf" c1a = "cligroupf" c2 = "cligroupg" @@ -164,33 +172,27 @@ def test_equals(): r1a = {"a": "b"} r2 = {"a": "c"} - assert JobRequirements(1, 1, 1, c1) == JobRequirements(1, 1, 1, c1a) - assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) == JobRequirements( - 1, 1, 1, c1a, t, u1a, f, r1a - ) + jr_sm = JobRequirements(1, 1, 1, c1) + jr_lg = JobRequirements(1, 1, 1, c1, t, u1, f, r1, t) - assert JobRequirements(1, 1, 1, c1) != JobRequirements(2, 1, 1, c1a) - assert JobRequirements(1, 1, 1, c1) != JobRequirements(1, 2, 1, c1a) - assert JobRequirements(1, 1, 1, c1) != JobRequirements(1, 1, 2, c1a) - assert JobRequirements(1, 1, 1, c1) != JobRequirements(1, 1, 1, c2) - assert JobRequirements(1, 1, 1, c1) != (1, 1, 1, c1) + assert jr_sm == JobRequirements(1, 1, 1, c1a) + assert jr_lg == JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, t) - assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( - 1, 1, 1, c1a, f, u1a, f, r1a - ) - assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( - 1, 1, 1, c1a, t, u2, f, r1a - ) - assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( - 1, 1, 1, c1a, t, u1a, t, r1a - ) - assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != JobRequirements( - 1, 1, 1, c1a, t, u1a, f, r2 - ) - assert JobRequirements(1, 1, 1, c1, t, u1, f, r1) != (1, 1, 1, c1a, t, u1a, f, r1a) + assert jr_sm != JobRequirements(2, 1, 1, c1a) + assert jr_sm != JobRequirements(1, 2, 1, c1a) + assert jr_sm != JobRequirements(1, 1, 2, c1a) + assert jr_sm != JobRequirements(1, 1, 1, c2) + assert jr_sm != (1, 1, 1, c1) + + assert jr_lg != JobRequirements(1, 1, 1, c1a, f, u1a, f, r1a, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u2, f, r1a, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u1a, t, r1a, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u1a, f, r2, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, f) + assert jr_lg != (1, 1, 1, c1a, t, u1a, f, r1a, t) -def test_hash(): +def test_job_req_hash(): # hashes will change from instance to instance of the python interpreter, and therefore # tests can't be written that directly test the hash value. See # https://docs.python.org/3/reference/datamodel.html#object.__hash__ @@ -206,25 +208,244 @@ def test_hash(): r1a = {"a": "b"} r2 = {"a": "c"} - assert hash(JobRequirements(1, 1, 1, c1)) == hash(JobRequirements(1, 1, 1, c1a)) - assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) == hash( - JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a) + jr_sm = JobRequirements(1, 1, 1, c1) + jr_lg = JobRequirements(1, 1, 1, c1, t, u1, f, r1, t) + + assert hash(jr_sm) == hash(JobRequirements(1, 1, 1, c1a)) + assert hash(jr_lg) == hash(JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, t)) + + assert hash(jr_sm) != hash(JobRequirements(2, 1, 1, c1a)) + assert hash(jr_sm) != hash(JobRequirements(1, 2, 1, c1a)) + assert hash(jr_sm) != hash(JobRequirements(1, 1, 2, c1a)) + assert hash(jr_sm) != hash(JobRequirements(1, 1, 1, c2)) + + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, f, u1a, f, r1a, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u2, f, r1a, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u1a, t, r1a, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u1a, f, r2, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, f)) + + +def test_job_sub_init_minimal(): + jsp = JobSubmissionParameters( + "jobid", + AppInfo("a.b", "a/x"), + JobRequirements(6, 7, 4, "cligroup"), + UserCreds("user", "tokeytoken"), ) - assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(2, 1, 1, c1a)) - assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(1, 2, 1, c1a)) - assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(1, 1, 2, c1a)) - assert hash(JobRequirements(1, 1, 1, c1)) != hash(JobRequirements(1, 1, 1, c2)) + assert jsp.job_id == "jobid" + assert jsp.app_info == AppInfo("a.b", "a/x") + assert jsp.job_reqs == JobRequirements(6, 7, 4, "cligroup") + assert jsp.user_creds == UserCreds("user", "tokeytoken") + assert jsp.parent_job_id is None + assert jsp.wsid is None + assert jsp.source_ws_objects == tuple() - assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( - JobRequirements(1, 1, 1, c1a, f, u1a, f, r1a) + +def test_job_sub_init_maximal(): + jsp = JobSubmissionParameters( + " jobid \t ", + AppInfo("a.b", "a/x"), + JobRequirements(6, 7, 4, "cligroup"), + UserCreds("user", "tokeytoken"), + " parentid \t ", + 1, + [" 1 /\t2 / 4", "6/7/8"], ) - assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( - JobRequirements(1, 1, 1, c1a, t, u2, f, r1a) + + assert jsp.job_id == "jobid" + assert jsp.app_info == AppInfo("a.b", "a/x") + assert jsp.job_reqs == JobRequirements(6, 7, 4, "cligroup") + assert jsp.user_creds == UserCreds("user", "tokeytoken") + assert jsp.parent_job_id == "parentid" + assert jsp.wsid == 1 + assert jsp.source_ws_objects == ("1/2/4", "6/7/8") + + +def test_job_sub_init_fail(): + n = None + j = "jobby job job" + a = AppInfo("a.b", "a/x") + r = JobRequirements(6, 7, 4, "cligroup") + u = UserCreds("user", "tokeytoken") + + _job_sub_init_fail( + n, a, r, u, n, n, n, IncorrectParamsException("Missing input parameter: job_id") ) - assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( - JobRequirements(1, 1, 1, c1a, t, u1a, t, r1a) + _job_sub_init_fail( + " \t ", + a, + r, + u, + n, + n, + n, + IncorrectParamsException("Missing input parameter: job_id"), ) - assert hash(JobRequirements(1, 1, 1, c1, t, u1, f, r1)) != hash( - JobRequirements(1, 1, 1, c1a, t, u1a, f, r2) + _job_sub_init_fail( + j, + n, + r, + u, + n, + n, + n, + ValueError("app_info cannot be a value that evaluates to false"), ) + _job_sub_init_fail( + j, + a, + n, + u, + n, + n, + n, + ValueError("job_reqs cannot be a value that evaluates to false"), + ) + _job_sub_init_fail( + j, + a, + r, + n, + n, + n, + n, + ValueError("user_creds cannot be a value that evaluates to false"), + ) + # the only way to get parent id to to fail is with a control char + _job_sub_init_fail( + j, + a, + r, + u, + "par\bent", + n, + n, + IncorrectParamsException("parent_job_id contains control characters"), + ) + _job_sub_init_fail( + j, a, r, u, n, 0, n, IncorrectParamsException("wsid must be at least 1") + ) + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + ["1/2/3", n], + IncorrectParamsException( + "source_ws_objects index 1, 'None', is not a valid Unique Permanent Address" + ), + ) + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + ["1/2/3", " \t "], + IncorrectParamsException( + "source_ws_objects index 1, ' \t ', is not a valid Unique Permanent Address" + ), + ) + for o in ["1/2", "1/2/", "/1/2", "1/2/3/4", "x/2/3", "1/x/3", "1/2/x"]: + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + [o], + IncorrectParamsException( + f"source_ws_objects index 0, '{o}', is not a valid Unique Permanent Address" + ), + ) + + +def _job_sub_init_fail(jobid, appinfo, jobreq, usercred, parentid, wsid, wso, expected): + with raises(Exception) as got: + JobSubmissionParameters(jobid, appinfo, jobreq, usercred, parentid, wsid, wso) + assert_exception_correct(got.value, expected) + + +def test_job_sub_equals(): + j1 = "jobby job job" + j1a = "jobby job job" + j2 = "jobby job job JOB" + a1 = AppInfo("a.b", "a/x") + a1a = AppInfo("a.b", "a/x") + a2 = AppInfo("a.b", "a/y") + r1 = JobRequirements(6, 7, 4, "cligroup") + r1a = JobRequirements(6, 7, 4, "cligroup") + r2 = JobRequirements(6, 7, 4, "cligroup2") + u1 = UserCreds("user", "tokeytoken") + u1a = UserCreds("user", "tokeytoken") + u2 = UserCreds("user", "tokeytoken2") + p1 = "I'm so miserable and you just don't care" + p1a = "I'm so miserable and you just don't care" + p2 = "Oh do shut up Portia" + w1 = ["1/2/3"] + w1a = ["1/2/3"] + w2 = ["1/2/4"] + + JSP = JobSubmissionParameters + jsp_sm = JSP(j1, a1, r1, u1) + jsp_lg = JSP(j1, a1, r1, u1, p1, 1, w1) + + assert jsp_sm == JSP(j1a, a1a, r1a, u1a) + assert jsp_lg == JSP(j1a, a1a, r1a, u1a, p1a, 1, w1a) + + assert jsp_sm != JSP(j2, a1a, r1a, u1a) + assert jsp_sm != JSP(j1a, a2, r1a, u1a) + assert jsp_sm != JSP(j1a, a1a, r2, u1a) + assert jsp_sm != JSP(j1a, a1a, r1a, u2) + assert jsp_sm != (j1a, a1a, r1a, u1a) + + assert jsp_lg != JSP(j1a, a1a, r1a, u1a, p2, 1, w1a) + assert jsp_lg != JSP(j1a, a1a, r1a, u1a, p1a, 2, w1a) + assert jsp_lg != JSP(j1a, a1a, r1a, u1a, p1a, 1, w2) + assert jsp_lg != (j1a, a1a, r1a, u1a, p1a, 1, w1a) + + +def test_job_sub_hash(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + j1 = "jobby job job" + j1a = "jobby job job" + j2 = "jobby job job JOB" + a1 = AppInfo("a.b", "a/x") + a1a = AppInfo("a.b", "a/x") + a2 = AppInfo("a.b", "a/y") + r1 = JobRequirements(6, 7, 4, "cligroup") + r1a = JobRequirements(6, 7, 4, "cligroup") + r2 = JobRequirements(6, 7, 4, "cligroup2") + u1 = UserCreds("user", "tokeytoken") + u1a = UserCreds("user", "tokeytoken") + u2 = UserCreds("user", "tokeytoken2") + p1 = "I'm so miserable and you just don't care" + p1a = "I'm so miserable and you just don't care" + p2 = "Oh do shut up Portia" + w1 = ["1/2/3"] + w1a = ["1/2/3"] + w2 = ["1/2/4"] + + JSP = JobSubmissionParameters + jsp_sm = JSP(j1, a1, r1, u1) + jsp_lg = JSP(j1, a1, r1, u1, p1, 1, w1) + + assert hash(jsp_sm) == hash(JSP(j1a, a1a, r1a, u1a)) + assert hash(jsp_lg) == hash(JSP(j1a, a1a, r1a, u1a, p1a, 1, w1a)) + + assert hash(jsp_sm) != hash(JSP(j2, a1a, r1a, u1a)) + assert hash(jsp_sm) != hash(JSP(j1a, a2, r1a, u1a)) + assert hash(jsp_sm) != hash(JSP(j1a, a1a, r2, u1a)) + assert hash(jsp_sm) != hash(JSP(j1a, a1a, r1a, u2)) + + assert hash(jsp_lg) != hash(JSP(j1a, a1a, r1a, u1a, p2, 1, w1a)) + assert hash(jsp_lg) != hash(JSP(j1a, a1a, r1a, u1a, p1a, 2, w1a)) + assert hash(jsp_lg) != hash(JSP(j1a, a1a, r1a, u1a, p1a, 1, w2)) From 013d2c0f9b81e17afa79dfa7bf90566cb4dd499b Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 23 Mar 2021 13:46:24 -0700 Subject: [PATCH 032/109] DATAUP-389: Add a job requirements checker (#336) * Add a job requirements checker To be used in the job resolver prior to creating the requirements. Also normalize booleans and document how EE2 interacts with catalog client group specifications. * run black * Make check_params return normalized values Makes it easier for functions to use those values later * run black * Rename as_user to bill_to_user Better description of what the field actually does Also fix a typo in the test deploy.cfg file --- README.md | 28 +++ .../sdk/job_submission_parameters.py | 79 ++++++- test/deploy.cfg | 2 +- .../job_submission_parameters_test.py | 194 +++++++++++++++++- 4 files changed, 287 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 3dd26c40f..5f433d8ec 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,34 @@ pre-commit uninstall * Once the PR is apporoved, merge (no squash) to `main`. * Tag the merge commit in GitHub with the semantic version from `kbase.yml`. +## KBase Catalog interactions + +### Client Groups + +EE2 understands client group specifications in JSON and CSV formats. Both formats have special +fields in common: +* `request_cpus` - the number of CPUs to request +* `request_memory` - the amount of memory, in MB, to request +* `request_disk` - the amount of memory, in GB, to request +* `client_group_regex` - treat the client group (see below) as a regular expression +* `debug_mode` - run the job in debug mode + +The client group is handled differently for JSON and CSV: +* The JSON format has the `clientgroup` field, which is optional. +* The CSV format must have the client group in the first 'column' of the CSV and is required. The + remainder of the 'columns' must be in `key=value` format. + +Any fields other than the above are sent on to the scheduler as key value pairs. + +For example, to set the client group to `bigmem`, request 32 CPUs, 64GB of memory, and 1TB of disk, +the following would be entered in the catalog UI: +* CSV: `bigmem, request_cpus=32, request_memory=64000, request_disk=1000` +* JSON: `{"client_group": "bigmem", "request_cpus" : "32", "request_memory" : "64000", "request_disk" : "1000"}` + +Note that the representation of this data in the catalog API is idiosyncratic - both the JSON and +CSV data are split by commas into parts. EE2 will detect JSON entries and reconsitute them before +deserialization. + # Help Contact @Tianhao-Gu, @bio_boris, @briehl diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py index 280fb36c6..519a35749 100644 --- a/lib/execution_engine2/sdk/job_submission_parameters.py +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -33,7 +33,7 @@ def __init__( disk_GB: int, client_group: str, client_group_regex: Union[bool, None] = None, - as_user: str = None, + bill_to_user: str = None, ignore_concurrency_limits: bool = False, scheduler_requirements: Dict[str, str] = None, debug_mode: bool = False, @@ -47,7 +47,7 @@ def __init__( client_group - the client group in which the job will run. client_group_regex - whether to treat the client group string as a regular expression that can match multiple client groups. Pass None for no preference. - as_user - run the job as an alternate user; take the user's username. + bill_to_user - bill the job to an alternate user; takes the user's username. ignore_concurrency_limits - allow the user to run this job even if the user's maximum job count has already been reached. scheduler_requirements - arbitrary requirements for the scheduler passed as key/value @@ -58,17 +58,76 @@ def __init__( self.memory_MB = _gt_zero(memory_MB, "memory in MB") self.disk_GB = _gt_zero(disk_GB, "disk space in GB") self.client_group = _check_string(client_group, "client_group") - self.client_group_regex = client_group_regex - self.as_user = _check_string(as_user, "as_user", optional=True) - self.ignore_concurrency_limits = ignore_concurrency_limits - sr = scheduler_requirements if scheduler_requirements else {} + self.client_group_regex = ( + None if client_group_regex is None else bool(client_group_regex) + ) + self.bill_to_user = _check_string(bill_to_user, "bill_to_user", optional=True) + self.ignore_concurrency_limits = bool(ignore_concurrency_limits) + self.scheduler_requirements = FrozenMap( + self._check_scheduler_requirements(scheduler_requirements) + ) + self.debug_mode = bool(debug_mode) + + @classmethod + def _check_scheduler_requirements(cls, schd_reqs): + sr = schd_reqs if schd_reqs else {} for key, value in sr.items(): _check_string(key, "key in scheduler requirements structure") _check_string( value, f"value for key '{key}' in scheduler requirements structure" ) - self.scheduler_requirements = FrozenMap(sr) - self.debug_mode = debug_mode + return sr + + @classmethod + def check_parameters( + cls, + cpus: int = None, + memory_MB: int = None, + disk_GB: int = None, + client_group: str = None, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = False, + ): + """ + Test that a set of parameters are legal and returns normalized parmeters. + All arguments are optional - parameters required for initializing the class may be missing. + + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + """ + # Could add a check_required_parameters bool if needed, but YAGNI for now. Any missing + # required paramaters will be looked up from the catalog or EE2 config file. + if cpus is not None: + _gt_zero(cpus, "CPU count") + if memory_MB is not None: + _gt_zero(memory_MB, "memory in MB") + if disk_GB is not None: + _gt_zero(disk_GB, "disk space in GB") + if client_group is not None: + client_group = _check_string(client_group, "client_group") + return ( + cpus, + memory_MB, + disk_GB, + client_group, + None if client_group_regex is None else bool(client_group_regex), + _check_string(bill_to_user, "bill_to_user", optional=True), + bool(ignore_concurrency_limits), + cls._check_scheduler_requirements(scheduler_requirements), + bool(debug_mode), + ) def _params(self): return ( @@ -77,7 +136,7 @@ def _params(self): self.disk_GB, self.client_group, self.client_group_regex, - self.as_user, + self.bill_to_user, self.ignore_concurrency_limits, self.scheduler_requirements, self.debug_mode, @@ -91,7 +150,7 @@ def __eq__(self, other): other.disk_GB, other.client_group, other.client_group_regex, - other.as_user, + other.bill_to_user, other.ignore_concurrency_limits, other.scheduler_requirements, other.debug_mode, diff --git a/test/deploy.cfg b/test/deploy.cfg index 17889078e..25bb1ac77 100644 --- a/test/deploy.cfg +++ b/test/deploy.cfg @@ -80,7 +80,7 @@ request_disk = 100GB [hpc] request_cpus = 4 request_memory = 2000M -request_disk = 100GBraiss +request_disk = 100GB #---------------------------------------------------------------------------------------# [DEFAULT] default_client_group = njs diff --git a/test/tests_for_sdkmr/job_submission_parameters_test.py b/test/tests_for_sdkmr/job_submission_parameters_test.py index 76f816e2f..678a18fe4 100644 --- a/test/tests_for_sdkmr/job_submission_parameters_test.py +++ b/test/tests_for_sdkmr/job_submission_parameters_test.py @@ -17,7 +17,7 @@ def test_job_req_init_minimal(): assert jr.disk_GB == 1 assert jr.client_group == "njs" assert jr.client_group_regex is None - assert jr.as_user is None + assert jr.bill_to_user is None assert jr.ignore_concurrency_limits is False assert jr.scheduler_requirements == {} assert jr.debug_mode is False @@ -28,9 +28,9 @@ def test_job_req_init_maximal(): 6, 7, 8, - "bigmemlong", + " bigmemlong \t ", True, - "someuser", + "\tsomeuser ", True, {"proc": "x286", "maxmem": "640k"}, True, @@ -41,12 +41,52 @@ def test_job_req_init_maximal(): assert jr.disk_GB == 8 assert jr.client_group == "bigmemlong" assert jr.client_group_regex is True - assert jr.as_user == "someuser" + assert jr.bill_to_user == "someuser" assert jr.ignore_concurrency_limits is True assert jr.scheduler_requirements == {"proc": "x286", "maxmem": "640k"} assert jr.debug_mode is True +def test_job_req_init_non_bools(): + for inp, expected in { + 1: True, + " ": True, + (1,): True, + 0: False, + "": False, + tuple(): False, + }.items(): + jr = JobRequirements( + 6, + 7, + 8, + "cg", + client_group_regex=inp, + ignore_concurrency_limits=inp, + debug_mode=inp, + ) + + assert jr.client_group_regex is expected + assert jr.ignore_concurrency_limits is expected + assert jr.debug_mode is expected + + +def test_job_req_init_None_for_bools(): + jr = JobRequirements( + 6, + 7, + 8, + "cg", + client_group_regex=None, + ignore_concurrency_limits=None, + debug_mode=None, + ) + + assert jr.client_group_regex is None + assert jr.ignore_concurrency_limits is False + assert jr.debug_mode is False + + def test_job_req_init_fail(): n = None _job_req_init_fail( @@ -105,7 +145,7 @@ def test_job_req_init_fail(): "f", "user\tname", n, - IncorrectParamsException("as_user contains control characters"), + IncorrectParamsException("bill_to_user contains control characters"), ) _job_req_init_fail( 1, @@ -159,6 +199,150 @@ def _job_req_init_fail(cpus, mem, disk, cgroup, user, reqs, expected): assert_exception_correct(got.value, expected) +def test_job_req_check_parameters_no_input(): + n = None + f = False + assert JobRequirements.check_parameters() == (n, n, n, n, n, n, f, {}, f) + assert JobRequirements.check_parameters(n, n, n, n, n, n, n, n, n) == ( + n, + n, + n, + n, + n, + n, + f, + {}, + f, + ) + + +def test_job_req_check_parameters_full_input(): + assert ( + JobRequirements.check_parameters( + 1, + 1, + 1, + " b ", + "x", + " user ", + 890, + {"proc": "x286", "maxmem": "640k"}, + [], + ) + == (1, 1, 1, "b", True, "user", True, {"proc": "x286", "maxmem": "640k"}, False) + ) + + +def test_job_req_check_parameters_whitespace_as_user(): + assert ( + JobRequirements.check_parameters( + 1, + 1, + 1, + " b ", + False, + " \t ", + 890, + {"proc": "x286", "maxmem": "640k"}, + [], + ) + == (1, 1, 1, "b", False, None, True, {"proc": "x286", "maxmem": "640k"}, False) + ) + + +def test_job_req_check_parameters_fail(): + n = None + _job_req_check_parameters_fail( + 0, 1, 1, "c", "u", n, IncorrectParamsException("CPU count must be at least 1") + ) + _job_req_check_parameters_fail( + 1, + 0, + 1, + "c", + "u", + n, + IncorrectParamsException("memory in MB must be at least 1"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 0, + "c", + "u", + n, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + " \t ", + "u", + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + " j\bi ", + n, + IncorrectParamsException("bill_to_user contains control characters"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {None: 1}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {"a": None}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {" \t ": 1}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {"b": " \t "}, + IncorrectParamsException( + "Missing input parameter: value for key 'b' in scheduler requirements structure" + ), + ) + + +def _job_req_check_parameters_fail(cpu, mem, disk, cgroup, user, reqs, expected): + with raises(Exception) as got: + JobRequirements(cpu, mem, disk, cgroup, True, user, True, reqs) + assert_exception_correct(got.value, expected) + + def test_job_req_equals(): c1 = "cligroupf" c1a = "cligroupf" From 990805ff66ba88c0703faa478804943595bfffa4 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 25 Mar 2021 12:19:20 -0700 Subject: [PATCH 033/109] DATAUP-389 - add a job requirements resolver, pt 1 (#338) * Add job requirements pt1 Probably 3 parts total. This part adds a normalization routine for job resource input from a user, the catalog, or the ee2 config file. It will be used in the resolver in later parts. * run black * Add comment * typo --- README.md | 4 +- .../utils/job_requirements_resolver.py | 166 ++++++++++ .../job_requirements_resolver_test.py | 304 ++++++++++++++++++ 3 files changed, 472 insertions(+), 2 deletions(-) create mode 100644 lib/execution_engine2/utils/job_requirements_resolver.py create mode 100644 test/tests_for_utils/job_requirements_resolver_test.py diff --git a/README.md b/README.md index 5f433d8ec..f7762d853 100644 --- a/README.md +++ b/README.md @@ -114,8 +114,8 @@ fields in common: * `request_cpus` - the number of CPUs to request * `request_memory` - the amount of memory, in MB, to request * `request_disk` - the amount of memory, in GB, to request -* `client_group_regex` - treat the client group (see below) as a regular expression -* `debug_mode` - run the job in debug mode +* `client_group_regex` - boolean - treat the client group (see below) as a regular expression +* `debug_mode` - boolean - run the job in debug mode The client group is handled differently for JSON and CSV: * The JSON format has the `clientgroup` field, which is optional. diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py new file mode 100644 index 000000000..837fe1252 --- /dev/null +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -0,0 +1,166 @@ +""" +Contains resolvers for job requirements. +""" + +from typing import Dict, Union + +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.exceptions import IncorrectParamsException + +CLIENT_GROUP = "client_group" +REQUEST_CPUS = "request_cpus" +REQUEST_MEMORY = "request_memory" +REQUEST_DISK = "request_disk" +CLIENT_GROUP_REGEX = "client_group_regex" +DEBUG_MODE = "debug_mode" +_RESOURCES = set([CLIENT_GROUP, REQUEST_CPUS, REQUEST_MEMORY, REQUEST_DISK]) + + +def _check_raise(name, value, source): + raise IncorrectParamsException( + f"Found illegal {name} '{value}' in job requirements from {source}" + ) + + +def _check_clientgroup(clientgroup, source): + clientgroup = _string_request(clientgroup, "client group", source) + # this is a possible error mode from the catalog since it uses key=value pairs in CSV + # format + if "=" in clientgroup: + _check_raise("client group", clientgroup, source) + return clientgroup + + +def _string_request(putative_string, name, source): + if type(putative_string) != str: + _check_raise(name, putative_string, source) + return putative_string.strip() + + +def _int_request(putative_int, original, name, source): + if type(putative_int) == float: + _check_raise(f"{name} request", original, source) + try: + return int(putative_int) + except ValueError: + _check_raise(f"{name} request", original, source) + + +def _check_cpus(cpus, source): + return _int_request(cpus, cpus, "cpu", source) + + +def _check_memory(memory, source): + if type(memory) == int: + return memory + memory2 = _string_request(memory, "memory request", source) + if memory2.endswith("M"): + memory2 = memory2[:-1] + elif memory2.endswith("MB"): + memory2 = memory2[:-2] + return _int_request(memory2, memory, "memory", source) + + +def _check_disk(disk, source): + if type(disk) == int: + return disk + disk2 = _string_request(disk, "disk request", source) + if disk2.endswith("GB"): + disk2 = disk2[:-2] + return _int_request(disk2, disk, "disk", source) + + +def _bool_request(putative_bool, name, source): + if type(putative_bool) == bool or type(putative_bool) == int: + return bool(putative_bool) + pbs = _string_request(putative_bool, name, source).lower() + if pbs == "true": + return True + if pbs == "false": + return False + _check_raise(name, putative_bool, source) + + +def _check_client_group_regex(client_group_regex, source) -> bool: + return _bool_request(client_group_regex, "client group regex", source) + + +def _check_debug_mode(debug_mode, source) -> bool: + return _bool_request(debug_mode, "debug mode", source) + + +_KEY_CHECKERS = { + CLIENT_GROUP: _check_clientgroup, + REQUEST_CPUS: _check_cpus, + REQUEST_MEMORY: _check_memory, + REQUEST_DISK: _check_disk, + CLIENT_GROUP_REGEX: _check_client_group_regex, + DEBUG_MODE: _check_debug_mode, +} + + +class JobRequirementsResolver: + """ + Resolves requirements for a job (e.g. CPU, memory, etc.) given a method id and optional input + parameters. Order of precedence is: + 1) Parameters submitted by the client programmer + 2) Requirements in the KBase Catalog service + 3) Requirements from the EE2 configuration file (deploy.cfg). + """ + + @classmethod + def normalize_job_reqs( + cls, reqs: Dict[str, str], source: str, require_all_resources=False + ) -> Dict[str, Union[str, int]]: + f""" + Massage job requirements into a standard format. Does the following to specific keys of + the reqs argument: + + {CLIENT_GROUP}: ensures it does not contain an =. This error mode is more probable in + the KBase catalog UI. + {REQUEST_CPUS}: parses to an int + {REQUEST_MEMORY}: parses to an int, removing a trailing 'M' or 'MB' if necessary. + {REQUEST_DISK}: parses to an int, removing a trailing 'GB' if necessary. + {CLIENT_GROUP_REGEX}: parses to a boolean or None. The strings true and false are + parsed to booleans, case-insensitive. Ints are parsed directly to booleans. + {DEBUG_MODE}: parses to a boolean. The strings true and false are parsed to booleans, + case-insensitive. Ints are parsed directly to booleans. + + reqs - the job requirements + source - the source of the job requirements, e.g. catalog, user, etc. + require_all_resources - True to throw an error if all four keys resources keys + ({CLIENT_GROUP}, {REQUEST_CPUS}, {REQUEST_MEMORY}, {REQUEST_DISK}) aren't present + with valid values. + + Returns a new dictionary with the altered keys. If any key is not present no action is + taken for that key. + """ + # TODO could support more units and convert as necessary (see checker funcs at start + # of module). YAGNI for now. + if reqs is None: + reqs = {} + ret = {} + for key in [ + CLIENT_GROUP, + REQUEST_CPUS, + REQUEST_MEMORY, + REQUEST_DISK, + CLIENT_GROUP_REGEX, + DEBUG_MODE, + ]: + if not cls._has_value(reqs.get(key)): + if require_all_resources and key in _RESOURCES: + raise IncorrectParamsException( + f"Missing {key} key in job requirements from {source}" + ) + else: + ret[key] = _KEY_CHECKERS[key](reqs.get(key), source) + return ret + + @classmethod + def _has_value(cls, inc): + if inc is None: + return False + if type(inc) == str and not inc.strip(): + return False + return True diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py new file mode 100644 index 000000000..8d793fb4f --- /dev/null +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -0,0 +1,304 @@ +""" +Unit tests for the job requirements resolver. +""" + +from enum import Enum +from pytest import raises +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_normalize_job_reqs_minimal(): + assert JobRequirementsResolver.normalize_job_reqs(None, "mysource") == {} + assert JobRequirementsResolver.normalize_job_reqs({}, "mysource") == {} + assert ( + JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": None, + "request_memory": None, + "request_disk": None, + "client_group": None, + "client_group_regex": None, + "debug_mode": None, + "expect_noop": " fooo ", + }, + "source", + ) + == {} + ) + assert ( + JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": " \t ", + "request_memory": " \t ", + "request_disk": " \t ", + "client_group": " \t ", + "client_group_regex": " \t ", + "debug_mode": " \t ", + "expect_noop": " fooo ", + }, + "source", + ) + == {} + ) + + +def test_normalize_job_reqs_minimal_require_all(): + assert JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": 1, + "request_memory": 1, + "request_disk": 1, + "client_group": "foo", + }, + "source", + True, + ) == { + "request_cpus": 1, + "request_memory": 1, + "request_disk": 1, + "client_group": "foo", + } + + +def test_normalize_job_reqs_maximal_ints(): + assert JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": 56, + "request_memory": 200, + "request_disk": 7000, + "client_group": " njs ", + "client_group_regex": 1, + "debug_mode": -1, + "expect_noop": 1, + }, + "mysource", + ) == { + "request_cpus": 56, + "request_memory": 200, + "request_disk": 7000, + "client_group": "njs", + "client_group_regex": True, + "debug_mode": True, + } + + +def test_normalize_job_reqs_maximal_strings(): + assert JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": " 56 ", + "request_memory": " 201 ", + "request_disk": " \t 7000 ", + "client_group": " njs ", + "client_group_regex": " False ", + "debug_mode": " true \t ", + "expect_noop": 1, + }, + "mysource", + ) == { + "request_cpus": 56, + "request_memory": 201, + "request_disk": 7000, + "client_group": "njs", + "client_group_regex": False, + "debug_mode": True, + } + + +def test_normalize_job_reqs_memory(): + for mem in [2000, "2000 ", " 2000M ", "2000MB"]: + assert JobRequirementsResolver.normalize_job_reqs( + {"request_memory": mem}, "s" + ) == {"request_memory": 2000} + + +def test_normalize_job_reqs_disk(): + for disk in [6000, "6000", " 6000GB "]: + assert JobRequirementsResolver.normalize_job_reqs( + {"request_disk": disk}, "s" + ) == {"request_disk": 6000} + + +def test_normalize_job_reqs_bools_true(): + for b in [True, 1, -1, 100, -100, " True ", " true"]: + assert JobRequirementsResolver.normalize_job_reqs( + {"client_group_regex": b, "debug_mode": b}, "s" + ) == {"client_group_regex": True, "debug_mode": True} + + +def test_normalize_job_reqs_bools_False(): + for b in [False, 0, " False ", " false"]: + assert JobRequirementsResolver.normalize_job_reqs( + {"client_group_regex": b, "debug_mode": b}, "s" + ) == {"client_group_regex": False, "debug_mode": False} + + +def test_normalize_job_reqs_fail_client_group(): + _normalize_job_reqs_fail( + {"client_group": []}, + "src", + False, + IncorrectParamsException( + "Found illegal client group '[]' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"client_group": "njs=true"}, + "src2", + False, + IncorrectParamsException( + "Found illegal client group 'njs=true' in job requirements from src2" + ), + ) + + +def test_normalize_job_reqs_fail_cpu(): + _normalize_job_reqs_fail( + {"request_cpus": 8.4}, + "src3", + False, + IncorrectParamsException( + "Found illegal cpu request '8.4' in job requirements from src3" + ), + ) + _normalize_job_reqs_fail( + {"request_cpus": "26M"}, + "src4", + False, + IncorrectParamsException( + "Found illegal cpu request '26M' in job requirements from src4" + ), + ) + + +def test_normalize_job_reqs_fail_mem(): + _normalize_job_reqs_fail( + {"request_memory": 3.2}, + "src5", + False, + IncorrectParamsException( + "Found illegal memory request '3.2' in job requirements from src5" + ), + ) + _normalize_job_reqs_fail( + {"request_memory": {}}, + "src5", + False, + IncorrectParamsException( + "Found illegal memory request '{}' in job requirements from src5" + ), + ) + _normalize_job_reqs_fail( + {"request_memory": "26G"}, + "src6", + False, + IncorrectParamsException( + "Found illegal memory request '26G' in job requirements from src6" + ), + ) + + +def test_normalize_job_reqs_fail_disk(): + _normalize_job_reqs_fail( + {"request_disk": 6.5}, + "src", + False, + IncorrectParamsException( + "Found illegal disk request '6.5' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"request_disk": set()}, + "src", + False, + IncorrectParamsException( + "Found illegal disk request 'set()' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"request_disk": "26M"}, + "src", + False, + IncorrectParamsException( + "Found illegal disk request '26M' in job requirements from src" + ), + ) + + +def test_normalize_job_reqs_fail_regex(): + _normalize_job_reqs_fail( + {"client_group_regex": 92.4}, + "src", + False, + IncorrectParamsException( + "Found illegal client group regex '92.4' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"client_group_regex": Enum}, + "src", + False, + IncorrectParamsException( + "Found illegal client group regex '' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"client_group_regex": "truthy"}, + "src", + False, + IncorrectParamsException( + "Found illegal client group regex 'truthy' in job requirements from src" + ), + ) + + +def test_normalize_job_reqs_fail_debug(): + _normalize_job_reqs_fail( + {"debug_mode": 9.5}, + "src", + False, + IncorrectParamsException( + "Found illegal debug mode '9.5' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"debug_mode": int}, + "src", + False, + IncorrectParamsException( + "Found illegal debug mode '' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"debug_mode": " yep "}, + "src", + False, + IncorrectParamsException( + "Found illegal debug mode ' yep ' in job requirements from src" + ), + ) + + +def test_normalize_job_reqs_fail_require_all(): + reqs_all = { + "request_cpus": 56, + "request_memory": 200, + "request_disk": 7000, + "client_group": "njs", + } + for k in ["request_cpus", "request_memory", "request_disk", "client_group"]: + r = dict(reqs_all) + del r[k] + _normalize_job_reqs_fail( + r, + "mysrc", + True, + IncorrectParamsException(f"Missing {k} key in job requirements from mysrc"), + ) + + +def _normalize_job_reqs_fail(reqs, source, req_all_res, expected): + with raises(Exception) as got: + JobRequirementsResolver.normalize_job_reqs(reqs, source, req_all_res) + assert_exception_correct(got.value, expected) From be9400e76166c70d53d97acfe9adc63f92a4776a Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 25 Mar 2021 12:52:49 -0700 Subject: [PATCH 034/109] DATAUP-389 - add a job requirements resolver, pt 2 of 4 (#339) * Add fn(job_requirements) -> requirements type Determine a requirements type based on job requirements. This will be used to determine whether a user is allowed to run a job with non-default requirements based on their ee2 admin status. *Probably* for now anything other than standard will require write admin but billing is split out as billing a job to another user seems like a step above merely specifying how a job is run. * run black --- .../utils/job_requirements_resolver.py | 77 +++++++++++++ .../job_requirements_resolver_test.py | 102 +++++++++++++++++- 2 files changed, 178 insertions(+), 1 deletion(-) diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index 837fe1252..98b72951d 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -3,6 +3,7 @@ """ from typing import Dict, Union +from enum import Enum from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.exceptions import IncorrectParamsException @@ -16,6 +17,27 @@ _RESOURCES = set([CLIENT_GROUP, REQUEST_CPUS, REQUEST_MEMORY, REQUEST_DISK]) +class RequirementsType(Enum): + """ + A classification of the type of requirements requested by the user. + """ + + STANDARD = 1 + """ + No special requests. + """ + + PROCESSING = 2 + """ + The user requests special processing such as a CPU count, removal of concurrency limits, etc. + """ + + BILLING = 3 + """ + The user requests that they bill another user. + """ + + def _check_raise(name, value, source): raise IncorrectParamsException( f"Found illegal {name} '{value}' in job requirements from {source}" @@ -108,6 +130,61 @@ class JobRequirementsResolver: 3) Requirements from the EE2 configuration file (deploy.cfg). """ + @classmethod + def get_requirements_type( + self, + cpus: int = None, + memory_MB: int = None, + disk_GB: int = None, + client_group: str = None, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = False, + ) -> RequirementsType: + f""" + Determine what type of requirements are being requested. + + All parameters are optional. + + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. Pass None for no preference. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + debug_mode - whether to run the job in debug mode. + + Returns the type of requirements requested by the user: + {RequirementsType.STANDARD.name} - if no requirements are requested + {RequirementsType.PROCESSING.name} - if any requirements other than bill_to_user are + requested + {RequirementsType.BILLING.name} - if bill_to_user is requested + """ + args = JobRequirements.check_parameters( + cpus, + memory_MB, + disk_GB, + client_group, + client_group_regex, + bill_to_user, + ignore_concurrency_limits, + scheduler_requirements, + debug_mode, + ) + if args[5]: # bill_to_user + return RequirementsType.BILLING + if any(args) or args[4] is False: + # regex False means the user is asking for non default + return RequirementsType.PROCESSING + return RequirementsType.STANDARD + @classmethod def normalize_job_reqs( cls, reqs: Dict[str, str], source: str, require_all_resources=False diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py index 8d793fb4f..93681c276 100644 --- a/test/tests_for_utils/job_requirements_resolver_test.py +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -4,7 +4,10 @@ from enum import Enum from pytest import raises -from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.job_requirements_resolver import ( + JobRequirementsResolver, + RequirementsType, +) from execution_engine2.exceptions import IncorrectParamsException from utils_shared.test_utils import assert_exception_correct @@ -302,3 +305,100 @@ def _normalize_job_reqs_fail(reqs, source, req_all_res, expected): with raises(Exception) as got: JobRequirementsResolver.normalize_job_reqs(reqs, source, req_all_res) assert_exception_correct(got.value, expected) + + +def test_get_requirements_type_standard(): + grt = JobRequirementsResolver.get_requirements_type + assert grt() == RequirementsType.STANDARD + assert ( + grt(None, None, None, None, None, None, None, None, None) + == RequirementsType.STANDARD + ) + assert ( + grt(None, None, None, None, None, None, False, {}, False) + == RequirementsType.STANDARD + ) + + +def test_get_requirements_type_processing(): + grt = JobRequirementsResolver.get_requirements_type + assert grt(cpus=4) == RequirementsType.PROCESSING + assert grt(memory_MB=26) == RequirementsType.PROCESSING + assert grt(disk_GB=78) == RequirementsType.PROCESSING + assert grt(client_group="foo") == RequirementsType.PROCESSING + assert grt(client_group_regex=False) == RequirementsType.PROCESSING + assert grt(client_group_regex=True) == RequirementsType.PROCESSING + assert grt(ignore_concurrency_limits=True) == RequirementsType.PROCESSING + assert grt(scheduler_requirements={"a": "b"}) == RequirementsType.PROCESSING + assert grt(debug_mode=True) == RequirementsType.PROCESSING + + assert ( + grt( + cpus=4, + memory_MB=2, + disk_GB=8, + client_group="yay", + client_group_regex=True, + ignore_concurrency_limits=True, + debug_mode=True, + ) + == RequirementsType.PROCESSING + ) + + +def test_get_requirements_type_billing(): + grt = JobRequirementsResolver.get_requirements_type + assert grt(bill_to_user="foo") == RequirementsType.BILLING + + assert ( + grt( + cpus=4, + memory_MB=2, + disk_GB=8, + client_group="yay", + client_group_regex=True, + bill_to_user="can I buy you a drink?", + ignore_concurrency_limits=True, + debug_mode=True, + ) + == RequirementsType.BILLING + ) + + +def test_get_requirements_type_fail(): + # All the illegal requirements testing is delegated to a method outside the code + # unit under test, so we just do one test per input to be sure it's hooked up correctly + # and delegate more thorough testing to the unit tests for the called method. + n = None + _grtf = _get_requirements_type_fail + _grtf(0, n, n, n, n, IncorrectParamsException("CPU count must be at least 1")) + _grtf(n, 0, n, n, n, IncorrectParamsException("memory in MB must be at least 1")) + _grtf( + n, n, 0, n, n, IncorrectParamsException("disk space in GB must be at least 1") + ) + _grtf( + n, + n, + n, + " \t ", + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _grtf( + n, + n, + n, + n, + " \bfoo ", + IncorrectParamsException("bill_to_user contains control characters"), + ) + # note there are no invalid values for client_group_regex, ignore_concurrentcy_limits, + # and debug_mode + + +def _get_requirements_type_fail(cpus, mem, disk, cg, btu, expected): + with raises(Exception) as got: + JobRequirementsResolver.get_requirements_type( + cpus, mem, disk, cg, False, btu, False, False + ) + assert_exception_correct(got.value, expected) From 4984ef0c8c60940663b0332508d2c834b4a3e8a2 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 29 Mar 2021 14:34:23 -0700 Subject: [PATCH 035/109] DATAUP-389 - add a job requirements resolver, pt 3 of 4 (#340) * Job resolver: add initialization & spec parsing * run black * typo --- lib/execution_engine2/sdk/EE2Constants.py | 4 + .../utils/job_requirements_resolver.py | 105 +++++++- .../job_requirements_resolver_test.py | 243 ++++++++++++++++++ 3 files changed, 351 insertions(+), 1 deletion(-) diff --git a/lib/execution_engine2/sdk/EE2Constants.py b/lib/execution_engine2/sdk/EE2Constants.py index 7d3f3cd0f..6871c22b3 100644 --- a/lib/execution_engine2/sdk/EE2Constants.py +++ b/lib/execution_engine2/sdk/EE2Constants.py @@ -9,6 +9,10 @@ KBASE_CONCIERGE_USERNAME = "kbaseconcierge" CONCIERGE_CLIENTGROUP = "kbase_concierge" +EE2_CONFIG_SECTION = "execution_engine2" +EE2_DEFAULT_SECTION = "DEFAULT" +EE2_DEFAULT_CLIENT_GROUP = "default_client_group" + # these also probably should be configurable. ADMIN_READ_ROLE = "EE2_ADMIN_RO" ADMIN_WRITE_ROLE = "EE2_ADMIN" diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index 98b72951d..ca3c063f6 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -2,10 +2,23 @@ Contains resolvers for job requirements. """ -from typing import Dict, Union +from configparser import ConfigParser +from typing import Iterable, Dict, Union, Set from enum import Enum +from lib.installed_clients.CatalogClient import Catalog + +from execution_engine2.utils.arg_processing import ( + check_string as _check_string, + not_falsy as _not_falsy, +) + from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.sdk.EE2Constants import ( + EE2_CONFIG_SECTION, + EE2_DEFAULT_SECTION, + EE2_DEFAULT_CLIENT_GROUP, +) from execution_engine2.exceptions import IncorrectParamsException CLIENT_GROUP = "client_group" @@ -130,6 +143,96 @@ class JobRequirementsResolver: 3) Requirements from the EE2 configuration file (deploy.cfg). """ + def __init__( + self, + catalog: Catalog, + cfgfile: Iterable[str], + override_client_group: str = None, + ): + """ + Create the resolver. + + catalog - a catalog client pointing at the relevant KBase catalog service. + cfgfile - the configuration file as an open file object or other iterable. + override_client_group - if provided, this client group will be used for all jobs, ignoring + all other sources of client group information. + """ + self._catalog = _not_falsy(catalog, "catalog") + self._override_client_group = _check_string( + override_client_group, "override_client_group", optional=True + ) + config = ConfigParser() + config.read_file(_not_falsy(cfgfile, "cfgfile")) + self._default_client_group = _check_string( + config.get( + section=EE2_DEFAULT_SECTION, + option=EE2_DEFAULT_CLIENT_GROUP, + fallback=None, + ), + f"value for {EE2_DEFAULT_SECTION}.{EE2_DEFAULT_CLIENT_GROUP} in deployment config file", + ) + self._clientgroup_default_configs = self._build_config(config) + if self._default_client_group not in self._clientgroup_default_configs: + raise ValueError( + "No deployment configuration entry for default " + + f"client group '{self._default_client_group}'" + ) + if ( + self._override_client_group + and self._override_client_group not in self._clientgroup_default_configs + ): + raise ValueError( + "No deployment configuration entry for override " + + f"client group '{self._override_client_group}'" + ) + + def _build_config(self, config): + ret = {} + for sec in config.sections(): + # if the default section is left as DEFAULT configparser shouldn't include it + # in the list, but just in case it changes... + if sec != EE2_CONFIG_SECTION and sec != EE2_DEFAULT_SECTION: + reqspec = {item[0]: item[1] for item in config.items(sec)} + reqspec[CLIENT_GROUP] = sec + ret[sec] = self.normalize_job_reqs( + reqspec, + f"section '{sec}' of the deployment configuration", + require_all_resources=True, + ) + return ret + + def get_override_client_group(self) -> Union[str, None]: + """ + Get the override client group, if any. This client group supercedes all others. + """ + return self._override_client_group + + def get_default_client_group(self) -> str: + """ + Get the default client group used if a client group is not provided by override, the user, + or the catalog. + """ + return self._default_client_group + + def get_configured_client_groups(self) -> Set[str]: + """ + Get the client groups configured in the configuration file. + """ + return self._clientgroup_default_configs.keys() + + def get_configured_client_group_spec( + self, clientgroup: str + ) -> Dict[str, Union[int, str]]: + f""" + Get the client group specification in normalized format. Includes the {CLIENT_GROUP}, + {REQUEST_CPUS}, {REQUEST_MEMORY}, and {REQUEST_DISK} keys. May, but usually will not, + include the {DEBUG_MODE} and {CLIENT_GROUP_REGEX} keys. + """ + if clientgroup not in self._clientgroup_default_configs: + raise ValueError(f"Client group '{clientgroup}' is not configured") + # make a copy to prevent accidental mutation by the caller + return dict(self._clientgroup_default_configs[clientgroup]) + @classmethod def get_requirements_type( self, diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py index 93681c276..230fb254c 100644 --- a/test/tests_for_utils/job_requirements_resolver_test.py +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -3,12 +3,15 @@ """ from enum import Enum +from io import StringIO from pytest import raises +from unittest.mock import create_autospec from execution_engine2.utils.job_requirements_resolver import ( JobRequirementsResolver, RequirementsType, ) from execution_engine2.exceptions import IncorrectParamsException +from installed_clients.CatalogClient import Catalog from utils_shared.test_utils import assert_exception_correct @@ -402,3 +405,243 @@ def _get_requirements_type_fail(cpus, mem, disk, cg, btu, expected): cpus, mem, disk, cg, False, btu, False, False ) assert_exception_correct(got.value, expected) + + +def _get_simple_deploy_spec_file_obj(): + return StringIO( + """ + [execution_engine2] + request_cpus = 0 + request_memory = 2000M + request_disk = 100GB + + [DEFAULT] + default_client_group = cg2 + + [cg1] + request_cpus = 4 + request_memory = 2000M + request_disk = 100GB + + [cg2] + request_cpus = 8 + request_memory = 700 + request_disk = 32 + debug_mode = True + client_group_regex = false + """ + ) + + +# Note the constructor uses the normalization class method under the hood for normalizing +# the EE2 config file client groups. As such, we don't duplicate the testing of that method +# here other than some spot checks. If the constructor changes significantly more +# testing may be required. + + +def test_init(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec) + assert jrr.get_default_client_group() == "cg2" + assert jrr.get_override_client_group() is None + assert jrr.get_configured_client_groups() == set(["cg1", "cg2"]) + assert jrr.get_configured_client_group_spec("cg1") == { + "request_cpus": 4, + "request_memory": 2000, + "request_disk": 100, + "client_group": "cg1", + } + + assert jrr.get_configured_client_group_spec("cg2") == { + "request_cpus": 8, + "request_memory": 700, + "request_disk": 32, + "client_group": "cg2", + "debug_mode": True, + "client_group_regex": False, + } + + +def test_init_with_override(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(catalog, spec, " \t ") + assert jrr.get_override_client_group() is None + + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(catalog, spec, "cg1") + assert jrr.get_override_client_group() == "cg1" + + +def test_init_fail_missing_input(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + _init_fail( + None, + _get_simple_deploy_spec_file_obj(), + None, + ValueError("catalog cannot be a value that evaluates to false"), + ) + _init_fail( + catalog, + None, + None, + ValueError("cfgfile cannot be a value that evaluates to false"), + ) + _init_fail( + catalog, + [], + None, + ValueError("cfgfile cannot be a value that evaluates to false"), + ) + + +def test_init_fail_no_override_in_config(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + + spec = _get_simple_deploy_spec_file_obj() + _init_fail( + catalog, + spec, + "cg3", + ValueError("No deployment configuration entry for override client group 'cg3'"), + ) + + +def test_init_fail_default_config_error(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + + shared_spec = """ + [njs] + request_cpus = 4 + request_memory = 2000M + request_disk = 100GB + """ + + _init_fail( + catalog, + StringIO(shared_spec), + None, + IncorrectParamsException( + "Missing input parameter: value for DEFAULT.default_client_group in deployment " + + "config file" + ), + ) + + spec = StringIO( + shared_spec + + """ + [DEFAULT] + foo = bar + """ + ) + _init_fail( + catalog, + spec, + None, + IncorrectParamsException( + "Missing input parameter: value for DEFAULT.default_client_group in deployment " + + "config file" + ), + ) + + spec = StringIO( + shared_spec + + """ + [DEFAULT] + default_client_group = njrs + """ + ) + _init_fail( + catalog, + spec, + None, + ValueError("No deployment configuration entry for default client group 'njrs'"), + ) + + +def test_init_fail_bad_config(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + + shared_spec = """ + [DEFAULT] + default_client_group = njs + """ + + spec = ( + shared_spec + + """ + [njs] + request_memory = 2000M + request_disk = 100GB + """ + ) + + _init_fail( + catalog, + StringIO(spec), + None, + IncorrectParamsException( + "Missing request_cpus key in job requirements from section 'njs' of the " + + "deployment configuration" + ), + ) + + spec = ( + shared_spec + + """ + [njs] + request_cpus = 4 + request_disk = 100GB + """ + ) + + _init_fail( + catalog, + StringIO(spec), + None, + IncorrectParamsException( + "Missing request_memory key in job requirements from section 'njs' of the " + + "deployment configuration" + ), + ) + + spec = ( + shared_spec + + """ + [njs] + request_cpus = 4 + request_memory = 2000M + """ + ) + + _init_fail( + catalog, + StringIO(spec), + None, + IncorrectParamsException( + "Missing request_disk key in job requirements from section 'njs' of the " + + "deployment configuration" + ), + ) + + +def _init_fail(catalog, spec, override, expected): + with raises(Exception) as got: + JobRequirementsResolver(catalog, spec, override) + assert_exception_correct(got.value, expected) + + +def test_get_configured_client_group_spec_fail(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + + with raises(Exception) as got: + jrr.get_configured_client_group_spec("cg4") + assert_exception_correct( + got.value, ValueError("Client group 'cg4' is not configured") + ) From 2b0be0a2083b04383d43430ba48a9d113006476f Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 30 Mar 2021 10:38:49 -0700 Subject: [PATCH 036/109] DATAUP-389 - add a job requirements resolver, pt 4 of 4 (#341) * Add job requirements resolution method also in JobRequirements.check_parameters, don't force a bool if the user doesn't specify one. Leave that to the constructor. * Run black * remove debug cruft * Fix CSV item splitting bug Fix a bug which would cause the resolver to fail if a csv item contained more than one '='. Also clarify what the comment about CSV format means. --- .../sdk/job_submission_parameters.py | 10 +- .../utils/job_requirements_resolver.py | 167 ++++++- .../job_submission_parameters_test.py | 13 +- .../job_requirements_resolver_test.py | 444 ++++++++++++++++++ 4 files changed, 622 insertions(+), 12 deletions(-) diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py index 519a35749..9658bfe0f 100644 --- a/lib/execution_engine2/sdk/job_submission_parameters.py +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -87,9 +87,9 @@ def check_parameters( client_group: str = None, client_group_regex: Union[bool, None] = None, bill_to_user: str = None, - ignore_concurrency_limits: bool = False, + ignore_concurrency_limits: Union[bool, None] = None, scheduler_requirements: Dict[str, str] = None, - debug_mode: bool = False, + debug_mode: Union[bool, None] = None, ): """ Test that a set of parameters are legal and returns normalized parmeters. @@ -124,9 +124,11 @@ def check_parameters( client_group, None if client_group_regex is None else bool(client_group_regex), _check_string(bill_to_user, "bill_to_user", optional=True), - bool(ignore_concurrency_limits), + None + if ignore_concurrency_limits is None + else bool(ignore_concurrency_limits), cls._check_scheduler_requirements(scheduler_requirements), - bool(debug_mode), + None if debug_mode is None else bool(debug_mode), ) def _params(self): diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index ca3c063f6..972d6ed59 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -2,6 +2,7 @@ Contains resolvers for job requirements. """ +import json from configparser import ConfigParser from typing import Iterable, Dict, Union, Set from enum import Enum @@ -28,6 +29,15 @@ CLIENT_GROUP_REGEX = "client_group_regex" DEBUG_MODE = "debug_mode" _RESOURCES = set([CLIENT_GROUP, REQUEST_CPUS, REQUEST_MEMORY, REQUEST_DISK]) +_ALL_SPECIAL_KEYS = _RESOURCES | set( + [CLIENT_GROUP_REGEX, DEBUG_MODE, "bill_to_user", "ignore_concurrency_limits"] +) + +_CLIENT_GROUPS = "client_groups" + + +def _remove_special_keys(inc_dict): + return {k: inc_dict[k] for k in set(inc_dict) - _ALL_SPECIAL_KEYS} class RequirementsType(Enum): @@ -290,7 +300,7 @@ def get_requirements_type( @classmethod def normalize_job_reqs( - cls, reqs: Dict[str, str], source: str, require_all_resources=False + cls, reqs: Dict[str, Union[str, int]], source: str, require_all_resources=False ) -> Dict[str, Union[str, int]]: f""" Massage job requirements into a standard format. Does the following to specific keys of @@ -344,3 +354,158 @@ def _has_value(cls, inc): if type(inc) == str and not inc.strip(): return False return True + + def resolve_requirements( + self, + method: str, + cpus: int = None, + memory_MB: int = None, + disk_GB: int = None, + client_group: str = None, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = None, + ) -> JobRequirements: + """ + Resolve jobs requirements for a method. + + All parameters are optional other than the method and supplying them will override + the catalog and ee2 settings for the job. + + method - the method to be run in module.method format. + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. Pass None for no preference. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + debug_mode - whether to run the job in debug mode. + + Returns the job requirements. + """ + + if method is None or len(method.split(".")) != 2: + raise IncorrectParamsException( + f"Unrecognized method: '{method}'. Please input module_name.function_name" + ) + module_name, function_name = [m.strip() for m in method.split(".")] + + args = JobRequirements.check_parameters( + cpus, + memory_MB, + disk_GB, + client_group, + client_group_regex, + bill_to_user, + ignore_concurrency_limits, + scheduler_requirements, + debug_mode, + ) + + # the catalog could contain arbitrary scheduler requirements so we can't skip the + # call even if all the arguments are provided + cat_reqs_all = self._get_catalog_reqs(module_name, function_name) + cat_reqs = self.normalize_job_reqs( + cat_reqs_all, + f"catalog method {module_name}.{function_name}", + ) + client_group = self._get_client_group( + args[3], cat_reqs.get(CLIENT_GROUP), module_name, function_name + ) + + # don't mutate the spec, make a copy + reqs = dict(self._clientgroup_default_configs[client_group]) + reqs.update(cat_reqs) + + scheduler_requirements = _remove_special_keys(cat_reqs_all) + # don't mutate args, check_parameters doesn't make a copy of the incoming args + scheduler_requirements.update(_remove_special_keys(dict(args[7]))) + + cgr = args[4] if (args[4] is not None) else reqs.pop(CLIENT_GROUP_REGEX, None) + dm = args[8] if (args[8] is not None) else reqs.pop(DEBUG_MODE, None) + + return JobRequirements( + args[0] or reqs[REQUEST_CPUS], + args[1] or reqs[REQUEST_MEMORY], + args[2] or reqs[REQUEST_DISK], + client_group, + client_group_regex=cgr, + bill_to_user=args[5], + ignore_concurrency_limits=args[6], + scheduler_requirements=scheduler_requirements, + debug_mode=dm, + ) + + def _get_client_group(self, user_cg, catalog_cg, module_name, function_name): + cg = next( + i + for i in [ + self._override_client_group, + user_cg, + catalog_cg, + self._default_client_group, + ] + if i is not None + ) + if cg not in self._clientgroup_default_configs: + if cg == catalog_cg: + raise IncorrectParamsException( + f"Catalog specified illegal client group '{cg}' for method " + + f"{module_name}.{function_name}" + ) + raise IncorrectParamsException(f"No such clientgroup: {cg}") + return cg + + def _get_catalog_reqs(self, module_name, function_name): + # could cache results for 30s or so to speed things up... YAGNI + group_config = self._catalog.list_client_group_configs( + {"module_name": module_name, "function_name": function_name} + ) + # If group_config is empty, that means there's no clientgroup entry in the catalog + # It'll return an empty list even for non-existent modules + if not group_config: + return {} + if len(group_config) > 1: + raise ValueError( + "Unexpected result from the Catalog service: more than one client group " + + f"configuration found for method {module_name}.{function_name}" + ) + + resources_request = group_config[0].get(_CLIENT_GROUPS, None) + + # No client group provided + if not resources_request: + return {} + # JSON + if "{" in resources_request[0]: + try: + rv = json.loads(", ".join(resources_request)) + except ValueError: + raise ValueError( + "Unable to parse JSON client group entry from catalog " + + f"for method {module_name}.{function_name}" + ) + return {k.strip(): rv[k] for k in rv} + # CSV Format + # This presents as CSV in the Catalog UI, e.g. + # clientgroup, key1=value1, key2=value2 + # and so on + # The UI splits by comma before sending the data to the catalog, which is what we + # get when we pull the data + rv = {CLIENT_GROUP: resources_request.pop(0)} + for item in resources_request: + if "=" not in item: + raise ValueError( + f"Malformed requirement. Format is =. Item is '{item}' for " + + f"catalog method {module_name}.{function_name}" + ) + (key, value) = item.split("=", 1) + rv[key.strip()] = value.strip() + return rv diff --git a/test/tests_for_sdkmr/job_submission_parameters_test.py b/test/tests_for_sdkmr/job_submission_parameters_test.py index 678a18fe4..abb8bf1da 100644 --- a/test/tests_for_sdkmr/job_submission_parameters_test.py +++ b/test/tests_for_sdkmr/job_submission_parameters_test.py @@ -201,8 +201,7 @@ def _job_req_init_fail(cpus, mem, disk, cgroup, user, reqs, expected): def test_job_req_check_parameters_no_input(): n = None - f = False - assert JobRequirements.check_parameters() == (n, n, n, n, n, n, f, {}, f) + assert JobRequirements.check_parameters() == (n, n, n, n, n, n, n, {}, n) assert JobRequirements.check_parameters(n, n, n, n, n, n, n, n, n) == ( n, n, @@ -210,9 +209,9 @@ def test_job_req_check_parameters_no_input(): n, n, n, - f, + n, {}, - f, + n, ) @@ -240,13 +239,13 @@ def test_job_req_check_parameters_whitespace_as_user(): 1, 1, " b ", - False, + 0, " \t ", 890, {"proc": "x286", "maxmem": "640k"}, - [], + 1, ) - == (1, 1, 1, "b", False, None, True, {"proc": "x286", "maxmem": "640k"}, False) + == (1, 1, 1, "b", False, None, True, {"proc": "x286", "maxmem": "640k"}, True) ) diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py index 230fb254c..c9a035fd3 100644 --- a/test/tests_for_utils/job_requirements_resolver_test.py +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -6,6 +6,7 @@ from io import StringIO from pytest import raises from unittest.mock import create_autospec +from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.utils.job_requirements_resolver import ( JobRequirementsResolver, RequirementsType, @@ -645,3 +646,446 @@ def test_get_configured_client_group_spec_fail(): assert_exception_correct( got.value, ValueError("Client group 'cg4' is not configured") ) + + +# Note that resolve_requirements uses the normalization class method and an argument checking +# method under the hood. As such, we don't duplicate the testing of those methods +# here other than some spot checks. If the method changes significantly more +# testing may be required. + + +def test_resolve_requirements_from_spec(): + """ + Resolve requirements when no user input and no catalog record is available. + """ + _resolve_requirements_from_spec([]) + _resolve_requirements_from_spec([{}]) + _resolve_requirements_from_spec([{"client_groups": []}]) + + +def _resolve_requirements_from_spec(catalog_return): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = catalog_return + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec) + + assert jrr.resolve_requirements(" mod.meth ") == JobRequirements( + 8, + 700, + 32, + "cg2", + client_group_regex=False, + debug_mode=True, + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "mod", "function_name": "meth"} + ) + + +def test_resolve_requirements_from_spec_with_override(): + """ + Test that an override ignores client group information from all other sources. + """ + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}] + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec, " cg1 ") + + assert jrr.resolve_requirements( + " module2. some_meth ", client_group="cg2" + ) == JobRequirements( + 4, + 2000, + 100, + "cg1", + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "module2", "function_name": "some_meth"} + ) + + +def test_resolve_requirements_from_catalog_full_CSV(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + { + "client_groups": [ + "cg1", + "request_cpus= 78", + " request_memory = 500MB", + "request_disk = 700GB", + "client_group_regex = False", + "debug_mode = true", + "foo=bar=whoop", # test that only one split occurs + "baz=bat", + ] + } + ] + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec) + + assert jrr.resolve_requirements(" module2. some_meth ") == JobRequirements( + 78, + 500, + 700, + "cg1", + False, + None, + False, + {"foo": "bar=whoop", "baz": "bat"}, + True, + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "module2", "function_name": "some_meth"} + ) + + +def test_resolve_requirements_from_catalog_partial_JSON(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + { + "client_groups": [ + '{"client_group": " cg1 "', + '" request_memory ": " 300M "', + '"exactlythesameshape": "asathingy"', + '"request_disk": 100000}', + ] + } + ] + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec) + + assert jrr.resolve_requirements(" module2. some_meth ") == JobRequirements( + 4, + 300, + 100000, + "cg1", + scheduler_requirements={"exactlythesameshape": "asathingy"}, + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "module2", "function_name": "some_meth"} + ) + + +def test_resolve_requirements_from_user_full(): + _resolve_requirements_from_user_full(True) + _resolve_requirements_from_user_full(False) + + +def _resolve_requirements_from_user_full(bool_val): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + { + "client_groups": [ + "cg2", + "request_cpus= 78", + " request_memory = 500MB", + "request_disk = 700GB", + "client_group_regex = False", + "debug_mode = true", + "foo=bar", + "baz=bat", + ] + } + ] + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec) + + assert jrr.resolve_requirements( + " module2. some_meth ", + 42, + 789, + 1, + "cg1", + bool_val, + "some_poor_sucker", + bool_val, + { + "foo": "Some of you may die", + "bar": "but that is a sacrifice I am willing to make", + }, + bool_val, + ) == JobRequirements( + 42, + 789, + 1, + "cg1", + bool_val, + "some_poor_sucker", + bool_val, + { + "foo": "Some of you may die", + "bar": "but that is a sacrifice I am willing to make", + "baz": "bat", + }, + bool_val, + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "module2", "function_name": "some_meth"} + ) + + +def test_resolve_requirements_from_user_partial(): + """ + Gets requirements from the user, catalog, and the ee2 deploy config. + + Also tests that special keys are removed from the scheduler requirements. + """ + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + { + "client_groups": [ + "cg2", + "request_cpus= 78", + "request_disk = 700", + "client_group_regex = False", + "debug_mode = true", + "foo=bar", + "baz=bat", + ] + } + ] + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec) + + assert jrr.resolve_requirements( + " module2. some_meth ", + cpus=42, + client_group="cg1", + client_group_regex=True, + scheduler_requirements={ + "client_group": "foo", + "request_cpus": "78", + "request_memory": "800", + "request_disk": "700", + "client_group_regex": "False", + "debug_mode": "True", + "bill_to_user": "foo", + "ignore_concurrency_limits": "true", + "whee": "whoo", + }, + ) == JobRequirements( + 42, + 2000, + 700, + "cg1", + client_group_regex=True, + scheduler_requirements={"foo": "bar", "baz": "bat", "whee": "whoo"}, + debug_mode=True, + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "module2", "function_name": "some_meth"} + ) + + +def test_resolve_requirements_fail_illegal_inputs(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + + _resolve_requirements_fail( + jrr, + None, + {}, + IncorrectParamsException( + "Unrecognized method: 'None'. Please input module_name.function_name" + ), + ) + _resolve_requirements_fail( + jrr, + "method", + {}, + IncorrectParamsException( + "Unrecognized method: 'method'. Please input module_name.function_name" + ), + ) + _resolve_requirements_fail( + jrr, + "mod1.mod2.method", + {}, + IncorrectParamsException( + "Unrecognized method: 'mod1.mod2.method'. Please input module_name.function_name" + ), + ) + _resolve_requirements_fail( + jrr, + "m.m", + {"cpus": 0}, + IncorrectParamsException("CPU count must be at least 1"), + ) + _resolve_requirements_fail( + jrr, + "m.m", + {"memory_MB": 0}, + IncorrectParamsException("memory in MB must be at least 1"), + ) + _resolve_requirements_fail( + jrr, + "m.m", + {"disk_GB": 0}, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _resolve_requirements_fail( + jrr, + "m.m", + {"client_group": " \t "}, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _resolve_requirements_fail( + jrr, + "m.m", + {"bill_to_user": "\b"}, + IncorrectParamsException("bill_to_user contains control characters"), + ) + _resolve_requirements_fail( + jrr, + "m.m", + {"scheduler_requirements": {"a": None}}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + + +def test_resolve_requirements_fail_catalog_multiple_entries(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}, {}] + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + "m.m", + {}, + ValueError( + "Unexpected result from the Catalog service: more than one client group " + + "configuration found for method m.m" + ), + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "m", "function_name": "m"} + ) + + +def test_resolve_requirements_fail_catalog_bad_JSON(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + {"client_groups": ['{"foo": "bar", "baz":}']} + ] + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + "m.m", + {}, + ValueError( + "Unable to parse JSON client group entry from catalog for method m.m" + ), + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "m", "function_name": "m"} + ) + + +def test_resolve_requirements_fail_catalog_bad_CSV(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + {"client_groups": ["cg", "foo is bar"]} + ] + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + "m.m", + {}, + ValueError( + "Malformed requirement. Format is =. " + + "Item is 'foo is bar' for catalog method m.m" + ), + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "m", "function_name": "m"} + ) + + +def test_resolve_requirements_fail_catalog_normalize(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + {"client_groups": ["cg", "request_memory=72TB"]} + ] + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + " mod . meth ", + {}, + IncorrectParamsException( + "Found illegal memory request '72TB' in job requirements from catalog method mod.meth" + ), + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "mod", "function_name": "meth"} + ) + + +def test_resolve_requirements_fail_catalog_clientgroup(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [ + {"client_groups": ["cg", "request_memory=72"]} + ] + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + " mod . meth ", + {}, + IncorrectParamsException( + "Catalog specified illegal client group 'cg' for method mod.meth" + ), + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "mod", "function_name": "meth"} + ) + + +def test_resolve_requirements_fail_input_clientgroup(): + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [] + + jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + "m.m", + {"client_group": "cb4"}, + IncorrectParamsException("No such clientgroup: cb4"), + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "m", "function_name": "m"} + ) + + +def _resolve_requirements_fail(jrr, method, kwargs, expected): + with raises(Exception) as got: + jrr.resolve_requirements(method, **kwargs) + assert_exception_correct(got.value, expected) From d1729981ca20ece0c2d63b58728f94372920d273 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 30 Mar 2021 11:06:02 -0700 Subject: [PATCH 037/109] Make the user client group override the override (#342) After reviewing the code again, it's clear that any user (who is presumed to be a catalog admin) provided client group should override every other source, including the `OVERRIDE_CLIENT_GROUP` environmental variable. --- .../utils/job_requirements_resolver.py | 2 +- .../job_requirements_resolver_test.py | 28 +++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index 972d6ed59..afabdf144 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -447,8 +447,8 @@ def _get_client_group(self, user_cg, catalog_cg, module_name, function_name): cg = next( i for i in [ - self._override_client_group, user_cg, + self._override_client_group, catalog_cg, self._default_client_group, ] diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py index c9a035fd3..670a52ee5 100644 --- a/test/tests_for_utils/job_requirements_resolver_test.py +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -687,7 +687,7 @@ def _resolve_requirements_from_spec(catalog_return): def test_resolve_requirements_from_spec_with_override(): """ - Test that an override ignores client group information from all other sources. + Test that an override ignores client group information from the catalog and deploy config. """ catalog = create_autospec(Catalog, spec_set=True, instance=True) catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}] @@ -696,8 +696,32 @@ def test_resolve_requirements_from_spec_with_override(): jrr = JobRequirementsResolver(catalog, spec, " cg1 ") + assert jrr.resolve_requirements(" module2. some_meth ") == JobRequirements( + 4, + 2000, + 100, + "cg1", + ) + + catalog.list_client_group_configs.assert_called_once_with( + {"module_name": "module2", "function_name": "some_meth"} + ) + + +def test_resolve_requirements_from_spec_with_override_and_user_client_group(): + """ + Test that a user providing a client group ignores client group information from all other + sources. + """ + catalog = create_autospec(Catalog, spec_set=True, instance=True) + catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}] + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(catalog, spec, " cg2 ") + assert jrr.resolve_requirements( - " module2. some_meth ", client_group="cg2" + " module2. some_meth ", client_group=" cg1" ) == JobRequirements( 4, 2000, From b752135478675c5ba7a29641bf8a4bd08146219a Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 31 Mar 2021 13:54:52 -0700 Subject: [PATCH 038/109] DATAUP-389 - integrate the job requirements resolver pt 1 (#343) * Add catalog and job reqs resolver to client set Nothing does anything with them yet - will be tested more thoroughly when that happens. Adds a separate field for the catalog client, as the job reqs resolver will make CatalogUtils redundant. * run black * remove unused import --- .../execution_engine2Impl.py | 4 +- lib/execution_engine2/utils/clients.py | 62 +++++++++++---- test/tests_for_auth/ee2_admin_mode_test.py | 3 +- .../ee2_SDKMethodRunner_EE2Logs_test.py | 9 ++- .../ee2_SDKMethodRunner_test.py | 18 +++-- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 9 ++- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 9 ++- test/tests_for_sdkmr/ee2_load_test.py | 9 ++- test/tests_for_utils/clients_test.py | 78 ++++++++++++++++++- test/utils_shared/mock_utils.py | 12 +++ 10 files changed, 172 insertions(+), 41 deletions(-) diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 02842b88a..4b3ba3f0b 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -63,7 +63,9 @@ def __init__(self, config): self.gen_cfg = GenerateFromConfig(config) # move these into GFC? Since they're only generated once it doesn't seem necessary configpath = os.environ["KB_DEPLOYMENT_CONFIG"] - self.clients = get_client_set(config, configpath) + override = os.environ.get("OVERRIDE_CLIENT_GROUP") + with open(configpath) as cf: + self.clients = get_client_set(config, configpath, cf, override) #END_CONSTRUCTOR pass diff --git a/lib/execution_engine2/utils/clients.py b/lib/execution_engine2/utils/clients.py index 279b0c695..ce634ed1d 100644 --- a/lib/execution_engine2/utils/clients.py +++ b/lib/execution_engine2/utils/clients.py @@ -3,19 +3,22 @@ # Note on testing - this class is not generally unit-testable, and is only tested fully in # integration tests. -from typing import Dict +from typing import Dict, Iterable from execution_engine2.authorization.roles import AdminAuthUtil from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.utils.arg_processing import not_falsy as _not_falsy from execution_engine2.utils.CatalogUtils import CatalogUtils from execution_engine2.utils.Condor import Condor from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient from execution_engine2.utils.arg_processing import parse_bool from installed_clients.authclient import KBaseAuth +from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace @@ -94,7 +97,9 @@ def __init__( auth: KBaseAuth, auth_admin: AdminAuthUtil, condor: Condor, - catalog_utils: CatalogUtils, + catalog: Catalog, + requirements_resolver: JobRequirementsResolver, + catalog_utils: CatalogUtils, # TODO JRR remove after replaced by JRR kafka_client: KafkaClient, mongo_util: MongoUtil, slack_client: SlackClient, @@ -103,14 +108,17 @@ def __init__( Initialize the client set from the individual clients. """ - # TODO check no clients are None. Make a general method somewhere - self.auth = auth - self.auth_admin = auth_admin - self.condor = condor - self.catalog_utils = catalog_utils - self.kafka_client = kafka_client - self.mongo_util = mongo_util - self.slack_client = slack_client + self.auth = _not_falsy(auth, "auth") + self.auth_admin = _not_falsy(auth_admin, "auth_admin") + self.condor = _not_falsy(condor, "condor") + self.catalog = _not_falsy(catalog, "catalog") + self.requirements_resolver = _not_falsy( + requirements_resolver, "requirements_resolver" + ) + self.catalog_utils = _not_falsy(catalog_utils, "catalog_utils") + self.kafka_client = _not_falsy(kafka_client, "kafka_client") + self.mongo_util = _not_falsy(mongo_util, "mongo_util") + self.slack_client = _not_falsy(slack_client, "slack_client") # the constructor allows for mix and match of mocks and real implementations as needed @@ -118,11 +126,17 @@ def __init__( def get_clients( - cfg: Dict[str, str], cfg_path: str + # TODO JRR remove cfg_path when Condor no longer needs it + cfg: Dict[str, str], + cfg_path, + cfg_file: Iterable[str], + override_client_group: str = None, ) -> ( KBaseAuth, AdminAuthUtil, Condor, + Catalog, + JobRequirementsResolver, CatalogUtils, KafkaClient, MongoUtil, @@ -134,6 +148,9 @@ def get_clients( cfg - the configuration dictionary cfg_path - the path to the configuration file + cfg_file - the full configuration file as a file like object or iterable. + override_client_group - a client group name to override any client groups provided by + users or the catalog service. Expected keys in config: auth-url - the root URL of the kbase auth service @@ -143,9 +160,15 @@ def get_clients( slack-token - a token for contacting Slack """ # Condor needs access to the entire deploy.cfg file, not just the ee2 section - condor = Condor(cfg_path) + condor = Condor(cfg_path) # TODO JRR replace with cfg when JRR is used # Do a check to ensure the urls and tokens actually work correctly? # TODO check keys are present - make some general methods for dealing with this + # token is needed for running log_exec_stats in EE2Status + catalog = Catalog(cfg["catalog-url"], token=cfg["catalog-token"]) + # make a separate, hidden catalog instance + jrr = JobRequirementsResolver( + Catalog(cfg["catalog-url"]), cfg_file, override_client_group + ) catalog_utils = CatalogUtils(cfg["catalog-url"], cfg["catalog-token"]) auth_url = cfg["auth-url"] auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") @@ -168,6 +191,8 @@ def get_clients( auth, auth_admin, condor, + catalog, + jrr, catalog_utils, kafka_client, mongo_util, @@ -175,13 +200,22 @@ def get_clients( ) -def get_client_set(cfg: Dict[str, str], cfg_path: str) -> ClientSet: +def get_client_set( + cfg: Dict[str, str], + # TODO JRR remove cfg_path when Condor no longer needs it + cfg_path: str, + cfg_file: Iterable[str], + override_client_group: str = None, +) -> ClientSet: """ A helper method to create a ClientSet from a config dict rather than constructing and passing in clients individually. cfg - the configuration dictionary cfg_path - the path to the configuration file + cfg_file - the full configuration file as a file like object or iterable. + override_client_group - a client group name to override any client groups provided by + users or the catalog service. Expected keys in config: auth-url - the root URL of the kbase auth service @@ -191,4 +225,4 @@ def get_client_set(cfg: Dict[str, str], cfg_path: str) -> ClientSet: slack-token - a token for contacting Slack """ - return ClientSet(*get_clients(cfg, cfg_path)) + return ClientSet(*get_clients(cfg, cfg_path, cfg_file, override_client_group)) diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index 0b955310b..8c9788058 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -95,7 +95,8 @@ def getRunner(self, user_clients=None, clients=None) -> SDKMethodRunner: if not user_clients: user_clients = get_user_client_set(self.cfg, self.user_id, self.token) if not clients: - clients = get_client_set(self.cfg, self.config_file) + with open(self.config_file) as cf: + clients = get_client_set(self.cfg, self.config_file, cf) runner = SDKMethodRunner(user_clients, clients) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py index e2a0d0352..7a4647ad7 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py @@ -35,10 +35,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, deploy), - ) + with open(deploy) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, deploy, cf), + ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 80d62e005..b8225ff52 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -72,10 +72,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, cls.config_file), - ) + with open(cls.config_file) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cls.config_file, cf), + ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -724,10 +725,11 @@ def test_check_job_global_perm(self, rq_mock): self.assertEqual(job_states[job_id]["status"], "created") # now test with a different user - other_method_runner = SDKMethodRunner( - get_user_client_set(self.cfg, "some_other_user", "other_token"), - get_client_set(self.cfg, self.config_file), - ) + with open(self.config_file) as cf: + other_method_runner = SDKMethodRunner( + get_user_client_set(self.cfg, "some_other_user", "other_token"), + get_client_set(self.cfg, self.config_file, cf), + ) job_states = other_method_runner.get_jobs_status().check_workspace_jobs( self.ws_id ) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 8c6a9d7c4..c20c0ea2b 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -53,10 +53,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, config_file), - ) + with open(config_file) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, config_file, cf), + ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index 09295267a..2ca2ecb3f 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -45,10 +45,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, config_file), - ) + with open(config_file) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, config_file, cf), + ) cls.cr = CondorResources( request_cpus="1", request_disk="1GB", diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 51047fdb6..aa24fede1 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -55,10 +55,11 @@ def setUpClass(cls): @classmethod def _getRunner(cls) -> SDKMethodRunner: - runner = SDKMethodRunner( - get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, cls.deploy), - ) + with open(cls.deploy) as cf: + runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cls.deploy, cf), + ) # Initialize these clients from None status = runner.get_jobs_status() # type: JobsStatus status._send_exec_stats_to_catalog = MagicMock(return_value=True) diff --git a/test/tests_for_utils/clients_test.py b/test/tests_for_utils/clients_test.py index 3e109e73e..bc9a68111 100644 --- a/test/tests_for_utils/clients_test.py +++ b/test/tests_for_utils/clients_test.py @@ -5,10 +5,26 @@ from unittest.mock import create_autospec from execution_engine2.authorization.workspaceauth import WorkspaceAuth -from execution_engine2.utils.clients import UserClientSet, get_user_client_set +from execution_engine2.utils.clients import ( + UserClientSet, + get_user_client_set, + ClientSet, +) from utils_shared.test_utils import assert_exception_correct +from utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from installed_clients.WorkspaceClient import Workspace +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.utils.CatalogUtils import CatalogUtils +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient + +from installed_clients.authclient import KBaseAuth +from installed_clients.CatalogClient import Catalog + def test_get_user_client_set_fail(): ws_err = "missing workspace-url in configuration" @@ -68,3 +84,63 @@ def user_client_set_init_fail(user, token, ws_client, ws_auth, expected): with raises(Exception) as e: UserClientSet(user, token, ws_client, ws_auth) assert_exception_correct(e.value, expected) + + +def test_client_set_init_fail(): + mocks = get_client_mocks(None, None, *ALL_CLIENTS) + a = mocks[KBaseAuth] + aa = mocks[AdminAuthUtil] + c = mocks[Condor] + ca = mocks[Catalog] + j = mocks[JobRequirementsResolver] + cu = mocks[CatalogUtils] + k = mocks[KafkaClient] + m = mocks[MongoUtil] + s = mocks[SlackClient] + n = None + + e = ValueError("auth cannot be a value that evaluates to false") + _client_set_init_fail(n, aa, c, ca, j, cu, k, m, s, e) + e = ValueError("auth_admin cannot be a value that evaluates to false") + _client_set_init_fail(a, n, c, ca, j, cu, k, m, s, e) + e = ValueError("condor cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, n, ca, j, cu, k, m, s, e) + e = ValueError("catalog cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, n, j, cu, k, m, s, e) + e = ValueError("requirements_resolver cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, n, cu, k, m, s, e) + e = ValueError("catalog_utils cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, j, n, k, m, s, e) + e = ValueError("kafka_client cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, j, cu, n, m, s, e) + e = ValueError("mongo_util cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, j, cu, k, n, s, e) + e = ValueError("slack_client cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, j, cu, k, m, n, e) + + +def _client_set_init_fail( + auth: KBaseAuth, + auth_admin: AdminAuthUtil, + condor: Condor, + catalog: Catalog, + requirements_resolver: JobRequirementsResolver, + catalog_utils: CatalogUtils, + kafka_client: KafkaClient, + mongo_util: MongoUtil, + slack_client: SlackClient, + expected: Exception, +): + with raises(Exception) as got: + ClientSet( + auth, + auth_admin, + condor, + catalog, + requirements_resolver, + catalog_utils, + kafka_client, + mongo_util, + slack_client, + ) + assert_exception_correct(got.value, expected) diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py index 343fadf28..2e67220b7 100644 --- a/test/utils_shared/mock_utils.py +++ b/test/utils_shared/mock_utils.py @@ -2,16 +2,24 @@ from execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.utils.CatalogUtils import CatalogUtils +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient from installed_clients.authclient import KBaseAuth +from installed_clients.CatalogClient import Catalog from execution_engine2.authorization.roles import AdminAuthUtil from execution_engine2.utils.Condor import Condor from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from execution_engine2.utils.clients import ClientSet + +def _build_job_reqs(config, cfgfile): + with open(cfgfile) as cf: + return JobRequirementsResolver(Catalog(config["catalog-url"]), cf) + + _CLASS_IMPLEMENTATION_BUILDERS = { KBaseAuth: lambda config, cfgfile: KBaseAuth( auth_url=config["auth-url"] + "/api/legacy/KBase/Sessions/Login" @@ -20,6 +28,8 @@ config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] ), Condor: lambda config, cfgfile: Condor(cfgfile), + Catalog: lambda config, cfgfile: Catalog(config["catalog-url"]), + JobRequirementsResolver: _build_job_reqs, CatalogUtils: lambda config, cfgfile: CatalogUtils( config["catalog-url"], config["catalog-token"] ), @@ -55,6 +65,8 @@ def get_client_mocks(config, config_path, *to_be_mocked): ret[KBaseAuth], ret[AdminAuthUtil], ret[Condor], + ret[Catalog], + ret[JobRequirementsResolver], ret[CatalogUtils], ret[KafkaClient], ret[MongoUtil], From 3164fcd938bb087ececd58f60ba0a92932a972fa Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 5 Apr 2021 15:37:36 -0700 Subject: [PATCH 039/109] DATAUP-389 - add getters for new clients & missing getters (#344) * Add direct getter for catalog Slightly cleaner, plus CatalogUtils will be made redundant by the job requirements resolver class. * run black * Add getter for job requirements resolver * run black * Add getter for workspace auth and use in EE2Runjob, where workspace auth will need to be mocked soon * Remove redundant patch --- lib/execution_engine2/sdk/EE2Runjob.py | 10 +++----- lib/execution_engine2/sdk/EE2Status.py | 4 +-- lib/execution_engine2/sdk/SDKMethodRunner.py | 25 ++++++++++++++++++- test/tests_for_auth/ee2_admin_mode_test.py | 21 ++++++++-------- test/tests_for_sdkmr/EE2Runjob_test.py | 2 +- test/tests_for_sdkmr/EE2Status_test.py | 4 +-- .../ee2_SDKMethodRunner_test.py | 8 ++++++ test/tests_for_sdkmr/ee2_load_test.py | 6 ++--- 8 files changed, 52 insertions(+), 28 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index ee30e7d5b..6e4b1cfe7 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -117,10 +117,8 @@ def _get_module_git_commit(self, method, service_ver=None) -> Optional[str]: self.logger.debug(f"Getting commit for {module_name} {service_ver}") - module_version = ( - self.sdkmr.get_catalog_utils() - .get_catalog() - .get_module_version({"module_name": module_name, "version": service_ver}) + module_version = self.sdkmr.get_catalog().get_module_version( + {"module_name": module_name, "version": service_ver} ) git_commit_hash = module_version.get("git_commit_hash") @@ -144,7 +142,7 @@ def _check_ws_objects(self, source_objects) -> None: def _check_workspace_permissions(self, wsid): if wsid: - if not self.sdkmr.workspace_auth.can_write(wsid): + if not self.sdkmr.get_workspace_auth().can_write(wsid): self.logger.debug( f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {wsid}." ) @@ -153,7 +151,7 @@ def _check_workspace_permissions(self, wsid): ) def _check_workspace_permissions_list(self, wsids): - perms = self.sdkmr.workspace_auth.can_write_list(wsids) + perms = self.sdkmr.get_workspace_auth().can_write_list(wsids) bad_ws = [key for key in perms.keys() if perms[key] is False] if bad_ws: self.logger.debug( diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 31b2bd4ae..9c6663348 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -549,9 +549,7 @@ def _send_exec_stats_to_catalog(self, job_id): log_exec_stats_params["is_error"] = int(job.status == Status.error.value) log_exec_stats_params["job_id"] = job_id - self.sdkmr.get_catalog_utils().get_catalog().log_exec_stats( - log_exec_stats_params - ) + self.sdkmr.get_catalog().log_exec_stats(log_exec_stats_params) def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict: if not parent_job_id: diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index bef390a4f..acde98f5b 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -28,11 +28,14 @@ from lib.execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME from lib.execution_engine2.utils.CatalogUtils import CatalogUtils from lib.execution_engine2.utils.Condor import Condor +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.clients import UserClientSet, ClientSet from lib.execution_engine2.utils.EE2Logger import get_logger as _get_logger from lib.execution_engine2.utils.KafkaUtils import KafkaClient from lib.execution_engine2.utils.SlackUtils import SlackClient +from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace -from execution_engine2.utils.clients import UserClientSet, ClientSet class JobPermissions(Enum): @@ -65,6 +68,8 @@ def __init__( raise ValueError("clients is required") self.mongo_util = clients.mongo_util self.condor = clients.condor + self.catalog = clients.catalog + self.job_requirements_resolver = clients.requirements_resolver self.workspace = user_clients.workspace self.workspace_auth = user_clients.workspace_auth self.catalog_utils = clients.catalog_utils @@ -137,6 +142,12 @@ def get_workspace(self) -> Workspace: """ return self.workspace + def get_workspace_auth(self) -> WorkspaceAuth: + """ + Get the workspace authorization client for this instance of SDKMR. + """ + return self.workspace_auth + def get_logger(self) -> Logger: """ Get the logger for this instance of SDKMR. @@ -145,6 +156,18 @@ def get_logger(self) -> Logger: # logger, which seems... overkill? return self.logger + def get_catalog(self) -> Catalog: + """ + Get the catalog client for this instance of SDKMR. + """ + return self.catalog + + def get_job_requirements_resolver(self) -> JobRequirementsResolver: + """ + Get the job requirements resolver for this instance of SDKMR. + """ + return self.job_requirements_resolver + def get_catalog_utils(self) -> CatalogUtils: """ Get the catalog utilities for this instance of SDKMR. diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index 8c9788058..4e2c5e9e3 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -54,15 +54,9 @@ def setUpClass(cls): def setUp(self) -> None: """ - Patch out Catalog and Condor + Patch out Condor :return: """ - self.catalog_patch = patch( - "lib.installed_clients.CatalogClient.Catalog.get_module_version" - ) - self.catalog = self.catalog_patch.start() - self.catalog.return_value = {"git_commit_hash": "moduleversiongoeshere"} - si = SubmissionInfo(clusterid="123", submit={}, error=None) self.condor_patch = patch.object( target=Condor, attribute="run_job", return_value=si @@ -86,7 +80,6 @@ def setUp(self) -> None: # self.good_job_id_user2 = setup_runner.run_job(params=job_params_1,as_admin=False) def tearDown(self) -> None: - self.catalog_patch.stop() self.condor_patch.stop() self.condor_patch2.start() @@ -128,7 +121,11 @@ def get_user_mocks( def get_client_mocks(self, *to_be_mocked): return _get_client_mocks(self.cfg, self.config_file, *to_be_mocked) - @patch.object(Catalog, "get_module_version", return_value="module.version") + @patch.object( + Catalog, + "get_module_version", + return_value={"git_commit_hash": "moduleversiongoeshere"}, + ) def test_regular_user(self, catalog): # Regular User lowly_user = "Access Denied: You are not an administrator" @@ -213,7 +210,11 @@ def test_regular_user(self, catalog): # Start the job and get its status as an admin - @patch.object(Catalog, "get_module_version", return_value="module.version") + @patch.object( + Catalog, + "get_module_version", + return_value={"git_commit_hash": "moduleversiongoeshere"}, + ) @patch.object(WorkspaceAuth, "can_write", return_value=True) def test_admin_writer(self, workspace, catalog): # Admin User with WRITE diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index aa453a92e..321fbf80d 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -68,7 +68,7 @@ def test_run_as_admin(): slack = create_autospec(SlackClient, spec_set=True, instance=True) ws = create_autospec(Workspace, spec_set=True, instance=True) # Set up basic getter calls - catutils.get_catalog.return_value = catalog + sdkmr.get_catalog.return_value = catalog sdkmr.get_catalog_utils.return_value = catutils sdkmr.get_condor.return_value = condor sdkmr.get_kafka_client.return_value = kafka diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py index 191da652b..9555c6220 100644 --- a/test/tests_for_sdkmr/EE2Status_test.py +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -50,15 +50,13 @@ def test_finish_job_complete_minimal(): logger = create_autospec(Logger, spec_set=True, instance=True) mongo = create_autospec(MongoUtil, spec_set=True, instance=True) kafka = create_autospec(KafkaClient, spec_set=True, instance=True) - catutil = create_autospec(CatalogUtils, spec_set=True, instance=True) catalog = create_autospec(Catalog, spec_set=True, instance=True) condor = create_autospec(Condor, spec_set=True, instance=True) sdkmr.get_mongo_util.return_value = mongo sdkmr.get_logger.return_value = logger sdkmr.get_kafka_client.return_value = kafka sdkmr.get_condor.return_value = condor - sdkmr.get_catalog_utils.return_value = catutil - catutil.get_catalog.return_value = catalog + sdkmr.get_catalog.return_value = catalog # set up return values for mocks. Ordered as per order of operations in code job1 = _finish_job_complete_minimal_get_test_job( diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index b8225ff52..d965cfb83 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -23,6 +23,7 @@ from execution_engine2.utils.Condor import Condor from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode from execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException @@ -46,6 +47,7 @@ from lib.execution_engine2.sdk.EE2Runjob import EE2RunJob +from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace @@ -160,6 +162,7 @@ def test_getters(self): sdkmr = SDKMethodRunner(user_clients, clients_and_mocks[ClientSet]) assert sdkmr.get_workspace() is ws + assert sdkmr.get_workspace_auth() is wsa assert sdkmr.get_user_id() == "user" assert sdkmr.get_token() == "token" assert sdkmr.get_kafka_client() is clients_and_mocks[KafkaClient] @@ -167,6 +170,11 @@ def test_getters(self): assert sdkmr.get_slack_client() is clients_and_mocks[SlackClient] assert sdkmr.get_catalog_utils() is clients_and_mocks[CatalogUtils] assert sdkmr.get_condor() is clients_and_mocks[Condor] + assert sdkmr.get_catalog() is clients_and_mocks[Catalog] + assert ( + sdkmr.get_job_requirements_resolver() + is clients_and_mocks[JobRequirementsResolver] + ) def test_save_job(self): ws = Workspace("https://fake.com") diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index aa24fede1..31cc4d0d9 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -243,10 +243,8 @@ def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): @patch.object(Condor, "run_job", return_value=si) @patch.object(WorkspaceAuth, "can_write", return_value=True) - @patch( - "lib.installed_clients.CatalogClient.Catalog.get_module_version", autospec=True - ) - @patch("lib.installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + @patch("installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) def test_run_job_stress(self, ccles, cc, workspace, condor): """ testing running 3 different jobs in multiple theads. From 1d3353ac6cdd8d1fadc64938fc877bc159631b20 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 5 Apr 2021 16:12:10 -0700 Subject: [PATCH 040/109] DATAUP-389 - remove unused fields (#345) * remove sdkmr.is_admin field Doesn't actually do anything * remove sdkmr.is_admin mocks they don't mock anything that actually exists * remove mock for non-existent field --- lib/execution_engine2/sdk/SDKMethodRunner.py | 2 -- test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py | 8 -------- 2 files changed, 10 deletions(-) diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index acde98f5b..3164bfa82 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -90,8 +90,6 @@ def __init__( expire=self.JOB_PERMISSION_CACHE_EXPIRE_TIME, ) - self.is_admin = False - # self.roles = self.roles_cache.get_roles(user_id,token) or list() self._ee2_runjob = None self._ee2_status = None self._ee2_logs = None diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index d965cfb83..9a05286c5 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -263,8 +263,6 @@ def test_cancel_job2(self, rq_mock, condor_mock): runner = self.getRunner() runner.workspace_auth = MagicMock() runner.auth.get_user = MagicMock(return_value=user_name) - runner.is_admin = True - runner._is_admin = MagicMock(return_value=True) runner.workspace_auth.can_read = MagicMock(return_value=True) runner.get_permissions_for_workspace = MagicMock(return_value=True) @@ -946,7 +944,6 @@ def test_check_jobs_date_range(self, condor_mock): runner.workspace_auth = MagicMock() runner.auth.get_user = MagicMock(return_value=user_name) - runner.is_admin = True runner.check_is_admin = MagicMock(return_value=True) runner.workspace_auth.can_read = MagicMock(return_value=True) @@ -959,8 +956,6 @@ def test_check_jobs_date_range(self, condor_mock): # fixed_rj = RunJob(runner) # fixed_rj._get_module_git_commit = MagicMock(return_value='hash_goes_here') - runner._get_module_git_commit = MagicMock(return_value="git_commit_goes_here") - runner.get_condor = MagicMock(return_value=condor_mock) # ctx = {"user_id": self.user_id, "wsid": self.ws_id, "token": self.token} job = get_example_job().to_mongo().to_dict() @@ -1060,7 +1055,6 @@ def test_check_jobs_date_range(self, condor_mock): # runner.get_permissions_for_workspace = MagicMock( # return_value=SDKMethodRunner.WorkspacePermissions.ADMINISTRATOR # ) - runner.is_admin = MagicMock(return_value=True) print( "Test case 1. Retrieving Jobs from last_week and tomorrow_max (yesterday and now jobs) " @@ -1129,7 +1123,6 @@ def test_check_jobs_date_range(self, condor_mock): ) print("Test case 2B. Same as above but with FAKE user (NO ADMIN) ") - runner.is_admin = False runner.check_is_admin = MagicMock(return_value=False) with self.assertRaisesRegex( AuthError, @@ -1143,7 +1136,6 @@ def test_check_jobs_date_range(self, condor_mock): print("Exception raised is", error) print("Test case 2C. Same as above but with FAKE_TEST_USER + ADMIN) ") - runner.is_admin = True runner.check_is_admin = MagicMock(return_value=True) job_state = runner.check_jobs_date_range_for_user( creation_end_time=str(tomorrow), From 4865f1061982cf33b058e03dd2287131f2067bbe Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 5 Apr 2021 22:26:21 -0700 Subject: [PATCH 041/109] Add sdkmr.save_and_return_job (#346) Needed for mocking the case where a Job object has the save() method called on it and is then continued to be used. The save() method has a side effect of modifying the Job object to add an `.id` field, and so to mock that the method that calls save() on the job must return the updated job. --- lib/execution_engine2/sdk/SDKMethodRunner.py | 11 +++++++++-- test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 3164bfa82..11d91163b 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -248,13 +248,20 @@ def check_as_concierge(self): # at this point since MongoEngine creates a global connection to MongoDB # and makes it available to all the model objects. - def save_job(self, job: Job): + def save_job(self, job: Job) -> str: """ - Save a job record to the Mongo database. + Save a job record to the Mongo database and return the job's ID as a string. """ job.save() return str(job.id) + def save_and_return_job(self, job: Job) -> Job: + """ + Save a job record to the Mongo database and return the updated job. + """ + job.save() + return job + def get_job_counts(self, job_filter): """ Get the number of jobs matching a filter. diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 9a05286c5..ec020de82 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -195,6 +195,18 @@ def test_save_job(self): j.save.assert_called_once_with() + def test_save_and_return_job(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + sdkmr = SDKMethodRunner(cliset, clients_and_mocks[ClientSet]) + + j = create_autospec(Job, spec_set=True, instance=True) + assert sdkmr.save_and_return_job(j) == j + + j.save.assert_called_once_with() + # Status @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_cancel_job(self, condor): From 0ffee2512eee26f019684e65d9296872348675bb Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 6 Apr 2021 08:29:53 -0700 Subject: [PATCH 042/109] DATAUP-389 - remove some MongoEngine context managers (#347) * Remove mongo engine context from to be altered files Not needed as per https://github.com/kbase/execution_engine2/issues/287 Removing in a commit prior to integrating the new job requirements resolution code to make that change easier to read * run black --- lib/execution_engine2/sdk/EE2Runjob.py | 22 +- .../ee2_SDKMethodRunner_test.py | 1210 ++++++++--------- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 227 ++-- test/tests_for_sdkmr/ee2_load_test.py | 925 ++++++------- 4 files changed, 1160 insertions(+), 1224 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 6e4b1cfe7..8faa0db48 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -289,15 +289,14 @@ def _create_parent_job(self, wsid, meta): job_input.narrative_cell_info.cell_id = meta.get("cell_id") job_input.narrative_cell_info.status = meta.get("status") - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - j = Job( - job_input=job_input, - batch_job=True, - status=Status.created.value, - wsid=wsid, - user=self.sdkmr.user_id, - ) - j.save() + j = Job( + job_input=job_input, + batch_job=True, + status=Status.created.value, + wsid=wsid, + user=self.sdkmr.user_id, + ) + j.save() # TODO Do we need a new kafka call? self.sdkmr.kafka_client.send_kafka_message( @@ -319,9 +318,8 @@ def _run_batch(self, parent_job: Job, params): self._abort_child_jobs(child_jobs) raise e - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - parent_job.child_jobs = child_jobs - parent_job.save() + parent_job.child_jobs = child_jobs + parent_job.save() return child_jobs diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index ec020de82..c38861df3 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -214,12 +214,11 @@ def test_cancel_job(self, condor): sdk = self.getRunner() sdk.condor = condor - with sdk.get_mongo_util().mongo_engine_connection(): - job = get_example_job() - job.user = self.user_id - job.wsid = self.ws_id - job.save() - job_id = job.id + job = get_example_job() + job.user = self.user_id + job.wsid = self.ws_id + job.save() + job_id = job.id logging.info( f"Created job in wsid={job.wsid} status={job.status} scheduler={job.scheduler_id}. About to cancel {job_id}" @@ -236,11 +235,10 @@ def test_cancel_job(self, condor): TerminatedCode.terminated_by_user, ) - with sdk.get_mongo_util().mongo_engine_connection(): - job = get_example_job() - job.user = self.user_id - job.wsid = self.ws_id - job_id = job.save().id + job = get_example_job() + job.user = self.user_id + job.wsid = self.ws_id + job_id = job.save().id logging.info( f"Created job {job_id} in {job.wsid} status {job.status}. About to cancel" @@ -371,50 +369,49 @@ def test_check_job_canceled(self, mongo_util): # # runner = self.getRunner() - with self.mongo_util.mongo_engine_connection(): - job_id = self.create_job_rec() - - call_count = 0 - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - # estimating - runner.update_job_status(job_id=job_id, status=Status.estimating.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.queued.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.running.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.completed.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertTrue(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.error.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertTrue(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.terminated.value) - rv = runner.check_job_canceled(job_id) - self.assertTrue(rv["canceled"]) - self.assertTrue(rv["finished"]) - call_count += 1 + job_id = self.create_job_rec() + + call_count = 0 + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + # estimating + runner.update_job_status(job_id=job_id, status=Status.estimating.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.queued.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.running.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.completed.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertTrue(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.error.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertTrue(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.terminated.value) + rv = runner.check_job_canceled(job_id) + self.assertTrue(rv["canceled"]) + self.assertTrue(rv["finished"]) + call_count += 1 @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -573,85 +570,79 @@ def test_run_job_and_add_log(self, rq_mock, condor_mock): @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_finish_job(self, condor): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) - runner.catalog_utils.get_catalog().log_exec_stats = MagicMock( - return_value=True - ) + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) + runner.catalog_utils.get_catalog().log_exec_stats = MagicMock(return_value=True) - # test missing job_id input - with self.assertRaises(ValueError) as context1: - logging.info("Finish Job Case 0 Raises Error") - runner.finish_job(job_id=None) - self.assertEqual("Please provide a valid job id", str(context1.exception)) - - # test finish job with invalid status (This was removed) - # with self.assertRaises(ValueError) as context2: - # logging.info("Finish Job Case 1 Raises Error") - # runner.finish_job(job_id=job_id) - # self.assertIn("Unexpected job status", str(context2.exception)) - - # update job status to running - - runner.start_job(job_id=job_id, skip_estimation=True) - - # self.mongo_util.update_job_status(job_id=job_id, status=Status.running.value) - # job.running = datetime.datetime.utcnow() - # job.save() - - # test finish job without error - job_output = dict() - job_output["version"] = "1" - job_output["id"] = "5d54bdcb9b402d15271b3208" # A valid objectid - job_output["result"] = {"output": "output"} - logging.info("Case2 : Finish a running job") - - print(f"About to finish job {job_id}. The job status is currently") - print(runner.get_job_status_field(job_id)) - try: - runner.finish_job(job_id=job_id, job_output=job_output) - except: - pass - print("Job is now finished, status is") - print(runner.get_job_status_field(job_id)) - self.assertEqual( - {"status": "completed"}, runner.get_job_status_field(job_id) - ) + # test missing job_id input + with self.assertRaises(ValueError) as context1: + logging.info("Finish Job Case 0 Raises Error") + runner.finish_job(job_id=None) + self.assertEqual("Please provide a valid job id", str(context1.exception)) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, Status.completed.value) - self.assertFalse(job.errormsg) - self.assertTrue(job.finished) - # if job_output not a dict# - # job_output2 = job.job_output.to_mongo().to_dict() - job_output2 = job.job_output - self.assertEqual(job_output2["version"], "1") - self.assertEqual(str(job_output2["id"]), job_output["id"]) - - # update finished status to running - with self.assertRaises(InvalidStatusTransitionException): - self.mongo_util.update_job_status( - job_id=job_id, status=Status.running.value - ) + # test finish job with invalid status (This was removed) + # with self.assertRaises(ValueError) as context2: + # logging.info("Finish Job Case 1 Raises Error") + # runner.finish_job(job_id=job_id) + # self.assertIn("Unexpected job status", str(context2.exception)) + + # update job status to running + + runner.start_job(job_id=job_id, skip_estimation=True) + + # self.mongo_util.update_job_status(job_id=job_id, status=Status.running.value) + # job.running = datetime.datetime.utcnow() + # job.save() + + # test finish job without error + job_output = dict() + job_output["version"] = "1" + job_output["id"] = "5d54bdcb9b402d15271b3208" # A valid objectid + job_output["result"] = {"output": "output"} + logging.info("Case2 : Finish a running job") + + print(f"About to finish job {job_id}. The job status is currently") + print(runner.get_job_status_field(job_id)) + try: + runner.finish_job(job_id=job_id, job_output=job_output) + except: + pass + print("Job is now finished, status is") + print(runner.get_job_status_field(job_id)) + self.assertEqual({"status": "completed"}, runner.get_job_status_field(job_id)) + + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, Status.completed.value) + self.assertFalse(job.errormsg) + self.assertTrue(job.finished) + # if job_output not a dict# + # job_output2 = job.job_output.to_mongo().to_dict() + job_output2 = job.job_output + self.assertEqual(job_output2["version"], "1") + self.assertEqual(str(job_output2["id"]), job_output["id"]) + + # update finished status to running + with self.assertRaises(InvalidStatusTransitionException): + self.mongo_util.update_job_status( + job_id=job_id, status=Status.running.value + ) @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_finish_job_with_error_message(self, condor): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - job = self.mongo_util.get_job(job_id=job_id) - new_count = Job.objects.count() - self.assertEqual(ori_job_count, new_count - 1) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + job = self.mongo_util.get_job(job_id=job_id) + new_count = Job.objects.count() + self.assertEqual(ori_job_count, new_count - 1) runner = self.getRunner() condor._get_job_info = MagicMock(return_value={}) @@ -679,10 +670,8 @@ def test_finish_job_with_error_message(self, condor): self.assertIsNone(job.error) self.assertTrue(job.finished) - with self.mongo_util.mongo_engine_connection(): - job_id = runner.update_job_status( - job_id, "running" - ) # put job back to running status + # put job back to running status + job_id = runner.update_job_status(job_id, "running") error = { "message": "error message", @@ -714,45 +703,44 @@ def test_check_job_global_perm(self, rq_mock): user_roles=[], ) ) - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.assertFalse(job.running) - self.assertFalse(job.estimating) - - # test check_job - runner = self.getRunner() - job_state = runner.check_job(job_id) - json.dumps(job_state) # make sure it's JSON serializable - self.assertTrue(validate_job_state(job_state)) - self.assertEqual(job_state["status"], "created") - self.assertEqual(job_state["wsid"], self.ws_id) - - self.assertAlmostEqual( - job_state["created"] / 1000.0, job_state["updated"] / 1000.0, places=-1 - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - # test globally - job_states = runner.get_jobs_status().check_workspace_jobs(self.ws_id) - self.assertTrue(job_id in job_states) - self.assertEqual(job_states[job_id]["status"], "created") + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.assertFalse(job.running) + self.assertFalse(job.estimating) - # now test with a different user - with open(self.config_file) as cf: - other_method_runner = SDKMethodRunner( - get_user_client_set(self.cfg, "some_other_user", "other_token"), - get_client_set(self.cfg, self.config_file, cf), - ) - job_states = other_method_runner.get_jobs_status().check_workspace_jobs( - self.ws_id + # test check_job + runner = self.getRunner() + job_state = runner.check_job(job_id) + json.dumps(job_state) # make sure it's JSON serializable + self.assertTrue(validate_job_state(job_state)) + self.assertEqual(job_state["status"], "created") + self.assertEqual(job_state["wsid"], self.ws_id) + + self.assertAlmostEqual( + job_state["created"] / 1000.0, job_state["updated"] / 1000.0, places=-1 + ) + + # test globally + job_states = runner.get_jobs_status().check_workspace_jobs(self.ws_id) + self.assertTrue(job_id in job_states) + self.assertEqual(job_states[job_id]["status"], "created") + + # now test with a different user + with open(self.config_file) as cf: + other_method_runner = SDKMethodRunner( + get_user_client_set(self.cfg, "some_other_user", "other_token"), + get_client_set(self.cfg, self.config_file, cf), ) - self.assertTrue(job_id in job_states) - self.assertEqual(job_states[job_id]["status"], "created") + job_states = other_method_runner.get_jobs_status().check_workspace_jobs( + self.ws_id + ) + self.assertTrue(job_id in job_states) + self.assertEqual(job_states[job_id]["status"], "created") @requests_mock.Mocker() def test_check_job_ok(self, rq_mock): @@ -763,165 +751,144 @@ def test_check_job_ok(self, rq_mock): ) ) - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - job_id_1 = self.create_job_rec() - job_id_fake = str(bson.objectid.ObjectId()) - print(f"Saved job_id {job_id}") - print(f"Saved job_id_1 {job_id_1}") - print(f"Created fake {job_id_fake}") - - new_count = Job.objects.count() - self.assertEqual(ori_job_count, new_count - 2) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.assertFalse(job.running) - self.assertFalse(job.estimating) - - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) - - # test missing job_id input - with self.assertRaises(ValueError) as context: - runner.check_job(None) - self.assertEqual("Please provide valid job_id", str(context.exception)) - - # test check_job in a regular way - job_state = runner.check_job(job_id) - json.dumps(job_state) # make sure it's JSON serializable - self.assertTrue(validate_job_state(job_state)) - self.assertEqual(job_state["status"], "created") - self.assertEqual(job_state["wsid"], self.ws_id) - # Test both - job_state1 = runner.check_job(job_id_1) - self.assertEqual(job_state1["status"], "created") - - print(f'Job status of {job_id}={job_state["status"]}') - print(f'Job status of {job_id_1}={job_state1["status"]}') - - # test check_job with exclude_fields - job_state_exclude = runner.check_job(job_id, exclude_fields=["status"]) - self.assertFalse("status" in job_state_exclude.keys()) - self.assertEqual(job_state_exclude["wsid"], self.ws_id) - - # test check_job with exclude_fields - job_state_exclude2 = runner.check_job(job_id, exclude_fields=["status"]) - self.assertFalse("status" in job_state_exclude2.keys()) - self.assertEqual(job_state_exclude2["wsid"], self.ws_id) - - # test check_jobs - job_states_rl_0 = runner.check_jobs( - [job_id, job_id_1, job_id_fake], return_list=0 - ) - logging.info( - json.dumps(job_states_rl_0) - ) # make sure it's JSON serializable - self.assertEqual(len(job_states_rl_0.keys()), 3) - self.assertEqual(list(job_states_rl_0.keys())[0], job_id) - self.assertEqual(list(job_states_rl_0.keys())[1], job_id_1) - self.assertEqual(list(job_states_rl_0.keys())[2], job_id_fake) - self.assertTrue(validate_job_state(job_states_rl_0[job_id])) - self.assertTrue(job_id in job_states_rl_0) - self.assertEqual(job_states_rl_0[job_id]["status"], "created") - self.assertEqual(job_states_rl_0[job_id]["wsid"], self.ws_id) - - # test check_jobs return list - job_states_rl_1 = runner.check_jobs( - [job_id, job_id_1, job_id_fake], return_list=1 - )["job_states"] - json.dumps(job_states_rl_1) # make sure it's JSON serializable - self.assertEqual(len(job_states_rl_1), 3) - self.assertEqual(job_states_rl_1[0]["job_id"], job_id) - self.assertEqual(job_states_rl_1[1]["job_id"], job_id_1) - self.assertEqual(job_states_rl_1[2], []) - self.assertTrue(isinstance(job_states_rl_1, list)) - print(type(job_states_rl_1)) - self.assertCountEqual(job_states_rl_1, list(job_states_rl_0.values())) - - job_states_list_rl_t = runner.check_jobs( - [job_id, job_id_1], return_list="True" - )["job_states"] - json.dumps(job_states_list_rl_t) # make sure it's JSON serializable - self.assertEqual(job_states_list_rl_t[0]["job_id"], job_id) - self.assertEqual(job_states_list_rl_t[1]["job_id"], job_id_1) - self.assertTrue(isinstance(job_states_list_rl_t, list)) - self.assertCountEqual( - job_states_list_rl_t, list(job_states_rl_0.values())[:2] - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + job_id_1 = self.create_job_rec() + job_id_fake = str(bson.objectid.ObjectId()) + print(f"Saved job_id {job_id}") + print(f"Saved job_id_1 {job_id_1}") + print(f"Created fake {job_id_fake}") - # test check_jobs with exclude_fields - job_states_rl0_exclude_wsid = runner.check_jobs( - [job_id], exclude_fields=["wsid"], return_list=0 - ) - self.assertTrue(job_id in job_states_rl0_exclude_wsid) - self.assertFalse("wsid" in job_states_rl0_exclude_wsid[job_id].keys()) - self.assertEqual(job_states_rl0_exclude_wsid[job_id]["status"], "created") - - # test check_workspace_jobs - job_states_from_workspace_check = ( - runner.get_jobs_status().check_workspace_jobs( - self.ws_id, return_list="False" - ) - ) - for job_id_from_wsid in job_states_from_workspace_check: - self.assertTrue(job_states_from_workspace_check[job_id_from_wsid]) - print("Job States are") - for job_key in job_states_from_workspace_check: - if job_key in job_states_rl_1: - print( - job_key, - job_states_from_workspace_check[job_key]["status"], - runner.check_job(job_id=job_key)["status"], - job_states_rl_0[job], - ) + new_count = Job.objects.count() + self.assertEqual(ori_job_count, new_count - 2) - json.dumps( - job_states_from_workspace_check - ) # make sure it's JSON serializable - self.assertTrue(job_id in job_states_from_workspace_check) - self.assertEqual( - job_states_from_workspace_check[job_id]["status"], "created" - ) - self.assertEqual( - job_states_from_workspace_check[job_id]["wsid"], self.ws_id - ) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.assertFalse(job.running) + self.assertFalse(job.estimating) - self.assertTrue(job_id_1 in job_states_from_workspace_check) - self.assertEqual( - job_states_from_workspace_check[job_id_1]["status"], "created" - ) - self.assertEqual( - job_states_from_workspace_check[job_id_1]["wsid"], self.ws_id - ) + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) - # test check_workspace_jobs with exclude_fields - job_states_with_exclude_wsid = ( - runner.get_jobs_status().check_workspace_jobs( - self.ws_id, exclude_fields=["wsid"], return_list=False + # test missing job_id input + with self.assertRaises(ValueError) as context: + runner.check_job(None) + self.assertEqual("Please provide valid job_id", str(context.exception)) + + # test check_job in a regular way + job_state = runner.check_job(job_id) + json.dumps(job_state) # make sure it's JSON serializable + self.assertTrue(validate_job_state(job_state)) + self.assertEqual(job_state["status"], "created") + self.assertEqual(job_state["wsid"], self.ws_id) + # Test both + job_state1 = runner.check_job(job_id_1) + self.assertEqual(job_state1["status"], "created") + + print(f'Job status of {job_id}={job_state["status"]}') + print(f'Job status of {job_id_1}={job_state1["status"]}') + + # test check_job with exclude_fields + job_state_exclude = runner.check_job(job_id, exclude_fields=["status"]) + self.assertFalse("status" in job_state_exclude.keys()) + self.assertEqual(job_state_exclude["wsid"], self.ws_id) + + # test check_job with exclude_fields + job_state_exclude2 = runner.check_job(job_id, exclude_fields=["status"]) + self.assertFalse("status" in job_state_exclude2.keys()) + self.assertEqual(job_state_exclude2["wsid"], self.ws_id) + + # test check_jobs + job_states_rl_0 = runner.check_jobs( + [job_id, job_id_1, job_id_fake], return_list=0 + ) + logging.info(json.dumps(job_states_rl_0)) # make sure it's JSON serializable + self.assertEqual(len(job_states_rl_0.keys()), 3) + self.assertEqual(list(job_states_rl_0.keys())[0], job_id) + self.assertEqual(list(job_states_rl_0.keys())[1], job_id_1) + self.assertEqual(list(job_states_rl_0.keys())[2], job_id_fake) + self.assertTrue(validate_job_state(job_states_rl_0[job_id])) + self.assertTrue(job_id in job_states_rl_0) + self.assertEqual(job_states_rl_0[job_id]["status"], "created") + self.assertEqual(job_states_rl_0[job_id]["wsid"], self.ws_id) + + # test check_jobs return list + job_states_rl_1 = runner.check_jobs( + [job_id, job_id_1, job_id_fake], return_list=1 + )["job_states"] + json.dumps(job_states_rl_1) # make sure it's JSON serializable + self.assertEqual(len(job_states_rl_1), 3) + self.assertEqual(job_states_rl_1[0]["job_id"], job_id) + self.assertEqual(job_states_rl_1[1]["job_id"], job_id_1) + self.assertEqual(job_states_rl_1[2], []) + self.assertTrue(isinstance(job_states_rl_1, list)) + print(type(job_states_rl_1)) + self.assertCountEqual(job_states_rl_1, list(job_states_rl_0.values())) + + job_states_list_rl_t = runner.check_jobs( + [job_id, job_id_1], return_list="True" + )["job_states"] + json.dumps(job_states_list_rl_t) # make sure it's JSON serializable + self.assertEqual(job_states_list_rl_t[0]["job_id"], job_id) + self.assertEqual(job_states_list_rl_t[1]["job_id"], job_id_1) + self.assertTrue(isinstance(job_states_list_rl_t, list)) + self.assertCountEqual(job_states_list_rl_t, list(job_states_rl_0.values())[:2]) + + # test check_jobs with exclude_fields + job_states_rl0_exclude_wsid = runner.check_jobs( + [job_id], exclude_fields=["wsid"], return_list=0 + ) + self.assertTrue(job_id in job_states_rl0_exclude_wsid) + self.assertFalse("wsid" in job_states_rl0_exclude_wsid[job_id].keys()) + self.assertEqual(job_states_rl0_exclude_wsid[job_id]["status"], "created") + + # test check_workspace_jobs + job_states_from_workspace_check = runner.get_jobs_status().check_workspace_jobs( + self.ws_id, return_list="False" + ) + for job_id_from_wsid in job_states_from_workspace_check: + self.assertTrue(job_states_from_workspace_check[job_id_from_wsid]) + print("Job States are") + for job_key in job_states_from_workspace_check: + if job_key in job_states_rl_1: + print( + job_key, + job_states_from_workspace_check[job_key]["status"], + runner.check_job(job_id=job_key)["status"], + job_states_rl_0[job], ) - ) - logging.info( - json.dumps(job_states_with_exclude_wsid) - ) # make sure it's JSON serializable - self.assertTrue(job_id in job_states_with_exclude_wsid) - self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id].keys()) - self.assertEqual(job_states_with_exclude_wsid[job_id]["status"], "created") - self.assertTrue(job_id_1 in job_states_with_exclude_wsid) - self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id_1].keys()) - self.assertEqual( - job_states_with_exclude_wsid[job_id_1]["status"], "created" - ) + json.dumps(job_states_from_workspace_check) # make sure it's JSON serializable + self.assertTrue(job_id in job_states_from_workspace_check) + self.assertEqual(job_states_from_workspace_check[job_id]["status"], "created") + self.assertEqual(job_states_from_workspace_check[job_id]["wsid"], self.ws_id) - with self.assertRaises(PermissionError) as e: - runner.get_jobs_status().check_workspace_jobs(1234) - self.assertIn( - f"User {self.user_id} does not have permission to read jobs in workspace {1234}", - str(e.exception), - ) + self.assertTrue(job_id_1 in job_states_from_workspace_check) + self.assertEqual(job_states_from_workspace_check[job_id_1]["status"], "created") + self.assertEqual(job_states_from_workspace_check[job_id_1]["wsid"], self.ws_id) + + # test check_workspace_jobs with exclude_fields + job_states_with_exclude_wsid = runner.get_jobs_status().check_workspace_jobs( + self.ws_id, exclude_fields=["wsid"], return_list=False + ) + + logging.info( + json.dumps(job_states_with_exclude_wsid) + ) # make sure it's JSON serializable + self.assertTrue(job_id in job_states_with_exclude_wsid) + self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id].keys()) + self.assertEqual(job_states_with_exclude_wsid[job_id]["status"], "created") + self.assertTrue(job_id_1 in job_states_with_exclude_wsid) + self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id_1].keys()) + self.assertEqual(job_states_with_exclude_wsid[job_id_1]["status"], "created") + + with self.assertRaises(PermissionError) as e: + runner.get_jobs_status().check_workspace_jobs(1234) + self.assertIn( + f"User {self.user_id} does not have permission to read jobs in workspace {1234}", + str(e.exception), + ) @staticmethod def create_job_from_job(job, new_job_id): @@ -936,16 +903,15 @@ def create_job_from_job(job, new_job_id): return j def replace_job_id(self, job1, new_id): - with self.mongo_util.mongo_engine_connection(): - job2 = self.create_job_from_job(job1, new_id) - job2.save() - print( - "Saved job with id", - job2.id, - job2.id.generation_time, - job2.id.generation_time.timestamp(), - ) - job1.delete() + job2 = self.create_job_from_job(job1, new_id) + job2.save() + print( + "Saved job with id", + job2.id, + job2.id.generation_time, + job2.id.generation_time.timestamp(), + ) + job1.delete() # flake8: noqa: C901 @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -1009,340 +975,334 @@ def test_check_jobs_date_range(self, condor_mock): print(f"Tomorrow: {tomorrow} ts: {tomorrow.timestamp()}") print(f"Day after: {day_after} ts: {day_after.timestamp()}") - with self.mongo_util.mongo_engine_connection(): - # Last Month - job = Job.objects.with_id(job_id1) # type : Job - new_id_last_month = ObjectId.from_datetime(last_month) - print(last_month, new_id_last_month, new_id_last_month.generation_time) - - print("About to replace job id") - print(job) - print(new_id_last_month) - self.replace_job_id(job, new_id_last_month) - new_job_ids.append(str(new_id_last_month)) - - # Last week - job = Job.objects.with_id(job_id2) # type : Job - new_id_last_week = ObjectId.from_datetime(last_week) - self.replace_job_id(job, new_id_last_week) - new_job_ids.append(str(new_id_last_week)) - - # Yesterday - job = Job.objects.with_id(job_id3) # type : Job - new_id_yesterday = ObjectId.from_datetime(yesterday) - self.replace_job_id(job, new_id_yesterday) - new_job_ids.append(str(new_id_yesterday)) - - # Now - job = Job.objects.with_id(job_id4) # type : Job - new_id_now = ObjectId.from_datetime(now) - self.replace_job_id(job, new_id_now) - new_job_ids.append(str(new_id_now)) - - # Tomorrow - job = Job.objects.with_id(job_id5) # type : Job - new_id_tomorrow = ObjectId.from_datetime(tomorrow) - self.replace_job_id(job, new_id_tomorrow) - new_job_ids.append(str(new_id_tomorrow)) - - # Day After - job = Job.objects.with_id(job_id6) # type : Job - new_id_day_after = ObjectId.from_datetime(day_after) - self.replace_job_id(job, new_id_day_after) - new_job_ids.append(str(new_id_day_after)) + # Last Month + job = Job.objects.with_id(job_id1) # type : Job + new_id_last_month = ObjectId.from_datetime(last_month) + print(last_month, new_id_last_month, new_id_last_month.generation_time) + + print("About to replace job id") + print(job) + print(new_id_last_month) + self.replace_job_id(job, new_id_last_month) + new_job_ids.append(str(new_id_last_month)) + + # Last week + job = Job.objects.with_id(job_id2) # type : Job + new_id_last_week = ObjectId.from_datetime(last_week) + self.replace_job_id(job, new_id_last_week) + new_job_ids.append(str(new_id_last_week)) + + # Yesterday + job = Job.objects.with_id(job_id3) # type : Job + new_id_yesterday = ObjectId.from_datetime(yesterday) + self.replace_job_id(job, new_id_yesterday) + new_job_ids.append(str(new_id_yesterday)) + + # Now + job = Job.objects.with_id(job_id4) # type : Job + new_id_now = ObjectId.from_datetime(now) + self.replace_job_id(job, new_id_now) + new_job_ids.append(str(new_id_now)) + + # Tomorrow + job = Job.objects.with_id(job_id5) # type : Job + new_id_tomorrow = ObjectId.from_datetime(tomorrow) + self.replace_job_id(job, new_id_tomorrow) + new_job_ids.append(str(new_id_tomorrow)) + + # Day After + job = Job.objects.with_id(job_id6) # type : Job + new_id_day_after = ObjectId.from_datetime(day_after) + self.replace_job_id(job, new_id_day_after) + new_job_ids.append(str(new_id_day_after)) # JOB ID GETS GENERATED HERE - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.false = self.assertFalse(job.running) - self.assertFalse(job.estimating) - - runner.check_permission_for_job = MagicMock(return_value=True) - # runner.get_permissions_for_workspace = MagicMock( - # return_value=SDKMethodRunner.WorkspacePermissions.ADMINISTRATOR - # ) - - print( - "Test case 1. Retrieving Jobs from last_week and tomorrow_max (yesterday and now jobs) " - ) - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=last_week.timestamp(), # test timestamp input - user="ALL", - ) - count = 0 - for js in job_state["jobs"]: - job_id = js["job_id"] - print("Job is id", job_id) - if job_id in new_job_ids: - count += 1 - self.assertIn(js["status"], ["created", "queued"]) - print(js["created"]) - print(type(js["created"])) - date = SDKMethodRunner.check_and_convert_time(js["created"]) - ts = date - print( - f"Creation date {date}, LastWeek:{last_week}, Tomorrow{tomorrow})" - ) - print(ts, last_week.timestamp()) - self.assertTrue(float(ts) >= last_week.timestamp()) - print(ts, tomorrow.timestamp()) - self.assertTrue(float(ts) <= tomorrow.timestamp()) - self.assertEqual(2, count) - - print( - "Test case 2A. Retrieving Jobs from last_month and tomorrow_max (last_month, last_week, yesterday and now jobs) " - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str( - tomorrow.timestamp() - ), # test timestamp string input - creation_start_time=last_month_and_1_hour, # test datetime input - user="ALL", - ) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.false = self.assertFalse(job.running) + self.assertFalse(job.estimating) - count = 0 - for js in job_state["jobs"]: - job_id = js["job_id"] - print("Job is id", job_id) - if job_id in new_job_ids: - count += 1 - self.assertIn(js["status"], ["created", "queued"]) - ts = SDKMethodRunner.check_and_convert_time(js["created"]) - print(f"Timestamp: {ts}") - self.assertTrue(ts > last_month_and_1_hour.timestamp()) - self.assertTrue(ts < tomorrow.timestamp()) - self.assertEqual(4, count) - - print("Found all of the jobs", len(new_job_ids)) - - with self.assertRaises(Exception) as context: - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(yesterday), - creation_start_time=str(tomorrow), - user="ALL", - ) - self.assertEqual( - "The start date cannot be greater than the end date.", - str(context.exception), - ) + runner.check_permission_for_job = MagicMock(return_value=True) + # runner.get_permissions_for_workspace = MagicMock( + # return_value=SDKMethodRunner.WorkspacePermissions.ADMINISTRATOR + # ) - print("Test case 2B. Same as above but with FAKE user (NO ADMIN) ") - runner.check_is_admin = MagicMock(return_value=False) - with self.assertRaisesRegex( - AuthError, - "You are not authorized to view all records or records for others.", - ) as error: - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="FAKE", + print( + "Test case 1. Retrieving Jobs from last_week and tomorrow_max (yesterday and now jobs) " + ) + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=last_week.timestamp(), # test timestamp input + user="ALL", + ) + count = 0 + for js in job_state["jobs"]: + job_id = js["job_id"] + print("Job is id", job_id) + if job_id in new_job_ids: + count += 1 + self.assertIn(js["status"], ["created", "queued"]) + print(js["created"]) + print(type(js["created"])) + date = SDKMethodRunner.check_and_convert_time(js["created"]) + ts = date + print( + f"Creation date {date}, LastWeek:{last_week}, Tomorrow{tomorrow})" ) - print("Exception raised is", error) + print(ts, last_week.timestamp()) + self.assertTrue(float(ts) >= last_week.timestamp()) + print(ts, tomorrow.timestamp()) + self.assertTrue(float(ts) <= tomorrow.timestamp()) + self.assertEqual(2, count) - print("Test case 2C. Same as above but with FAKE_TEST_USER + ADMIN) ") - runner.check_is_admin = MagicMock(return_value=True) + print( + "Test case 2A. Retrieving Jobs from last_month and tomorrow_max (last_month, last_week, yesterday and now jobs) " + ) + + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow.timestamp()), # test timestamp string input + creation_start_time=last_month_and_1_hour, # test datetime input + user="ALL", + ) + + count = 0 + for js in job_state["jobs"]: + job_id = js["job_id"] + print("Job is id", job_id) + if job_id in new_job_ids: + count += 1 + self.assertIn(js["status"], ["created", "queued"]) + ts = SDKMethodRunner.check_and_convert_time(js["created"]) + print(f"Timestamp: {ts}") + self.assertTrue(ts > last_month_and_1_hour.timestamp()) + self.assertTrue(ts < tomorrow.timestamp()) + self.assertEqual(4, count) + + print("Found all of the jobs", len(new_job_ids)) + + with self.assertRaises(Exception) as context: job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user=user_name, + creation_end_time=str(yesterday), + creation_start_time=str(tomorrow), + user="ALL", + ) + self.assertEqual( + "The start date cannot be greater than the end date.", + str(context.exception), ) - count = 0 - for js in job_state["jobs"]: - job_id = js["job_id"] - print("Job is id", job_id) - if job_id in new_job_ids: - count += 1 - self.assertIn(js["status"], ["created", "queued"]) - ts = SDKMethodRunner.check_and_convert_time(js["created"]) - print(f"Timestamp: {ts}") - self.assertTrue(ts > last_month_and_1_hour.timestamp()) - self.assertTrue(ts < tomorrow.timestamp()) - - # May need to change this if other db entries get added - self.assertEqual(4, count) - - print("Found all of the jobs", len(new_job_ids)) - - print("Test case 3. Assert Raises error") - - with self.assertRaises(Exception) as context: - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(yesterday), - creation_start_time=str(tomorrow), - user="ALL", - ) - self.assertEqual( - "The start date cannot be greater than the end date.", - str(context.exception), - ) - - print("Test case 4, find the original job") + print("Test case 2B. Same as above but with FAKE user (NO ADMIN) ") + runner.check_is_admin = MagicMock(return_value=False) + with self.assertRaisesRegex( + AuthError, + "You are not authorized to view all records or records for others.", + ) as error: job_state = runner.check_jobs_date_range_for_user( creation_end_time=str(tomorrow), creation_start_time=str(last_month_and_1_hour), - user=user_name, + user="FAKE", ) - self.assertTrue(len(job_state["jobs"][0].keys()) > 0) - print(f"Checking {job_id}") + print("Exception raised is", error) - found = False - for job in job_state["jobs"]: - if job_id == job["job_id"]: - found = True + print("Test case 2C. Same as above but with FAKE_TEST_USER + ADMIN) ") + runner.check_is_admin = MagicMock(return_value=True) + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user=user_name, + ) - if found is False: - raise Exception("Didn't find the original job") + count = 0 + for js in job_state["jobs"]: + job_id = js["job_id"] + print("Job is id", job_id) + if job_id in new_job_ids: + count += 1 + self.assertIn(js["status"], ["created", "queued"]) + ts = SDKMethodRunner.check_and_convert_time(js["created"]) + print(f"Timestamp: {ts}") + self.assertTrue(ts > last_month_and_1_hour.timestamp()) + self.assertTrue(ts < tomorrow.timestamp()) - print(job_state) + # May need to change this if other db entries get added + self.assertEqual(4, count) - print("Test 5, find the original job, but with projections") - job_states = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user=user_name, - job_projection=["wsid"], - ) - job_state_with_proj = None - for job in job_states["jobs"]: - if job_id == job["job_id"]: - job_state_with_proj = job - - example_job_stat = { - "_id": "5d892ede9ea3d7d3b824dbff", - "authstrat": "kbaseworkspace", - "wsid": 9999, - "updated": "2019-09-23 20:45:19.468032", - "job_id": "5d892ede9ea3d7d3b824dbff", - "created": "2019-09-23 20:45:18+00:00", - } - - required_headers = list(example_job_stat.keys()) - required_headers.append("wsid") - - for member in required_headers: - self.assertIn(member, job_state_with_proj) - self.assertNotIn("status", job_state_with_proj) - - print("Test 6a, find the original job, but with projections and filters") + print("Found all of the jobs", len(new_job_ids)) + + print("Test case 3. Assert Raises error") + + with self.assertRaises(Exception) as context: job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), + creation_end_time=str(yesterday), + creation_start_time=str(tomorrow), user="ALL", - job_projection=["wsid", "status"], - job_filter={"wsid": 9999}, + ) + self.assertEqual( + "The start date cannot be greater than the end date.", + str(context.exception), ) - for record in job_state["jobs"]: + print("Test case 4, find the original job") + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user=user_name, + ) + self.assertTrue(len(job_state["jobs"][0].keys()) > 0) + print(f"Checking {job_id}") - print(record) - if record["wsid"] != 9999: - raise Exception("Only records with wsid 9999 should be allowed") - self.assertIn("wsid", record) - self.assertIn("status", record) - self.assertNotIn("service_ver", record) - print("job state is", "len is", len(job_state["jobs"])) + found = False + for job in job_state["jobs"]: + if job_id == job["job_id"]: + found = True - self.assertTrue(len(job_state["jobs"]) >= 1) + if found is False: + raise Exception("Didn't find the original job") - print("Test 6b, find the original job, but with projections and filters") - job_state2 = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - job_filter=["wsid=123"], - ) + print(job_state) - for record in job_state2["jobs"]: + print("Test 5, find the original job, but with projections") + job_states = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user=user_name, + job_projection=["wsid"], + ) + job_state_with_proj = None + for job in job_states["jobs"]: + if job_id == job["job_id"]: + job_state_with_proj = job + + example_job_stat = { + "_id": "5d892ede9ea3d7d3b824dbff", + "authstrat": "kbaseworkspace", + "wsid": 9999, + "updated": "2019-09-23 20:45:19.468032", + "job_id": "5d892ede9ea3d7d3b824dbff", + "created": "2019-09-23 20:45:18+00:00", + } - if record["wsid"] != 123: - print(record) - print("ID IS", record["wsid"]) - raise Exception("Only records with wsid 123 should be allowed") - self.assertIn("wsid", record) - self.assertIn("status", record) - self.assertNotIn("service_ver", record) + required_headers = list(example_job_stat.keys()) + required_headers.append("wsid") - print(len(job_state2["jobs"])) - self.assertTrue(len(job_state2["jobs"]) > 0) + for member in required_headers: + self.assertIn(member, job_state_with_proj) + self.assertNotIn("status", job_state_with_proj) - print( - "Test 7, find same jobs as test 2 or 3, but also filter, project, and limit" - ) - job_state_limit = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - job_filter=["wsid=123"], - limit=2, - ) + print("Test 6a, find the original job, but with projections and filters") + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + job_filter={"wsid": 9999}, + ) - self.assertTrue(len(job_state_limit["jobs"]) > 0) + for record in job_state["jobs"]: + + print(record) + if record["wsid"] != 9999: + raise Exception("Only records with wsid 9999 should be allowed") + self.assertIn("wsid", record) + self.assertIn("status", record) + self.assertNotIn("service_ver", record) + print("job state is", "len is", len(job_state["jobs"])) + + self.assertTrue(len(job_state["jobs"]) >= 1) + + print("Test 6b, find the original job, but with projections and filters") + job_state2 = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + job_filter=["wsid=123"], + ) - print( - "Test 8, ascending and descending (maybe should verify jobs count > 2)" - ) - job_state_limit_asc = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - ascending="True", - ) + for record in job_state2["jobs"]: - epoch = datetime.utcfromtimestamp(0) + if record["wsid"] != 123: + print(record) + print("ID IS", record["wsid"]) + raise Exception("Only records with wsid 123 should be allowed") + self.assertIn("wsid", record) + self.assertIn("status", record) + self.assertNotIn("service_ver", record) - job_id_temp = str(ObjectId.from_datetime(epoch)) - for item in job_state_limit_asc["jobs"]: - job_id = item["job_id"] - if ObjectId(job_id) > ObjectId(job_id_temp): - job_id_temp = job_id - else: - raise Exception( - "Not ascending" - + "JobIdPrev" - + str(job_id_temp) - + "JobIdNext" - + str(job_id) - ) + print(len(job_state2["jobs"])) + self.assertTrue(len(job_state2["jobs"]) > 0) - job_state_limit_desc = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - ascending="False", - ) + print( + "Test 7, find same jobs as test 2 or 3, but also filter, project, and limit" + ) + job_state_limit = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + job_filter=["wsid=123"], + limit=2, + ) - # TimeDelta Over 9999 days - job_id_temp = str(ObjectId.from_datetime(now + timedelta(days=9999))) + self.assertTrue(len(job_state_limit["jobs"]) > 0) - for item in job_state_limit_desc["jobs"]: - job_id = item["job_id"] - if ObjectId(job_id) < ObjectId(job_id_temp): - job_id_temp = job_id - else: - raise Exception( - "Not Descending" - + "JobIdPrev:" - + str(job_id_temp) - + "JobIdNext:" - + str(job_id) - ) + print("Test 8, ascending and descending (maybe should verify jobs count > 2)") + job_state_limit_asc = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + ascending="True", + ) + + epoch = datetime.utcfromtimestamp(0) + + job_id_temp = str(ObjectId.from_datetime(epoch)) + for item in job_state_limit_asc["jobs"]: + job_id = item["job_id"] + if ObjectId(job_id) > ObjectId(job_id_temp): + job_id_temp = job_id + else: + raise Exception( + "Not ascending" + + "JobIdPrev" + + str(job_id_temp) + + "JobIdNext" + + str(job_id) + ) + + job_state_limit_desc = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + ascending="False", + ) + + # TimeDelta Over 9999 days + job_id_temp = str(ObjectId.from_datetime(now + timedelta(days=9999))) + + for item in job_state_limit_desc["jobs"]: + job_id = item["job_id"] + if ObjectId(job_id) < ObjectId(job_id_temp): + job_id_temp = job_id + else: + raise Exception( + "Not Descending" + + "JobIdPrev:" + + str(job_id_temp) + + "JobIdNext:" + + str(job_id) + ) - for key in job_state_limit_desc.keys(): - print(key) - print(job_state_limit_desc[key]) + for key in job_state_limit_desc.keys(): + print(key) + print(job_state_limit_desc[key]) # TODO TEST _finish_job_with_success, TEST finish_job_with_error diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index c20c0ea2b..9c66069e0 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -91,140 +91,137 @@ def test_init_ok(self): self.assertTrue(set(class_attri) <= set(runner.__dict__.keys())) def test_init_job_rec(self): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() - - job_params = { - "wsid": self.ws_id, - "method": "MEGAHIT.run_megahit", - "app_id": "MEGAHIT/run_megahit", - "service_ver": "2.2.1", - "params": [ - { - "workspace_name": "wjriehl:1475006266615", - "read_library_refs": ["18836/5/1"], - "output_contigset_name": "rhodo_contigs", - "recipe": "auto", - "assembler": None, - "pipeline": None, - "min_contig_len": None, - } - ], - "source_ws_objects": ["a/b/c", "e/d"], - "parent_job_id": "9998", - "meta": {"tag": "dev", "token_id": "12345"}, - } - - job_id = runner.get_runjob()._init_job_rec(self.user_id, job_params) - - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - job = Job.objects.get(id=job_id) - - self.assertEqual(job.user, self.user_id) - self.assertEqual(job.authstrat, "kbaseworkspace") - self.assertEqual(job.wsid, self.ws_id) - - job_input = job.job_input - - self.assertEqual(job_input.wsid, self.ws_id) - self.assertEqual(job_input.method, "MEGAHIT.run_megahit") - self.assertEqual(job_input.app_id, "MEGAHIT/run_megahit") - # TODO this is an integration test - # self.assertEqual(job_input.service_ver, "2.2.1") - self.assertEqual( - job_input.service_ver, "048baf3c2b76cb923b3b4c52008ed77dbe20292d" - ) + ori_job_count = Job.objects.count() + runner = self.getRunner() + + job_params = { + "wsid": self.ws_id, + "method": "MEGAHIT.run_megahit", + "app_id": "MEGAHIT/run_megahit", + "service_ver": "2.2.1", + "params": [ + { + "workspace_name": "wjriehl:1475006266615", + "read_library_refs": ["18836/5/1"], + "output_contigset_name": "rhodo_contigs", + "recipe": "auto", + "assembler": None, + "pipeline": None, + "min_contig_len": None, + } + ], + "source_ws_objects": ["a/b/c", "e/d"], + "parent_job_id": "9998", + "meta": {"tag": "dev", "token_id": "12345"}, + } + + job_id = runner.get_runjob()._init_job_rec(self.user_id, job_params) + + self.assertEqual(ori_job_count, Job.objects.count() - 1) + + job = Job.objects.get(id=job_id) + + self.assertEqual(job.user, self.user_id) + self.assertEqual(job.authstrat, "kbaseworkspace") + self.assertEqual(job.wsid, self.ws_id) + + job_input = job.job_input + + self.assertEqual(job_input.wsid, self.ws_id) + self.assertEqual(job_input.method, "MEGAHIT.run_megahit") + self.assertEqual(job_input.app_id, "MEGAHIT/run_megahit") + # TODO this is an integration test + # self.assertEqual(job_input.service_ver, "2.2.1") + self.assertEqual( + job_input.service_ver, "048baf3c2b76cb923b3b4c52008ed77dbe20292d" + ) - self.assertCountEqual(job_input.source_ws_objects, ["a/b/c", "e/d"]) - self.assertEqual(job_input.parent_job_id, "9998") + self.assertCountEqual(job_input.source_ws_objects, ["a/b/c", "e/d"]) + self.assertEqual(job_input.parent_job_id, "9998") - narrative_cell_info = job_input.narrative_cell_info - self.assertEqual(narrative_cell_info.tag, "dev") - self.assertEqual(narrative_cell_info.token_id, "12345") - self.assertFalse(narrative_cell_info.status) + narrative_cell_info = job_input.narrative_cell_info + self.assertEqual(narrative_cell_info.tag, "dev") + self.assertEqual(narrative_cell_info.token_id, "12345") + self.assertFalse(narrative_cell_info.status) - self.assertFalse(job.job_output) + self.assertFalse(job.job_output) - self.mongo_util.get_job(job_id=job_id).delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.mongo_util.get_job(job_id=job_id).delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_get_job_params(self): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) - params = runner.get_job_params(job_id) - - expected_params_keys = [ - "wsid", - "method", - "params", - "service_ver", - "app_id", - "source_ws_objects", - "parent_job_id", - ] - self.assertCountEqual(params.keys(), expected_params_keys) - self.assertEqual(params["wsid"], self.ws_id) - self.assertEqual(params["method"], "MEGAHIT.run_megahit") - self.assertEqual(params["app_id"], "MEGAHIT/run_megahit") - self.assertEqual(params["service_ver"], "2.2.1") - self.assertCountEqual(params["source_ws_objects"], ["a/b/c", "e/d"]) - self.assertEqual(params["parent_job_id"], "9998") - - self.mongo_util.get_job(job_id=job_id).delete() - self.assertEqual(ori_job_count, Job.objects.count()) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) + + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) + params = runner.get_job_params(job_id) + + expected_params_keys = [ + "wsid", + "method", + "params", + "service_ver", + "app_id", + "source_ws_objects", + "parent_job_id", + ] + self.assertCountEqual(params.keys(), expected_params_keys) + self.assertEqual(params["wsid"], self.ws_id) + self.assertEqual(params["method"], "MEGAHIT.run_megahit") + self.assertEqual(params["app_id"], "MEGAHIT/run_megahit") + self.assertEqual(params["service_ver"], "2.2.1") + self.assertCountEqual(params["source_ws_objects"], ["a/b/c", "e/d"]) + self.assertEqual(params["parent_job_id"], "9998") + + self.mongo_util.get_job(job_id=job_id).delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_start_job(self): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.assertFalse(job.running) - self.assertFalse(job.estimating) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.assertFalse(job.running) + self.assertFalse(job.estimating) - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) - # test missing job_id input - with self.assertRaises(ValueError) as context: - runner.start_job(None) - self.assertEqual("Please provide valid job_id", str(context.exception)) + # test missing job_id input + with self.assertRaises(ValueError) as context: + runner.start_job(None) + self.assertEqual("Please provide valid job_id", str(context.exception)) - # start a created job, set job to estimation status - runner.start_job(job_id, skip_estimation=False) + # start a created job, set job to estimation status + runner.start_job(job_id, skip_estimation=False) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "estimating") - self.assertFalse(job.running) - self.assertTrue(job.estimating) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "estimating") + self.assertFalse(job.running) + self.assertTrue(job.estimating) - # start a estimating job, set job to running status - runner.start_job(job_id, skip_estimation=False) + # start a estimating job, set job to running status + runner.start_job(job_id, skip_estimation=False) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "running") - self.assertTrue(job.running) - self.assertTrue(job.estimating) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "running") + self.assertTrue(job.running) + self.assertTrue(job.estimating) - # test start a job with invalid status - with self.assertRaises(ValueError) as context: - runner.start_job(job_id) - self.assertIn("Unexpected job status", str(context.exception)) + # test start a job with invalid status + with self.assertRaises(ValueError) as context: + runner.start_job(job_id) + self.assertIn("Unexpected job status", str(context.exception)) - self.mongo_util.get_job(job_id=job_id).delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.mongo_util.get_job(job_id=job_id).delete() + self.assertEqual(ori_job_count, Job.objects.count()) @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 31cc4d0d9..9192cb2c2 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -79,162 +79,160 @@ def test_init_job_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.method_runner - # set job method differently to distinguish - method_1 = "app_1.a_method" - method_2 = "app_1.b_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - - threads = list() - job_ids = list() - que = queue.Queue() - - # execute _init_job_rec for 2 different jobs in threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - runner.get_runjob()._init_job_rec(self.user_id, job_params_1) - ) + ori_job_count = Job.objects.count() + runner = self.method_runner + # set job method differently to distinguish + method_1 = "app_1.a_method" + method_2 = "app_1.b_method" + job_params_1 = get_sample_job_params(method=method_1) + job_params_2 = get_sample_job_params(method=method_2) + + threads = list() + job_ids = list() + que = queue.Queue() + + # execute _init_job_rec for 2 different jobs in threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + runner.get_runjob()._init_job_rec(self.user_id, job_params_1) ) - threads.append(x) - x.start() - y = threading.Thread( - target=que.put( - runner.get_runjob()._init_job_rec(self.user_id, job_params_2) - ) + ) + threads.append(x) + x.start() + y = threading.Thread( + target=que.put( + runner.get_runjob()._init_job_rec(self.user_id, job_params_2) ) - threads.append(y) - y.start() + ) + threads.append(y) + y.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_ids.append(que.get()) + while not que.empty(): + job_ids.append(que.get()) - jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs + jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs - methods = [job.job_input.method for job in jobs] # examing methods returned - self.assertEqual(len(methods), thread_count * 2) - self.assertEqual(methods.count(method_1), thread_count) - self.assertEqual(methods.count(method_2), thread_count) + methods = [job.job_input.method for job in jobs] # examing methods returned + self.assertEqual(len(methods), thread_count * 2) + self.assertEqual(methods.count(method_1), thread_count) + self.assertEqual(methods.count(method_2), thread_count) - self.assertEqual( - len(set(job_ids)), thread_count * 2 - ) # testing identicalness of job_ids returned - self.assertEqual(len(job_ids), len(set(job_ids))) + self.assertEqual( + len(set(job_ids)), thread_count * 2 + ) # testing identicalness of job_ids returned + self.assertEqual(len(job_ids), len(set(job_ids))) - self.assertEqual( - ori_job_count, Job.objects.count() - thread_count * 2 - ) # testing job numbers created + self.assertEqual( + ori_job_count, Job.objects.count() - thread_count * 2 + ) # testing job numbers created - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_update_job_status_stress(self): """ testing update jobs into different status in multiple threads """ - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.method_runner + ori_job_count = Job.objects.count() + runner = self.method_runner - job_params = get_sample_job_params() + job_params = get_sample_job_params() - thread_count = self.thread_count # threads to test + thread_count = self.thread_count # threads to test - job_ids_queued = list() # jobs to be set into 'queued' status - job_ids_running = list() # jobs to be set into 'running' status - job_ids_completed = list() # jobs to be set into 'completed' status + job_ids_queued = list() # jobs to be set into 'queued' status + job_ids_running = list() # jobs to be set into 'running' status + job_ids_completed = list() # jobs to be set into 'completed' status - # initializing jobs to be tested - for index in range(thread_count): - job_ids_queued.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_running.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_completed.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) + # initializing jobs to be tested + for index in range(thread_count): + job_ids_queued.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_running.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_completed.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) - # examing newly created job status - queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) - for job in queued_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNone(job_rec.get("queued")) - self.assertEqual(job_rec.get("status"), "created") - - running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) - for job in running_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNone(job_rec.get("running")) - self.assertEqual(job_rec.get("status"), "created") - - finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) - for job in finish_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNone(job_rec.get("finished")) - self.assertEqual(job_rec.get("status"), "created") - - threads = list() - - def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): - """ - update jobs status in one thread - """ - runner.get_runjob().update_job_to_queued( - job_ids_queued[index], "scheduler_id" - ) - runner.get_jobs_status().start_job(job_ids_running[index]) - runner.get_jobs_status().start_job(job_ids_finish[index]) - job_output = { - "version": "11", - "result": {"result": 1}, - "id": "5d54bdcb9b402d15271b3208", - } - runner.finish_job(job_id=job_ids_finish[index], job_output=job_output) - - for index in range(thread_count): - x = threading.Thread( - target=update_states( - index, job_ids_queued, job_ids_running, job_ids_completed - ) + # examing newly created job status + queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) + for job in queued_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNone(job_rec.get("queued")) + self.assertEqual(job_rec.get("status"), "created") + + running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) + for job in running_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNone(job_rec.get("running")) + self.assertEqual(job_rec.get("status"), "created") + + finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) + for job in finish_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNone(job_rec.get("finished")) + self.assertEqual(job_rec.get("status"), "created") + + threads = list() + + def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): + """ + update jobs status in one thread + """ + runner.get_runjob().update_job_to_queued( + job_ids_queued[index], "scheduler_id" + ) + runner.get_jobs_status().start_job(job_ids_running[index]) + runner.get_jobs_status().start_job(job_ids_finish[index]) + job_output = { + "version": "11", + "result": {"result": 1}, + "id": "5d54bdcb9b402d15271b3208", + } + runner.finish_job(job_id=job_ids_finish[index], job_output=job_output) + + for index in range(thread_count): + x = threading.Thread( + target=update_states( + index, job_ids_queued, job_ids_running, job_ids_completed ) - threads.append(x) - x.start() - - for index, thread in enumerate(threads): - thread.join() - - # examing updateed job status - queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) - for job in queued_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNotNone(job_rec.get("queued")) - self.assertEqual(job_rec.get("status"), "queued") - - running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) - for job in running_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNotNone(job_rec.get("running")) - self.assertEqual(job_rec.get("status"), "running") - - finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) - for job in finish_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNotNone(job_rec.get("finished")) - self.assertEqual(job_rec.get("status"), "completed") - - jobs = self.mongo_util.get_jobs( - job_ids=(job_ids_queued + job_ids_running + job_ids_completed) ) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + threads.append(x) + x.start() + + for index, thread in enumerate(threads): + thread.join() + + # examing updateed job status + queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) + for job in queued_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNotNone(job_rec.get("queued")) + self.assertEqual(job_rec.get("status"), "queued") + + running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) + for job in running_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNotNone(job_rec.get("running")) + self.assertEqual(job_rec.get("status"), "running") + + finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) + for job in finish_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNotNone(job_rec.get("finished")) + self.assertEqual(job_rec.get("status"), "completed") + + jobs = self.mongo_util.get_jobs( + job_ids=(job_ids_queued + job_ids_running + job_ids_completed) + ) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) # @patch.object(Catalog, "get_module_version", return_value="module.version") # @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -253,159 +251,155 @@ def test_run_job_stress(self, ccles, cc, workspace, condor): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() + ori_job_count = Job.objects.count() - # set job method differently to distinguish - method_1 = "app1.a_method" - method_2 = "app2.b_method" - method_3 = "app3.c_method" + # set job method differently to distinguish + method_1 = "app1.a_method" + method_2 = "app2.b_method" + method_3 = "app3.c_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - job_params_3 = get_sample_job_params(method=method_3) + job_params_1 = get_sample_job_params(method=method_1) + job_params_2 = get_sample_job_params(method=method_2) + job_params_3 = get_sample_job_params(method=method_3) - threads = list() - job_ids = list() - que = queue.Queue() + threads = list() + job_ids = list() + que = queue.Queue() - # execute run_job for 3 different jobs in threads - for index in range(thread_count): - x = threading.Thread( - target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_1)) - ) - threads.append(x) - x.start() + # execute run_job for 3 different jobs in threads + for index in range(thread_count): + x = threading.Thread( + target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_1)) + ) + threads.append(x) + x.start() - y = threading.Thread( - target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_2)) - ) - threads.append(y) - y.start() + y = threading.Thread( + target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_2)) + ) + threads.append(y) + y.start() - z = threading.Thread( - target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_3)) - ) - threads.append(z) - z.start() + z = threading.Thread( + target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_3)) + ) + threads.append(z) + z.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_ids.append(que.get()[0]) + while not que.empty(): + job_ids.append(que.get()[0]) - jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs + jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs - methods = [job.job_input.method for job in jobs] # examing methods returned - self.assertEqual(len(methods), thread_count * 3) - self.assertEqual(methods.count(method_1), thread_count) - self.assertEqual(methods.count(method_2), thread_count) - self.assertEqual(methods.count(method_3), thread_count) + methods = [job.job_input.method for job in jobs] # examing methods returned + self.assertEqual(len(methods), thread_count * 3) + self.assertEqual(methods.count(method_1), thread_count) + self.assertEqual(methods.count(method_2), thread_count) + self.assertEqual(methods.count(method_3), thread_count) - status = [ - job.status for job in jobs - ] # all jobs should eventually be put to 'queued' status - self.assertCountEqual(status, [Status.queued.value] * thread_count * 3) + status = [ + job.status for job in jobs + ] # all jobs should eventually be put to 'queued' status + self.assertCountEqual(status, [Status.queued.value] * thread_count * 3) - self.assertEqual( - len(set(job_ids)), thread_count * 3 - ) # testing identicalness of job_ids returned - self.assertEqual(len(job_ids), len(set(job_ids))) + self.assertEqual( + len(set(job_ids)), thread_count * 3 + ) # testing identicalness of job_ids returned + self.assertEqual(len(job_ids), len(set(job_ids))) - self.assertEqual( - ori_job_count, Job.objects.count() - thread_count * 3 - ) # testing job numbers created + self.assertEqual( + ori_job_count, Job.objects.count() - thread_count * 3 + ) # testing job numbers created - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_update_job_status(self): """ testing update jobs into different status in multiple threads """ - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.method_runner + ori_job_count = Job.objects.count() + runner = self.method_runner - job_params = get_sample_job_params() + job_params = get_sample_job_params() - thread_count = self.thread_count # threads to test + thread_count = self.thread_count # threads to test - job_ids_queued = list() # jobs to be set into 'queued' status - job_ids_running = list() # jobs to be set into 'running' status - job_ids_completed = list() # jobs to be set into 'completed' status + job_ids_queued = list() # jobs to be set into 'queued' status + job_ids_running = list() # jobs to be set into 'running' status + job_ids_completed = list() # jobs to be set into 'completed' status - # initializing jobs to be tested - for index in range(thread_count): - job_ids_queued.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_running.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_completed.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) + # initializing jobs to be tested + for index in range(thread_count): + job_ids_queued.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_running.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_completed.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) - # examing newly created job status - init_jobs = self.mongo_util.get_jobs( - job_ids=job_ids_queued + job_ids_running + job_ids_completed + # examing newly created job status + init_jobs = self.mongo_util.get_jobs( + job_ids=job_ids_queued + job_ids_running + job_ids_completed + ) + for job in init_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "created") + + threads = list() + + def update_states(index, job_ids_queued, job_ids_running, job_ids_completed): + """ + update jobs status in one thread + """ + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_ids_queued[index], "status": "queued"}, + ) + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_ids_running[index], "status": "running"}, + ) + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_ids_completed[index], "status": "completed"}, ) - for job in init_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "created") - - threads = list() - - def update_states( - index, job_ids_queued, job_ids_running, job_ids_completed - ): - """ - update jobs status in one thread - """ - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_ids_queued[index], "status": "queued"}, - ) - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_ids_running[index], "status": "running"}, - ) - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_ids_completed[index], "status": "completed"}, - ) - for index in range(thread_count): - x = threading.Thread( - target=update_states( - index, job_ids_queued, job_ids_running, job_ids_completed - ) + for index in range(thread_count): + x = threading.Thread( + target=update_states( + index, job_ids_queued, job_ids_running, job_ids_completed ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - # examing updateed job status - queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) - for job in queued_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "queued") + # examing updateed job status + queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) + for job in queued_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "queued") - running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) - for job in running_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "running") + running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) + for job in running_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "running") - finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) - for job in finish_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "completed") + finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) + for job in finish_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "completed") - jobs = self.mongo_util.get_jobs( - job_ids=(job_ids_queued + job_ids_running + job_ids_completed) - ) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs = self.mongo_util.get_jobs( + job_ids=(job_ids_queued + job_ids_running + job_ids_completed) + ) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_check_jobs_stress(self): """ @@ -414,53 +408,52 @@ def test_check_jobs_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.method_runner - - # set job method differently to distinguish - method_1 = "a_method" - method_2 = "b_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - - # create jobs - job_id_1 = runner.get_runjob()._init_job_rec(self.user_id, job_params_1) - job_id_2 = runner.get_runjob()._init_job_rec(self.user_id, job_params_2) - - threads = list() - job_status = list() - que = queue.Queue() - - # execute check_jobs in multiple threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.check_jobs( - ctx=self.ctx, params={"job_ids": [job_id_1, job_id_2]} - ) + ori_job_count = Job.objects.count() + runner = self.method_runner + + # set job method differently to distinguish + method_1 = "a_method" + method_2 = "b_method" + job_params_1 = get_sample_job_params(method=method_1) + job_params_2 = get_sample_job_params(method=method_2) + + # create jobs + job_id_1 = runner.get_runjob()._init_job_rec(self.user_id, job_params_1) + job_id_2 = runner.get_runjob()._init_job_rec(self.user_id, job_params_2) + + threads = list() + job_status = list() + que = queue.Queue() + + # execute check_jobs in multiple threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.check_jobs( + ctx=self.ctx, params={"job_ids": [job_id_1, job_id_2]} ) ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_status.append(que.get()) + while not que.empty(): + job_status.append(que.get()) - # exam returned job status - for job_status in job_status: - job_status = job_status[0]["job_states"] - job_ids = [js["job_id"] for js in job_status] - job_methods = [js["job_input"]["method"] for js in job_status] - self.assertCountEqual(job_ids, [job_id_1, job_id_2]) - self.assertCountEqual(job_methods, [method_1, method_2]) + # exam returned job status + for job_status in job_status: + job_status = job_status[0]["job_states"] + job_ids = [js["job_id"] for js in job_status] + job_methods = [js["job_input"]["method"] for js in job_status] + self.assertCountEqual(job_ids, [job_id_1, job_id_2]) + self.assertCountEqual(job_methods, [method_1, method_2]) - jobs = self.mongo_util.get_jobs(job_ids=[job_id_1, job_id_2]) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs = self.mongo_util.get_jobs(job_ids=[job_id_1, job_id_2]) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_check_job_canceled_stress(self): """ @@ -469,106 +462,101 @@ def test_check_job_canceled_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.method_runner + ori_job_count = Job.objects.count() + runner = self.method_runner - job_params = get_sample_job_params() + job_params = get_sample_job_params() - # create jobs - job_id_running = runner.get_runjob()._init_job_rec(self.user_id, job_params) - job_id_terminated = runner.get_runjob()._init_job_rec( - self.user_id, job_params - ) - job_id_completed = runner.get_runjob()._init_job_rec( - self.user_id, job_params - ) + # create jobs + job_id_running = runner.get_runjob()._init_job_rec(self.user_id, job_params) + job_id_terminated = runner.get_runjob()._init_job_rec(self.user_id, job_params) + job_id_completed = runner.get_runjob()._init_job_rec(self.user_id, job_params) - self.impl.update_job_status( - ctx=self.ctx, params={"job_id": job_id_running, "status": "running"} - ) - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_id_terminated, "status": "terminated"}, - ) - self.impl.update_job_status( - ctx=self.ctx, params={"job_id": job_id_completed, "status": "completed"} - ) + self.impl.update_job_status( + ctx=self.ctx, params={"job_id": job_id_running, "status": "running"} + ) + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_id_terminated, "status": "terminated"}, + ) + self.impl.update_job_status( + ctx=self.ctx, params={"job_id": job_id_completed, "status": "completed"} + ) - threads = list() - job_canceled_status = list() - que = queue.Queue() - - # execute check_job_canceled in multiple threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.check_job_canceled( - ctx=self.ctx, params={"job_id": job_id_running} - ) + threads = list() + job_canceled_status = list() + que = queue.Queue() + + # execute check_job_canceled in multiple threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.check_job_canceled( + ctx=self.ctx, params={"job_id": job_id_running} ) ) - threads.append(x) - x.start() - - y = threading.Thread( - target=que.put( - self.impl.check_job_canceled( - ctx=self.ctx, params={"job_id": job_id_terminated} - ) + ) + threads.append(x) + x.start() + + y = threading.Thread( + target=que.put( + self.impl.check_job_canceled( + ctx=self.ctx, params={"job_id": job_id_terminated} ) ) - threads.append(y) - y.start() - - z = threading.Thread( - target=que.put( - self.impl.check_job_canceled( - ctx=self.ctx, params={"job_id": job_id_completed} - ) + ) + threads.append(y) + y.start() + + z = threading.Thread( + target=que.put( + self.impl.check_job_canceled( + ctx=self.ctx, params={"job_id": job_id_completed} ) ) - threads.append(z) - z.start() - - for index, thread in enumerate(threads): - thread.join() - - while not que.empty(): - job_canceled_status.append(que.get()) - - # exam correct job ids returned - job_ids_returned = [ - jcs_return[0]["job_id"] for jcs_return in job_canceled_status - ] - self.assertEqual( - len(job_ids_returned), thread_count * 3 - ) # exam total job number returned - self.assertEqual(job_ids_returned.count(job_id_running), thread_count) - self.assertEqual(job_ids_returned.count(job_id_terminated), thread_count) - self.assertEqual(job_ids_returned.count(job_id_completed), thread_count) - - # exam returned job canceled status - for job_canceled_status_return in job_canceled_status: - job_canceled_status_return = job_canceled_status_return[0] - if job_canceled_status_return["job_id"] == job_id_running: - self.assertFalse(job_canceled_status_return["canceled"]) - self.assertFalse(job_canceled_status_return["finished"]) - if job_canceled_status_return["job_id"] == job_id_terminated: - self.assertTrue(job_canceled_status_return["canceled"]) - self.assertTrue(job_canceled_status_return["finished"]) - if job_canceled_status_return["job_id"] == job_id_completed: - self.assertFalse(job_canceled_status_return["canceled"]) - self.assertTrue(job_canceled_status_return["finished"]) - - jobs = self.mongo_util.get_jobs( - job_ids=[job_id_running, job_id_terminated, job_id_completed] ) + threads.append(z) + z.start() + + for index, thread in enumerate(threads): + thread.join() + + while not que.empty(): + job_canceled_status.append(que.get()) + + # exam correct job ids returned + job_ids_returned = [ + jcs_return[0]["job_id"] for jcs_return in job_canceled_status + ] + self.assertEqual( + len(job_ids_returned), thread_count * 3 + ) # exam total job number returned + self.assertEqual(job_ids_returned.count(job_id_running), thread_count) + self.assertEqual(job_ids_returned.count(job_id_terminated), thread_count) + self.assertEqual(job_ids_returned.count(job_id_completed), thread_count) + + # exam returned job canceled status + for job_canceled_status_return in job_canceled_status: + job_canceled_status_return = job_canceled_status_return[0] + if job_canceled_status_return["job_id"] == job_id_running: + self.assertFalse(job_canceled_status_return["canceled"]) + self.assertFalse(job_canceled_status_return["finished"]) + if job_canceled_status_return["job_id"] == job_id_terminated: + self.assertTrue(job_canceled_status_return["canceled"]) + self.assertTrue(job_canceled_status_return["finished"]) + if job_canceled_status_return["job_id"] == job_id_completed: + self.assertFalse(job_canceled_status_return["canceled"]) + self.assertTrue(job_canceled_status_return["finished"]) + + jobs = self.mongo_util.get_jobs( + job_ids=[job_id_running, job_id_terminated, job_id_completed] + ) - for job in jobs: - job.delete() + for job in jobs: + job.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.assertEqual(ori_job_count, Job.objects.count()) def test_get_job_logs_stress(self): """ @@ -577,57 +565,54 @@ def test_get_job_logs_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.method_runner + ori_job_count = Job.objects.count() + runner = self.method_runner - # create job - job_id = runner.get_runjob()._init_job_rec( - self.user_id, get_sample_job_params() - ) + # create job + job_id = runner.get_runjob()._init_job_rec( + self.user_id, get_sample_job_params() + ) - # add one line to job - ts = time.time() - job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] - self.impl.add_job_logs( - ctx=self.ctx, params={"job_id": job_id}, lines=job_line - ) + # add one line to job + ts = time.time() + job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] + self.impl.add_job_logs(ctx=self.ctx, params={"job_id": job_id}, lines=job_line) - threads = list() - job_lines = list() - que = queue.Queue() + threads = list() + job_lines = list() + que = queue.Queue() - # execute get_job_logs in multiple threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id}) - ) + # execute get_job_logs in multiple threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id}) ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_lines.append(que.get()) + while not que.empty(): + job_lines.append(que.get()) - self.assertEqual( - len(job_lines), thread_count - ) # exam total number of job lines returned + self.assertEqual( + len(job_lines), thread_count + ) # exam total number of job lines returned - # exam each get_job_logs result - for job_line in job_lines: - job_line = job_line[0]["lines"][0] - self.assertEqual(job_line["line"], "hello ee2") - self.assertEqual(job_line["linepos"], 0) - self.assertEqual(job_line["is_error"], 1) - self.assertEqual(job_line["ts"], int(ts * 1000)) + # exam each get_job_logs result + for job_line in job_lines: + job_line = job_line[0]["lines"][0] + self.assertEqual(job_line["line"], "hello ee2") + self.assertEqual(job_line["linepos"], 0) + self.assertEqual(job_line["is_error"], 1) + self.assertEqual(job_line["ts"], int(ts * 1000)) - jobs = self.mongo_util.get_jobs(job_ids=[job_id]) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs = self.mongo_util.get_jobs(job_ids=[job_id]) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_add_job_logs_stress(self): """ @@ -636,61 +621,57 @@ def test_add_job_logs_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): + ori_job_count = Job.objects.count() + print("original job count is", ori_job_count) + runner = self.method_runner - ori_job_count = Job.objects.count() - print("original job count is", ori_job_count) - runner = self.method_runner - - # create job - job_id = runner.get_runjob()._init_job_rec( - self.user_id, get_sample_job_params() - ) + # create job + job_id = runner.get_runjob()._init_job_rec( + self.user_id, get_sample_job_params() + ) - # job line to be added - ts = time.time() - job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] - - threads = list() - que = queue.Queue() - # execute add_job_logs in multiple threads - print("Number of threads are", thread_count) - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.add_job_logs( - ctx=self.ctx, params={"job_id": job_id}, lines=job_line - ) + # job line to be added + ts = time.time() + job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] + + threads = list() + que = queue.Queue() + # execute add_job_logs in multiple threads + print("Number of threads are", thread_count) + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.add_job_logs( + ctx=self.ctx, params={"job_id": job_id}, lines=job_line ) ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - job_lines = self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id})[ - 0 - ] + job_lines = self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id})[0] - self.assertEqual( - job_lines["last_line_number"], thread_count - 1 - ) # exam total number of job lines created by add_job_logs + self.assertEqual( + job_lines["last_line_number"], thread_count - 1 + ) # exam total number of job lines created by add_job_logs - # exam each line created by add_job_logs - lines = job_lines["lines"] - self.assertEqual(len(lines), thread_count) - line_pos = list() - for line in lines: - self.assertEqual(line["line"], "hello ee2") - self.assertEqual(line["is_error"], 1) - self.assertEqual(line["ts"], int(ts * 1000)) - line_pos.append(line["linepos"]) - self.assertCountEqual(line_pos, list(range(0, thread_count))) + # exam each line created by add_job_logs + lines = job_lines["lines"] + self.assertEqual(len(lines), thread_count) + line_pos = list() + for line in lines: + self.assertEqual(line["line"], "hello ee2") + self.assertEqual(line["is_error"], 1) + self.assertEqual(line["ts"], int(ts * 1000)) + line_pos.append(line["linepos"]) + self.assertCountEqual(line_pos, list(range(0, thread_count))) - jobs = self.mongo_util.get_jobs(job_ids=[job_id]) + jobs = self.mongo_util.get_jobs(job_ids=[job_id]) - for job in jobs: - job.delete() + for job in jobs: + job.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.assertEqual(ori_job_count, Job.objects.count()) From 37c075d4c88bb1dafd9fdb41335a47bd113ea514 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 6 Apr 2021 16:02:08 -0700 Subject: [PATCH 043/109] DATAUP-389 - refactor RunJob unit tests (#348) * Refactor runjob test Factors out some mocking that will be in common with other tests once the job requirements resolution code will be integrated. Should make the upcoming changes easier to understand. * run black --- test/tests_for_sdkmr/EE2Runjob_test.py | 262 ++++++++++++++----------- 1 file changed, 151 insertions(+), 111 deletions(-) diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 321fbf80d..3fc0c5f7f 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -4,134 +4,125 @@ # Incomplete by a long way. Will add more unit tests as they come up. -from typing import List +from typing import List, Dict, Any from bson.objectid import ObjectId from logging import Logger from unittest.mock import create_autospec +from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions -from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.utils.CatalogUtils import CatalogUtils -from execution_engine2.utils.Condor import ( - Condor, - CondorResources, - SubmissionInfo, +from execution_engine2.utils.Condor import Condor, SubmissionInfo, CondorResources +from execution_engine2.sdk.job_submission_parameters import ( + JobRequirements as ResolvedRequirements, ) +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.utils.KafkaUtils import ( KafkaClient, KafkaQueueChange, KafkaCreateJob, ) +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from execution_engine2.utils.SlackUtils import SlackClient from execution_engine2.db.MongoUtil import MongoUtil from installed_clients.WorkspaceClient import Workspace from installed_clients.CatalogClient import Catalog +from utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS +# common variables +_JOB_ID = "603051cfaf2e3401b0500982" +_GIT_COMMIT = "git5678" +_WS_REF_1 = "1/2/3" +_WS_REF_2 = "4/5/6" +_CLUSTER = "cluster42" +_METHOD = "lolcats.lol_unto_death" +_APP = "lolcats/itsmypartyilllolifiwantto" +_USER = "someuser" +_TOKEN = "tokentokentoken" +_CREATED_STATE = "created" +_QUEUED_STATE = "queued" -def test_run_as_admin(): - """ - A basic unit test of the run() method with an administrative user. - This test is a fairly minimal test of the run() method. It does not exercise all the - potential code paths or provide all the possible run inputs, such as job parameters, cell - metadata, etc. +def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: """ - - # Set up data variables - job_id = "603051cfaf2e3401b0500982" - git_commit = "git5678" - ws_obj1 = "1/2/3" - ws_obj2 = "4/5/6" - client_group = "grotesquememlong" - cpus = "4" - mem = "32M" - cluster = "cluster42" - method = "lolcats.lol_unto_death" - user = "someuser" - token = "tokentokentoken" - created_state = "created" - queued_state = "queued" + Returns a dictionary of the class that is mocked to the mock of the class, and initializes + the SDKMR getters to return the mocks. + """ + # Can't seem to find a mypy annotation for a class, so Any it is # The amount of mocking required here implies the method should be broken up into smaller # classes that are individually mockable. Or maybe it's just really complicated and this # is the best we can do. Worth looking into at some point though. - - # We intentionally do not check the logger methods as there are a lot of them and this is - # already a very large test. This may be something to be added later when needed. + mocks = get_client_mocks(None, None, *ALL_CLIENTS) sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catutils = create_autospec(CatalogUtils, spec_set=True, instance=True) - condor = create_autospec(Condor, spec_set=True, instance=True) - kafka = create_autospec(KafkaClient, spec_set=True, instance=True) - logger = create_autospec(Logger, spec_set=True, instance=True) - mongo = create_autospec(MongoUtil, spec_set=True, instance=True) - slack = create_autospec(SlackClient, spec_set=True, instance=True) - ws = create_autospec(Workspace, spec_set=True, instance=True) + mocks[SDKMethodRunner] = sdkmr + mocks[Logger] = create_autospec(Logger, spec_set=True, instance=True) + mocks[Workspace] = create_autospec(Workspace, spec_set=True, instance=True) + mocks[WorkspaceAuth] = create_autospec(WorkspaceAuth, spec_set=True, instance=True) # Set up basic getter calls - sdkmr.get_catalog.return_value = catalog - sdkmr.get_catalog_utils.return_value = catutils - sdkmr.get_condor.return_value = condor - sdkmr.get_kafka_client.return_value = kafka - sdkmr.get_logger.return_value = logger - sdkmr.get_mongo_util.return_value = mongo - sdkmr.get_slack_client.return_value = slack + sdkmr.get_catalog.return_value = mocks[Catalog] + sdkmr.get_catalog_utils.return_value = mocks[CatalogUtils] + sdkmr.get_condor.return_value = mocks[Condor] + sdkmr.get_kafka_client.return_value = mocks[KafkaClient] + sdkmr.get_logger.return_value = mocks[Logger] + sdkmr.get_mongo_util.return_value = mocks[MongoUtil] + sdkmr.get_job_requirements_resolver.return_value = mocks[JobRequirementsResolver] + sdkmr.get_slack_client.return_value = mocks[SlackClient] sdkmr.get_token.return_value = token sdkmr.get_user_id.return_value = user - sdkmr.get_workspace.return_value = ws + sdkmr.get_workspace.return_value = mocks[Workspace] + sdkmr.get_workspace_auth.return_value = mocks[WorkspaceAuth] - # Set up call returns. These calls are in the order they occur in the code - sdkmr.check_as_admin.return_value = True - sdkmr.save_job.return_value = job_id - ws.get_object_info3.return_value = {"paths": [[ws_obj1], [ws_obj2]]} - catalog_resources = { - "client_group": client_group, - "request_cpus": cpus, - "request_memory": mem, + return mocks + + +def _set_up_common_return_values(mocks): + """ + Set up return values on mocks that are the same for several tests. + """ + mocks[Workspace].get_object_info3.return_value = { + "paths": [[_WS_REF_1], [_WS_REF_2]] } - catutils.get_normalized_resources.return_value = catalog_resources - condor.extract_resources.return_value = CondorResources( - cpus, "2600GB", mem, client_group - ) - catalog.get_module_version.return_value = {"git_commit_hash": git_commit} - condor.run_job.return_value = SubmissionInfo(cluster, {}, None) + mocks[SDKMethodRunner].save_job.return_value = _JOB_ID + mocks[Catalog].get_module_version.return_value = {"git_commit_hash": _GIT_COMMIT} + mocks[Condor].run_job.return_value = SubmissionInfo(_CLUSTER, {}, None) retjob = Job() - retjob.id = ObjectId(job_id) - retjob.status = created_state - mongo.get_job.return_value = retjob + retjob.id = ObjectId(_JOB_ID) + retjob.status = _CREATED_STATE + mocks[MongoUtil].get_job.return_value = retjob - # set up the class to be tested and run the method - rj = EE2RunJob(sdkmr) - params = { - "method": method, - "source_ws_objects": [ws_obj1, ws_obj2], - } - assert rj.run(params, as_admin=True) == job_id - # check mocks called as expected. The order here is the order that they're called in the code. - sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) - ws.get_object_info3.assert_called_once_with( - {"objects": [{"ref": ws_obj1}, {"ref": ws_obj2}], "ignoreErrors": 1} +def _check_common_mock_calls(mocks, reqs, creqs, wsid): + """ + Check that mocks are called as expected when those calls are similar or the same for + several tests. + """ + sdkmr = mocks[SDKMethodRunner] + kafka = mocks[KafkaClient] + mocks[Workspace].get_object_info3.assert_called_once_with( + {"objects": [{"ref": _WS_REF_1}, {"ref": _WS_REF_2}], "ignoreErrors": 1} ) - catutils.get_normalized_resources.assert_called_once_with(method) - condor.extract_resources.assert_called_once_with(catalog_resources) - catalog.get_module_version.assert_called_once_with( + mocks[Catalog].get_module_version.assert_called_once_with( {"module_name": "lolcats", "version": "release"} ) # initial job data save expected_job = Job() - expected_job.user = user - expected_job.status = created_state + expected_job.user = _USER + expected_job.status = _CREATED_STATE + expected_job.wsid = wsid ji = JobInput() - ji.method = method - ji.service_ver = git_commit - ji.source_ws_objects = [ws_obj1, ws_obj2] + ji.method = _METHOD + ji.app_id = _APP + ji.wsid = wsid + ji.service_ver = _GIT_COMMIT + ji.source_ws_objects = [_WS_REF_1, _WS_REF_2] ji.parent_job_id = "None" jr = JobRequirements() - jr.clientgroup = client_group - jr.cpu = cpus - jr.memory = "32" - jr.disk = "2600" + jr.clientgroup = creqs.client_group + jr.cpu = creqs.request_cpus + jr.memory = creqs.request_memory + jr.disk = creqs.request_disk ji.requirements = jr ji.narrative_cell_info = Meta() expected_job.job_input = ji @@ -139,48 +130,97 @@ def test_run_as_admin(): got_job = sdkmr.save_job.call_args_list[0][0][0] assert_jobs_equal(got_job, expected_job) - kafka.send_kafka_message.assert_any_call(KafkaCreateJob(user, job_id)) - condor.run_job.assert_called_once_with( - params={ - "method": method, - "source_ws_objects": [ws_obj1, ws_obj2], - "service_ver": git_commit, - "job_id": job_id, - "user_id": user, - "token": token, - "cg_resources_requirements": { - "client_group": client_group, - "request_cpus": cpus, - "request_memory": mem, - }, - }, - concierge_params=None, + kafka.send_kafka_message.assert_any_call(KafkaCreateJob(_USER, _JOB_ID)) + params_expected = { + "method": _METHOD, + "app_id": _APP, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + "service_ver": _GIT_COMMIT, + "job_id": _JOB_ID, + "user_id": _USER, + "token": _TOKEN, + "cg_resources_requirements": reqs, + } + mocks[Condor].run_job.assert_called_once_with( + params=params_expected, concierge_params=None ) # updated job data save - mongo.get_job.assert_called_once_with(job_id) + mocks[MongoUtil].get_job.assert_called_once_with(_JOB_ID) # update to queued state got_job = sdkmr.save_job.call_args_list[1][0][0] expected_job = Job() - expected_job.id = ObjectId(job_id) - expected_job.status = queued_state + expected_job.id = ObjectId(_JOB_ID) + expected_job.status = _QUEUED_STATE # no way to test this really without code refactoring expected_job.queued = got_job.queued expected_job.scheduler_type = "condor" - expected_job.scheduler_id = cluster + expected_job.scheduler_id = _CLUSTER assert_jobs_equal(got_job, expected_job) kafka.send_kafka_message.assert_called_with( # update to queued state KafkaQueueChange( - job_id=job_id, - new_status=queued_state, - previous_status=created_state, - scheduler_id=cluster, + job_id=_JOB_ID, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER, ) ) - slack.run_job_message.assert_called_once_with(job_id, cluster, user) + mocks[SlackClient].run_job_message.assert_called_once_with(_JOB_ID, _CLUSTER, _USER) + + +def test_run_as_admin(): + """ + A basic unit test of the run() method with an administrative user. + + This test is a fairly minimal test of the run() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + + # Set up data variables + client_group = "grotesquememlong" + cpus = 4 + mem = "32M" + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + catutils = mocks[CatalogUtils] + condor = mocks[Condor] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + sdkmr.check_as_admin.return_value = True + catalog_resources = { + "client_group": client_group, + "request_cpus": cpus, + "request_memory": mem, + } + catutils.get_normalized_resources.return_value = catalog_resources + condor.extract_resources.return_value = CondorResources( + cpus, "2600GB", mem, client_group + ) + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + assert rj.run(params, as_admin=True) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + catutils.get_normalized_resources.assert_called_once_with(_METHOD) + condor.extract_resources.assert_called_once_with(catalog_resources) + expected_condor_resources = CondorResources(4, "2600", "32", client_group) + _check_common_mock_calls(mocks, catalog_resources, expected_condor_resources, None) def assert_jobs_equal(got_job: Job, expected_job: Job): From 34629099c81546a110830aef5d485f28ef8a0303 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 6 Apr 2021 16:47:05 -0700 Subject: [PATCH 044/109] DATAUP-389 - check that source_ws_objects is a list (#349) * Check that source_ws_objects is a list Mypy should really be used for that check, but it turns out this class will be the first thing to encounter s_ws_o, so in the interests of utility sticking the check in there. * run black --- lib/execution_engine2/sdk/job_submission_parameters.py | 2 ++ test/tests_for_sdkmr/job_submission_parameters_test.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py index 9658bfe0f..2a720920e 100644 --- a/lib/execution_engine2/sdk/job_submission_parameters.py +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -214,6 +214,8 @@ def __init__( ) self.wsid = _gt_zero(wsid, "wsid", optional=True) source_ws_objects = source_ws_objects if source_ws_objects else [] + if type(source_ws_objects) != list: + raise IncorrectParamsException("source_ws_objects must be a list") for i, ref in enumerate(source_ws_objects): upa, is_valid = _is_valid_UPA(ref) if not is_valid: diff --git a/test/tests_for_sdkmr/job_submission_parameters_test.py b/test/tests_for_sdkmr/job_submission_parameters_test.py index abb8bf1da..33ab3e16b 100644 --- a/test/tests_for_sdkmr/job_submission_parameters_test.py +++ b/test/tests_for_sdkmr/job_submission_parameters_test.py @@ -522,6 +522,16 @@ def test_job_sub_init_fail(): "source_ws_objects index 1, 'None', is not a valid Unique Permanent Address" ), ) + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + {"1/2/3": "5/6/7"}, + IncorrectParamsException("source_ws_objects must be a list"), + ) _job_sub_init_fail( j, a, From efe0b91fa88e3479f76487eaa7322b6d066a1858 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 8 Apr 2021 16:26:46 -0700 Subject: [PATCH 045/109] DATAUP-389 - Integrate job reqs resolver (#350) * Integrate the job requirement resolver Required a fairly large refactor of the run_job pathway in EE2RunJob and Condor. CatalogUtils is no longer necessary as a reesult. Also corrected the spec - in multiple places in the code memory is treated as MB and disk as GB, and that's how it's defined in the deploy file. Quite a bit of argument checking has been moved prior to making any DB changes - in particular several checks are run prior to creating the parent job in run_batch. Condor.run_job now simply translates a job spec into a submission dict. ee2_scheduler_test has been made redundant by Condor_test, but is left in to show the tests pass and the changes to said tests to get them to pass. It will be deleted in the next commit. for htcondor, making it quite a bit simpler. * run black * Minor cleanup * typo, clarify TODO * Add some docs & typing --- execution_engine2.spec | 17 +- .../execution_engine2Impl.py | 2 +- lib/execution_engine2/sdk/EE2Constants.py | 16 +- lib/execution_engine2/sdk/EE2Runjob.py | 238 ++++-- lib/execution_engine2/sdk/SDKMethodRunner.py | 8 - lib/execution_engine2/utils/CatalogUtils.py | 71 -- lib/execution_engine2/utils/Condor.py | 284 ++----- lib/execution_engine2/utils/CondorTuples.py | 7 - lib/execution_engine2/utils/clients.py | 16 +- test/tests_for_auth/ee2_admin_mode_test.py | 10 +- test/tests_for_sdkmr/EE2Runjob_test.py | 710 ++++++++++++++++-- test/tests_for_sdkmr/EE2Status_test.py | 1 - .../ee2_SDKMethodRunner_EE2Logs_test.py | 2 +- .../ee2_SDKMethodRunner_test.py | 40 +- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 17 +- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 14 +- test/tests_for_sdkmr/ee2_load_test.py | 28 +- test/tests_for_sdkmr/ee2_scheduler_test.py | 204 +++-- test/tests_for_utils/Condor_test.py | 212 ++++++ test/tests_for_utils/clients_test.py | 22 +- test/utils_shared/mock_utils.py | 7 +- test/utils_shared/test_utils.py | 19 +- 22 files changed, 1263 insertions(+), 682 deletions(-) delete mode 100644 lib/execution_engine2/utils/CatalogUtils.py create mode 100644 test/tests_for_utils/Condor_test.py diff --git a/execution_engine2.spec b/execution_engine2.spec index 63cdc60e5..47fdf681f 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -167,21 +167,32 @@ /* EE2Constants Concierge Params are request_cpus: int request_memory: int in MB - request_disk: int in MB + request_disk: int in GB job_priority: int = None range from -20 to +20, with higher values meaning better priority. + Note: job_priority is currently not implemented. account_group: str = None # Someone elses account + ignore_concurrency_limits: ignore any limits on simultaneous job runs. + Default 1 (True). requirements_list: list = None ['machine=worker102','color=red'] client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup + client_group_regex: Whether to treat the client group string, whether provided here, + from the catalog, or as a default, as a regular expression when matching + clientgroups. Default True for HTC, but the default depends on the scheduler. + Omit to use the default. + debug_mode: Whether to run the job in debug mode. Default 0 (False). */ typedef structure { int request_cpu; - int request_memory_mb; - int request_disk_mb; + int request_memory; + int request_disk; int job_priority; string account_group; + boolean ignore_concurrency_limits; list requirements_list; string client_group; + boolean client_group_regex; + boolean debug_mode; } ConciergeParams; diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 4b3ba3f0b..49a5b9e7f 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -65,7 +65,7 @@ def __init__(self, config): configpath = os.environ["KB_DEPLOYMENT_CONFIG"] override = os.environ.get("OVERRIDE_CLIENT_GROUP") with open(configpath) as cf: - self.clients = get_client_set(config, configpath, cf, override) + self.clients = get_client_set(config, cf, override) #END_CONSTRUCTOR pass diff --git a/lib/execution_engine2/sdk/EE2Constants.py b/lib/execution_engine2/sdk/EE2Constants.py index 6871c22b3..932d7fa27 100644 --- a/lib/execution_engine2/sdk/EE2Constants.py +++ b/lib/execution_engine2/sdk/EE2Constants.py @@ -1,5 +1,4 @@ -from dataclasses import dataclass -from typing import Optional, NamedTuple +from typing import NamedTuple # May want to make this configurable. Hardcoded for now as we want concierge data to be owned # by this user. @@ -23,16 +22,3 @@ class JobError(NamedTuple): message: str code: int error: str - - -@dataclass() -class ConciergeParams: - """ Set requested params. If you don't specify CG, its automatically set for you""" - - request_cpus: int - request_memory: int - request_disk: int - job_priority: int = None - account_group: str = None - requirements_list: list = None - client_group: Optional[str] = CONCIERGE_CLIENTGROUP diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 8faa0db48..571f35a59 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -7,7 +7,7 @@ import os import time from enum import Enum -from typing import Optional, Dict, NamedTuple, Union, List +from typing import Optional, Dict, NamedTuple, Union, List, Any from execution_engine2.db.models.models import ( Job, @@ -18,9 +18,31 @@ ErrorCode, TerminatedCode, ) -from lib.execution_engine2.sdk.EE2Constants import ConciergeParams -from lib.execution_engine2.utils.CondorTuples import CondorResources +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements as ResolvedRequirements, + AppInfo, + UserCreds, +) +from execution_engine2.utils.job_requirements_resolver import ( + REQUEST_CPUS, + REQUEST_DISK, + REQUEST_MEMORY, + CLIENT_GROUP, + CLIENT_GROUP_REGEX, + DEBUG_MODE, +) from execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange +from execution_engine2.exceptions import IncorrectParamsException + + +_JOB_REQUIREMENTS = "job_reqs" +_REQUIREMENTS_LIST = "requirements_list" +_METHOD = "method" +_APP_ID = "app_id" +_PARENT_JOB_ID = "parent_job_id" +_WORKSPACE_ID = "wsid" +_SOURCE_WS_OBJECTS = "source_ws_objects" class JobPermissions(Enum): @@ -50,28 +72,26 @@ def _init_job_rec( self, user_id: str, params: Dict, - resources: CondorResources = None, - concierge_params: ConciergeParams = None, ) -> str: job = Job() inputs = JobInput() job.user = user_id job.authstrat = "kbaseworkspace" - job.wsid = params.get("wsid") + job.wsid = params.get(_WORKSPACE_ID) job.status = "created" # Inputs inputs.wsid = job.wsid - inputs.method = params.get("method") + inputs.method = params.get(_METHOD) inputs.params = params.get("params") params["service_ver"] = self._get_module_git_commit( - params.get("method"), params.get("service_ver") + params.get(_METHOD), params.get("service_ver") ) inputs.service_ver = params.get("service_ver") - inputs.app_id = params.get("app_id") - inputs.source_ws_objects = params.get("source_ws_objects") - inputs.parent_job_id = str(params.get("parent_job_id")) + inputs.app_id = params.get(_APP_ID) + inputs.source_ws_objects = params.get(_SOURCE_WS_OBJECTS) + inputs.parent_job_id = str(params.get(_PARENT_JOB_ID)) inputs.narrative_cell_info = Meta() meta = params.get("meta") @@ -79,23 +99,12 @@ def _init_job_rec( for meta_attr in ["run_id", "token_id", "tag", "cell_id", "status"]: inputs.narrative_cell_info[meta_attr] = meta.get(meta_attr) - if resources: - # TODO Should probably do some type checking on these before its passed in - jr = JobRequirements() - if concierge_params: - jr.cpu = concierge_params.request_cpus - jr.memory = concierge_params.request_memory - jr.disk = concierge_params.request_disk - jr.clientgroup = concierge_params.client_group - else: - jr.clientgroup = resources.client_group - if self.override_clientgroup: - jr.clientgroup = self.override_clientgroup - jr.cpu = resources.request_cpus - jr.memory = resources.request_memory[:-1] # Memory always in mb - jr.disk = resources.request_disk[:-2] # Space always in gb - - inputs.requirements = jr + jr = JobRequirements() + jr.cpu = params[_JOB_REQUIREMENTS].cpus + jr.memory = params[_JOB_REQUIREMENTS].memory_MB + jr.disk = params[_JOB_REQUIREMENTS].disk_GB + jr.clientgroup = params[_JOB_REQUIREMENTS].client_group + inputs.requirements = jr job.job_input = inputs self.logger.debug(job.job_input.to_mongo().to_dict()) @@ -184,50 +193,33 @@ def _finish_created_job( error=f"{exception}", ) - def _prepare_to_run(self, params, concierge_params=None) -> PreparedJobParams: + def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParameters: """ - Creates a job record, grabs info about the objects, - checks the catalog resource requirements, and submits to condor + Creates a job record and creates the job submission params """ - # perform sanity checks before creating job - self._check_ws_objects(source_objects=params.get("source_ws_objects")) - method = params.get("method") - # Normalize multiple formats into one format (csv vs json) - normalized_resources = self.sdkmr.get_catalog_utils().get_normalized_resources( - method - ) - # These are for saving into job inputs. Maybe its best to pass this into condor as well? - extracted_resources = self.sdkmr.get_condor().extract_resources( - cgrr=normalized_resources - ) # type: CondorResources - # insert initial job document into db - - job_id = self._init_job_rec( - self.sdkmr.get_user_id(), params, extracted_resources, concierge_params - ) - params["job_id"] = job_id - params["user_id"] = self.sdkmr.get_user_id() - params["token"] = self.sdkmr.get_token() - params["cg_resources_requirements"] = normalized_resources + job_id = self._init_job_rec(self.sdkmr.get_user_id(), params) self.logger.debug( - f"User {self.sdkmr.get_user_id()} attempting to run job {method} {params}" + f"User {self.sdkmr.get_user_id()} attempting to run job {params[_METHOD]} {params}" ) - return PreparedJobParams(params=params, job_id=job_id) - - def _run(self, params, concierge_params=None): - prepared = self._prepare_to_run( - params=params, concierge_params=concierge_params + return JobSubmissionParameters( + job_id, + AppInfo(params[_METHOD], params[_APP_ID]), + params[_JOB_REQUIREMENTS], + UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), + parent_job_id=params.get(_PARENT_JOB_ID), + wsid=params.get(_WORKSPACE_ID), + source_ws_objects=params.get(_SOURCE_WS_OBJECTS), ) - params = prepared.params - job_id = prepared.job_id + + def _run(self, params): + job_params = self._prepare_to_run(params=params) + job_id = job_params.job_id try: - submission_info = self.sdkmr.get_condor().run_job( - params=params, concierge_params=concierge_params - ) + submission_info = self.sdkmr.get_condor().run_job(params=job_params) condor_job_id = submission_info.clusterid self.logger.debug(f"Submitted job id and got '{condor_job_id}'") except Exception as e: @@ -294,12 +286,12 @@ def _create_parent_job(self, wsid, meta): batch_job=True, status=Status.created.value, wsid=wsid, - user=self.sdkmr.user_id, + user=self.sdkmr.get_user_id(), ) - j.save() + j = self.sdkmr.save_and_return_job(j) # TODO Do we need a new kafka call? - self.sdkmr.kafka_client.send_kafka_message( + self.sdkmr.get_kafka_client().send_kafka_message( message=KafkaCreateJob(job_id=str(j.id), user=j.user) ) return j @@ -307,8 +299,8 @@ def _create_parent_job(self, wsid, meta): def _run_batch(self, parent_job: Job, params): child_jobs = [] for job_param in params: - if "parent_job_id" not in job_param: - job_param["parent_job_id"] = str(parent_job.id) + if _PARENT_JOB_ID not in job_param: + job_param[_PARENT_JOB_ID] = str(parent_job.id) try: child_jobs.append(str(self._run(params=job_param))) except Exception as e: @@ -319,7 +311,7 @@ def _run_batch(self, parent_job: Job, params): raise e parent_job.child_jobs = child_jobs - parent_job.save() + self.sdkmr.save_job(parent_job) return child_jobs @@ -332,19 +324,61 @@ def run_batch( :param as_admin: Allows you to run jobs in other people's workspaces :return: A list of condor job ids or a failure notification """ - wsid = batch_params.get("wsid") + if type(params) != list: + raise IncorrectParamsException("params must be a list") + wsid = batch_params.get(_WORKSPACE_ID) meta = batch_params.get("meta") if as_admin: self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) else: # Make sure you aren't running a job in someone elses workspace self._check_workspace_permissions(wsid) - wsids = [job_input.get("wsid", wsid) for job_input in params] + # this is very odd. Why check the parent wsid again if there's no wsid in the job? + # also, what if the parent wsid is None? + # also also, why not just put all the wsids in one list and make one ws call? + wsids = [job_input.get(_WORKSPACE_ID, wsid) for job_input in params] self._check_workspace_permissions_list(wsids) + self._add_job_requirements(params) + self._check_job_arguments(params) + parent_job = self._create_parent_job(wsid=wsid, meta=meta) children_jobs = self._run_batch(parent_job=parent_job, params=params) - return {"parent_job_id": str(parent_job.id), "child_job_ids": children_jobs} + return {_PARENT_JOB_ID: str(parent_job.id), "child_job_ids": children_jobs} + + # modifies the jobs in place + def _add_job_requirements(self, jobs: List[Dict[str, Any]]): + f""" + Adds the job requirements, generated from the job requirements resolver, + to the provided RunJobParams dicts. Expects the required field {_METHOD} in the param + dicts. Adds the {_JOB_REQUIREMENTS} field to the param dicts, which holds the value of the + job requirements object. + """ + # could add a cache in the job requirements resolver to avoid making the same + # catalog call over and over if all the jobs have the same method + jrr = self.sdkmr.get_job_requirements_resolver() + for j in jobs: + # TODO JRR check if requesting any job requirements & if is admin + # TODO JRR actually process the requirements once added to the spec + j[_JOB_REQUIREMENTS] = jrr.resolve_requirements(j.get(_METHOD)) + + def _check_job_arguments(self, jobs): + # perform sanity checks before creating job or parent job + for j in jobs: + # Could make an argument checker method, or a class that doesn't require a job id. + # Seems like more code & work for no real benefit though. + # Just create the class for checks, don't use yet + JobSubmissionParameters( + "fakejobid", + AppInfo(j.get(_METHOD), j.get(_APP_ID)), + j[_JOB_REQUIREMENTS], + UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), + wsid=j.get(_WORKSPACE_ID), + source_ws_objects=j.get(_SOURCE_WS_OBJECTS), + ) + # This is also an opportunity for caching + # although most likely jobs aren't operating on the same object + self._check_ws_objects(source_objects=j.get(_SOURCE_WS_OBJECTS)) def run( self, params=None, as_admin=False, concierge_params: Dict = None @@ -359,15 +393,57 @@ def run( if as_admin: self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) else: - self._check_workspace_permissions(params.get("wsid")) + self._check_workspace_permissions(params.get(_WORKSPACE_ID)) if concierge_params: - cp = ConciergeParams(**concierge_params) self.sdkmr.check_as_concierge() + # we don't check requirements type because the concierge can do what they like + params[_JOB_REQUIREMENTS] = self._get_job_reqs_from_concierge_params( + params.get(_METHOD), concierge_params + ) else: - cp = None - - return self._run(params=params, concierge_params=cp) + self._add_job_requirements([params]) + self._check_job_arguments([params]) + + return self._run(params=params) + + def _get_job_reqs_from_concierge_params( + self, method: str, concierge_params: Dict[str, Any] + ) -> ResolvedRequirements: + jrr = self.sdkmr.get_job_requirements_resolver() + norm = jrr.normalize_job_reqs(concierge_params, "concierge parameters") + rl = concierge_params.get(_REQUIREMENTS_LIST) + schd_reqs = {} + if rl: + if type(rl) != list: + raise IncorrectParamsException(f"{_REQUIREMENTS_LIST} must be a list") + for s in rl: + if type(s) != str or "=" not in s: + raise IncorrectParamsException( + f"Found illegal requirement in {_REQUIREMENTS_LIST}: {s}" + ) + key, val = s.split("=") + schd_reqs[key.strip()] = val.strip() + + return jrr.resolve_requirements( + method, + cpus=norm.get(REQUEST_CPUS), + memory_MB=norm.get(REQUEST_MEMORY), + disk_GB=norm.get(REQUEST_DISK), + client_group=norm.get(CLIENT_GROUP), + client_group_regex=norm.get(CLIENT_GROUP_REGEX), + # error messaging here is for 'bill_to_user' vs 'account_group' but almost impossible + # to screw up so YAGNI + # Note that this is never confirmed to be a real user. May want to fix that, but + # since it's admin only... YAGNI + bill_to_user=concierge_params.get("account_group"), + # default is to ignore concurrency limits for concierge + ignore_concurrency_limits=bool( + concierge_params.get("ignore_concurrency_limits", 1) + ), + scheduler_requirements=schd_reqs, + debug_mode=norm.get(DEBUG_MODE), + ) def update_job_to_queued(self, job_id, scheduler_id): # TODO RETRY FOR RACE CONDITION OF RUN/CANCEL @@ -405,12 +481,12 @@ def get_job_params(self, job_id, as_admin=False): job_input = job.job_input - job_params["method"] = job_input.method + job_params[_METHOD] = job_input.method job_params["params"] = job_input.params job_params["service_ver"] = job_input.service_ver - job_params["app_id"] = job_input.app_id - job_params["wsid"] = job_input.wsid - job_params["parent_job_id"] = job_input.parent_job_id - job_params["source_ws_objects"] = job_input.source_ws_objects + job_params[_APP_ID] = job_input.app_id + job_params[_WORKSPACE_ID] = job_input.wsid + job_params[_PARENT_JOB_ID] = job_input.parent_job_id + job_params[_SOURCE_WS_OBJECTS] = job_input.source_ws_objects return job_params diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 11d91163b..9cf9f5437 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -26,7 +26,6 @@ EE2Logs, ) from lib.execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME -from lib.execution_engine2.utils.CatalogUtils import CatalogUtils from lib.execution_engine2.utils.Condor import Condor from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver @@ -72,7 +71,6 @@ def __init__( self.job_requirements_resolver = clients.requirements_resolver self.workspace = user_clients.workspace self.workspace_auth = user_clients.workspace_auth - self.catalog_utils = clients.catalog_utils self.auth = clients.auth self.auth_admin = clients.auth_admin self.user_id = user_clients.user_id @@ -166,12 +164,6 @@ def get_job_requirements_resolver(self) -> JobRequirementsResolver: """ return self.job_requirements_resolver - def get_catalog_utils(self) -> CatalogUtils: - """ - Get the catalog utilities for this instance of SDKMR. - """ - return self.catalog_utils - def get_kafka_client(self) -> KafkaClient: """ Get the Kafka client for this instance of SDKMR. diff --git a/lib/execution_engine2/utils/CatalogUtils.py b/lib/execution_engine2/utils/CatalogUtils.py deleted file mode 100644 index a9e614889..000000000 --- a/lib/execution_engine2/utils/CatalogUtils.py +++ /dev/null @@ -1,71 +0,0 @@ -import json -from typing import List, Dict - -from lib.installed_clients.CatalogClient import Catalog - - -class CatalogUtils: - def __init__(self, url, admin_token): - self._catalog = Catalog(url=url, token=admin_token) - - def get_catalog(self): - """ Get the catalog client for this instance. """ - # TODO unit test this method after switching to dependency injection - return self._catalog - - def get_normalized_resources(self, method) -> Dict: - """ - get client groups info from Catalog - """ - if method is None: - raise ValueError("Please input module_name.function_name") - - if method is not None and "." not in method: - raise ValueError( - "unrecognized method: {}. Please input module_name.function_name".format( - method - ) - ) - - module_name, function_name = method.split(".") - - group_config = self._catalog.list_client_group_configs( - {"module_name": module_name, "function_name": function_name} - ) - - job_settings = [] - if group_config and len(group_config) > 0: - job_settings = group_config[0].get("client_groups") - - normalize = self.normalize_job_settings(job_settings) - - return normalize - - @staticmethod - def normalize_job_settings(resources_request: List): - """ - Ensure that the client_groups are processed as a dictionary and has at least one value - :param resources_request: either an empty string, a json object, or cg,key1=value,key2=value - :return: - """ - - # No client group provided - if len(resources_request) == 0: - return {} - # JSON - if "{" in resources_request[0]: - json_resources_request = ", ".join(resources_request) - return json.loads(json_resources_request) - # CSV Format - rr = resources_request # type: list - rv = {"client_group": rr.pop(0)} - for item in rr: - if "=" not in item: - raise Exception( - f"Malformed requirement. Format is = . Item is {item}" - ) - (key, value) = item.split("=") - rv[key] = value - # - # print("Going to return", rv) - return rv diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 1d4570a79..0881a45b3 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -6,31 +6,24 @@ import os import pathlib import pwd -from configparser import ConfigParser -from typing import Dict, Optional, Any, Tuple +from typing import Dict, Optional, Any import htcondor -from lib.execution_engine2.exceptions import ( - MissingCondorRequirementsException, - MissingRunJobParamsException, +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements, ) -from lib.execution_engine2.sdk.EE2Runjob import ConciergeParams from lib.execution_engine2.utils.CondorTuples import ( - CondorResources, SubmissionInfo, JobInfo, ) +from execution_engine2.utils.arg_processing import not_falsy as _not_falsy class Condor: # TODO: Should these be outside of the class? - REQUEST_CPUS = "request_cpus" - REQUEST_MEMORY = "request_memory" - REQUEST_DISK = "request_disk" CG = "+CLIENTGROUP" - EE2 = "execution_engine2" - ENDPOINT = "kbase-endpoint" EXTERNAL_URL = "external-url" EXECUTABLE = "executable" CATALOG_TOKEN = "catalog-token" @@ -40,71 +33,48 @@ class Condor: LEAVE_JOB_IN_QUEUE = "leavejobinqueue" TRANSFER_INPUT_FILES = "transfer_input_files" PYTHON_EXECUTABLE = "PYTHON_EXECUTABLE" - DEFAULT_CLIENT_GROUP = "default_client_group" - def __init__(self, config_filepath, htc=htcondor): + def __init__(self, config: Dict[str, str], htc=htcondor): """ Create the condor wrapper. - config_filepath - the path to the execution_engine2 configuration file. + config - the execution_engine2 configuration. htc - the htcondor module, or an alternate implementation or mock. """ + # TODO some nicer error messages for the required keys vs. just KeyError self.htcondor = htc - self.config = ConfigParser() - self.override_clientgroup = os.environ.get("OVERRIDE_CLIENT_GROUP", None) - self.config.read(config_filepath) - self.ee_endpoint = self.config.get(section=self.EE2, option=self.EXTERNAL_URL) - self.python_executable = self.config.get( - section=self.EE2, - option=self.PYTHON_EXECUTABLE, - fallback="/miniconda/bin/python", + self.ee_endpoint = config[self.EXTERNAL_URL] + self.python_executable = config.get( + self.PYTHON_EXECUTABLE, "/miniconda/bin/python" ) - self.initial_dir = self.config.get( - section=self.EE2, option=self.INITIAL_DIR, fallback="/condor_shared" - ) - executable = self.config.get(section=self.EE2, option=self.EXECUTABLE) - if not pathlib.Path(executable).exists() and not pathlib.Path( - self.initial_dir + "/" + executable + self.initial_dir = config.get(self.INITIAL_DIR, "/condor_shared") + self.executable = config[self.EXECUTABLE] + if not pathlib.Path(self.executable).exists() and not pathlib.Path( + self.initial_dir + "/" + self.executable ): - raise FileNotFoundError(executable) - self.executable = executable - self.catalog_token = self.config.get( - section=self.EE2, option=self.CATALOG_TOKEN - ) - self.docker_timeout = self.config.get( - section=self.EE2, option=self.DOCKER_TIMEOUT, fallback="604801" - ) - self.pool_user = self.config.get( - section=self.EE2, option=self.POOL_USER, fallback="condor_pool" - ) - self.leave_job_in_queue = self.config.get( - section=self.EE2, option=self.LEAVE_JOB_IN_QUEUE, fallback="True" - ) - self.transfer_input_files = self.config.get( - section=self.EE2, - option=self.TRANSFER_INPUT_FILES, - fallback="/condor_shared/JobRunner.tgz", + raise FileNotFoundError(self.executable) + self.catalog_token = config[self.CATALOG_TOKEN] + self.docker_timeout = config.get(self.DOCKER_TIMEOUT, "604801") + self.pool_user = config.get(self.POOL_USER, "condor_pool") + self.leave_job_in_queue = config.get(self.LEAVE_JOB_IN_QUEUE, "True") + self.transfer_input_files = config.get( + self.TRANSFER_INPUT_FILES, "/condor_shared/JobRunner.tgz" ) self.logger = logging.getLogger("ee2") - def _setup_environment_vars(self, params: Dict, client_group: str) -> str: + def _setup_environment_vars(self, params: JobSubmissionParameters) -> str: # 7 day docker job timeout default, Catalog token used to get access to volume mounts - dm = ( - str(params["cg_resources_requirements"].get("debug_mode", "")).lower() - == "true" - ) - environment_vars = { "DOCKER_JOB_TIMEOUT": self.docker_timeout, "KB_ADMIN_AUTH_TOKEN": self.catalog_token, - "KB_AUTH_TOKEN": params.get("token"), - "CLIENTGROUP": client_group, - "JOB_ID": params.get("job_id"), + "KB_AUTH_TOKEN": params.user_creds.token, + "CLIENTGROUP": params.job_reqs.client_group, + "JOB_ID": params.job_id, # "WORKDIR": f"{config.get('WORKDIR')}/{params.get('USER')}/{params.get('JOB_ID')}", "CONDOR_ID": "$(Cluster).$(Process)", "PYTHON_EXECUTABLE": self.python_executable, - "DEBUG_MODE": str(dm), - "PARENT_JOB_ID": params.get("parent_job_id", ""), + "DEBUG_MODE": str(params.job_reqs.debug_mode), + "PARENT_JOB_ID": params.parent_job_id or "", } environment = "" @@ -113,89 +83,6 @@ def _setup_environment_vars(self, params: Dict, client_group: str) -> str: return f'"{environment}"' - @staticmethod - def _check_for_missing_runjob_params(params: Dict[str, str]) -> None: - """ - Check for missing runjob parameters - :param params: Params saved when the job was created - """ - for item in ("token", "user_id", "job_id", "cg_resources_requirements"): - if item not in params: - raise MissingRunJobParamsException(f"{item} not found in params") - - def extract_resources(self, cgrr: Dict[str, str]) -> CondorResources: - """ - # TODO Validate MB/GB from both config and catalog. - Checks to see if request_cpus/memory/disk is available - If not, it sets them based on defaults from the config - :param cgrr: - :return: - """ - self.logger.debug(f"About to extract from {cgrr}") - - client_group = cgrr.get("client_group", "") - if client_group is None or client_group == "": - client_group = self.config.get( - section="DEFAULT", option=self.DEFAULT_CLIENT_GROUP - ) - - if client_group not in self.config.sections(): - raise ValueError(f"{client_group} not found in {self.config.sections()}") - - # TODO Validate that they are a resource followed by a unit - for key in [self.REQUEST_DISK, self.REQUEST_CPUS, self.REQUEST_MEMORY]: - if key not in cgrr or cgrr[key] in ["", None]: - cgrr[key] = self.config.get(section=client_group, option=key) - - if self.override_clientgroup: - client_group = self.override_clientgroup - - cr = CondorResources( - str(cgrr.get(self.REQUEST_CPUS)), - str(cgrr.get(self.REQUEST_DISK)), - str(cgrr.get(self.REQUEST_MEMORY)), - client_group, - ) - - return cr - - def _extract_requirements( - self, cgrr: Optional[dict] = None, client_group: Optional[str] = None - ): - """ - - :param cgrr: Client Groups and Resource Requirements - :param client_group: Client Group - :return: A list of condor submit file requirements in (key == value) format - """ - if cgrr is None or client_group is None: - raise MissingCondorRequirementsException( - "Please provide normalized cgrr and client_group" - ) - - requirements_statement = [] - - # Default to using a regex - if str(cgrr.get("client_group_regex", True)).lower() == "true": - requirements_statement.append(f'regexp("{client_group}",CLIENTGROUP)') - else: - requirements_statement.append(f'(CLIENTGROUP == "{client_group}")') - - restricted_requirements = [ - "client_group", - "client_group_regex", - self.REQUEST_MEMORY, - self.REQUEST_DISK, - self.REQUEST_CPUS, - "debug_mode", - ] - - for key, value in cgrr.items(): - if key.lower() not in restricted_requirements: - requirements_statement.append(f'({key} == "{value}")') - - return requirements_statement - @staticmethod def _add_hardcoded_attributes(sub, job_id): sub["universe"] = "vanilla" @@ -232,78 +119,54 @@ def _add_configurable_attributes(self, sub): return sub def _extract_resources_and_requirements( - self, sub: Dict[str, Any], cgrr: Dict[str, str] - ) -> Tuple[Dict[str, Any], str]: + self, sub: Dict[str, Any], job_reqs: JobRequirements + ) -> Dict[str, Any]: # Extract minimum condor resource requirements and client_group - resources = self.extract_resources(cgrr) - sub["request_cpus"] = resources.request_cpus - sub["request_memory"] = resources.request_memory - sub["request_disk"] = resources.request_disk - client_group = resources.client_group + sub["request_cpus"] = job_reqs.cpus + sub["request_memory"] = f"{job_reqs.memory_MB}MB" + sub["request_disk"] = f"{job_reqs.disk_GB}GB" # Set requirements statement - requirements = self._extract_requirements(cgrr=cgrr, client_group=client_group) - sub["requirements"] = " && ".join(requirements) - sub["+KB_CLIENTGROUP"] = f'"{client_group}"' - return (sub, client_group) - - @staticmethod - def _modify_with_concierge(sub, concierge_params): - # Remove Concurrency Limits for this Job - del sub["Concurrency_Limits"] - # Override Clientgroup - sub["+KB_CLIENTGROUP"] = f'"{concierge_params.client_group}"' - if concierge_params.account_group: - sub["+AccountingGroup"] = concierge_params.account_group - # Override Resource Requirements - sub["request_cpus"] = concierge_params.request_cpus - sub["request_memory"] = concierge_params.request_memory - sub["request_disk"] = concierge_params.request_disk - # Build up requirements w/ custom requirements - sub["requirements"] = f'(CLIENTGROUP == "{concierge_params.client_group}")' - requirements = [] - if concierge_params.requirements_list: - for item in concierge_params.requirements_list: - key, value = item.split("=") - requirements.append(f'({key} == "{value}")') - sub["requirements"] += " && ".join(requirements) - + sub["requirements"] = self._create_requirements_statement(job_reqs) + sub["+KB_CLIENTGROUP"] = f'"{job_reqs.client_group}"' return sub + def _create_requirements_statement(self, job_reqs: JobRequirements) -> str: + reqs = [] + if job_reqs.client_group_regex is not False: + # Default is True, so a value of None means True + reqs = [f'regexp("{job_reqs.client_group}",CLIENTGROUP)'] + else: + reqs = [f'(CLIENTGROUP == "{job_reqs.client_group}")'] + for key, value in job_reqs.scheduler_requirements.items(): + reqs.append(f'({key} == "{value}")') + return " && ".join(reqs) + def _add_resources_and_special_attributes( - self, params: Dict, concierge_params: ConciergeParams = None - ) -> Dict: + self, params: JobSubmissionParameters + ) -> Dict[str, str]: sub = dict() - sub["JobBatchName"] = params.get("job_id") - sub["arguments"] = f"{params['job_id']} {self.ee_endpoint}" + sub["JobBatchName"] = params.job_id + sub["arguments"] = f"{params.job_id} {self.ee_endpoint}" sub = self._add_job_labels(sub=sub, params=params) # Extract special requirements - (sub, client_group) = self._extract_resources_and_requirements( - sub, params["cg_resources_requirements"] - ) + sub = self._extract_resources_and_requirements(sub, params.job_reqs) - sub["+AccountingGroup"] = params.get("user_id") - sub["Concurrency_Limits"] = params.get("user_id") - if concierge_params: - sub = self._modify_with_concierge(sub, concierge_params) - client_group = concierge_params.client_group - sub["+AccountingGroup"] = f'"{sub["+AccountingGroup"]}"' + btu = params.job_reqs.bill_to_user + user = btu if btu else params.user_creds.username + if not params.job_reqs.ignore_concurrency_limits: + sub["Concurrency_Limits"] = user + sub["+AccountingGroup"] = f'"{user}"' - sub["environment"] = self._setup_environment_vars( - params, client_group=client_group - ) + sub["environment"] = self._setup_environment_vars(params) return sub - # TODO Copy stuff from Concierge Params into #AcctGroup/Clientgroup/JobPrio, CPu/MEMORY/DISK/ - def _create_submit( - self, params: Dict, concierge_params: ConciergeParams = None - ) -> Dict: + def _create_submit(self, params: JobSubmissionParameters) -> Dict[str, str]: # note some tests call this function directly and will need to be updated if the # signature is changed - self._check_for_missing_runjob_params(params) - sub = self._add_resources_and_special_attributes(params, concierge_params) - sub = self._add_hardcoded_attributes(sub=sub, job_id=params["job_id"]) + sub = self._add_resources_and_special_attributes(params) + sub = self._add_hardcoded_attributes(sub=sub, job_id=params.job_id) sub = self._add_configurable_attributes(sub) # Ensure all values are a string for item in sub.keys(): @@ -311,14 +174,14 @@ def _create_submit( return sub @staticmethod - def _add_job_labels(sub: Dict, params: Dict[str, str]): - sub["+KB_PARENT_JOB_ID"] = params.get("parent_job_id", "") - sub["+KB_MODULE_NAME"] = params.get("method", "").split(".")[0] - sub["+KB_FUNCTION_NAME"] = params.get("method", "").split(".")[-1] - sub["+KB_APP_ID"] = params.get("app_id", "") - sub["+KB_APP_MODULE_NAME"] = params.get("app_id", "").split("/")[0] - sub["+KB_WSID"] = params.get("wsid", "") - sub["+KB_SOURCE_WS_OBJECTS"] = ",".join(params.get("source_ws_objects", list())) + def _add_job_labels(sub: Dict, params: JobSubmissionParameters): + sub["+KB_PARENT_JOB_ID"] = params.parent_job_id or "" + sub["+KB_MODULE_NAME"] = params.app_info.module + sub["+KB_FUNCTION_NAME"] = params.app_info.method + sub["+KB_APP_ID"] = params.app_info.get_application_id() + sub["+KB_APP_MODULE_NAME"] = params.app_info.application_module + sub["+KB_WSID"] = params.wsid or "" + sub["+KB_SOURCE_WS_OBJECTS"] = ",".join(params.source_ws_objects) # Ensure double quoted user inputs for key in sub.keys(): @@ -329,19 +192,14 @@ def _add_job_labels(sub: Dict, params: Dict[str, str]): return sub - def run_job( - self, - params: Dict[str, str], - concierge_params: Dict[str, str] = None, - ) -> SubmissionInfo: + def run_job(self, params: JobSubmissionParameters) -> SubmissionInfo: """ TODO: Add a retry TODO: Add list of required params - :param params: Params to run the job, such as the username, job_id, token, client_group_and_requirements - :param concierge_params: Concierge Options for Submit Files + :param params: Params to run the job. :return: """ - submit = self._create_submit(params, concierge_params) + submit = self._create_submit(_not_falsy(params, "params")) sub = self.htcondor.Submit(submit) try: diff --git a/lib/execution_engine2/utils/CondorTuples.py b/lib/execution_engine2/utils/CondorTuples.py index bec435de1..048568084 100644 --- a/lib/execution_engine2/utils/CondorTuples.py +++ b/lib/execution_engine2/utils/CondorTuples.py @@ -15,13 +15,6 @@ class SubmissionInfo(NamedTuple): error: Optional[Exception] -class CondorResources(NamedTuple): - request_cpus: str - request_disk: str - request_memory: str - client_group: str - - class JobStatusCodes(enum.Enum): UNEXPANDED = 0 IDLE = 1 diff --git a/lib/execution_engine2/utils/clients.py b/lib/execution_engine2/utils/clients.py index ce634ed1d..e5a4c928e 100644 --- a/lib/execution_engine2/utils/clients.py +++ b/lib/execution_engine2/utils/clients.py @@ -9,7 +9,6 @@ from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.utils.arg_processing import not_falsy as _not_falsy -from execution_engine2.utils.CatalogUtils import CatalogUtils from execution_engine2.utils.Condor import Condor from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver @@ -99,7 +98,6 @@ def __init__( condor: Condor, catalog: Catalog, requirements_resolver: JobRequirementsResolver, - catalog_utils: CatalogUtils, # TODO JRR remove after replaced by JRR kafka_client: KafkaClient, mongo_util: MongoUtil, slack_client: SlackClient, @@ -115,7 +113,6 @@ def __init__( self.requirements_resolver = _not_falsy( requirements_resolver, "requirements_resolver" ) - self.catalog_utils = _not_falsy(catalog_utils, "catalog_utils") self.kafka_client = _not_falsy(kafka_client, "kafka_client") self.mongo_util = _not_falsy(mongo_util, "mongo_util") self.slack_client = _not_falsy(slack_client, "slack_client") @@ -126,9 +123,7 @@ def __init__( def get_clients( - # TODO JRR remove cfg_path when Condor no longer needs it cfg: Dict[str, str], - cfg_path, cfg_file: Iterable[str], override_client_group: str = None, ) -> ( @@ -137,7 +132,6 @@ def get_clients( Condor, Catalog, JobRequirementsResolver, - CatalogUtils, KafkaClient, MongoUtil, SlackClient, @@ -147,7 +141,6 @@ def get_clients( reused from user to user. cfg - the configuration dictionary - cfg_path - the path to the configuration file cfg_file - the full configuration file as a file like object or iterable. override_client_group - a client group name to override any client groups provided by users or the catalog service. @@ -160,7 +153,7 @@ def get_clients( slack-token - a token for contacting Slack """ # Condor needs access to the entire deploy.cfg file, not just the ee2 section - condor = Condor(cfg_path) # TODO JRR replace with cfg when JRR is used + condor = Condor(cfg) # Do a check to ensure the urls and tokens actually work correctly? # TODO check keys are present - make some general methods for dealing with this # token is needed for running log_exec_stats in EE2Status @@ -169,7 +162,6 @@ def get_clients( jrr = JobRequirementsResolver( Catalog(cfg["catalog-url"]), cfg_file, override_client_group ) - catalog_utils = CatalogUtils(cfg["catalog-url"], cfg["catalog-token"]) auth_url = cfg["auth-url"] auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") # TODO using hardcoded roles for now to avoid possible bugs with mismatched cfg roles @@ -193,7 +185,6 @@ def get_clients( condor, catalog, jrr, - catalog_utils, kafka_client, mongo_util, slack_client, @@ -202,8 +193,6 @@ def get_clients( def get_client_set( cfg: Dict[str, str], - # TODO JRR remove cfg_path when Condor no longer needs it - cfg_path: str, cfg_file: Iterable[str], override_client_group: str = None, ) -> ClientSet: @@ -212,7 +201,6 @@ def get_client_set( in clients individually. cfg - the configuration dictionary - cfg_path - the path to the configuration file cfg_file - the full configuration file as a file like object or iterable. override_client_group - a client group name to override any client groups provided by users or the catalog service. @@ -225,4 +213,4 @@ def get_client_set( slack-token - a token for contacting Slack """ - return ClientSet(*get_clients(cfg, cfg_path, cfg_file, override_client_group)) + return ClientSet(*get_clients(cfg, cfg_file, override_client_group)) diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index 4e2c5e9e3..93d569476 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -89,7 +89,7 @@ def getRunner(self, user_clients=None, clients=None) -> SDKMethodRunner: user_clients = get_user_client_set(self.cfg, self.user_id, self.token) if not clients: with open(self.config_file) as cf: - clients = get_client_set(self.cfg, self.config_file, cf) + clients = get_client_set(self.cfg, cf) runner = SDKMethodRunner(user_clients, clients) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() @@ -136,7 +136,9 @@ def test_regular_user(self, catalog): ws_auth.can_write.return_value = True runner = self.getRunner(user_client_set, clients_and_mocks[ClientSet]) method_1 = "module_name.function_name" - job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) + job_params_1 = get_sample_job_params( + method=method_1, wsid=self.ws_id, app_id="module_name/foo" + ) # Check Admin Status is_admin = runner.check_is_admin() @@ -243,7 +245,9 @@ def test_admin_writer(self, workspace, catalog): adminauth.get_admin_role.return_value = ADMIN_WRITE_ROLE method_1 = "module_name.function_name" - job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) + job_params_1 = get_sample_job_params( + method=method_1, wsid=self.ws_id, app_id="module_name/foo" + ) # Check Admin Status is_admin = runner.check_is_admin() diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 3fc0c5f7f..d7c2530c2 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -4,19 +4,23 @@ # Incomplete by a long way. Will add more unit tests as they come up. +from pytest import raises from typing import List, Dict, Any from bson.objectid import ObjectId from logging import Logger -from unittest.mock import create_autospec +from unittest.mock import create_autospec, call from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta +from execution_engine2.exceptions import IncorrectParamsException from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions -from execution_engine2.utils.CatalogUtils import CatalogUtils -from execution_engine2.utils.Condor import Condor, SubmissionInfo, CondorResources from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, JobRequirements as ResolvedRequirements, + AppInfo, + UserCreds, ) from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.Condor import Condor, SubmissionInfo from execution_engine2.utils.KafkaUtils import ( KafkaClient, KafkaQueueChange, @@ -28,6 +32,7 @@ from installed_clients.WorkspaceClient import Workspace from installed_clients.CatalogClient import Catalog from utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS +from utils_shared.test_utils import assert_exception_correct # common variables _JOB_ID = "603051cfaf2e3401b0500982" @@ -39,9 +44,23 @@ _APP = "lolcats/itsmypartyilllolifiwantto" _USER = "someuser" _TOKEN = "tokentokentoken" +_OTHER_USER = "some_sucker" _CREATED_STATE = "created" _QUEUED_STATE = "queued" +# batch common variables +_BATCH = "batch" +_GIT_COMMIT_1 = "commit1" +_GIT_COMMIT_2 = "commit2" +_JOB_ID_1 = "603051cfaf2e3401b0500985" +_JOB_ID_2 = "603051cfaf2e3401b0500986" +_METHOD_1 = "module1.method1" +_APP_1 = "module1/app1" +_METHOD_2 = "module2.method2" +_APP_2 = "module2/app2" +_CLUSTER_1 = "cluster1" +_CLUSTER_2 = "cluster2" + def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: """ @@ -61,7 +80,6 @@ def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: mocks[WorkspaceAuth] = create_autospec(WorkspaceAuth, spec_set=True, instance=True) # Set up basic getter calls sdkmr.get_catalog.return_value = mocks[Catalog] - sdkmr.get_catalog_utils.return_value = mocks[CatalogUtils] sdkmr.get_condor.return_value = mocks[Condor] sdkmr.get_kafka_client.return_value = mocks[KafkaClient] sdkmr.get_logger.return_value = mocks[Logger] @@ -76,6 +94,50 @@ def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: return mocks +def _create_job( + reqs: ResolvedRequirements, + user=_USER, + method=_METHOD, + app=_APP, + state=_CREATED_STATE, + git_commit=_GIT_COMMIT, + parent_job_id="None", + source_ws_objects=None, + wsid=None, +): + job = Job() + job.user = user + job.status = state + job.wsid = wsid + ji = JobInput() + ji.method = method + ji.app_id = app + ji.wsid = wsid + ji.service_ver = git_commit + ji.source_ws_objects = source_ws_objects + ji.parent_job_id = parent_job_id + jr = JobRequirements() + jr.clientgroup = reqs.client_group + jr.cpu = reqs.cpus + jr.memory = reqs.memory_MB + jr.disk = reqs.disk_GB + ji.requirements = jr + ji.narrative_cell_info = Meta() + job.job_input = ji + return job + + +def _check_queued_job_save(got_job, job_id, cluster): + expected_job = Job() + expected_job.id = ObjectId(job_id) + expected_job.status = _QUEUED_STATE + # no way to test this really without code refactoring + expected_job.queued = got_job.queued + expected_job.scheduler_type = "condor" + expected_job.scheduler_id = cluster + assert_jobs_equal(got_job, expected_job) + + def _set_up_common_return_values(mocks): """ Set up return values on mocks that are the same for several tests. @@ -83,8 +145,8 @@ def _set_up_common_return_values(mocks): mocks[Workspace].get_object_info3.return_value = { "paths": [[_WS_REF_1], [_WS_REF_2]] } - mocks[SDKMethodRunner].save_job.return_value = _JOB_ID mocks[Catalog].get_module_version.return_value = {"git_commit_hash": _GIT_COMMIT} + mocks[SDKMethodRunner].save_job.return_value = _JOB_ID mocks[Condor].run_job.return_value = SubmissionInfo(_CLUSTER, {}, None) retjob = Job() retjob.id = ObjectId(_JOB_ID) @@ -92,7 +154,7 @@ def _set_up_common_return_values(mocks): mocks[MongoUtil].get_job.return_value = retjob -def _check_common_mock_calls(mocks, reqs, creqs, wsid): +def _check_common_mock_calls(mocks, reqs, wsid): """ Check that mocks are called as expected when those calls are similar or the same for several tests. @@ -107,58 +169,30 @@ def _check_common_mock_calls(mocks, reqs, creqs, wsid): ) # initial job data save - expected_job = Job() - expected_job.user = _USER - expected_job.status = _CREATED_STATE - expected_job.wsid = wsid - ji = JobInput() - ji.method = _METHOD - ji.app_id = _APP - ji.wsid = wsid - ji.service_ver = _GIT_COMMIT - ji.source_ws_objects = [_WS_REF_1, _WS_REF_2] - ji.parent_job_id = "None" - jr = JobRequirements() - jr.clientgroup = creqs.client_group - jr.cpu = creqs.request_cpus - jr.memory = creqs.request_memory - jr.disk = creqs.request_disk - ji.requirements = jr - ji.narrative_cell_info = Meta() - expected_job.job_input = ji + expected_job = _create_job( + reqs, wsid=wsid, source_ws_objects=[_WS_REF_1, _WS_REF_2] + ) assert len(sdkmr.save_job.call_args_list) == 2 got_job = sdkmr.save_job.call_args_list[0][0][0] assert_jobs_equal(got_job, expected_job) kafka.send_kafka_message.assert_any_call(KafkaCreateJob(_USER, _JOB_ID)) - params_expected = { - "method": _METHOD, - "app_id": _APP, - "source_ws_objects": [_WS_REF_1, _WS_REF_2], - "service_ver": _GIT_COMMIT, - "job_id": _JOB_ID, - "user_id": _USER, - "token": _TOKEN, - "cg_resources_requirements": reqs, - } - mocks[Condor].run_job.assert_called_once_with( - params=params_expected, concierge_params=None + jsp_expected = JobSubmissionParameters( + _JOB_ID, + AppInfo(_METHOD, _APP), + reqs, + UserCreds(_USER, _TOKEN), + wsid=wsid, + source_ws_objects=[_WS_REF_1, _WS_REF_2], ) + mocks[Condor].run_job.assert_called_once_with(params=jsp_expected) # updated job data save mocks[MongoUtil].get_job.assert_called_once_with(_JOB_ID) # update to queued state got_job = sdkmr.save_job.call_args_list[1][0][0] - expected_job = Job() - expected_job.id = ObjectId(_JOB_ID) - expected_job.status = _QUEUED_STATE - # no way to test this really without code refactoring - expected_job.queued = got_job.queued - - expected_job.scheduler_type = "condor" - expected_job.scheduler_id = _CLUSTER - assert_jobs_equal(got_job, expected_job) + _check_queued_job_save(got_job, _JOB_ID, _CLUSTER) kafka.send_kafka_message.assert_called_with( # update to queued state KafkaQueueChange( @@ -183,27 +217,172 @@ def test_run_as_admin(): # Set up data variables client_group = "grotesquememlong" cpus = 4 - mem = "32M" + mem = 32 + disk = 2600 # set up mocks mocks = _set_up_mocks(_USER, _TOKEN) sdkmr = mocks[SDKMethodRunner] - catutils = mocks[CatalogUtils] - condor = mocks[Condor] + jrr = mocks[JobRequirementsResolver] # We intentionally do not check the logger methods as there are a lot of them and this is # already a very large test. This may be something to be added later when needed. # Set up call returns. These calls are in the order they occur in the code - sdkmr.check_as_admin.return_value = True - catalog_resources = { + reqs = ResolvedRequirements( + cpus=cpus, memory_MB=mem, disk_GB=disk, client_group=client_group + ) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + assert rj.run(params, as_admin=True) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + jrr.resolve_requirements.assert_called_once_with(_METHOD) + _check_common_mock_calls(mocks, reqs, None) + + +def test_run_as_concierge_with_wsid(): + """ + A unit test of the run() method with a concierge - but not admin - user. + """ + + # Set up data variables + client_group = "tinymem" + cpus = 4 + mem = 32 + disk = 2600 + wsid = 78 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + wsauth = mocks[WorkspaceAuth] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + wsauth.can_write.return_value = True + jrr.normalize_job_reqs.return_value = { + "request_cpus": cpus, + "request_memory": mem, + "request_disk": disk, "client_group": client_group, + "client_group_regex": False, + "debug_mode": True, + } + reqs = ResolvedRequirements( + cpus=cpus, + memory_MB=mem, + disk_GB=disk, + client_group=client_group, + client_group_regex=False, + ignore_concurrency_limits=False, + bill_to_user=_OTHER_USER, + scheduler_requirements={"foo": "bar", "baz": "bat"}, + debug_mode=True, + ) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + "wsid": wsid, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + conc_params = { "request_cpus": cpus, "request_memory": mem, + "request_disk": disk, + "client_group": client_group, + "client_group_regex": 0, + "ignore_concurrency_limits": 0, + "account_group": _OTHER_USER, + "requirements_list": [" foo = bar ", "baz=bat"], + "debug_mode": 1, } - catutils.get_normalized_resources.return_value = catalog_resources - condor.extract_resources.return_value = CondorResources( - cpus, "2600GB", mem, client_group + assert rj.run(params, concierge_params=conc_params) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_concierge.assert_called_once_with() + wsauth.can_write.assert_called_once_with(wsid) + jrr.normalize_job_reqs.assert_called_once_with(conc_params, "concierge parameters") + + jrr.resolve_requirements.assert_called_once_with( + _METHOD, + cpus=cpus, + memory_MB=mem, + disk_GB=disk, + client_group=client_group, + client_group_regex=False, + ignore_concurrency_limits=False, + bill_to_user=_OTHER_USER, + scheduler_requirements={"foo": "bar", "baz": "bat"}, + debug_mode=True, ) + _check_common_mock_calls(mocks, reqs, wsid) + + +def test_run_as_concierge_empty_as_admin(): + """ + A unit test of the run() method with an effectively empty concierge dict and admin privs. + The fake key should be ignored but is required to make the concierge params truthy and + trigger the pathway. + """ + _run_as_concierge_empty_as_admin({"fake": "foo"}) + + +def test_run_as_concierge_sched_reqs_None_as_admin(): + """ + A unit test of the run() method with an concierge dict containing None for the scheduler + requirements and admin privs. + """ + _run_as_concierge_empty_as_admin({"requirements_list": None}) + + +def test_run_as_concierge_sched_reqs_empty_list_as_admin(): + """ + A unit test of the run() method with an concierge dict containing an empty list for the + scheduler requirements and admin privs. + """ + _run_as_concierge_empty_as_admin({"requirements_list": []}) + + +def _run_as_concierge_empty_as_admin(concierge_params): + + # Set up data variables + client_group = "somegroup" + cpus = 1 + mem = 1 + disk = 1 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + jrr.normalize_job_reqs.return_value = {} + reqs = ResolvedRequirements( + cpus=cpus, + memory_MB=mem, + disk_GB=disk, + client_group=client_group, + ) + jrr.resolve_requirements.return_value = reqs _set_up_common_return_values(mocks) # set up the class to be tested and run the method @@ -213,14 +392,427 @@ def test_run_as_admin(): "app_id": _APP, "source_ws_objects": [_WS_REF_1, _WS_REF_2], } - assert rj.run(params, as_admin=True) == _JOB_ID + assert rj.run(params, concierge_params=concierge_params, as_admin=True) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + sdkmr.check_as_concierge.assert_called_once_with() + jrr.normalize_job_reqs.assert_called_once_with( + concierge_params, "concierge parameters" + ) + + jrr.resolve_requirements.assert_called_once_with( + _METHOD, + cpus=None, + memory_MB=None, + disk_GB=None, + client_group=None, + client_group_regex=None, + ignore_concurrency_limits=True, + bill_to_user=None, + scheduler_requirements={}, + debug_mode=None, + ) + _check_common_mock_calls(mocks, reqs, None) + + +def test_run_fail_concierge_params(): + """ + Test that submitting invalid concierge params causes the job to fail. Note that most + error checking happens in the mocked out job requirements resolver, so we only check for + errors that EE2RunJob is responsible for handling. + """ + _run_fail_concierge_params( + {"requirements_list": {"a", "b"}}, + IncorrectParamsException("requirements_list must be a list"), + ) + for err in [None, "", 42, "foo:bar"]: + _run_fail_concierge_params( + {"requirements_list": [err]}, + IncorrectParamsException( + f"Found illegal requirement in requirements_list: {err}" + ), + ) + + +def _run_fail_concierge_params(concierge_params, expected): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + } + with raises(Exception) as got: + rj.run(params, concierge_params=concierge_params) + assert_exception_correct(got.value, expected) + + +def test_run_and_run_batch_fail_illegal_arguments(): + """ + Test that illegal arguments cause the job to fail. Note that not all arguments are + checked - this test checks arguments that are checked in the _check_job_arguments() + method. Furthermore, most argument checking occurs in the job submission parameters + class and its respective composed classes, and we don't reproduce all the error conditions + possible - just enough to ensure the error checking occurs. If major changes are made to + the error checking code then more tests may need to be written. + + Tests both the run() and run_batch() methods. + """ + _run_and_run_batch_fail_illegal_arguments( + {}, IncorrectParamsException("Missing input parameter: method ID") + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar"}, + IncorrectParamsException("Missing input parameter: application ID"), + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "app_id": "foo/baz", "wsid": 0}, + IncorrectParamsException("wsid must be at least 1"), + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "app_id": "foo/baz", "source_ws_objects": {"a": "b"}}, + IncorrectParamsException("source_ws_objects must be a list"), + ) + + +def _run_and_run_batch_fail_illegal_arguments(params, expected): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") + _run_and_run_batch_fail(sdkmr, params, expected) + + +def test_run_and_run_batch_fail_workspace_objects_check(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") + mocks[Workspace].get_object_info3.return_value = { + "paths": ["1/2/3", None, "21/34/55"] + } + + params = { + "method": "foo.bar", + "app_id": "foo/baz", + "source_ws_objects": ["1/2/3", "5/8/13", "21/34/55"], + } + _run_and_run_batch_fail( + sdkmr, params, ValueError("Some workspace object is inaccessible") + ) + + +def _run_and_run_batch_fail(sdkmr, params, expected): + rj = EE2RunJob(sdkmr) + with raises(Exception) as got: + rj.run(params, as_admin=True) + assert_exception_correct(got.value, expected) + + with raises(Exception) as got: + rj.run_batch([params], {}, as_admin=True) + assert_exception_correct(got.value, expected) + + +def _set_up_common_return_values_batch(mocks): + """ + Set up return values on mocks that are the same for several tests. + """ + reqs1 = ResolvedRequirements( + cpus=1, + memory_MB=2, + disk_GB=3, + client_group="cg1", + ) + reqs2 = ResolvedRequirements( + cpus=10, + memory_MB=20, + disk_GB=30, + client_group="cg2", + ) + mocks[JobRequirementsResolver].resolve_requirements.side_effect = [reqs1, reqs2] + mocks[Workspace].get_object_info3.return_value = { + "paths": [[_WS_REF_1], [_WS_REF_2]] + } + returned_parent_job = Job() + returned_parent_job.id = ObjectId(_JOB_ID) + returned_parent_job.user = _USER + mocks[SDKMethodRunner].save_and_return_job.return_value = returned_parent_job + mocks[Catalog].get_module_version.side_effect = [ + {"git_commit_hash": _GIT_COMMIT_1}, + {"git_commit_hash": _GIT_COMMIT_2}, + ] + # create job1, update job1, create job2, update job2, update parent job + mocks[SDKMethodRunner].save_job.side_effect = [ + _JOB_ID_1, + None, + _JOB_ID_2, + None, + None, + ] + mocks[Condor].run_job.side_effect = [ + SubmissionInfo(_CLUSTER_1, {}, None), + SubmissionInfo(_CLUSTER_2, {}, None), + ] + retjob_1 = Job() + retjob_1.id = ObjectId(_JOB_ID_1) + retjob_1.status = _CREATED_STATE + retjob_2 = Job() + retjob_2.id = ObjectId(_JOB_ID_2) + retjob_2.status = _CREATED_STATE + mocks[MongoUtil].get_job.side_effect = [retjob_1, retjob_2] + return reqs1, reqs2 + + +def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): + """ + Check that mocks are called as expected when those calls are similar or the same for + several tests. + """ + sdkmr = mocks[SDKMethodRunner] + mocks[JobRequirementsResolver].resolve_requirements.assert_has_calls( + [ + call(_METHOD_1), + call(_METHOD_2), + ] + ) + mocks[Workspace].get_object_info3.assert_called_once_with( + {"objects": [{"ref": _WS_REF_1}, {"ref": _WS_REF_2}], "ignoreErrors": 1} + ) + + # parent job initial save + expected_parent_job = Job() + job_input = JobInput() + job_input.service_ver = _BATCH + job_input.app_id = _BATCH + job_input.method = _BATCH + job_input.narrative_cell_info = Meta() + expected_parent_job.job_input = job_input + expected_parent_job.batch_job = True + expected_parent_job.status = _CREATED_STATE + expected_parent_job.wsid = parent_wsid + expected_parent_job.user = _USER + assert len(sdkmr.save_and_return_job.call_args_list) == 1 + got_parent_job = sdkmr.save_and_return_job.call_args_list[0][0][0] + assert_jobs_equal(got_parent_job, expected_parent_job) + + mocks[Catalog].get_module_version.assert_has_calls( + [ + call({"module_name": "module1", "version": "release"}), + call({"module_name": "module2", "version": "release"}), + ] + ) + + assert len(sdkmr.save_job.call_args_list) == 5 + + # initial child jobs data save + expected_job_1 = _create_job( + reqs1, + method=_METHOD_1, + app=_APP_1, + git_commit=_GIT_COMMIT_1, + source_ws_objects=[_WS_REF_1, _WS_REF_2], + parent_job_id=_JOB_ID, + ) + got_job_1 = sdkmr.save_job.call_args_list[0][0][0] + assert_jobs_equal(got_job_1, expected_job_1) + + expected_job_2 = _create_job( + reqs2, + method=_METHOD_2, + app=_APP_2, + git_commit=_GIT_COMMIT_2, + wsid=wsid, + parent_job_id=_JOB_ID, + ) + # index 2 because job 1 is updated with save_job before this job is created + got_job_2 = sdkmr.save_job.call_args_list[2][0][0] + assert_jobs_equal(got_job_2, expected_job_2) + + jsp_expected_1 = JobSubmissionParameters( + _JOB_ID_1, + AppInfo(_METHOD_1, _APP_1), + reqs1, + UserCreds(_USER, _TOKEN), + parent_job_id=_JOB_ID, + source_ws_objects=[_WS_REF_1, _WS_REF_2], + ) + jsp_expected_2 = JobSubmissionParameters( + _JOB_ID_2, + AppInfo(_METHOD_2, _APP_2), + reqs2, + UserCreds(_USER, _TOKEN), + parent_job_id=_JOB_ID, + wsid=wsid, + ) + mocks[Condor].run_job.assert_has_calls( + [call(params=jsp_expected_1), call(params=jsp_expected_2)] + ) + + # updated job data save + mocks[MongoUtil].get_job.assert_has_calls([call(_JOB_ID_1), call(_JOB_ID_2)]) + + # update to queued state + got_queued_job_1 = sdkmr.save_job.call_args_list[1][0][0] + got_queued_job_2 = sdkmr.save_job.call_args_list[3][0][0] + _check_queued_job_save(got_queued_job_1, _JOB_ID_1, _CLUSTER_1) + _check_queued_job_save(got_queued_job_2, _JOB_ID_2, _CLUSTER_2) + + mocks[KafkaClient].send_kafka_message.assert_has_calls( + [ + call(KafkaCreateJob(job_id=_JOB_ID, user=_USER)), # parent job + call(KafkaCreateJob(job_id=_JOB_ID_1, user=_USER)), + call( + KafkaQueueChange( + job_id=_JOB_ID_1, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER_1, + ) + ), + call(KafkaCreateJob(job_id=_JOB_ID_2, user=_USER)), + call( + KafkaQueueChange( + job_id=_JOB_ID_2, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER_2, + ) + ), + ] + ) + + mocks[SlackClient].run_job_message.assert_has_calls( + [ + call(job_id=_JOB_ID_1, scheduler_id=_CLUSTER_1, username=_USER), + call(job_id=_JOB_ID_2, scheduler_id=_CLUSTER_2, username=_USER), + ] + ) + + final_expected_parent_job = Job() + final_expected_parent_job.id = ObjectId(_JOB_ID) + final_expected_parent_job.user = _USER + final_expected_parent_job.child_jobs = [_JOB_ID_1, _JOB_ID_2] + final_got_parent_job = sdkmr.save_job.call_args_list[4][0][0] + assert_jobs_equal(final_got_parent_job, final_expected_parent_job) + + +def test_run_batch_with_parent_job_wsid(): + """ + A basic unit test of the run_batch() method, providing a workspace ID for the parent job. + + This test is a fairly minimal test of the run_batch() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + # set up variables + parent_wsid = 89 + wsid = 32 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + + mocks[WorkspaceAuth].can_write.return_value = True + mocks[WorkspaceAuth].can_write_list.return_value = {wsid: True} + reqs1, reqs2 = _set_up_common_return_values_batch(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = [ + { + "method": _METHOD_1, + "app_id": _APP_1, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + }, + { + "method": _METHOD_2, + "app_id": _APP_2, + "wsid": wsid, + }, + ] + assert rj.run_batch(params, {"wsid": parent_wsid}) == { + "parent_job_id": _JOB_ID, + "child_job_ids": [_JOB_ID_1, _JOB_ID_2], + } + + # check mocks called as expected. The order here is the order that they're called in the code. + mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) + # this seems like a bug. See comments in the run_batch method + mocks[WorkspaceAuth].can_write_list.assert_called_once_with([parent_wsid, wsid]) + _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid) + + +def test_run_batch_as_admin(): + """ + A basic unit test of the run_batch() method with an administrative user. + + This test is a fairly minimal test of the run_batch() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + # set up variables + wsid = 32 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + reqs1, reqs2 = _set_up_common_return_values_batch(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = [ + { + "method": _METHOD_1, + "app_id": _APP_1, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + }, + { + "method": _METHOD_2, + "app_id": _APP_2, + "wsid": wsid, + }, + ] + assert rj.run_batch(params, {}, as_admin=True) == { + "parent_job_id": _JOB_ID, + "child_job_ids": [_JOB_ID_1, _JOB_ID_2], + } # check mocks called as expected. The order here is the order that they're called in the code. sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) - catutils.get_normalized_resources.assert_called_once_with(_METHOD) - condor.extract_resources.assert_called_once_with(catalog_resources) - expected_condor_resources = CondorResources(4, "2600", "32", client_group) - _check_common_mock_calls(mocks, catalog_resources, expected_condor_resources, None) + _check_common_mock_calls_batch(mocks, reqs1, reqs2, None, wsid) + + +def test_run_batch_fail_params_not_list(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + + rj = EE2RunJob(sdkmr) + for params in [ + None, + {}, + { + 1, + }, + "a", + 8, + ]: + with raises(Exception) as got: + rj.run_batch(params, {}, as_admin=True) + assert_exception_correct( + got.value, IncorrectParamsException("params must be a list") + ) def assert_jobs_equal(got_job: Job, expected_job: Job): diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py index 9555c6220..b1cdc311c 100644 --- a/test/tests_for_sdkmr/EE2Status_test.py +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -11,7 +11,6 @@ from execution_engine2.sdk.EE2Status import JobsStatus, JobPermissions from execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.utils.KafkaUtils import KafkaClient, KafkaFinishJob -from lib.execution_engine2.utils.CatalogUtils import CatalogUtils from lib.execution_engine2.utils.Condor import Condor from installed_clients.CatalogClient import Catalog diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py index 7a4647ad7..279cf0438 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py @@ -38,7 +38,7 @@ def setUpClass(cls): with open(deploy) as cf: cls.method_runner = SDKMethodRunner( get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, deploy, cf), + get_client_set(cls.cfg, cf), ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index c38861df3..1da33045d 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -19,7 +19,6 @@ from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.utils.CatalogUtils import CatalogUtils from execution_engine2.utils.Condor import Condor from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient @@ -28,7 +27,8 @@ from execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from lib.execution_engine2.utils.CondorTuples import SubmissionInfo from execution_engine2.utils.clients import UserClientSet, ClientSet from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper @@ -77,7 +77,7 @@ def setUpClass(cls): with open(cls.config_file) as cf: cls.method_runner = SDKMethodRunner( get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, cls.config_file, cf), + get_client_set(cls.cfg, cf), ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -86,12 +86,6 @@ def setUpClass(cls): db=cls.cfg["mongo-database"], col=cls.cfg["mongo-jobs-collection"] ) - cls.cr = CondorResources( - request_cpus="1", - request_disk="1GB", - request_memory="100M", - client_group="njs", - ) cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: @@ -168,7 +162,6 @@ def test_getters(self): assert sdkmr.get_kafka_client() is clients_and_mocks[KafkaClient] assert sdkmr.get_mongo_util() is clients_and_mocks[MongoUtil] assert sdkmr.get_slack_client() is clients_and_mocks[SlackClient] - assert sdkmr.get_catalog_utils() is clients_and_mocks[CatalogUtils] assert sdkmr.get_condor() is clients_and_mocks[Condor] assert sdkmr.get_catalog() is clients_and_mocks[Catalog] assert ( @@ -283,22 +276,18 @@ def test_cancel_job2(self, rq_mock, condor_mock): runner.get_condor = MagicMock(return_value=condor_mock) fixed_rj = EE2RunJob(runner) fixed_rj._get_module_git_commit = MagicMock(return_value="hash_goes_here") - fixed_rj.sdkmr.catalog_utils.list_client_group_configs = MagicMock( - return_value="cg goes her" - ) runner.get_runjob = MagicMock(return_value=fixed_rj) # ctx = {"user_id": self.user_id, "wsid": self.ws_id, "token": self.token} job = get_example_job().to_mongo().to_dict() - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) print("About to run job with params") pprint(job) job_id0 = runner.run_job(params=job) @@ -430,13 +419,12 @@ def test_run_job_and_add_log(self, rq_mock, condor_mock): ) runner.get_condor = MagicMock(return_value=condor_mock) job = get_example_job(user=self.user_id, wsid=self.ws_id).to_mongo().to_dict() - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) job_id = runner.run_job(params=job) logging.info(f"Job id is {job_id} ") @@ -580,7 +568,7 @@ def test_finish_job(self, condor): runner = self.getRunner() runner._test_job_permissions = MagicMock(return_value=True) - runner.catalog_utils.get_catalog().log_exec_stats = MagicMock(return_value=True) + runner.get_catalog().log_exec_stats = MagicMock(return_value=True) # test missing job_id input with self.assertRaises(ValueError) as context1: @@ -734,7 +722,7 @@ def test_check_job_global_perm(self, rq_mock): with open(self.config_file) as cf: other_method_runner = SDKMethodRunner( get_user_client_set(self.cfg, "some_other_user", "other_token"), - get_client_set(self.cfg, self.config_file, cf), + get_client_set(self.cfg, cf), ) job_states = other_method_runner.get_jobs_status().check_workspace_jobs( self.ws_id @@ -920,7 +908,18 @@ def test_check_jobs_date_range(self, condor_mock): runner = self.getRunner() + # TODO redo this test with dependency injection & autospec vs. monkey patching + resolver = create_autospec( + JobRequirementsResolver, spec_set=True, instance=True + ) runner.workspace_auth = MagicMock() + runner.get_job_requirements_resolver = MagicMock(return_value=resolver) + resolver.resolve_requirements.return_value = JobRequirements( + cpus=1, + memory_MB=100, + disk_GB=1, + client_group="njs", + ) runner.auth.get_user = MagicMock(return_value=user_name) runner.check_is_admin = MagicMock(return_value=True) @@ -937,13 +936,12 @@ def test_check_jobs_date_range(self, condor_mock): runner.get_condor = MagicMock(return_value=condor_mock) # ctx = {"user_id": self.user_id, "wsid": self.ws_id, "token": self.token} job = get_example_job().to_mongo().to_dict() - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) job_id1 = runner.run_job(params=job) job_id2 = runner.run_job(params=job) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 9c66069e0..db9ffaafd 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -12,11 +12,12 @@ from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from lib.execution_engine2.utils.CondorTuples import SubmissionInfo from execution_engine2.utils.clients import ( get_client_set, get_user_client_set, ) +from execution_engine2.sdk.job_submission_parameters import JobRequirements from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -56,7 +57,7 @@ def setUpClass(cls): with open(config_file) as cf: cls.method_runner = SDKMethodRunner( get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, config_file, cf), + get_client_set(cls.cfg, cf), ) cls.mongo_util = MongoUtil(cls.cfg) @@ -66,12 +67,6 @@ def setUpClass(cls): db=cls.cfg["mongo-database"], col=cls.cfg["mongo-jobs-collection"] ) - cls.cr = CondorResources( - request_cpus="1", - request_disk="1GB", - request_memory="100M", - client_group="njs", - ) cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: @@ -86,7 +81,7 @@ def create_job_rec(self): return self.sdkmr_test_helper.create_job_rec() def test_init_ok(self): - class_attri = ["catalog_utils", "workspace", "mongo_util", "condor"] + class_attri = ["workspace", "mongo_util", "condor"] runner = self.getRunner() self.assertTrue(set(class_attri) <= set(runner.__dict__.keys())) @@ -110,6 +105,7 @@ def test_init_job_rec(self): "min_contig_len": None, } ], + "job_reqs": JobRequirements(1, 1, 1, "njs"), "source_ws_objects": ["a/b/c", "e/d"], "parent_job_id": "9998", "meta": {"tag": "dev", "token_id": "12345"}, @@ -237,7 +233,6 @@ def test_run_job(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) job_id = runner.run_job(params=job) print(f"Job id is {job_id} ") @@ -259,7 +254,6 @@ def test_run_job_batch(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) @@ -291,7 +285,6 @@ def test_run_job_fail(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) with self.assertRaises(expected_exception=RuntimeError): runner.run_job(params=job) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index 2ca2ecb3f..5985ba92d 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -12,7 +12,7 @@ from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from lib.execution_engine2.utils.CondorTuples import SubmissionInfo from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import bootstrap, get_example_job @@ -48,14 +48,8 @@ def setUpClass(cls): with open(config_file) as cf: cls.method_runner = SDKMethodRunner( get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, config_file, cf), + get_client_set(cls.cfg, cf), ) - cls.cr = CondorResources( - request_cpus="1", - request_disk="1GB", - request_memory="100M", - client_group="njs", - ) cls.fake_used_resources = { "RemoteUserCpu": "1", "DiskUsage_RAW": "1", @@ -96,7 +90,6 @@ def test_run_job_and_handle_held(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) condor_mock.get_job_resource_info = MagicMock( return_value=self.fake_used_resources ) @@ -184,7 +177,6 @@ def test_cancel_job_batch(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) @@ -213,7 +205,6 @@ def test_abandon_children(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) @@ -250,7 +241,6 @@ def test_check_job_batch(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 9192cb2c2..3a98b233f 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -15,6 +15,7 @@ from execution_engine2.execution_engine2Impl import execution_engine2 from execution_engine2.sdk.EE2Status import JobsStatus from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.utils.Condor import Condor from execution_engine2.utils.CondorTuples import SubmissionInfo from execution_engine2.utils.clients import get_user_client_set, get_client_set @@ -58,7 +59,7 @@ def _getRunner(cls) -> SDKMethodRunner: with open(cls.deploy) as cf: runner = SDKMethodRunner( get_user_client_set(cls.cfg, cls.user_id, cls.token), - get_client_set(cls.cfg, cls.deploy, cf), + get_client_set(cls.cfg, cf), ) # Initialize these clients from None status = runner.get_jobs_status() # type: JobsStatus @@ -85,7 +86,9 @@ def test_init_job_stress(self): method_1 = "app_1.a_method" method_2 = "app_1.b_method" job_params_1 = get_sample_job_params(method=method_1) + job_params_1["job_reqs"] = JobRequirements(1, 1, 1, "njs") job_params_2 = get_sample_job_params(method=method_2) + job_params_2["job_reqs"] = JobRequirements(1, 1, 1, "njs") threads = list() job_ids = list() @@ -141,6 +144,7 @@ def test_update_job_status_stress(self): runner = self.method_runner job_params = get_sample_job_params() + job_params["job_reqs"] = JobRequirements(1, 1, 1, "njs") thread_count = self.thread_count # threads to test @@ -258,9 +262,9 @@ def test_run_job_stress(self, ccles, cc, workspace, condor): method_2 = "app2.b_method" method_3 = "app3.c_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - job_params_3 = get_sample_job_params(method=method_3) + job_params_1 = get_sample_job_params(method=method_1, app_id="app1/a") + job_params_2 = get_sample_job_params(method=method_2, app_id="app2/b") + job_params_3 = get_sample_job_params(method=method_3, app_id="app3/c") threads = list() job_ids = list() @@ -325,6 +329,7 @@ def test_update_job_status(self): runner = self.method_runner job_params = get_sample_job_params() + job_params["job_reqs"] = JobRequirements(1, 1, 1, "njs") thread_count = self.thread_count # threads to test @@ -415,7 +420,9 @@ def test_check_jobs_stress(self): method_1 = "a_method" method_2 = "b_method" job_params_1 = get_sample_job_params(method=method_1) + job_params_1["job_reqs"] = JobRequirements(1, 1, 1, "njs") job_params_2 = get_sample_job_params(method=method_2) + job_params_2["job_reqs"] = JobRequirements(1, 1, 1, "njs") # create jobs job_id_1 = runner.get_runjob()._init_job_rec(self.user_id, job_params_1) @@ -466,6 +473,7 @@ def test_check_job_canceled_stress(self): runner = self.method_runner job_params = get_sample_job_params() + job_params["job_reqs"] = JobRequirements(1, 1, 1, "njs") # create jobs job_id_running = runner.get_runjob()._init_job_rec(self.user_id, job_params) @@ -569,9 +577,9 @@ def test_get_job_logs_stress(self): runner = self.method_runner # create job - job_id = runner.get_runjob()._init_job_rec( - self.user_id, get_sample_job_params() - ) + params = get_sample_job_params() + params["job_reqs"] = JobRequirements(1, 1, 1, "njs") + job_id = runner.get_runjob()._init_job_rec(self.user_id, params) # add one line to job ts = time.time() @@ -626,9 +634,9 @@ def test_add_job_logs_stress(self): runner = self.method_runner # create job - job_id = runner.get_runjob()._init_job_rec( - self.user_id, get_sample_job_params() - ) + params = get_sample_job_params() + params["job_reqs"] = JobRequirements(1, 1, 1, "njs") + job_id = runner.get_runjob()._init_job_rec(self.user_id, params) # job line to be added ts = time.time() diff --git a/test/tests_for_sdkmr/ee2_scheduler_test.py b/test/tests_for_sdkmr/ee2_scheduler_test.py index dbd2038b6..f56e284bf 100644 --- a/test/tests_for_sdkmr/ee2_scheduler_test.py +++ b/test/tests_for_sdkmr/ee2_scheduler_test.py @@ -1,12 +1,25 @@ # -*- coding: utf-8 -*- + +""" +Tests for the Condor scheduler. +""" + import logging -import os import unittest -from lib.execution_engine2.sdk.EE2Runjob import ConciergeParams -from lib.execution_engine2.utils.CatalogUtils import CatalogUtils from lib.execution_engine2.utils.Condor import Condor -from test.utils_shared.test_utils import bootstrap +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements, +) +from execution_engine2.utils.job_requirements_resolver import ( + REQUEST_CPUS, + REQUEST_DISK, + REQUEST_MEMORY, +) +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.utils.user_info import UserCreds +from test.utils_shared.test_utils import bootstrap, get_ee2_test_config logging.basicConfig(level=logging.INFO) @@ -16,63 +29,51 @@ class ExecutionEngine2SchedulerTest(unittest.TestCase): @classmethod def setUpClass(cls): - deploy = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") - cls.condor = Condor(deploy) + cls.condor = Condor(get_ee2_test_config()) cls.job_id = "1234" cls.user = "kbase" - cls.catalog_utils = CatalogUtils( - url="https://ci.kbase.us/services/Catalog", admin_token="123" - ) - - @classmethod - def tearDownClass(cls): - if hasattr(cls, "wsName"): - cls.wsClient.delete_workspace({"workspace": cls.wsName}) - print("Test workspace was deleted") - - def _create_sample_params(self, cgroups): - params = dict() - params["job_id"] = self.job_id - params["user_id"] = "kbase" - params["token"] = "test_token" - rr = CatalogUtils.normalize_job_settings(cgroups) - - print(rr) - params["cg_resources_requirements"] = rr - - return params - def test_empty_params(self): - c = self.condor - params = {"job_id": "test_job_id", "user_id": "test", "token": "test_token"} - with self.assertRaisesRegex( - Exception, "cg_resources_requirements not found in params" - ): - c._create_submit(params) + def _create_sample_params(self, request_x, scheduler_requirements=None): + sr = scheduler_requirements if scheduler_requirements else {} + return JobSubmissionParameters( + self.job_id, + AppInfo("fake.fake", "fake/app"), + JobRequirements( + request_x.get("request_cpus", 101), + request_x.get("request_memory", 102), + request_x.get("request_disk", 103), + request_x.get("client_group", "defaultcg"), + request_x.get("client_group_regex", True), + bill_to_user=request_x.get("bill_to_user"), + ignore_concurrency_limits=request_x.get( + "ignore_concurrency_limits", False + ), + scheduler_requirements=sr, + ), + UserCreds(self.user, "test_token"), + ) def test_create_submit_file(self): # Test with empty clientgroup logging.info("Testing with njs clientgroup") c = self.condor - params = self._create_sample_params(cgroups=["njs"]) + params = self._create_sample_params({"client_group": "njs"}) default_sub = c._create_submit(params) sub = default_sub self.assertEqual(sub["executable"], c.initial_dir + "/" + c.executable) - self.assertEqual(sub["arguments"], f"{params['job_id']} {c.ee_endpoint}") + self.assertEqual(sub["arguments"], f"{self.job_id} {c.ee_endpoint}") self.assertEqual(sub["universe"], "vanilla") - self.assertEqual(sub["+AccountingGroup"], '"' + params["user_id"] + '"') - self.assertEqual(sub["Concurrency_Limits"], params["user_id"]) + self.assertEqual(sub["+AccountingGroup"], f'"{self.user}"') + self.assertEqual(sub["Concurrency_Limits"], self.user) self.assertEqual(sub["+Owner"], '"condor_pool"') self.assertEqual(sub["ShouldTransferFiles"], "YES") self.assertEqual(sub["When_To_Transfer_Output"], "ON_EXIT_OR_EVICT") - self.assertEqual(sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS]) - self.assertEqual( - sub[Condor.REQUEST_MEMORY], c.config["njs"][Condor.REQUEST_MEMORY] - ) - self.assertEqual(sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK]) + self.assertEqual(sub[REQUEST_CPUS], "101") + self.assertEqual(sub[REQUEST_MEMORY], "102MB") + self.assertEqual(sub[REQUEST_DISK], "103GB") # TODO Test this variable somehow # environment = sub["environment"].split(" ") @@ -81,7 +82,8 @@ def test_create_submit_file(self): logging.info("Testing with complex-empty clientgroup") params = self._create_sample_params( - cgroups=["njs", "request_cpus=8", "request_memory=10GB", "request_apples=5"] + {"client_group": "njs", "request_cpus": 8, "request_memory": 10}, + {"request_apples": "5"}, ) njs_sub = c._create_submit(params) @@ -93,9 +95,9 @@ def test_create_submit_file(self): self.assertIn('request_apples == "5"', sub["requirements"]) - self.assertEqual(sub[Condor.REQUEST_CPUS], "8") - self.assertEqual(sub[Condor.REQUEST_MEMORY], "10GB") - self.assertEqual(sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK]) + self.assertEqual(sub[REQUEST_CPUS], "8") + self.assertEqual(sub[REQUEST_MEMORY], "10MB") + self.assertEqual(sub[REQUEST_DISK], "103GB") logging.info("Testing with regex disabled in old format (no effect)") @@ -111,16 +113,14 @@ def test_create_submit_file(self): logging.info("Testing with empty clientgroup defaulting to njs") - params = self._create_sample_params(cgroups="") + params = self._create_sample_params({}) empty_sub = c._create_submit(params) sub = empty_sub - self.assertEqual(sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS]) - self.assertEqual( - sub[Condor.REQUEST_MEMORY], c.config["njs"][Condor.REQUEST_MEMORY] - ) - self.assertEqual(sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK]) + self.assertEqual(sub[REQUEST_CPUS], "101") + self.assertEqual(sub[REQUEST_MEMORY], "102MB") + self.assertEqual(sub[REQUEST_DISK], "103GB") # logging.info("Testing with empty dict (raises typeerror)") # @@ -129,85 +129,49 @@ def test_create_submit_file(self): # print(params) # empty_json_sub = c.create_submit(params) - logging.info("Testing with empty dict as a string ") - - params = self._create_sample_params(cgroups=["{}"]) - - empty_json_sub = c._create_submit(params) - - params = self._create_sample_params(cgroups=['{"client_group" : "njs"}']) - - json_sub = c._create_submit(params) + logging.info("Testing with regex disabled, bigmem") params = self._create_sample_params( - cgroups=['{"client_group" : "njs", "client_group_regex" : "false"}'] + {"client_group": "bigmem", "client_group_regex": False} ) - json_sub_with_regex_disabled_njs = c._create_submit(params) - - # json_sub_with_regex_disabled - - logging.info("Testing with real valid json ") - for sub in [empty_json_sub, json_sub, json_sub_with_regex_disabled_njs]: - self.assertEqual( - sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS] - ) - self.assertEqual( - sub[Condor.REQUEST_MEMORY], c.config["njs"][Condor.REQUEST_MEMORY] - ) - self.assertEqual( - sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK] - ) - - with self.assertRaises(ValueError): - logging.info("Testing with real json invalid cgroup {bigmemzlong} ") - params = self._create_sample_params( - cgroups='{"client_group" : "bigmemzlong", "client_group_regex" : "FaLsE"}' - ) - - # json_sub_with_regex_disabled - c._create_submit(params) - - logging.info("Testing with real json, regex disabled, bigmem") - - params = self._create_sample_params( - cgroups=['{"client_group" : "bigmem", "client_group_regex" : "FaLsE"}'] - ) - - json_sub_with_regex_disabled_bigmem = c._create_submit(params) + sub_with_regex_disabled_bigmem = c._create_submit(params) self.assertIn( '(CLIENTGROUP == "bigmem', - json_sub_with_regex_disabled_bigmem["requirements"], + sub_with_regex_disabled_bigmem["requirements"], ) - def _get_concierge_params(self, cg=None): - cp = {} - cp["request_cpus"] = 100 - cp["request_memory"] = 200 - cp["request_disk"] = 1000 - if cg: - cp["client_group"] = cg - return ConciergeParams(**cp) - - def test_create_submit_file_concierge(self): + def test_create_submit_file_without_concurrency_limits_and_bill_to_user(self): logging.info("Testing with concierge clientgroup") c = self.condor - params = self._create_sample_params(cgroups=["njs"]) - cp = self._get_concierge_params() - sub = c._create_submit(params=params, concierge_params=cp) + params = self._create_sample_params( + { + "client_group": "njs", + "request_cpus": 100, + "request_memory": 200, + "request_disk": 1000, + "ignore_concurrency_limits": True, + } + ) + sub = c._create_submit(params=params) # Concurrency limits removed self.assertNotIn("Concurrency_Limits", sub) - self.assertEqual(sub["+AccountingGroup"], '"' + params["user_id"] + '"') - self.assertEqual(sub[Condor.REQUEST_CPUS], str(cp.request_cpus)) - self.assertEqual(sub[Condor.REQUEST_MEMORY], str(cp.request_memory)) - self.assertEqual(sub[Condor.REQUEST_DISK], str(cp.request_disk)) - self.assertEqual(sub["+KB_CLIENTGROUP"], f'"{str(cp.client_group)}"') - - cp.client_group = "LeConcierge" - cp.account_group = "LeCat" - sub2 = c._create_submit(params=params, concierge_params=cp) - self.assertEqual(sub2["+KB_CLIENTGROUP"], f'"{str(cp.client_group)}"') - self.assertEqual(sub2["+AccountingGroup"], '"' + cp.account_group + '"') + self.assertEqual(sub["+AccountingGroup"], f'"{self.user}"') + self.assertEqual(sub[REQUEST_CPUS], "100") + self.assertEqual(sub[REQUEST_MEMORY], "200MB") + self.assertEqual(sub[REQUEST_DISK], "1000GB") + self.assertEqual(sub["+KB_CLIENTGROUP"], '"njs"') + + params = self._create_sample_params( + { + "client_group": "LeConcierge", + "bill_to_user": "LeCat", + "ignore_concurrency_limits": True, + } + ) + sub2 = c._create_submit(params=params) + self.assertEqual(sub2["+KB_CLIENTGROUP"], '"LeConcierge"') + self.assertEqual(sub2["+AccountingGroup"], '"LeCat"') self.assertNotIn("Concurrency_Limits", sub2) # submission_info = c.run_submit(sub2) diff --git a/test/tests_for_utils/Condor_test.py b/test/tests_for_utils/Condor_test.py new file mode 100644 index 000000000..9bc099e43 --- /dev/null +++ b/test/tests_for_utils/Condor_test.py @@ -0,0 +1,212 @@ +""" +Unit tests for the Condor wrapper. +""" + +# TODO Add tests for get_job_resource_info and cancel_job + +import htcondor +from unittest.mock import create_autospec + +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements, +) +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo + +# Note the executable existence code in the constructor appears to be buggy and will never +# throw an error. If it checks for existence of initial-dir/executable as well as just executable +# that makes testing a bit ungainly as executable will have to exist in the current directory. +# TODO fix the executable existence bug in the Condor constructor + + +def _mock_htc(): + htc = create_autospec(htcondor, spec_set=True) + sub = create_autospec(htcondor.Submit, spec_set=True, instance=True) + htc.Submit.return_value = sub + schedd = create_autospec(htcondor.Schedd, spec_set=True, instance=True) + htc.Schedd.return_value = schedd + txn = create_autospec(htcondor.Transaction, spec_set=True, instance=True) + # mock context manager ops + schedd.transaction.return_value = txn + txn.__enter__.return_value = txn + return htc, sub, schedd, txn + + +def _get_common_sub(job_id): + return { + "universe": "vanilla", + "ShouldTransferFiles": "YES", + "on_exit_hold": "ExitCode =!= 0", + "JobLeaseDuration": "43200", + "MaxJobRetirementTime": "43200", + "Periodic_Hold": "( RemoteWallClockTime > 604800 )", + "log": "runner_logs/$(Cluster).$(Process).log", + "error": f"runner_logs/{job_id}.err", + "output": f"runner_logs/{job_id}.out", + "transfer_output_remaps": f'"runner_logs/{job_id}.err=cluster_logs/{job_id}.err;' + + f'runner_logs/{job_id}.out=cluster_logs/{job_id}.out"', + "When_To_Transfer_Output": "ON_EXIT_OR_EVICT", + "getenv": "false", + } + + +def _check_calls(htc, schedd, sub, txn, expected_sub): + htc.Submit.assert_called_once_with(expected_sub) + htc.Schedd.assert_called_once_with() + schedd.transaction.assert_called_once_with() + sub.queue.assert_called_once_with(txn, 1) + + +def test_run_job_minimal(): + htc, sub, schedd, txn = _mock_htc() + c = Condor( + { + "external-url": "https://fake.com", + "executable": "file.exe", + "catalog-token": "cattoken", + }, + htc=htc, + ) + sub.queue.return_value = 123 + + subinfo = c.run_job( + JobSubmissionParameters( + "jobbyjob", + AppInfo("foo.bar", "foo/whoo"), + JobRequirements(2, 3, 4, "cg"), + UserCreds("user1", "token"), + ) + ) + # presumably sub being part of the submission info is a bug. I assume that it's intended + # to be the submission dictionary. However, that contains admin tokens and SubmissionInfo + # gets logged so maybe it's better this way. + assert subinfo == SubmissionInfo("123", sub, None) + + expected_sub = _get_common_sub("jobbyjob") + expected_sub.update( + { + "JobBatchName": "jobbyjob", + "arguments": "jobbyjob https://fake.com", + "+KB_PARENT_JOB_ID": "", + "+KB_MODULE_NAME": '"foo"', + "+KB_FUNCTION_NAME": '"bar"', + "+KB_APP_ID": '"foo/whoo"', + "+KB_APP_MODULE_NAME": '"foo"', + "+KB_WSID": "", + "+KB_SOURCE_WS_OBJECTS": "", + "request_cpus": "2", + "request_memory": "3MB", + "request_disk": "4GB", + "requirements": 'regexp("cg",CLIENTGROUP)', + "+KB_CLIENTGROUP": '"cg"', + "Concurrency_Limits": "user1", + "+AccountingGroup": '"user1"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=604801 KB_ADMIN_AUTH_TOKEN=cattoken KB_AUTH_TOKEN=token ' + + "CLIENTGROUP=cg JOB_ID=jobbyjob CONDOR_ID=$(Cluster).$(Process) " + + 'PYTHON_EXECUTABLE=/miniconda/bin/python DEBUG_MODE=False PARENT_JOB_ID= "' + ), + "leavejobinqueue": "True", + "initial_dir": "/condor_shared", + "+Owner": '"condor_pool"', + "executable": "/condor_shared/file.exe", + "transfer_input_files": "/condor_shared/JobRunner.tgz", + } + ) + _check_calls(htc, schedd, sub, txn, expected_sub) + + +def test_run_job_maximal_with_concurrency_limits(): + """ + Tests with all constructor arguments and method arguments with concurrency limits. + """ + _run_job_maximal(True, {}) + + +def test_run_job_maximal_without_concurrency_limits(): + """ + Tests with all constructor arguments and method arguments without concurrency limits. + """ + _run_job_maximal(False, {"Concurrency_Limits": "sucker"}) + + +def _run_job_maximal(ignore_concurrency_limits, update): + htc, sub, schedd, txn = _mock_htc() + c = Condor( + { + "external-url": "https://fake2.com", + "executable": "somefile.exe", + "catalog-token": "catsupertoken", + "PYTHON_EXECUTABLE": "python1.3", + "initialdir": "/somedir", + "docker_timeout": 42, + "pool_user": "thosedamnkidsnextdoor", + "leavejobinqueue": "False", + "transfer_input_files": "alan_alda_nude.tiff", + }, + htc=htc, + ) + + sub.queue.return_value = 789 + + subinfo = c.run_job( + JobSubmissionParameters( + "a_job_id", + AppInfo("kb_quast.run_quast_app", "kb_quast/run_QUAST_app"), + JobRequirements( + 6, + 28, + 496, + "clientclientclient", + client_group_regex=False, + bill_to_user="sucker", + ignore_concurrency_limits=ignore_concurrency_limits, + scheduler_requirements={"a": "b", "c": "d"}, + debug_mode=True, + ), + UserCreds("user2", "suparsekrit"), + parent_job_id="old_n_gross", + wsid=89, + source_ws_objects=["1/2/3", "4/5/7"], + ) + ) + # presumably sub being part of the submission info is a bug. I assume that it's intended + # to be the submission dictionary. However, that contains admin tokens and SubmissionInfo + # gets logged so maybe it's better this way. + assert subinfo == SubmissionInfo("789", sub, None) + + expected_sub = _get_common_sub("a_job_id") + expected_sub.update(update) + expected_sub.update( + { + "JobBatchName": "a_job_id", + "arguments": "a_job_id https://fake2.com", + "+KB_PARENT_JOB_ID": '"old_n_gross"', + "+KB_MODULE_NAME": '"kb_quast"', + "+KB_FUNCTION_NAME": '"run_quast_app"', + "+KB_APP_ID": '"kb_quast/run_QUAST_app"', + "+KB_APP_MODULE_NAME": '"kb_quast"', + "+KB_WSID": '"89"', + "+KB_SOURCE_WS_OBJECTS": '"1/2/3,4/5/7"', + "request_cpus": "6", + "request_memory": "28MB", + "request_disk": "496GB", + "requirements": '(CLIENTGROUP == "clientclientclient") && (a == "b") && (c == "d")', + "+KB_CLIENTGROUP": '"clientclientclient"', + "+AccountingGroup": '"sucker"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=42 KB_ADMIN_AUTH_TOKEN=catsupertoken KB_AUTH_TOKEN=suparsekrit ' + + "CLIENTGROUP=clientclientclient JOB_ID=a_job_id CONDOR_ID=$(Cluster).$(Process) " + + 'PYTHON_EXECUTABLE=python1.3 DEBUG_MODE=True PARENT_JOB_ID=old_n_gross "' + ), + "leavejobinqueue": "False", + "initial_dir": "/somedir", + "+Owner": '"thosedamnkidsnextdoor"', + "executable": "/somedir/somefile.exe", + "transfer_input_files": "alan_alda_nude.tiff", + } + ) + _check_calls(htc, schedd, sub, txn, expected_sub) diff --git a/test/tests_for_utils/clients_test.py b/test/tests_for_utils/clients_test.py index bc9a68111..92bd39e01 100644 --- a/test/tests_for_utils/clients_test.py +++ b/test/tests_for_utils/clients_test.py @@ -16,7 +16,6 @@ from execution_engine2.authorization.roles import AdminAuthUtil from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.utils.CatalogUtils import CatalogUtils from execution_engine2.utils.Condor import Condor from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from execution_engine2.utils.KafkaUtils import KafkaClient @@ -93,30 +92,27 @@ def test_client_set_init_fail(): c = mocks[Condor] ca = mocks[Catalog] j = mocks[JobRequirementsResolver] - cu = mocks[CatalogUtils] k = mocks[KafkaClient] m = mocks[MongoUtil] s = mocks[SlackClient] n = None e = ValueError("auth cannot be a value that evaluates to false") - _client_set_init_fail(n, aa, c, ca, j, cu, k, m, s, e) + _client_set_init_fail(n, aa, c, ca, j, k, m, s, e) e = ValueError("auth_admin cannot be a value that evaluates to false") - _client_set_init_fail(a, n, c, ca, j, cu, k, m, s, e) + _client_set_init_fail(a, n, c, ca, j, k, m, s, e) e = ValueError("condor cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, n, ca, j, cu, k, m, s, e) + _client_set_init_fail(a, aa, n, ca, j, k, m, s, e) e = ValueError("catalog cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, n, j, cu, k, m, s, e) + _client_set_init_fail(a, aa, c, n, j, k, m, s, e) e = ValueError("requirements_resolver cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, n, cu, k, m, s, e) - e = ValueError("catalog_utils cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, n, k, m, s, e) + _client_set_init_fail(a, aa, c, ca, n, k, m, s, e) e = ValueError("kafka_client cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, cu, n, m, s, e) + _client_set_init_fail(a, aa, c, ca, j, n, m, s, e) e = ValueError("mongo_util cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, cu, k, n, s, e) + _client_set_init_fail(a, aa, c, ca, j, k, n, s, e) e = ValueError("slack_client cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, cu, k, m, n, e) + _client_set_init_fail(a, aa, c, ca, j, k, m, n, e) def _client_set_init_fail( @@ -125,7 +121,6 @@ def _client_set_init_fail( condor: Condor, catalog: Catalog, requirements_resolver: JobRequirementsResolver, - catalog_utils: CatalogUtils, kafka_client: KafkaClient, mongo_util: MongoUtil, slack_client: SlackClient, @@ -138,7 +133,6 @@ def _client_set_init_fail( condor, catalog, requirements_resolver, - catalog_utils, kafka_client, mongo_util, slack_client, diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py index 2e67220b7..2acdcae78 100644 --- a/test/utils_shared/mock_utils.py +++ b/test/utils_shared/mock_utils.py @@ -1,7 +1,6 @@ from unittest.mock import create_autospec from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.utils.CatalogUtils import CatalogUtils from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient @@ -27,12 +26,9 @@ def _build_job_reqs(config, cfgfile): AdminAuthUtil: lambda config, cfgfile: AdminAuthUtil( config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] ), - Condor: lambda config, cfgfile: Condor(cfgfile), + Condor: lambda config, cfgfile: Condor(config), Catalog: lambda config, cfgfile: Catalog(config["catalog-url"]), JobRequirementsResolver: _build_job_reqs, - CatalogUtils: lambda config, cfgfile: CatalogUtils( - config["catalog-url"], config["catalog-token"] - ), KafkaClient: lambda config, cfgfile: KafkaClient(config["kafka-host"]), MongoUtil: lambda config, cfgfile: MongoUtil(config), SlackClient: lambda config, cfgfile: SlackClient( @@ -67,7 +63,6 @@ def get_client_mocks(config, config_path, *to_be_mocked): ret[Condor], ret[Catalog], ret[JobRequirementsResolver], - ret[CatalogUtils], ret[KafkaClient], ret[MongoUtil], ret[SlackClient], diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index d23752074..e37bb7063 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -14,7 +14,7 @@ from lib.execution_engine2.db.models.models import Job, JobInput, Meta from lib.execution_engine2.db.models.models import Status from lib.execution_engine2.exceptions import MalformedTimestampException -from lib.execution_engine2.utils.CondorTuples import CondorResources, JobInfo +from lib.execution_engine2.utils.CondorTuples import JobInfo EE2_CONFIG_SECTION = "execution_engine2" @@ -47,7 +47,7 @@ def get_example_job_as_dict( .to_mongo() .to_dict() ) - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] return job @@ -65,11 +65,11 @@ def get_example_job( job_input = JobInput() job_input.wsid = j.wsid - job_input.method = "method" + job_input.method = "module.method" job_input.requested_release = "requested_release" job_input.params = {} job_input.service_ver = "dev" - job_input.app_id = "super_module.super_function" + job_input.app_id = "module/super_function" m = Meta() m.cell_id = "ApplePie" @@ -93,7 +93,7 @@ def get_example_job_as_dict_for_runjob( user=user, wsid=wsid, authstrat=authstrat, scheduler_id=scheduler_id ) job_dict = job.to_mongo().to_dict() - job_dict["method"] = job["job_input"]["app_id"] + job_dict["method"] = job["job_input"]["method"] job_dict["app_id"] = job["job_input"]["app_id"] job_dict["service_ver"] = job["job_input"]["service_ver"] return job_dict @@ -361,14 +361,13 @@ def get_sample_condor_info(job=None, error=None): return JobInfo(info=job, error=error) -def get_sample_job_params(method=None, wsid="123"): - if not method: - method = "default_method" - +def get_sample_job_params( + method="MEGAHIT.default_method", wsid=123, app_id="MEGAHIT/run_megahit" +): job_params = { "wsid": wsid, "method": method, - "app_id": "MEGAHIT/run_megahit", + "app_id": app_id, "service_ver": "2.2.1", "params": [ { From afeba1a05a2e464d4fddca058fbb62fac167d44b Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 8 Apr 2021 19:04:29 -0700 Subject: [PATCH 046/109] Remove ee2_scheduler_tests (#352) Made completely redundant by Condor_test. --- test/tests_for_sdkmr/ee2_scheduler_test.py | 196 --------------------- 1 file changed, 196 deletions(-) delete mode 100644 test/tests_for_sdkmr/ee2_scheduler_test.py diff --git a/test/tests_for_sdkmr/ee2_scheduler_test.py b/test/tests_for_sdkmr/ee2_scheduler_test.py deleted file mode 100644 index f56e284bf..000000000 --- a/test/tests_for_sdkmr/ee2_scheduler_test.py +++ /dev/null @@ -1,196 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests for the Condor scheduler. -""" - -import logging -import unittest - -from lib.execution_engine2.utils.Condor import Condor -from execution_engine2.sdk.job_submission_parameters import ( - JobSubmissionParameters, - JobRequirements, -) -from execution_engine2.utils.job_requirements_resolver import ( - REQUEST_CPUS, - REQUEST_DISK, - REQUEST_MEMORY, -) -from execution_engine2.utils.application_info import AppInfo -from execution_engine2.utils.user_info import UserCreds -from test.utils_shared.test_utils import bootstrap, get_ee2_test_config - -logging.basicConfig(level=logging.INFO) - -bootstrap() - - -class ExecutionEngine2SchedulerTest(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.condor = Condor(get_ee2_test_config()) - cls.job_id = "1234" - cls.user = "kbase" - - def _create_sample_params(self, request_x, scheduler_requirements=None): - sr = scheduler_requirements if scheduler_requirements else {} - return JobSubmissionParameters( - self.job_id, - AppInfo("fake.fake", "fake/app"), - JobRequirements( - request_x.get("request_cpus", 101), - request_x.get("request_memory", 102), - request_x.get("request_disk", 103), - request_x.get("client_group", "defaultcg"), - request_x.get("client_group_regex", True), - bill_to_user=request_x.get("bill_to_user"), - ignore_concurrency_limits=request_x.get( - "ignore_concurrency_limits", False - ), - scheduler_requirements=sr, - ), - UserCreds(self.user, "test_token"), - ) - - def test_create_submit_file(self): - # Test with empty clientgroup - logging.info("Testing with njs clientgroup") - c = self.condor - params = self._create_sample_params({"client_group": "njs"}) - - default_sub = c._create_submit(params) - - sub = default_sub - self.assertEqual(sub["executable"], c.initial_dir + "/" + c.executable) - self.assertEqual(sub["arguments"], f"{self.job_id} {c.ee_endpoint}") - self.assertEqual(sub["universe"], "vanilla") - self.assertEqual(sub["+AccountingGroup"], f'"{self.user}"') - self.assertEqual(sub["Concurrency_Limits"], self.user) - self.assertEqual(sub["+Owner"], '"condor_pool"') - self.assertEqual(sub["ShouldTransferFiles"], "YES") - self.assertEqual(sub["When_To_Transfer_Output"], "ON_EXIT_OR_EVICT") - - self.assertEqual(sub[REQUEST_CPUS], "101") - self.assertEqual(sub[REQUEST_MEMORY], "102MB") - self.assertEqual(sub[REQUEST_DISK], "103GB") - - # TODO Test this variable somehow - # environment = sub["environment"].split(" ") - - # Test with filled out clientgroup - logging.info("Testing with complex-empty clientgroup") - - params = self._create_sample_params( - {"client_group": "njs", "request_cpus": 8, "request_memory": 10}, - {"request_apples": "5"}, - ) - - njs_sub = c._create_submit(params) - sub = njs_sub - - self.assertIn("njs", sub["requirements"]) - - self.assertIn('regexp("njs",CLIENTGROUP)', sub["requirements"]) - - self.assertIn('request_apples == "5"', sub["requirements"]) - - self.assertEqual(sub[REQUEST_CPUS], "8") - self.assertEqual(sub[REQUEST_MEMORY], "10MB") - self.assertEqual(sub[REQUEST_DISK], "103GB") - - logging.info("Testing with regex disabled in old format (no effect)") - - # with self.assertRaisesRegex( - # ValueError, "Illegal argument! Old format does not support this option" - # ): - # params = self._create_sample_params( - # cgroups=["njs,request_cpus=8,request_memory=10GB,request_apples=5,client_group_regex=False"] - # ) - # c.create_submit(params) # pragma: no cover - - # Test with json version of clientgroup - - logging.info("Testing with empty clientgroup defaulting to njs") - - params = self._create_sample_params({}) - - empty_sub = c._create_submit(params) - sub = empty_sub - - self.assertEqual(sub[REQUEST_CPUS], "101") - self.assertEqual(sub[REQUEST_MEMORY], "102MB") - self.assertEqual(sub[REQUEST_DISK], "103GB") - - # logging.info("Testing with empty dict (raises typeerror)") - # - # with self.assertRaises(TypeError): - # params = self._create_sample_params(cgroups={}) - # print(params) - # empty_json_sub = c.create_submit(params) - - logging.info("Testing with regex disabled, bigmem") - - params = self._create_sample_params( - {"client_group": "bigmem", "client_group_regex": False} - ) - - sub_with_regex_disabled_bigmem = c._create_submit(params) - self.assertIn( - '(CLIENTGROUP == "bigmem', - sub_with_regex_disabled_bigmem["requirements"], - ) - - def test_create_submit_file_without_concurrency_limits_and_bill_to_user(self): - logging.info("Testing with concierge clientgroup") - c = self.condor - params = self._create_sample_params( - { - "client_group": "njs", - "request_cpus": 100, - "request_memory": 200, - "request_disk": 1000, - "ignore_concurrency_limits": True, - } - ) - sub = c._create_submit(params=params) - # Concurrency limits removed - self.assertNotIn("Concurrency_Limits", sub) - self.assertEqual(sub["+AccountingGroup"], f'"{self.user}"') - self.assertEqual(sub[REQUEST_CPUS], "100") - self.assertEqual(sub[REQUEST_MEMORY], "200MB") - self.assertEqual(sub[REQUEST_DISK], "1000GB") - self.assertEqual(sub["+KB_CLIENTGROUP"], '"njs"') - - params = self._create_sample_params( - { - "client_group": "LeConcierge", - "bill_to_user": "LeCat", - "ignore_concurrency_limits": True, - } - ) - sub2 = c._create_submit(params=params) - self.assertEqual(sub2["+KB_CLIENTGROUP"], '"LeConcierge"') - self.assertEqual(sub2["+AccountingGroup"], '"LeCat"') - self.assertNotIn("Concurrency_Limits", sub2) - - # submission_info = c.run_submit(sub2) - # - # self.assertIsNotNone(submission_info.clusterid) - # self.assertIsNotNone(submission_info.submit) - # self.assertIsNone(submission_info.error) - - # - # def test_extract(self): - # logging.info("Testing with concierge clientgroup") - # c = self.condor - # params = self._create_sample_params(cgroups=["njs"]) - # cp = self._get_concierge_params() - # sub = c.create_submit(params=params, concierge_params=cp) - # submission_info = c.run_submit(sub) - # print(submission_info) - # - # - # def test_get_usage(self): - # job_id = '732' - # print(self.condor.get_job_resource_info(cluster_id=job_id)) From 6a76cb4a59121aaa3decd0ba17731c4fe45d0300 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 8 Apr 2021 19:43:21 -0700 Subject: [PATCH 047/109] don't allow users to set a job's parent job (#353) Not necessary and allows setting a job's parent to an arbitrary job, which is not what we want --- lib/execution_engine2/sdk/EE2Runjob.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 571f35a59..fe15f0e69 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -299,8 +299,7 @@ def _create_parent_job(self, wsid, meta): def _run_batch(self, parent_job: Job, params): child_jobs = [] for job_param in params: - if _PARENT_JOB_ID not in job_param: - job_param[_PARENT_JOB_ID] = str(parent_job.id) + job_param[_PARENT_JOB_ID] = str(parent_job.id) try: child_jobs.append(str(self._run(params=job_param))) except Exception as e: From 60464f2ab5878585a8b35e1f1f9df84c218ceb5a Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 22 Apr 2021 11:59:50 -0700 Subject: [PATCH 048/109] DATAUP-389 - Add workspace instance to integration tests (#354) * Add workspace instance to integration tests Includes a basic test to check the workspace is running correctly which will be deleted when real tests are added. * run black * switch some constants to local vars not needed outside of local scope * shorten ws controller init method Stil pretty long, but easier to digest * clean up the types db prior to running tests --- test/tests_for_integration/api_to_db_test.py | 103 +++++-- .../workspace_controller.py | 269 ++++++++++++++++++ test/tests_for_integration/wsjars | 40 +++ 3 files changed, 389 insertions(+), 23 deletions(-) create mode 100644 test/tests_for_integration/workspace_controller.py create mode 100644 test/tests_for_integration/wsjars diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 391a290ed..cfb7ef243 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -5,9 +5,11 @@ NOTE 2: These tests were set up quickly in order to debug a problem with administration related calls. As such, the auth server was set up to run in test mode locally. If more integrations -(e.g. the workspace) are needed, they will need to be added either locally or as docker containers. -If the latter, the test auth integration will likely need to be converted to a docker container or -exposed to other containers. +are needed, they will need to be added either locally or as docker containers. +If the latter, the test auth and workspace integrations will likely need to be converted to +docker containers or exposed to other containers. + +NOTE 3: Posting to Slack always fails silently. """ import os @@ -21,6 +23,7 @@ from pytest import fixture from typing import Dict from tests_for_integration.auth_controller import AuthController +from tests_for_integration.workspace_controller import WorkspaceController from utils_shared.test_utils import ( get_full_test_config, get_ee2_test_config, @@ -34,10 +37,9 @@ ) from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from installed_clients.execution_engine2Client import execution_engine2 as ee2client +from installed_clients.WorkspaceClient import Workspace KEEP_TEMP_FILES = False -AUTH_DB = "api_to_db_test" -AUTH_MONGO_USER = "auth" TEMP_DIR = Path("test_temp_can_delete") # may need to make this configurable @@ -73,13 +75,17 @@ def mongo_client(config): mc.close() -def _clean_auth_db(mongo_client): +def _clean_db(mongo_client, db, db_user): try: - mongo_client[AUTH_DB].command("dropUser", AUTH_MONGO_USER) + mongo_client[db].command("dropUser", db_user) except pymongo.errors.OperationFailure as e: - if f"User '{AUTH_MONGO_USER}@{AUTH_DB}' not found" not in e.args[0]: + if f"User '{db_user}@{db}' not found" not in e.args[0]: raise # otherwise ignore and continue, user is already toast - mongo_client.drop_database(AUTH_DB) + mongo_client.drop_database(db) + + +def _create_db_user(mongo_client, db, db_user, password): + mongo_client[db].command("createUser", db_user, pwd=password, roles=["readWrite"]) def _set_up_auth_users(auth_url): @@ -103,19 +109,20 @@ def _set_up_auth_users(auth_url): @fixture(scope="module") def auth_url(config, mongo_client): + auth_db = "api_to_db_auth_test" + auth_mongo_user = "auth" # clean up from any previously failed test runs that left the db in place - _clean_auth_db(mongo_client) + _clean_db(mongo_client, auth_db, auth_mongo_user) # make a user for the auth db - mongo_client[AUTH_DB].command( - "createUser", AUTH_MONGO_USER, pwd="authpwd", roles=["readWrite"] - ) + _create_db_user(mongo_client, auth_db, auth_mongo_user, "authpwd") + auth = AuthController( JARS_DIR, config["mongo-host"], - AUTH_DB, + auth_db, TEMP_DIR, - mongo_user=AUTH_MONGO_USER, + mongo_user=auth_mongo_user, mongo_pwd="authpwd", ) print( @@ -133,10 +140,48 @@ def auth_url(config, mongo_client): # Because the tests are run with mongo in a persistent docker container via docker-compose, # we need to clean up after ourselves. - _clean_auth_db(mongo_client) + _clean_db(mongo_client, auth_db, auth_mongo_user) + + +@fixture(scope="module") +def ws_controller(config, mongo_client, auth_url): + ws_db = "api_to_db_ws_test" + ws_types_db = "api_to_db_ws_types_test" + ws_mongo_user = "workspace" + # clean up from any previously failed test runs that left the db in place + _clean_db(mongo_client, ws_db, ws_mongo_user) + _clean_db(mongo_client, ws_types_db, ws_mongo_user) + + # make a user for the ws dbs + _create_db_user(mongo_client, ws_db, ws_mongo_user, "wspwd") + _create_db_user(mongo_client, ws_types_db, ws_mongo_user, "wspwd") + + ws = WorkspaceController( + JARS_DIR, + config["mongo-host"], + ws_db, + ws_types_db, + auth_url + "/testmode/", + TEMP_DIR, + mongo_user=ws_mongo_user, + mongo_pwd="wspwd", + ) + print( + f"Started KBase Workspace {ws.version} on port {ws.port} " + + f"in dir {ws.temp_dir} in {ws.startup_count}s" + ) + yield ws + + print(f"shutting down workspace, KEEP_TEMP_FILES={KEEP_TEMP_FILES}") + ws.destroy(not KEEP_TEMP_FILES) + + # Because the tests are run with mongo in a persistent docker container via docker-compose, + # we need to clean up after ourselves. + _clean_db(mongo_client, ws_db, ws_mongo_user) + _clean_db(mongo_client, ws_types_db, ws_mongo_user) -def _update_config_and_create_config_file(full_config, auth_url): +def _update_config_and_create_config_file(full_config, auth_url, ws_controller): """ Updates the config in place with the correct auth url for the tests and writes the updated config to a temporary file. @@ -151,6 +196,7 @@ def _update_config_and_create_config_file(full_config, auth_url): ee2c["auth-service-url-v2"] = auth_url + "/testmode/api/v2/token" ee2c["auth-url"] = auth_url + "/testmode" ee2c["auth-service-url-allow-insecure"] = "true" + ee2c["workspace-url"] = f"http://localhost:{ws_controller.port}" deploy = tempfile.mkstemp(".cfg", "deploy-", dir=TEMP_DIR, text=True) os.close(deploy[0]) @@ -161,21 +207,26 @@ def _update_config_and_create_config_file(full_config, auth_url): return deploy[1] -def _clear_ee2_db(mc: pymongo.MongoClient, config: Dict[str, str]): +def _clear_dbs( + mc: pymongo.MongoClient, config: Dict[str, str], ws_controller: WorkspaceController +): ee2 = mc[config["mongo-database"]] for name in ee2.list_collection_names(): if not name.startswith("system."): # don't drop collection since that drops indexes ee2.get_collection(name).delete_many({}) + ws_controller.clear_db() @fixture(scope="module") -def service(full_config, auth_url, mongo_client, config): +def service(full_config, auth_url, mongo_client, config, ws_controller): # also updates the config in place so it contains the correct auth urls for any other # methods that use the config fixture - cfgpath = _update_config_and_create_config_file(full_config, auth_url) + cfgpath = _update_config_and_create_config_file( + full_config, auth_url, ws_controller + ) print(f"created test deploy at {cfgpath}") - _clear_ee2_db(mongo_client, config) + _clear_dbs(mongo_client, config, ws_controller) prior_deploy = os.environ[KB_DEPLOY_ENV] # from this point on, calling the get_*_test_config methods will get the temp config file @@ -209,8 +260,8 @@ def service(full_config, auth_url, mongo_client, config): @fixture -def ee2_port(service, mongo_client, config): - _clear_ee2_db(mongo_client, config) +def ee2_port(service, mongo_client, config, ws_controller): + _clear_dbs(mongo_client, config, ws_controller) yield service @@ -234,3 +285,9 @@ def test_get_admin_permission_success(ee2_port): assert ee2cli_read.get_admin_permission() == {"permission": "r"} assert ee2cli_no.get_admin_permission() == {"permission": "n"} assert ee2cli_write.get_admin_permission() == {"permission": "w"} + + +def test_temporary_check_ws(ee2_port, ws_controller): + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + ws = wsc.create_workspace({"workspace": "foo"}) + assert ws[1] == "foo" diff --git a/test/tests_for_integration/workspace_controller.py b/test/tests_for_integration/workspace_controller.py new file mode 100644 index 000000000..92fa59733 --- /dev/null +++ b/test/tests_for_integration/workspace_controller.py @@ -0,0 +1,269 @@ +""" +Q&D Utility to run a Workspace server for the purposes of testing. + +Initializes a GridFS backend and does not support handles, bytestreams or samples. +""" + +import os as _os +import shutil as _shutil +import subprocess as _subprocess +import tempfile as _tempfile +import time as _time +from pathlib import Path as _Path +from pymongo.mongo_client import MongoClient + +import requests as _requests + +from configparser import ConfigParser as _ConfigParser +from installed_clients.WorkspaceClient import Workspace as _Workspace +from installed_clients.baseclient import ServerError as _ServerError + +from utils_shared.test_utils import TestException as _TestException +from utils_shared import test_utils as _test_utils + +_WS_CLASS = "us.kbase.workspace.WorkspaceServer" +_JARS_FILE = _Path(__file__).resolve().parent.joinpath("wsjars") + + +class WorkspaceController: + """ + The main Workspace controller class. The Workspace will allow users with the KBase Auth + service WS_READ_ADMIN role to use read-only administration methods and WS_FULL_ADMIN role + to use all administration methods. + + Attributes: + version - the version of the Workspace service + port - the port for the Workspace service. + temp_dir - the location of the Workspace data and logs. + """ + + # TODO This code likely belongs somewhere else. Not quite sure where though, maybe in WS repo. + # TODO This code is similar to the auth controller code, DRY it up? + + def __init__( + self, + jars_dir: _Path, + mongo_host: str, + mongo_db: str, + mongo_type_db: str, + auth_url: str, + root_temp_dir: _Path, + mongo_user: str = None, + mongo_pwd: str = None, + ): + """ + Create and start a new Workspace service. An unused port will be selected for the server. + + :param jars_dir: The path to the lib/jars dir of the KBase Jars repo + (https://github.com/kbase/jars), e.g /path_to_repo/lib/jars. + :param mongo_host: The address for the MongoDB host. + :param mongo_db: The database in which to store Workspace data. + :param mongo_type_db: The database in which to store Workspace type specifications. + :param auth_url: The root url of an instance of the KBase auth service. + :param root_temp_dir: A temporary directory in which to store Auth data and log files. + The files will be stored inside a child directory that is unique per invocation. + :param mongo_user: The username for the Mongo account, if provided. The user is expected + to be a user in the provided databases with readWrite permission. + :param mongo_pwd: The password for the Mongo accont if, provided. + """ + self._check_params( + jars_dir, + mongo_host, + mongo_db, + mongo_type_db, + auth_url, + root_temp_dir, + mongo_user, + mongo_pwd, + ) + + self._db = mongo_db + jars_dir = jars_dir.resolve() + class_path = self._get_class_path(jars_dir) + + # make temp dirs + root_temp_dir = root_temp_dir.absolute() + _os.makedirs(root_temp_dir, exist_ok=True) + self.temp_dir = _Path( + _tempfile.mkdtemp(prefix="WorkspaceController-", dir=str(root_temp_dir)) + ) + ws_temp_dir = self.temp_dir.joinpath("temp_files") + _os.makedirs(ws_temp_dir) + + configfile = self._create_deploy_cfg( + self.temp_dir, + ws_temp_dir, + mongo_host, + mongo_db, + mongo_type_db, + auth_url, + mongo_user, + mongo_pwd, + ) + newenv = _os.environ.copy() + newenv["KB_DEPLOYMENT_CONFIG"] = configfile + + self.port = _test_utils.find_free_port() + + command = ["java", "-classpath", class_path, _WS_CLASS, str(self.port)] + + self._wslog = self.temp_dir / "ws.log" + self._outfile = open(self._wslog, "w") + + self._proc = _subprocess.Popen( + command, stdout=self._outfile, stderr=_subprocess.STDOUT, env=newenv + ) + + self.version, self.startup_count = self._wait_for_service() + self._mongo_client = self._get_mongo_client( + mongo_host, mongo_db, mongo_user, mongo_pwd + ) + + def _check_params( + self, + jars_dir: _Path, + mongo_host: str, + mongo_db: str, + mongo_type_db: str, + auth_url: str, + root_temp_dir: _Path, + mongo_user: str, + mongo_pwd: str, + ): + if not jars_dir or not _os.access(jars_dir, _os.X_OK): + raise _TestException( + "jars_dir {} does not exist or is not executable.".format(jars_dir) + ) + if not mongo_host: + raise _TestException("mongo_controller must be provided") + if not mongo_db: + raise _TestException("mongo_db must be provided") + if not mongo_type_db: + raise _TestException("mongo_type_db must be provided") + if not auth_url: + raise _TestException("auth_url must be provided") + if not root_temp_dir: + raise _TestException("root_temp_dir is None") + if bool(mongo_user) ^ bool(mongo_pwd): # xor + raise _TestException( + "Neither or both of mongo_user and mongo_pwd is required" + ) + + def _get_class_path(self, jars_dir: _Path): + cp = [] + with open(_JARS_FILE) as jf: + for line in jf: + if line.strip() and not line.startswith("#"): + p = jars_dir.joinpath(line.strip()) + if not p.is_file(): + raise _TestException(f"Required jar does not exist: {p}") + cp.append(str(p)) + return ":".join(cp) + + def _create_deploy_cfg( + self, + temp_dir, + ws_temp_dir, + mongo_host, + mongo_db, + mongo_type_db, + auth_url, + mongo_user, + mongo_pwd, + ): + cp = _ConfigParser() + cp["Workspace"] = { + "mongodb-host": mongo_host, + "mongodb-database": mongo_db, + "mongodb-type-database": mongo_type_db, + "backend-type": "GridFS", + "auth-service-url": auth_url + "/api/legacy/KBase", + "auth-service-url-allow-insecure": "true", + # TODO WS trailing slash should not be necessary + # see https://github.com/kbase/workspace_deluxe/issues/350 + "auth2-service-url": auth_url + "/", + "temp-dir": str(ws_temp_dir), + "ignore-handle-service": "true", + "auth2-ws-admin-read-only-roles": "WS_READ_ADMIN", + "auth2-ws-admin-full-roles": "WS_FULL_ADMIN", + } + if mongo_user: + cp["Workspace"]["mongodb-user"] = mongo_user + cp["Workspace"]["mongodb-pwd"] = mongo_pwd + f = temp_dir / "test.cfg" + with open(f, "w") as inifile: + cp.write(inifile) + return f + + def _wait_for_service(self): + ws = _Workspace(f"http://localhost:{self.port}") + for count in range(40): + err = None + _time.sleep(1) # wait for server to start + try: + version = ws.ver() + break + except (_ServerError, _requests.exceptions.ConnectionError) as se: + err = _TestException(se.args[0]) + err.__cause__ = se + if err: + print("Error starting workspace service. Dumping logs and throwing error") + self._print_ws_logs() + raise err + return version, count + 1 + + def _get_mongo_client(self, mongo_host, mongo_db, mongo_user, mongo_pwd): + if mongo_user: + mongo_client = MongoClient( + mongo_host, username=mongo_user, password=mongo_pwd, authSource=mongo_db + ) + else: + mongo_client = MongoClient(mongo_host) + # check that the client is correctly connected. See + # https://api.mongodb.com/python/3.7.0/api/pymongo/mongo_client.html + # #pymongo.mongo_client.MongoClient + mongo_client.admin.command("ismaster") + return mongo_client + + def get_url(self): + """ + Get the url for the running workspace instance. + """ + return f"http://localhost:{self.port}" + + def clear_db(self): + """ + Remove all data, but not indexes, from the database. Do not remove any installed types. + """ + db = self._mongo_client[self._db] + for name in db.list_collection_names(): + if not name.startswith("system."): + # don't drop collection since that drops indexes + db.get_collection(name).delete_many({}) + + def destroy(self, delete_temp_files: bool = True, dump_logs_to_stdout: bool = True): + """ + Shut down the server and optionally delete any files generated. + + :param delete_temp_files: if true, delete all the temporary files generated as part of + running the server. + :param dump_logs_to_stdout: Write the contents of the workspace log file to stdout. + This is useful in the context of 3rd party CI services, where the log file is not + necessarily accessible. + """ + if self._proc: + self._proc.terminate() + self._print_ws_logs(dump_logs_to_stdout=dump_logs_to_stdout) + if delete_temp_files and self.temp_dir: + _shutil.rmtree(self.temp_dir) + if self._mongo_client: + self._mongo_client.close() + + # closes logfile + def _print_ws_logs(self, dump_logs_to_stdout=True): + if self._outfile: + self._outfile.close() + if dump_logs_to_stdout: + with open(self._wslog) as f: + for line in f: + print(line) diff --git a/test/tests_for_integration/wsjars b/test/tests_for_integration/wsjars new file mode 100644 index 000000000..c2d134ddc --- /dev/null +++ b/test/tests_for_integration/wsjars @@ -0,0 +1,40 @@ +kbase/workspace/WorkspaceService-0.11.2.jar + +# server code +kbase/common/kbase-common-0.0.24.jar +ini4j/ini4j-0.5.2.jar +jetty/jetty-all-7.0.0.jar +jna/jna-3.4.0.jar +servlet/servlet-api-2.5.jar +syslog4j/syslog4j-0.9.46.jar +joda/joda-time-2.2.jar +annotation/javax.annotation-api-1.3.2.jar + +junit/junit-4.12.jar +hamcrest/hamcrest-core-1.3.jar +kbase/auth/kbase-auth-0.4.4.jar +jackson/jackson-annotations-2.2.3.jar +jackson/jackson-core-2.2.3.jar +jackson/jackson-databind-2.2.3.jar + +# shock client +kbase/shock/shock-client-0.0.16.jar +apache_commons/commons-logging-1.1.1.jar +apache_commons/http/httpclient-4.3.1.jar +apache_commons/http/httpcore-4.3.jar +apache_commons/http/httpmime-4.3.1.jar + +kbase/kidl/kbase-kidl-parser-1409261812-7863aef.jar +apache_commons/commons-codec-1.8.jar +apache_commons/commons-io-2.4.jar +apache_commons/commons-lang3-3.1.jar +mongo/mongo-java-driver-3.8.2.jar +bson4jackson/bson4jackson-2.2.0-2.2.0.jar +slf4j/slf4j-api-1.7.7.jar +logback/logback-core-1.1.2.jar +logback/logback-classic-1.1.2.jar +google/guava-14.0.1.jar +kafka/kafka-clients-2.1.0.jar +kbase/handle/AbstractHandleClient-1.0.0.jar + +# leaving out S3 libs as this is a test instance of the WS and will save all data in GFS \ No newline at end of file From 4aac7dd38ce260daee8ca9865e23ac511fd16037 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 26 Apr 2021 09:04:18 -0700 Subject: [PATCH 049/109] DATAUP-389 - add run_job integration test (#360) * Add run_job integration test Adds an integration test for the run_job method. Still need to check the kafka messages that get sent. It's not a complete integration test as the catalog and htcondor are too much work to add to the test suite at the moment. * Add notes about why Kafka integration is not tested * run black * Add source_ws_objcets to integration test Required adding types to the WS * run black * Add params and meta to run_job integration test * mix n match catalog and deploy.cfg job reqs in integration test * Add service ver to run job integratino test args * Add parent job id to run_job integration test * Add run_job unhappy path tests * run black * Fix unused import --- test/tests_for_integration/api_to_db_test.py | 339 ++++++++++++++++++- test/utils_shared/test_utils.py | 10 + 2 files changed, 335 insertions(+), 14 deletions(-) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index cfb7ef243..dc46bb4ce 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -1,7 +1,7 @@ """ Integration tests that cover the entire codebase from API to database. -NOTE 1: These tests are designed to only be runnable after running docker-compose up +NOTE 1: These tests are designed to only be runnable after running docker-compose up. NOTE 2: These tests were set up quickly in order to debug a problem with administration related calls. As such, the auth server was set up to run in test mode locally. If more integrations @@ -9,19 +9,36 @@ If the latter, the test auth and workspace integrations will likely need to be converted to docker containers or exposed to other containers. -NOTE 3: Posting to Slack always fails silently. +NOTE 3: Although this is supposed to be an integration test, the catalog service and htcondor +are still mocked out as bringing them up would take a large amount of effort. Someday... + +NOTE 4: Kafka notes + a) Currently nothing listens to the kafka feed. + b) When running the tests, the kafka producer logs that kafka cannot be reached. However, + this error is silent otherwise. + c) I wasn't able to contact the docker kafka service with the kafka-python client either. + d) As such, Kafka is not tested. Once tests are added, at least one test should check that + something sensible happens if a kafka message cannot be sent. + +NOTE 5: EE2 posting to Slack always fails silently in tests. Currently slack calls are not tested. """ +# TODO add more integration tests, these are not necessarily exhaustive + import os import tempfile import time +import htcondor +from bson import ObjectId from configparser import ConfigParser from threading import Thread from pathlib import Path import pymongo -from pytest import fixture +from pytest import fixture, raises from typing import Dict +from unittest.mock import patch, create_autospec, ANY + from tests_for_integration.auth_controller import AuthController from tests_for_integration.workspace_controller import WorkspaceController from utils_shared.test_utils import ( @@ -34,11 +51,17 @@ create_auth_user, create_auth_role, set_custom_roles, + assert_close_to_now, + assert_exception_correct, ) from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from installed_clients.baseclient import ServerError from installed_clients.execution_engine2Client import execution_engine2 as ee2client from installed_clients.WorkspaceClient import Workspace +# in the future remove this +from tests_for_utils.Condor_test import _get_common_sub + KEEP_TEMP_FILES = False TEMP_DIR = Path("test_temp_can_delete") @@ -52,6 +75,22 @@ USER_WRITE_ADMIN = "writeuser" TOKEN_WRITE_ADMIN = None +USER_WS_READ_ADMIN = "wsreadadmin" +TOKEN_WS_READ_ADMIN = None +USER_WS_FULL_ADMIN = "wsfulladmin" +TOKEN_WS_FULL_ADMIN = None +WS_READ_ADMIN = "WS_READ_ADMIN" +WS_FULL_ADMIN = "WS_FULL_ADMIN" + +CAT_GET_MODULE_VERSION = "installed_clients.CatalogClient.Catalog.get_module_version" +CAT_LIST_CLIENT_GROUPS = ( + "installed_clients.CatalogClient.Catalog.list_client_group_configs" +) + +# from test/deploy.cfg +MONGO_EE2_DB = "ee2" +MONGO_EE2_JOBS_COL = "ee2_jobs" + @fixture(scope="module") def config() -> Dict[str, str]: @@ -88,23 +127,41 @@ def _create_db_user(mongo_client, db, db_user, password): mongo_client[db].command("createUser", db_user, pwd=password, roles=["readWrite"]) +def _set_up_auth_user(auth_url, user, display, roles=None): + create_auth_user(auth_url, user, display) + if roles: + set_custom_roles(auth_url, user, roles) + return create_auth_login_token(auth_url, user) + + def _set_up_auth_users(auth_url): create_auth_role(auth_url, ADMIN_READ_ROLE, "ee2 admin read doohickey") create_auth_role(auth_url, ADMIN_WRITE_ROLE, "ee2 admin write thinger") + create_auth_role(auth_url, WS_READ_ADMIN, "wsr") + create_auth_role(auth_url, WS_FULL_ADMIN, "wsf") global TOKEN_READ_ADMIN - create_auth_user(auth_url, USER_READ_ADMIN, "display1") - TOKEN_READ_ADMIN = create_auth_login_token(auth_url, USER_READ_ADMIN) - set_custom_roles(auth_url, USER_READ_ADMIN, [ADMIN_READ_ROLE]) + TOKEN_READ_ADMIN = _set_up_auth_user( + auth_url, USER_READ_ADMIN, "display1", [ADMIN_READ_ROLE] + ) global TOKEN_NO_ADMIN - create_auth_user(auth_url, USER_NO_ADMIN, "display2") - TOKEN_NO_ADMIN = create_auth_login_token(auth_url, USER_NO_ADMIN) + TOKEN_NO_ADMIN = _set_up_auth_user(auth_url, USER_NO_ADMIN, "display2") global TOKEN_WRITE_ADMIN - create_auth_user(auth_url, USER_WRITE_ADMIN, "display3") - TOKEN_WRITE_ADMIN = create_auth_login_token(auth_url, USER_WRITE_ADMIN) - set_custom_roles(auth_url, USER_WRITE_ADMIN, [ADMIN_WRITE_ROLE]) + TOKEN_WRITE_ADMIN = _set_up_auth_user( + auth_url, USER_WRITE_ADMIN, "display3", [ADMIN_WRITE_ROLE] + ) + + global TOKEN_WS_READ_ADMIN + TOKEN_WS_READ_ADMIN = _set_up_auth_user( + auth_url, USER_WS_READ_ADMIN, "wsra", [WS_READ_ADMIN] + ) + + global TOKEN_WS_FULL_ADMIN + TOKEN_WS_FULL_ADMIN = _set_up_auth_user( + auth_url, USER_WS_FULL_ADMIN, "wsrf", [WS_FULL_ADMIN] + ) @fixture(scope="module") @@ -143,6 +200,27 @@ def auth_url(config, mongo_client): _clean_db(mongo_client, auth_db, auth_mongo_user) +def _add_ws_types(ws_controller): + wsc = Workspace(f"http://localhost:{ws_controller.port}", token=TOKEN_WS_FULL_ADMIN) + wsc.request_module_ownership("Trivial") + wsc.administer({"command": "approveModRequest", "module": "Trivial"}) + wsc.register_typespec( + { + "spec": """ + module Trivial { + /* @optional dontusethisfieldorifyoudomakesureitsastring */ + typedef structure { + string dontusethisfieldorifyoudomakesureitsastring; + } Object; + }; + """, + "dryrun": 0, + "new_types": ["Object"], + } + ) + wsc.release_module("Trivial") + + @fixture(scope="module") def ws_controller(config, mongo_client, auth_url): ws_db = "api_to_db_ws_test" @@ -170,6 +248,8 @@ def ws_controller(config, mongo_client, auth_url): f"Started KBase Workspace {ws.version} on port {ws.port} " + f"in dir {ws.temp_dir} in {ws.startup_count}s" ) + _add_ws_types(ws) + yield ws print(f"shutting down workspace, KEEP_TEMP_FILES={KEEP_TEMP_FILES}") @@ -287,7 +367,238 @@ def test_get_admin_permission_success(ee2_port): assert ee2cli_write.get_admin_permission() == {"permission": "w"} -def test_temporary_check_ws(ee2_port, ws_controller): +######## run_job tests ######## + + +def _get_htc_mocks(): + sub = create_autospec(htcondor.Submit, spec_set=True, instance=True) + schedd = create_autospec(htcondor.Schedd, spec_set=True, instance=True) + txn = create_autospec(htcondor.Transaction, spec_set=True, instance=True) + return sub, schedd, txn + + +def _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn): + sub_init.return_value = sub + schedd_init.return_value = schedd + # mock context manager ops + schedd.transaction.return_value = txn + txn.__enter__.return_value = txn + return sub, schedd, txn + + +def _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub): + sub_init.assert_called_once_with(expected_sub) + schedd_init.assert_called_once_with() + schedd.transaction.assert_called_once_with() + sub.queue.assert_called_once_with(txn, 1) + + +def test_run_job(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method. + """ + # Set up workspace and objects + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + {"name": "two", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.return_value = 123 + list_cgroups.return_value = [ + {"client_groups": ['{"request_cpus":8,"request_memory":5}']} + ] + get_mod_ver.return_value = {"git_commit_hash": "somehash"} + + # run the method + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) + job_id = ee2.run_job( + { + "method": "mod.meth", + "app_id": "mod/app", + "wsid": 1, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "parent_job_id": "totallywrongid", + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "status": "totally wasted bro", + "thiskey": "getssilentlydropped", + }, + } + ) + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_called_once_with( + ANY, {"module_name": "mod", "version": "beta"} + ) + list_cgroups.assert_called_once_with( + ANY, {"module_name": "mod", "function_name": "meth"} + ) + + expected_sub = _get_common_sub(job_id) + expected_sub.update( + { + "JobBatchName": job_id, + "arguments": f"{job_id} https://ci.kbase.us/services/ee2", + "+KB_PARENT_JOB_ID": '"totallywrongid"', + "+KB_MODULE_NAME": '"mod"', + "+KB_FUNCTION_NAME": '"meth"', + "+KB_APP_ID": '"mod/app"', + "+KB_APP_MODULE_NAME": '"mod"', + "+KB_WSID": '"1"', + "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', + "request_cpus": "8", + "request_memory": "5MB", + "request_disk": "30GB", + "requirements": 'regexp("njs",CLIENTGROUP)', + "+KB_CLIENTGROUP": '"njs"', + "Concurrency_Limits": f"{USER_NO_ADMIN}", + "+AccountingGroup": f'"{USER_NO_ADMIN}"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=604805 KB_ADMIN_AUTH_TOKEN=test_auth_token ' + + f"KB_AUTH_TOKEN={TOKEN_NO_ADMIN} CLIENTGROUP=njs JOB_ID={job_id} " + + "CONDOR_ID=$(Cluster).$(Process) PYTHON_EXECUTABLE=/miniconda/bin/python " + + 'DEBUG_MODE=False PARENT_JOB_ID=totallywrongid "' + ), + "leavejobinqueue": "true", + "initial_dir": "../scripts/", + "+Owner": '"condor_pool"', + "executable": "../scripts//../scripts/execute_runner.sh", + "transfer_input_files": "../scripts/JobRunner.tgz", + } + ) + + _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) + + # check the mongo record is correct + job = mongo_client[MONGO_EE2_DB][MONGO_EE2_JOBS_COL].find_one( + {"_id": ObjectId(job_id)} + ) + assert_close_to_now(job.pop("updated")) + assert_close_to_now(job.pop("queued")) + expected_job = { + "_id": ObjectId(job_id), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": 1, + "status": "queued", + "job_input": { + "wsid": 1, + "method": "mod.meth", + "params": [{"foo": "bar"}, 42], + "service_ver": "somehash", + "app_id": "mod/app", + "source_ws_objects": ["1/1/1", "1/2/1"], + "parent_job_id": "totallywrongid", + "requirements": { + "clientgroup": "njs", + "cpu": 8, + "memory": 5, + "disk": 30, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "status": "totally wasted bro", + }, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + assert job == expected_job + + +def test_run_job_fail_no_workspace_access(ee2_port): + params = {"method": "mod.meth", "app_id": "mod/app", "wsid": 1} + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 1 exists'))" + ) + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_method(ee2_port): + params = {"method": "mod.meth.moke", "app_id": "mod/app"} + # TODO the Server.py file is quoting strings for some reason it seems + # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 + err = "\"Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name\"" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_app(ee2_port): + params = {"method": "mod.meth", "app_id": "mod.app"} + # TODO the Server.py file is quoting strings for some reason it seems + # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 + err = "\"Application ID 'mod.app' contains a '.'\"" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_upa(ee2_port): + params = { + "method": "mod.meth", + "app_id": "mod/app", + "source_ws_objects": ["ws/obj/1"], + } + # TODO the Server.py file is quoting strings for some reason it seems + # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 + err = "\"source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address\"" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_no_such_object(ee2_port, ws_controller): + # Set up workspace and objects wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) - ws = wsc.create_workspace({"workspace": "foo"}) - assert ws[1] == "foo" + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + params = {"method": "mod.meth", "app_id": "mod/app", "source_ws_objects": ["1/2/1"]} + # TODO the Server.py file is quoting strings for some reason it seems + # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 + err = "'Some workspace object is inaccessible'" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def _run_job_fail(ee2_port, token, params, expected, throw_exception=False): + client = ee2client(f"http://localhost:{ee2_port}", token=token) + if throw_exception: + client.run_job(params) + else: + with raises(ServerError) as got: + client.run_job(params) + assert_exception_correct(got.value, ServerError("name", 1, expected)) diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index e37bb7063..47d5d75e6 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -3,6 +3,7 @@ import uuid import logging import socket +import time from configparser import ConfigParser from contextlib import closing from datetime import datetime @@ -393,6 +394,15 @@ def assert_exception_correct(got: Exception, expected: Exception): assert type(got) == type(expected) +def assert_close_to_now(time_): + """ + Checks that a timestamp in seconds since the epoch is within a second of the current time. + """ + now_ms = time.time() + assert now_ms + 1 > time_ + assert now_ms - 1 < time_ + + def find_free_port() -> int: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(("", 0)) From 859a136a8f6ec1a9bdc0c2e0a92cbc4727cbc962 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 26 Apr 2021 15:16:43 -0700 Subject: [PATCH 050/109] DATAUP-389 - Update RunJobParams spec to match code (#361) * Update RunJobParams spec to match code remote_url and rpc_context are unused. meta only supports a small number of keys meta.status is unused and so was removed from the db model. * Make updated black pass Became more stringent on a few things * Fix UPA definition, re-add typing for mongo connection --- execution_engine2.html | 2 +- execution_engine2.spec | 72 +- lib/execution_engine2/db/MongoUtil.py | 9 +- lib/execution_engine2/db/models/models.py | 1 - .../execution_engine2Impl.py | 832 ++++++++---------- lib/execution_engine2/sdk/EE2Logs.py | 2 +- lib/execution_engine2/sdk/EE2Runjob.py | 3 +- lib/execution_engine2/sdk/SDKMethodRunner.py | 38 +- test/test_clients/authclient.py | 2 +- test/tests_for_db/ee2_MongoUtil_test.py | 2 +- test/tests_for_integration/api_to_db_test.py | 2 - ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 1 - 12 files changed, 413 insertions(+), 553 deletions(-) diff --git a/execution_engine2.html b/execution_engine2.html index aa8b431be..a5b6f050e 100644 --- a/execution_engine2.html +++ b/execution_engine2.html @@ -1 +1 @@ -execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y or X/Y/Z, where
*X is the workspace name or id,
*Y is the object name or id,
*Z is the version, which is optional.
*/
typedefstringwsref;

/*
*time - the time the call was started;
*method - service defined in standard JSON RPC way, typically it's
*module name from spec-file followed by '.' and name of funcdef
*from spec-file corresponding to running method (e.g.
*'KBaseTrees.construct_species_tree' from trees service);
*job_id - job id if method is asynchronous (optional field).
*/
typedefstructure{
stringmethod;
job_idjob_id;
}
MethodCall;

/*
*call_stack - upstream calls details including nested service calls and
*parent jobs where calls are listed in order from outer to inner.
*/
typedefstructure{
list<MethodCall>call_stack;
stringrun_id;
}
RpcContext;

/*
*method - service defined in standard JSON RPC way, typically it's
*module name from spec-file followed by '.' and name of funcdef
*from spec-file corresponding to running method (e.g.
*'KBaseTrees.construct_species_tree' from trees service);
*params - the parameters of the method that performed this call;
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*rpc_context - context of current method call including nested call
*history
*remote_url - run remote service call instead of local command line
*execution.
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance.
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*mapping<string, string> meta - user defined metadata to associate with
*the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 id of the parent of a batch job. Batch jobs will add
*this id to the EE2 database under the field "parent_job_id"
*/
typedefstructure{
stringmethod;
list<UnspecifiedObject>params;
stringservice_ver;
RpcContextrpc_context;
stringremote_url;
list<wsref>source_ws_objects;
stringapp_id;
mapping<string,string>meta;
intwsid;
stringparent_job_id;
}
RunJobParams;

/*
*Start a new job (long running method of service registered in ServiceRegistery).
*Such job runs Docker image for this service in script mode.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

typedefstructure{
intwsid;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in MB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*account_group: str = None # Someone elses account
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*/
typedefstructure{
intrequest_cpu;
intrequest_memory_mb;
intrequest_disk_mb;
intjob_priority;
stringaccount_group;
list<string>requirements_list;
stringclient_group;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*params - the parameters to pass to the method.
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*run_job_batch ignores this parameter when starting a job batch.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
}
RunJobParams;

/*
*Start a new job (long running method of service registered in ServiceRegistery).
*Such job runs Docker image for this service in script mode.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

typedefstructure{
intwsid;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index 47fdf681f..fe94b7c47 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -66,73 +66,59 @@ /*================================================================================*/ /* Running long running methods through Docker images of services from Registry */ /*================================================================================*/ - /* A workspace object reference of the form X/Y or X/Y/Z, where - X is the workspace name or id, - Y is the object name or id, - Z is the version, which is optional. + /* A workspace object reference of the form X/Y/Z, where + X is the workspace id, + Y is the object id, + Z is the version. */ typedef string wsref; - /* - time - the time the call was started; - method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); - job_id - job id if method is asynchronous (optional field). - */ - typedef structure { - timestamp time; - string method; - job_id job_id; - } MethodCall; - - /* - call_stack - upstream calls details including nested service calls and - parent jobs where calls are listed in order from outer to inner. + /* Narrative metadata for a job. All fields are optional. + run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID. + token_id - the ID of the token used to run the method. + tag - the release tag, e.g. dev/beta/release. + cell_id - the ID of the narrative cell from which the job was run. */ typedef structure { - list call_stack; string run_id; - } RpcContext; + string token_id; + string tag; + string cell_id; + } Meta; /* - method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); - params - the parameters of the method that performed this call; + method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' + app_id - the id of the Narrative application (UI) running this job (e.g. + repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, last version is used if this parameter is not defined - rpc_context - context of current method call including nested call - history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references will - be added to the autogenerated provenance. - app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) - mapping meta - user defined metadata to associate with - the job. + be added to the autogenerated provenance. Must be in UPA format (e.g. + 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an optional workspace id to associate with the job. This is passed to the workspace service, which will share the job based on the permissions of the workspace rather than owner of the job - parent_job_id - EE2 id of the parent of a batch job. Batch jobs will add - this id to the EE2 database under the field "parent_job_id" + parent_job_id - EE2 job id for the parent of the current job. + For run_job and run_job_concierge, this value can be specified to denote + the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and the parent job + record is not altered. + run_job_batch ignores this parameter when starting a job batch. */ typedef structure { string method; + string app_id; list params; string service_ver; - RpcContext rpc_context; - string remote_url; list source_ws_objects; - string app_id; - mapping meta; + Meta meta; int wsid; string parent_job_id; } RunJobParams; diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 61fe0137b..d00c38d44 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -3,6 +3,7 @@ import time import traceback from contextlib import contextmanager +from typing import Dict from bson.objectid import ObjectId from mongoengine import connect, connection @@ -17,7 +18,7 @@ class MongoUtil: - def __init__(self, config: dict): + def __init__(self, config: Dict): self.config = config self.mongo_host = config["mongo-host"] self.mongo_port = int(config["mongo-port"]) @@ -41,7 +42,7 @@ def _get_pymongo_client(self): authMechanism=self.mongo_authmechanism, ) - def _get_mongoengine_client(self): + def _get_mongoengine_client(self) -> connection: return connect( db=self.mongo_database, host=self.mongo_host, @@ -50,7 +51,7 @@ def _get_mongoengine_client(self): password=self.mongo_pass, authentication_source=self.mongo_database, authentication_mechanism=self.mongo_authmechanism, - ) # type: connection + ) def _start_local_service(self): try: @@ -442,7 +443,7 @@ def insert_one(self, doc): return rec.inserted_id def _push_job_logs(self, log_lines: JobLog, job_id: str, record_count: int): - """ append a list of job logs, and update the record count """ + """append a list of job logs, and update the record count""" update_filter = {"_id": ObjectId(job_id)} push_op = {"lines": {"$each": log_lines}} diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 2c2326138..70a5af1ad 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -109,7 +109,6 @@ class Meta(EmbeddedDocument): token_id = StringField() tag = StringField() cell_id = StringField() - status = StringField() def __repr__(self): return self.to_json() diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 49a5b9e7f..19c9a3c9e 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -30,7 +30,7 @@ class execution_engine2: ######################################### noqa VERSION = "0.0.5" GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" - GIT_COMMIT_HASH = "330194d1f1c09c7dc9598fb7d6afedf50feb253d" + GIT_COMMIT_HASH = "462a6110e5837e67e574e6db169a79a4d67fa8b4" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -180,50 +180,38 @@ def run_job(self, ctx, params): """ Start a new job (long running method of service registered in ServiceRegistery). Such job runs Docker image for this service in script mode. - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String :returns: instance of type "job_id" (A job id.) """ @@ -248,50 +236,38 @@ def run_job(self, ctx, params): def run_job_batch(self, ctx, params, batch_params): """ - :param params: instance of list of type "RunJobParams" (method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :param params: instance of list of type "RunJobParams" (method - the + SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String :param batch_params: instance of type "BatchParams" -> structure: parameter "wsid" of Long @@ -351,64 +327,62 @@ def abandon_children(self, ctx, params): def run_job_concierge(self, ctx, params, concierge_params): """ - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int - request_memory: int in MB request_disk: int in MB job_priority: + request_memory: int in MB request_disk: int in GB job_priority: int = None range from -20 to +20, with higher values meaning - better priority. account_group: str = None # Someone elses account - requirements_list: list = None ['machine=worker102','color=red'] - client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can - leave default or specify a clientgroup) -> structure: parameter - "request_cpu" of Long, parameter "request_memory_mb" of Long, - parameter "request_disk_mb" of Long, parameter "job_priority" of - Long, parameter "account_group" of String, parameter - "requirements_list" of list of String, parameter "client_group" of - String + better priority. Note: job_priority is currently not implemented. + account_group: str = None # Someone elses account + ignore_concurrency_limits: ignore any limits on simultaneous job + runs. Default 1 (True). requirements_list: list = None + ['machine=worker102','color=red'] client_group: Optional[str] = + CONCIERGE_CLIENTGROUP # You can leave default or specify a + clientgroup client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. debug_mode: Whether to run the job in debug mode. Default + 0 (False).) -> structure: parameter "request_cpu" of Long, + parameter "request_memory" of Long, parameter "request_disk" of + Long, parameter "job_priority" of Long, parameter "account_group" + of String, parameter "ignore_concurrency_limits" of type "boolean" + (@range [0,1]), parameter "requirements_list" of list of String, + parameter "client_group" of String, parameter "client_group_regex" + of type "boolean" (@range [0,1]), parameter "debug_mode" of type + "boolean" (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ # ctx is the context object @@ -434,50 +408,38 @@ def get_job_params(self, ctx, params): necessary for job execution @optional as_admin) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "RunJobParams" (method - service defined - in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :returns: instance of type "RunJobParams" (method - the SDK method to + run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String """ # ctx is the context object @@ -722,49 +684,37 @@ def check_job(self, ctx, params): parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -842,49 +792,37 @@ def check_job_batch(self, ctx, params): parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -927,49 +865,37 @@ def check_job_batch(self, ctx, params): parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -1045,49 +971,37 @@ def check_jobs(self, ctx, params): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -1166,49 +1080,37 @@ def check_workspace_jobs(self, ctx, params): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -1431,49 +1333,37 @@ def check_jobs_date_range_for_user(self, ctx, params): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -1611,49 +1501,37 @@ def check_jobs_date_range_for_all(self, ctx, params): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y/Z, where X is the + workspace id, Y is the object id, Z is the version.), parameter + "meta" of type "Meta" (Narrative metadata for a job. All fields + are optional. run_id - the Narrative-assigned ID of the job run. + 1:1 with a job ID. token_id - the ID of the token used to run the + method. tag - the release tag, e.g. dev/beta/release. cell_id - + the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter diff --git a/lib/execution_engine2/sdk/EE2Logs.py b/lib/execution_engine2/sdk/EE2Logs.py index e9d19de9e..d96acb0b2 100644 --- a/lib/execution_engine2/sdk/EE2Logs.py +++ b/lib/execution_engine2/sdk/EE2Logs.py @@ -69,7 +69,7 @@ def _add_first_logs(self, log_lines, job_id): return AddLogResult(success=True, stored_line_count=log.stored_line_count) def _add_subsequent_logs(self, job_log, log_lines): - """ Add logs to an existing log entry """ + """Add logs to an existing log entry""" formatted_logs = self._format_job_logs( record_position=job_log["stored_line_count"] - 1, log_lines=log_lines ) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index fe15f0e69..35e81b950 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -96,7 +96,7 @@ def _init_job_rec( meta = params.get("meta") if meta: - for meta_attr in ["run_id", "token_id", "tag", "cell_id", "status"]: + for meta_attr in ["run_id", "token_id", "tag", "cell_id"]: inputs.narrative_cell_info[meta_attr] = meta.get(meta_attr) jr = JobRequirements() @@ -279,7 +279,6 @@ def _create_parent_job(self, wsid, meta): job_input.narrative_cell_info.token_id = meta.get("token_id") job_input.narrative_cell_info.tag = meta.get("tag") job_input.narrative_cell_info.cell_id = meta.get("cell_id") - job_input.narrative_cell_info.status = meta.get("status") j = Job( job_input=job_input, diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 9cf9f5437..41d45e5fb 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -288,7 +288,7 @@ def get_jobs(self, job_filter, job_projection, sort_order, offset, limit): # ENDPOINTS: Admin Related Endpoints def check_is_admin(self): - """ Authorization Required Read """ + """Authorization Required Read""" # Check whether if at minimum, a read only admin" try: return self.check_as_admin(requested_perm=JobPermissions.READ) @@ -300,66 +300,66 @@ def get_admin_permission(self): # ENDPOINTS: Running jobs and getting job input params def run_job(self, params, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_runjob().run(params=params, as_admin=as_admin) def run_job_batch(self, params, batch_params, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_runjob().run_batch( params=params, batch_params=batch_params, as_admin=as_admin ) def run_job_concierge(self, params, concierge_params): - """ Authorization Required : Be the kbaseconcierge user """ + """Authorization Required : Be the kbaseconcierge user""" return self.get_runjob().run(params=params, concierge_params=concierge_params) def get_job_params(self, job_id, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" return self.get_runjob().get_job_params(job_id=job_id, as_admin=as_admin) # ENDPOINTS: Adding and retrieving Logs def add_job_logs(self, job_id, log_lines, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_job_logs().add_job_logs( job_id=job_id, log_lines=log_lines, as_admin=as_admin ) def view_job_logs(self, job_id, skip_lines=None, as_admin=False, limit=None): - """ Authorization Required Read """ + """Authorization Required Read""" return self.get_job_logs().view_job_logs( job_id=job_id, skip_lines=skip_lines, as_admin=as_admin, limit=limit ) # Endpoints: Changing a job's status def start_job(self, job_id, skip_estimation=True, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().start_job( job_id=job_id, skip_estimation=skip_estimation, as_admin=as_admin ) # Endpoints: Changing a job's status def abandon_children(self, parent_job_id, child_job_ids, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().abandon_children( parent_job_id=parent_job_id, child_job_ids=child_job_ids, as_admin=as_admin ) def update_job_status(self, job_id, status, as_admin=False): # TODO: Make this an ADMIN ONLY function? Why would anyone need to call this who is not an admin? - """ Authorization Required: Read/Write """ + """Authorization Required: Read/Write""" return self.get_jobs_status().force_update_job_status( job_id=job_id, status=status, as_admin=as_admin ) def cancel_job(self, job_id, terminated_code=None, as_admin=False): # TODO: Cancel Child Jobs as well - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().cancel_job( job_id=job_id, terminated_code=terminated_code, as_admin=as_admin ) def handle_held_job(self, cluster_id): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" if self.check_as_admin(requested_perm=JobPermissions.WRITE): return self.get_jobs_status().handle_held_job( cluster_id=cluster_id, as_admin=True @@ -374,7 +374,7 @@ def finish_job( job_output=None, as_admin=False, ): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().finish_job( job_id=job_id, @@ -388,7 +388,7 @@ def finish_job( # Endpoints: Checking a job's status def check_job(self, job_id, exclude_fields=None, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" check_permission = True if as_admin is True: @@ -402,13 +402,13 @@ def check_job(self, job_id, exclude_fields=None, as_admin=False): ) def check_job_canceled(self, job_id, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" return self.get_jobs_status().check_job_canceled( job_id=job_id, as_admin=as_admin ) def get_job_status_field(self, job_id, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" return self.get_jobs_status().get_job_status(job_id=job_id, as_admin=as_admin) def check_job_batch( @@ -418,7 +418,7 @@ def check_job_batch( exclude_fields=None, as_admin=False, ): - """ Authorization Required: Read """ + """Authorization Required: Read""" if as_admin is True: self.check_as_admin(requested_perm=JobPermissions.READ) @@ -454,7 +454,7 @@ def check_jobs( return_list=1, as_admin=False, ): - """ Authorization Required: Read """ + """Authorization Required: Read""" if as_admin: self.check_as_admin(requested_perm=JobPermissions.READ) check_permission = False @@ -478,7 +478,7 @@ def check_jobs_date_range_for_user( ascending=None, as_admin=False, ): - """ Authorization Required: Read """ + """Authorization Required: Read""" if as_admin: self.check_as_admin(requested_perm=JobPermissions.READ) diff --git a/test/test_clients/authclient.py b/test/test_clients/authclient.py index 2087f463f..950b5760f 100644 --- a/test/test_clients/authclient.py +++ b/test/test_clients/authclient.py @@ -12,7 +12,7 @@ class TokenCache(object): - """ A basic cache for tokens. """ + """A basic cache for tokens.""" _MAX_TIME_SEC = 5 * 60 # 5 min diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index eb558df01..2ddbfbf23 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -58,7 +58,7 @@ def test_init_ok(self): self.assertTrue(set(class_attri) <= set(mongo_util.__dict__.keys())) def test_get_by_cluster(self): - """ Get a job by its condor scheduler_id""" + """Get a job by its condor scheduler_id""" mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): job = get_example_job() diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index dc46bb4ce..f90a85719 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -445,7 +445,6 @@ def test_run_job(ee2_port, ws_controller, mongo_client): "token_id": "tid", "tag": "yourit", "cell_id": "cid", - "status": "totally wasted bro", "thiskey": "getssilentlydropped", }, } @@ -526,7 +525,6 @@ def test_run_job(ee2_port, ws_controller, mongo_client): "token_id": "tid", "tag": "yourit", "cell_id": "cid", - "status": "totally wasted bro", }, }, "child_jobs": [], diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index db9ffaafd..1d9823246 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -138,7 +138,6 @@ def test_init_job_rec(self): narrative_cell_info = job_input.narrative_cell_info self.assertEqual(narrative_cell_info.tag, "dev") self.assertEqual(narrative_cell_info.token_id, "12345") - self.assertFalse(narrative_cell_info.status) self.assertFalse(job.job_output) From edc9307d13483637ef8de590304edc8dc342a89d Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 27 Apr 2021 08:37:43 -0700 Subject: [PATCH 051/109] DATAUP-389 - throw an error if a parent job id is specified in batch runs (#362) * throw an error if a parent job id is specified in batch runs * run black * Update spec --- execution_engine2.html | 2 +- execution_engine2.spec | 3 +- .../execution_engine2Impl.py | 539 +++++++++--------- lib/execution_engine2/sdk/EE2Runjob.py | 23 +- test/tests_for_sdkmr/EE2Runjob_test.py | 37 +- 5 files changed, 322 insertions(+), 282 deletions(-) diff --git a/execution_engine2.html b/execution_engine2.html index a5b6f050e..d73d9ae5e 100644 --- a/execution_engine2.html +++ b/execution_engine2.html @@ -1 +1 @@ -execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*params - the parameters to pass to the method.
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*run_job_batch ignores this parameter when starting a job batch.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
}
RunJobParams;

/*
*Start a new job (long running method of service registered in ServiceRegistery).
*Such job runs Docker image for this service in script mode.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

typedefstructure{
intwsid;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*params - the parameters to pass to the method.
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
}
RunJobParams;

/*
*Start a new job (long running method of service registered in ServiceRegistery).
*Such job runs Docker image for this service in script mode.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

typedefstructure{
intwsid;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index fe94b7c47..35848f4dd 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -109,7 +109,8 @@ the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. - run_job_batch ignores this parameter when starting a job batch. + Submitting a job with a parent ID to run_job_batch will cause an error to be + returned. */ typedef structure { diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 19c9a3c9e..2efb08ba5 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -30,7 +30,7 @@ class execution_engine2: ######################################### noqa VERSION = "0.0.5" GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" - GIT_COMMIT_HASH = "462a6110e5837e67e574e6db169a79a4d67fa8b4" + GIT_COMMIT_HASH = "ba016db2ffabc0fa48f79559816cf0f115c00feb" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -198,21 +198,22 @@ def run_job(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String :returns: instance of type "job_id" (A job id.) """ # ctx is the context object @@ -254,21 +255,22 @@ def run_job_batch(self, ctx, params, batch_params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String :param batch_params: instance of type "BatchParams" -> structure: parameter "wsid" of Long :returns: instance of type "BatchSubmission" -> structure: parameter @@ -345,21 +347,22 @@ def run_job_concierge(self, ctx, params, concierge_params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int request_memory: int in MB request_disk: int in GB job_priority: @@ -426,21 +429,22 @@ def get_job_params(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String """ # ctx is the context object # return variables are: params @@ -701,29 +705,29 @@ def check_job(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: job_state @@ -809,66 +813,67 @@ def check_job_batch(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "child_jobstates" - of list of type "JobState" (job_id - string - id of the job user - - string - user who started the job wsid - int - optional id of the - workspace where the job is bound authstrat - string - what - strategy used to authenticate the job job_input - object - inputs - to the job (from the run_job call) ## TODO - verify updated - int - - timestamp since epoch in milliseconds of the last time the - status was updated running - int - timestamp since epoch in - milliseconds of when it entered the running state created - int - - timestamp since epoch in milliseconds when the job was created - finished - int - timestamp since epoch in milliseconds when the - job was finished status - string - status of the job. one of the - following: created - job has been created in the service - estimating - an estimation job is running to estimate resources - required for the main job, and which queue should be used queued - - job is queued to be run running - job is running on a worker node - completed - job was completed successfully error - job is no - longer running, but failed with an error terminated - job is no - longer running, terminated either due to user cancellation, admin - cancellation, or some automated task error_code - int - internal - reason why the job is an error. one of the following: 0 - unknown - 1 - job crashed 2 - job terminated by automation 3 - job ran over - time limit 4 - job was missing its automated output document 5 - - job authentication token expired errormsg - string - message (e.g. - stacktrace) accompanying an errored job error - object - the - JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, - one of: 0 - user cancellation 1 - admin cancellation 2 - - terminated by some automatic process @optional error @optional - error_code @optional errormsg @optional terminated_code @optional - estimating @optional running @optional finished) -> structure: - parameter "job_id" of type "job_id" (A job id.), parameter "user" - of String, parameter "authstrat" of String, parameter "wsid" of - Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of + Long, parameter "child_jobstates" of list of type "JobState" + (job_id - string - id of the job user - string - user who started + the job wsid - int - optional id of the workspace where the job is + bound authstrat - string - what strategy used to authenticate the + job job_input - object - inputs to the job (from the run_job call) + ## TODO - verify updated - int - timestamp since epoch in + milliseconds of the last time the status was updated running - int + - timestamp since epoch in milliseconds of when it entered the + running state created - int - timestamp since epoch in + milliseconds when the job was created finished - int - timestamp + since epoch in milliseconds when the job was finished status - + string - status of the job. one of the following: created - job + has been created in the service estimating - an estimation job is + running to estimate resources required for the main job, and which + queue should be used queued - job is queued to be run running - + job is running on a worker node completed - job was completed + successfully error - job is no longer running, but failed with an + error terminated - job is no longer running, terminated either due + to user cancellation, admin cancellation, or some automated task + error_code - int - internal reason why the job is an error. one of + the following: 0 - unknown 1 - job crashed 2 - job terminated by + automation 3 - job ran over time limit 4 - job was missing its + automated output document 5 - job authentication token expired + errormsg - string - message (e.g. stacktrace) accompanying an + errored job error - object - the JSON-RPC error package that + accompanies the error code and message terminated_code - int - + internal reason why a job was terminated, one of: 0 - user + cancellation 1 - admin cancellation 2 - terminated by some + automatic process @optional error @optional error_code @optional + errormsg @optional terminated_code @optional estimating @optional + running @optional finished) -> structure: parameter "job_id" of + type "job_id" (A job id.), parameter "user" of String, parameter + "authstrat" of String, parameter "wsid" of Long, parameter + "status" of String, parameter "job_input" of type "RunJobParams" + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve @@ -882,29 +887,29 @@ def check_job_batch(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -988,29 +993,29 @@ def check_jobs(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -1097,29 +1102,29 @@ def check_workspace_jobs(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -1350,33 +1355,33 @@ def check_jobs_date_range_for_user(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "count" of Long, - parameter "query_count" of Long, parameter "filter" of mapping - from String to String, parameter "skip" of Long, parameter - "projection" of list of String, parameter "limit" of Long, - parameter "sort_order" of String + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of + Long, parameter "count" of Long, parameter "query_count" of Long, + parameter "filter" of mapping from String to String, parameter + "skip" of Long, parameter "projection" of list of String, + parameter "limit" of Long, parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal @@ -1518,33 +1523,33 @@ def check_jobs_date_range_for_all(self, ctx, params): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y/Z, where X is the - workspace id, Y is the object id, Z is the version.), parameter - "meta" of type "Meta" (Narrative metadata for a job. All fields - are optional. run_id - the Narrative-assigned ID of the job run. - 1:1 with a job ID. token_id - the ID of the token used to run the - method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "count" of Long, - parameter "query_count" of Long, parameter "filter" of mapping - from String to String, parameter "skip" of Long, parameter - "projection" of list of String, parameter "limit" of Long, - parameter "sort_order" of String + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "created" of Long, parameter + "queued" of Long, parameter "estimating" of Long, parameter + "running" of Long, parameter "finished" of Long, parameter + "updated" of Long, parameter "error" of type "JsonRpcError" (Error + block of JSON RPC response) -> structure: parameter "name" of + String, parameter "code" of Long, parameter "message" of String, + parameter "error" of String, parameter "error_code" of Long, + parameter "errormsg" of String, parameter "terminated_code" of + Long, parameter "count" of Long, parameter "query_count" of Long, + parameter "filter" of mapping from String to String, parameter + "skip" of Long, parameter "projection" of list of String, + parameter "limit" of Long, parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 35e81b950..8a63b88ed 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -338,7 +338,7 @@ def run_batch( self._check_workspace_permissions_list(wsids) self._add_job_requirements(params) - self._check_job_arguments(params) + self._check_job_arguments(params, has_parent_job=True) parent_job = self._create_parent_job(wsid=wsid, meta=meta) children_jobs = self._run_batch(parent_job=parent_job, params=params) @@ -360,23 +360,28 @@ def _add_job_requirements(self, jobs: List[Dict[str, Any]]): # TODO JRR actually process the requirements once added to the spec j[_JOB_REQUIREMENTS] = jrr.resolve_requirements(j.get(_METHOD)) - def _check_job_arguments(self, jobs): - # perform sanity checks before creating job or parent job - for j in jobs: + def _check_job_arguments(self, jobs, has_parent_job=False): + # perform sanity checks before creating any jobs, including the parent job for batch jobs + for i, job in enumerate(jobs): # Could make an argument checker method, or a class that doesn't require a job id. # Seems like more code & work for no real benefit though. # Just create the class for checks, don't use yet JobSubmissionParameters( "fakejobid", - AppInfo(j.get(_METHOD), j.get(_APP_ID)), - j[_JOB_REQUIREMENTS], + AppInfo(job.get(_METHOD), job.get(_APP_ID)), + job[_JOB_REQUIREMENTS], UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), - wsid=j.get(_WORKSPACE_ID), - source_ws_objects=j.get(_SOURCE_WS_OBJECTS), + wsid=job.get(_WORKSPACE_ID), + source_ws_objects=job.get(_SOURCE_WS_OBJECTS), ) + if has_parent_job and job.get(_PARENT_JOB_ID): + pre = f"Job #{i + 1}: b" if len(jobs) > 1 else "B" + raise IncorrectParamsException( + f"{pre}atch jobs may not specify a parent job ID" + ) # This is also an opportunity for caching # although most likely jobs aren't operating on the same object - self._check_ws_objects(source_objects=j.get(_SOURCE_WS_OBJECTS)) + self._check_ws_objects(source_objects=job.get(_SOURCE_WS_OBJECTS)) def run( self, params=None, as_admin=False, concierge_params: Dict = None diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index d7c2530c2..0bb8f199c 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -808,13 +808,42 @@ def test_run_batch_fail_params_not_list(): "a", 8, ]: - with raises(Exception) as got: - rj.run_batch(params, {}, as_admin=True) - assert_exception_correct( - got.value, IncorrectParamsException("params must be a list") + _run_batch_fail( + rj, params, {}, True, IncorrectParamsException("params must be a list") ) +def test_run_batch_fail_parent_id_included(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + rj = EE2RunJob(sdkmr) + + _run_batch_fail( + rj, + [{"method": "foo.bar", "app_id": "foo/bat", "parent_job_id": "a"}], + {}, + True, + IncorrectParamsException("Batch jobs may not specify a parent job ID"), + ) + + _run_batch_fail( + rj, + [ + {"method": "foo.bar", "app_id": "foo/bat"}, + {"method": "foo.bar", "app_id": "foo/bat", "parent_job_id": "a"}, + ], + {}, + True, + IncorrectParamsException("Job #2: batch jobs may not specify a parent job ID"), + ) + + +def _run_batch_fail(run_job, params, batch_params, as_admin, expected): + with raises(Exception) as got: + run_job.run_batch(params, batch_params, as_admin=as_admin) + assert_exception_correct(got.value, expected) + + def assert_jobs_equal(got_job: Job, expected_job: Job): """ Checks that the two jobs are equivalent, except that the 'updated' fields are checked that From 55172786fb43f4e9be405221fab305a7e09ebb0d Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 27 Apr 2021 09:19:44 -0700 Subject: [PATCH 052/109] Factor out helper methods for run_job integration test (#363) Will be reused in concierge test, maybe batch test --- test/tests_for_integration/api_to_db_test.py | 203 ++++++++++--------- 1 file changed, 108 insertions(+), 95 deletions(-) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index f90a85719..9ed28b197 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -393,12 +393,8 @@ def _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub): sub.queue.assert_called_once_with(txn, 1) -def test_run_job(ee2_port, ws_controller, mongo_client): - """ - A test of the run_job method. - """ - # Set up workspace and objects - wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) +def _set_up_workspace_objects(ws_controller, token): + wsc = Workspace(ws_controller.get_url(), token=token) wsc.create_workspace({"workspace": "foo"}) wsc.save_objects( { @@ -410,6 +406,108 @@ def test_run_job(ee2_port, ws_controller, mongo_client): } ) + +def _get_run_job_param_set(): + return { + "method": "mod.meth", + "app_id": "mod/app", + "wsid": 1, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "parent_job_id": "totallywrongid", + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "thiskey": "getssilentlydropped", + }, + } + + +def _get_condor_sub_for_rj_param_set(job_id, user, token, cpu, mem): + expected_sub = _get_common_sub(job_id) + expected_sub.update( + { + "JobBatchName": job_id, + "arguments": f"{job_id} https://ci.kbase.us/services/ee2", + "+KB_PARENT_JOB_ID": '"totallywrongid"', + "+KB_MODULE_NAME": '"mod"', + "+KB_FUNCTION_NAME": '"meth"', + "+KB_APP_ID": '"mod/app"', + "+KB_APP_MODULE_NAME": '"mod"', + "+KB_WSID": '"1"', + "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', + "request_cpus": f"{cpu}", + "request_memory": f"{mem}MB", + "request_disk": "30GB", + "requirements": 'regexp("njs",CLIENTGROUP)', + "+KB_CLIENTGROUP": '"njs"', + "Concurrency_Limits": f"{user}", + "+AccountingGroup": f'"{user}"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=604805 KB_ADMIN_AUTH_TOKEN=test_auth_token ' + + f"KB_AUTH_TOKEN={token} CLIENTGROUP=njs JOB_ID={job_id} " + + "CONDOR_ID=$(Cluster).$(Process) PYTHON_EXECUTABLE=/miniconda/bin/python " + + 'DEBUG_MODE=False PARENT_JOB_ID=totallywrongid "' + ), + "leavejobinqueue": "true", + "initial_dir": "../scripts/", + "+Owner": '"condor_pool"', + "executable": "../scripts//../scripts/execute_runner.sh", + "transfer_input_files": "../scripts/JobRunner.tgz", + } + ) + return expected_sub + + +def _check_mongo_job(mongo_client, job_id, user, cpu, mem, githash): + job = mongo_client[MONGO_EE2_DB][MONGO_EE2_JOBS_COL].find_one( + {"_id": ObjectId(job_id)} + ) + assert_close_to_now(job.pop("updated")) + assert_close_to_now(job.pop("queued")) + expected_job = { + "_id": ObjectId(job_id), + "user": user, + "authstrat": "kbaseworkspace", + "wsid": 1, + "status": "queued", + "job_input": { + "wsid": 1, + "method": "mod.meth", + "params": [{"foo": "bar"}, 42], + "service_ver": githash, + "app_id": "mod/app", + "source_ws_objects": ["1/1/1", "1/2/1"], + "parent_job_id": "totallywrongid", + "requirements": { + "clientgroup": "njs", + "cpu": cpu, + "memory": mem, + "disk": 30, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + }, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + assert job == expected_job + + +def test_run_job(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) # need to get the mock objects first so spec_set can do its magic before we mock out # the classes in the context manager sub, schedd, txn = _get_htc_mocks() @@ -431,24 +529,7 @@ def test_run_job(ee2_port, ws_controller, mongo_client): # run the method ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) - job_id = ee2.run_job( - { - "method": "mod.meth", - "app_id": "mod/app", - "wsid": 1, - "source_ws_objects": ["1/1/1", "1/2/1"], - "params": [{"foo": "bar"}, 42], - "service_ver": "beta", - "parent_job_id": "totallywrongid", - "meta": { - "run_id": "rid", - "token_id": "tid", - "tag": "yourit", - "cell_id": "cid", - "thiskey": "getssilentlydropped", - }, - } - ) + job_id = ee2.run_job(_get_run_job_param_set()) # check that mocks were called correctly # Since these are class methods, the first argument is self, which we ignore @@ -459,80 +540,12 @@ def test_run_job(ee2_port, ws_controller, mongo_client): ANY, {"module_name": "mod", "function_name": "meth"} ) - expected_sub = _get_common_sub(job_id) - expected_sub.update( - { - "JobBatchName": job_id, - "arguments": f"{job_id} https://ci.kbase.us/services/ee2", - "+KB_PARENT_JOB_ID": '"totallywrongid"', - "+KB_MODULE_NAME": '"mod"', - "+KB_FUNCTION_NAME": '"meth"', - "+KB_APP_ID": '"mod/app"', - "+KB_APP_MODULE_NAME": '"mod"', - "+KB_WSID": '"1"', - "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', - "request_cpus": "8", - "request_memory": "5MB", - "request_disk": "30GB", - "requirements": 'regexp("njs",CLIENTGROUP)', - "+KB_CLIENTGROUP": '"njs"', - "Concurrency_Limits": f"{USER_NO_ADMIN}", - "+AccountingGroup": f'"{USER_NO_ADMIN}"', - "environment": ( - '"DOCKER_JOB_TIMEOUT=604805 KB_ADMIN_AUTH_TOKEN=test_auth_token ' - + f"KB_AUTH_TOKEN={TOKEN_NO_ADMIN} CLIENTGROUP=njs JOB_ID={job_id} " - + "CONDOR_ID=$(Cluster).$(Process) PYTHON_EXECUTABLE=/miniconda/bin/python " - + 'DEBUG_MODE=False PARENT_JOB_ID=totallywrongid "' - ), - "leavejobinqueue": "true", - "initial_dir": "../scripts/", - "+Owner": '"condor_pool"', - "executable": "../scripts//../scripts/execute_runner.sh", - "transfer_input_files": "../scripts/JobRunner.tgz", - } + expected_sub = _get_condor_sub_for_rj_param_set( + job_id, USER_NO_ADMIN, TOKEN_NO_ADMIN, 8, 5 ) - _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) - # check the mongo record is correct - job = mongo_client[MONGO_EE2_DB][MONGO_EE2_JOBS_COL].find_one( - {"_id": ObjectId(job_id)} - ) - assert_close_to_now(job.pop("updated")) - assert_close_to_now(job.pop("queued")) - expected_job = { - "_id": ObjectId(job_id), - "user": USER_NO_ADMIN, - "authstrat": "kbaseworkspace", - "wsid": 1, - "status": "queued", - "job_input": { - "wsid": 1, - "method": "mod.meth", - "params": [{"foo": "bar"}, 42], - "service_ver": "somehash", - "app_id": "mod/app", - "source_ws_objects": ["1/1/1", "1/2/1"], - "parent_job_id": "totallywrongid", - "requirements": { - "clientgroup": "njs", - "cpu": 8, - "memory": 5, - "disk": 30, - }, - "narrative_cell_info": { - "run_id": "rid", - "token_id": "tid", - "tag": "yourit", - "cell_id": "cid", - }, - }, - "child_jobs": [], - "batch_job": False, - "scheduler_id": "123", - "scheduler_type": "condor", - } - assert job == expected_job + _check_mongo_job(mongo_client, job_id, USER_NO_ADMIN, 8, 5, "somehash") def test_run_job_fail_no_workspace_access(ee2_port): From c98e8c528a73c022e9b77f247e94b03da4d9bc47 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 27 Apr 2021 09:58:06 -0700 Subject: [PATCH 053/109] DATAUP-389 - fix a couple of concierge related bugs (#364) * Fix default clientgroup bug for concierge run When integrating the requirements normalizer, missed that if no client group is supplied when running run_job_concierge, a default client group should be used * Rename default concierge queue `concierge` is what's actually used in KBase prod and it's what's in the `deploy.cfg` file. --- lib/execution_engine2/sdk/EE2Constants.py | 2 +- lib/execution_engine2/sdk/EE2Runjob.py | 3 ++- test/tests_for_sdkmr/EE2Runjob_test.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Constants.py b/lib/execution_engine2/sdk/EE2Constants.py index 932d7fa27..a821ee33f 100644 --- a/lib/execution_engine2/sdk/EE2Constants.py +++ b/lib/execution_engine2/sdk/EE2Constants.py @@ -6,7 +6,7 @@ # specify an auth2 role that allows users to replace their token with the kbaseconcierge token # when running jobs. Needs more thought. KBASE_CONCIERGE_USERNAME = "kbaseconcierge" -CONCIERGE_CLIENTGROUP = "kbase_concierge" +CONCIERGE_CLIENTGROUP = "concierge" EE2_CONFIG_SECTION = "execution_engine2" EE2_DEFAULT_SECTION = "DEFAULT" diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 8a63b88ed..54570e3ce 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -24,6 +24,7 @@ AppInfo, UserCreds, ) +from execution_engine2.sdk.EE2Constants import CONCIERGE_CLIENTGROUP from execution_engine2.utils.job_requirements_resolver import ( REQUEST_CPUS, REQUEST_DISK, @@ -433,7 +434,7 @@ def _get_job_reqs_from_concierge_params( cpus=norm.get(REQUEST_CPUS), memory_MB=norm.get(REQUEST_MEMORY), disk_GB=norm.get(REQUEST_DISK), - client_group=norm.get(CLIENT_GROUP), + client_group=norm.get(CLIENT_GROUP) or CONCIERGE_CLIENTGROUP, client_group_regex=norm.get(CLIENT_GROUP_REGEX), # error messaging here is for 'bill_to_user' vs 'account_group' but almost impossible # to screw up so YAGNI diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 0bb8f199c..71390f8c6 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -362,7 +362,7 @@ def test_run_as_concierge_sched_reqs_empty_list_as_admin(): def _run_as_concierge_empty_as_admin(concierge_params): # Set up data variables - client_group = "somegroup" + client_group = "concierge" # hardcoded default for run_as_concierge cpus = 1 mem = 1 disk = 1 @@ -406,7 +406,7 @@ def _run_as_concierge_empty_as_admin(concierge_params): cpus=None, memory_MB=None, disk_GB=None, - client_group=None, + client_group=client_group, client_group_regex=None, ignore_concurrency_limits=True, bill_to_user=None, From 48599f5097f53e62a6db0bbe1a99363320a85d00 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 27 Apr 2021 10:47:29 -0700 Subject: [PATCH 054/109] DATAUP-389 - fix Server.py quoting error messages (#365) * Fix server.py quoting errors Bug was introduced a year or two ago * run black --- .../execution_engine2Server.py | 7 ++++++- test/tests_for_integration/api_to_db_test.py | 18 ++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/execution_engine2/execution_engine2Server.py b/lib/execution_engine2/execution_engine2Server.py index e160802d9..5e3df02ef 100644 --- a/lib/execution_engine2/execution_engine2Server.py +++ b/lib/execution_engine2/execution_engine2Server.py @@ -122,7 +122,12 @@ def _call_method(self, ctx, request): newerr = JSONServerError() newerr.trace = traceback.format_exc() if len(e.args) == 1: - newerr.data = repr(e.args[0]) + # THIS WAS CHANGED INTENTIONALLY - if you recompile please restore. + # repr adds single quotes around string arguments which is not what we want. + if type(e.args[0]) == str: + newerr.data = e.args[0] + else: + newerr.data = repr(e.args[0]) else: newerr.data = repr(e.args) raise newerr diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 9ed28b197..b9ea9c13e 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -560,17 +560,13 @@ def test_run_job_fail_no_workspace_access(ee2_port): def test_run_job_fail_bad_method(ee2_port): params = {"method": "mod.meth.moke", "app_id": "mod/app"} - # TODO the Server.py file is quoting strings for some reason it seems - # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 - err = "\"Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name\"" + err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def test_run_job_fail_bad_app(ee2_port): params = {"method": "mod.meth", "app_id": "mod.app"} - # TODO the Server.py file is quoting strings for some reason it seems - # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 - err = "\"Application ID 'mod.app' contains a '.'\"" + err = "Application ID 'mod.app' contains a '.'" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) @@ -580,9 +576,9 @@ def test_run_job_fail_bad_upa(ee2_port): "app_id": "mod/app", "source_ws_objects": ["ws/obj/1"], } - # TODO the Server.py file is quoting strings for some reason it seems - # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 - err = "\"source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address\"" + err = ( + "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" + ) _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) @@ -599,9 +595,7 @@ def test_run_job_fail_no_such_object(ee2_port, ws_controller): } ) params = {"method": "mod.meth", "app_id": "mod/app", "source_ws_objects": ["1/2/1"]} - # TODO the Server.py file is quoting strings for some reason it seems - # see https://github.com/kbase/sample_service/blob/master/lib/SampleService/SampleServiceServer.py#L119-L127 - err = "'Some workspace object is inaccessible'" + err = "Some workspace object is inaccessible" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) From 2b242f7df8b0733bd34fc86719808645284e47e8 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 28 Apr 2021 12:53:29 -0500 Subject: [PATCH 055/109] DATAUP-423 Removed extra logging (#368) * Removed extra logging * Removed extra logging Co-authored-by: Boris Sadkhin --- lib/execution_engine2/db/MongoUtil.py | 2 -- lib/execution_engine2/sdk/EE2Runjob.py | 7 ------- lib/execution_engine2/utils/Condor.py | 10 ++-------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index d00c38d44..1499cf267 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -373,8 +373,6 @@ def update_job_status(self, job_id, status, msg=None, error_message=None): f"Cannot change already finished/terminated/errored job. {j.status} to {status}" ) - self.logger.debug(f"job status is {j.status}. going to update to {status}") - # A job in status running can only be terminated/error/finished if j.status == Status.running.value: if status not in [ diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 54570e3ce..37c71c47b 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -108,9 +108,6 @@ def _init_job_rec( inputs.requirements = jr job.job_input = inputs - self.logger.debug(job.job_input.to_mongo().to_dict()) - - self.logger.debug(job.to_mongo().to_dict()) job_id = self.sdkmr.save_job(job) self.sdkmr.get_kafka_client().send_kafka_message( @@ -238,10 +235,6 @@ def _run(self, params): self._finish_created_job(job_id=job_id, exception=RuntimeError(error_msg)) raise RuntimeError(error_msg) - self.logger.debug( - f"Attempting to update job to queued {job_id} {condor_job_id} {submission_info}" - ) - self.update_job_to_queued(job_id=job_id, scheduler_id=condor_job_id) self.sdkmr.get_slack_client().run_job_message( job_id=job_id, scheduler_id=condor_job_id, username=self.sdkmr.get_user_id() diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 0881a45b3..31be2e60f 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -197,18 +197,14 @@ def run_job(self, params: JobSubmissionParameters) -> SubmissionInfo: TODO: Add a retry TODO: Add list of required params :param params: Params to run the job. - :return: + :return: ClusterID, Submit File, and Info about Errors """ + # Contains sensitive information to be sent to condor submit = self._create_submit(_not_falsy(params, "params")) sub = self.htcondor.Submit(submit) try: schedd = self.htcondor.Schedd() - self.logger.debug(schedd) - self.logger.debug(submit) - self.logger.debug(os.getuid()) - self.logger.debug(pwd.getpwuid(os.getuid()).pw_name) - self.logger.debug(submit) with schedd.transaction() as txn: return SubmissionInfo(str(sub.queue(txn, 1)), sub, None) except Exception as e: @@ -315,8 +311,6 @@ def _cancel_jobs(self, scheduler_ids: list): cancel_jobs = self.htcondor.Schedd().act( action=self.htcondor.JobAction.Remove, job_spec=scheduler_ids ) - self.logger.info(f"Cancel job message for {scheduler_ids} is") - self.logger.debug(f"{cancel_jobs}") return cancel_jobs except Exception: self.logger.error( From b4b1f66371ea3c4a1c994613bd7060d2cd0880dd Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 28 Apr 2021 12:37:27 -0700 Subject: [PATCH 056/109] DATAUP-389 - Add happy path integration tests for run_job_concierge (#366) * Add run_job_concierge happy path integration tests * run black * Add var names to _run_job_concierge calls +readabilty --- lib/execution_engine2/utils/Condor.py | 4 +- test/deploy.cfg | 5 + test/tests_for_integration/api_to_db_test.py | 176 +++++++++++++++++-- 3 files changed, 173 insertions(+), 12 deletions(-) diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 31be2e60f..e53536d9c 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -137,8 +137,8 @@ def _create_requirements_statement(self, job_reqs: JobRequirements) -> str: reqs = [f'regexp("{job_reqs.client_group}",CLIENTGROUP)'] else: reqs = [f'(CLIENTGROUP == "{job_reqs.client_group}")'] - for key, value in job_reqs.scheduler_requirements.items(): - reqs.append(f'({key} == "{value}")') + for key in sorted(job_reqs.scheduler_requirements): + reqs.append(f'({key} == "{job_reqs.scheduler_requirements[key]}")') return " && ".join(reqs) def _add_resources_and_special_attributes( diff --git a/test/deploy.cfg b/test/deploy.cfg index 25bb1ac77..0bfb9bc55 100644 --- a/test/deploy.cfg +++ b/test/deploy.cfg @@ -51,6 +51,11 @@ transfer_input_files = ../scripts/JobRunner.tgz # Log Level and sending DEBUG=true to the jobs, which means containers do not get cleaned up debug = false +#---------------------------------------------------------------------------------------# +[concierge] +request_cpus = 4 +request_memory = 23000M +request_disk = 100GB #---------------------------------------------------------------------------------------# [njs] request_cpus = 4 diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index b9ea9c13e..7f36ce648 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -75,6 +75,9 @@ USER_WRITE_ADMIN = "writeuser" TOKEN_WRITE_ADMIN = None +USER_KBASE_CONCIERGE = "kbaseconcierge" +TOKEN_KBASE_CONCIERGE = None + USER_WS_READ_ADMIN = "wsreadadmin" TOKEN_WS_READ_ADMIN = None USER_WS_FULL_ADMIN = "wsfulladmin" @@ -153,6 +156,11 @@ def _set_up_auth_users(auth_url): auth_url, USER_WRITE_ADMIN, "display3", [ADMIN_WRITE_ROLE] ) + global TOKEN_KBASE_CONCIERGE + TOKEN_KBASE_CONCIERGE = _set_up_auth_user( + auth_url, USER_KBASE_CONCIERGE, "concierge" + ) + global TOKEN_WS_READ_ADMIN TOKEN_WS_READ_ADMIN = _set_up_auth_user( auth_url, USER_WS_READ_ADMIN, "wsra", [WS_READ_ADMIN] @@ -426,7 +434,7 @@ def _get_run_job_param_set(): } -def _get_condor_sub_for_rj_param_set(job_id, user, token, cpu, mem): +def _get_condor_sub_for_rj_param_set(job_id, user, token, clientgroup, cpu, mem, disk): expected_sub = _get_common_sub(job_id) expected_sub.update( { @@ -441,14 +449,14 @@ def _get_condor_sub_for_rj_param_set(job_id, user, token, cpu, mem): "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', "request_cpus": f"{cpu}", "request_memory": f"{mem}MB", - "request_disk": "30GB", - "requirements": 'regexp("njs",CLIENTGROUP)', - "+KB_CLIENTGROUP": '"njs"', + "request_disk": f"{disk}GB", + "requirements": f'regexp("{clientgroup}",CLIENTGROUP)', + "+KB_CLIENTGROUP": f'"{clientgroup}"', "Concurrency_Limits": f"{user}", "+AccountingGroup": f'"{user}"', "environment": ( '"DOCKER_JOB_TIMEOUT=604805 KB_ADMIN_AUTH_TOKEN=test_auth_token ' - + f"KB_AUTH_TOKEN={token} CLIENTGROUP=njs JOB_ID={job_id} " + + f"KB_AUTH_TOKEN={token} CLIENTGROUP={clientgroup} JOB_ID={job_id} " + "CONDOR_ID=$(Cluster).$(Process) PYTHON_EXECUTABLE=/miniconda/bin/python " + 'DEBUG_MODE=False PARENT_JOB_ID=totallywrongid "' ), @@ -462,7 +470,7 @@ def _get_condor_sub_for_rj_param_set(job_id, user, token, cpu, mem): return expected_sub -def _check_mongo_job(mongo_client, job_id, user, cpu, mem, githash): +def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, githash): job = mongo_client[MONGO_EE2_DB][MONGO_EE2_JOBS_COL].find_one( {"_id": ObjectId(job_id)} ) @@ -483,10 +491,10 @@ def _check_mongo_job(mongo_client, job_id, user, cpu, mem, githash): "source_ws_objects": ["1/1/1", "1/2/1"], "parent_job_id": "totallywrongid", "requirements": { - "clientgroup": "njs", + "clientgroup": clientgroup, "cpu": cpu, "memory": mem, - "disk": 30, + "disk": disk, }, "narrative_cell_info": { "run_id": "rid", @@ -541,11 +549,13 @@ def test_run_job(ee2_port, ws_controller, mongo_client): ) expected_sub = _get_condor_sub_for_rj_param_set( - job_id, USER_NO_ADMIN, TOKEN_NO_ADMIN, 8, 5 + job_id, USER_NO_ADMIN, TOKEN_NO_ADMIN, "njs", 8, 5, 30 ) _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) - _check_mongo_job(mongo_client, job_id, USER_NO_ADMIN, 8, 5, "somehash") + _check_mongo_job( + mongo_client, job_id, USER_NO_ADMIN, "njs", 8, 5, 30, "somehash" + ) def test_run_job_fail_no_workspace_access(ee2_port): @@ -607,3 +617,149 @@ def _run_job_fail(ee2_port, token, params, expected, throw_exception=False): with raises(ServerError) as got: client.run_job(params) assert_exception_correct(got.value, ServerError("name", 1, expected)) + + +######## run_job_concierge tests ######## + + +def test_run_job_concierge_minimal(ee2_port, ws_controller, mongo_client): + def modify_sub(sub): + del sub["Concurrency_Limits"] + + _run_job_concierge( + ee2_port=ee2_port, + ws_controller=ws_controller, + mongo_client=mongo_client, + # if the concierge dict is empty, regular old run_job gets run + conc_params={"trigger": "concierge"}, # contents are ignored + modify_sub=modify_sub, + clientgroup="concierge", + cpu=4, + mem=23000, + disk=100, + ) + + +def test_run_job_concierge_mixed(ee2_port, ws_controller, mongo_client): + """ + Gets cpu from the input, memory from deploy.cfg, and disk from the catalog. + """ + + def modify_sub(sub): + del sub["Concurrency_Limits"] + + _run_job_concierge( + ee2_port=ee2_port, + ws_controller=ws_controller, + mongo_client=mongo_client, + conc_params={"client_group": "extreme", "request_cpus": 76}, + modify_sub=modify_sub, + clientgroup="extreme", + cpu=76, + mem=250000, + disk=7, + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_disk":7}']}], + ) + + +def test_run_job_concierge_maximal(ee2_port, ws_controller, mongo_client): + def modify_sub(sub): + sub[ + "requirements" + ] = '(CLIENTGROUP == "bigmem") && (baz == "bat") && (foo == "bar")' + sub["Concurrency_Limits"] = "some_sucker" + sub["+AccountingGroup"] = '"some_sucker"' + sub["environment"] = sub["environment"].replace( + "DEBUG_MODE=False", "DEBUG_MODE=True" + ) + + _run_job_concierge( + ee2_port=ee2_port, + ws_controller=ws_controller, + mongo_client=mongo_client, + conc_params={ + "client_group": "bigmem", + "request_cpus": 42, + "request_memory": 56, + "request_disk": 89, + "client_group_regex": False, + "account_group": "some_sucker", + "ignore_concurrency_limits": False, + "requirements_list": ["foo=bar", "baz=bat"], + "debug_mode": "true", + }, + modify_sub=modify_sub, + clientgroup="bigmem", + cpu=42, + mem=56, + disk=89, + ) + + +def _run_job_concierge( + ee2_port, + ws_controller, + mongo_client, + conc_params, + modify_sub, + clientgroup, + cpu, + mem, + disk, + catalog_return=None, +): + _set_up_workspace_objects(ws_controller, TOKEN_KBASE_CONCIERGE) + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.return_value = 123 + list_cgroups.return_value = catalog_return or [] + get_mod_ver.return_value = {"git_commit_hash": "somehash"} + + # run the method + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_KBASE_CONCIERGE) + # if the concierge dict is empty, regular old run_job gets run + job_id = ee2.run_job_concierge(_get_run_job_param_set(), conc_params) + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_called_once_with( + ANY, {"module_name": "mod", "version": "beta"} + ) + list_cgroups.assert_called_once_with( + ANY, {"module_name": "mod", "function_name": "meth"} + ) + + expected_sub = _get_condor_sub_for_rj_param_set( + job_id, + USER_KBASE_CONCIERGE, + TOKEN_KBASE_CONCIERGE, + clientgroup, + cpu, + mem, + disk, + ) + modify_sub(expected_sub) + + _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) + + _check_mongo_job( + mongo_client, + job_id, + USER_KBASE_CONCIERGE, + clientgroup, + cpu, + mem, + disk, + "somehash", + ) From f1f547d751eb1c8aea0515d029531c356613a53e Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 28 Apr 2021 13:33:07 -0700 Subject: [PATCH 057/109] DATAUP-389 - Fix bug in job requirements resolver (#369) * Fix bug in requirements resolver Need to check for both value and type errors when parting ints * remove unused imports --- lib/execution_engine2/utils/Condor.py | 2 -- lib/execution_engine2/utils/job_requirements_resolver.py | 2 +- test/tests_for_utils/job_requirements_resolver_test.py | 8 ++++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index e53536d9c..6be7905f8 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -3,9 +3,7 @@ Functions to call condor to manage jobs and extract resource requirements """ import logging -import os import pathlib -import pwd from typing import Dict, Optional, Any import htcondor diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index afabdf144..a6c346256 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -87,7 +87,7 @@ def _int_request(putative_int, original, name, source): _check_raise(f"{name} request", original, source) try: return int(putative_int) - except ValueError: + except (ValueError, TypeError): _check_raise(f"{name} request", original, source) diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py index 670a52ee5..9deb59c56 100644 --- a/test/tests_for_utils/job_requirements_resolver_test.py +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -177,6 +177,14 @@ def test_normalize_job_reqs_fail_cpu(): "Found illegal cpu request '26M' in job requirements from src4" ), ) + _normalize_job_reqs_fail( + {"request_cpus": ["26M"]}, + "src4.5", + False, + IncorrectParamsException( + "Found illegal cpu request '['26M']' in job requirements from src4.5" + ), + ) def test_normalize_job_reqs_fail_mem(): From 7e06858d9231a7bc7e2f88ee4e8210f54e4731b3 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 29 Apr 2021 09:52:07 -0500 Subject: [PATCH 058/109] Update deploy.docker.cfg.templ (#371) Fixes ``` execution_engine2.exceptions.IncorrectParamsException: Found illegal disk request '100GBraiss' in job requirements from section 'hpc' of the deployment configuration [2021-04-29 01:36:23 +0000] [45] [INFO] Worker exiting (pid: 45) [2021-04-29 01:36:24 +0000] [43] [ERROR] Exception in worker process Traceback (most recent call last): File "/kb/module/lib/execution_engine2/utils/job_requirements_resolver.py", line 89, in _int_request return int(putative_int) ValueError: invalid literal for int() with base 10: '100GBraiss' ``` --- build/templates/deploy.docker.cfg.templ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/templates/deploy.docker.cfg.templ b/build/templates/deploy.docker.cfg.templ index c237b344a..e314aee23 100644 --- a/build/templates/deploy.docker.cfg.templ +++ b/build/templates/deploy.docker.cfg.templ @@ -76,7 +76,7 @@ request_disk = 100GB [hpc] request_cpus = 4 request_memory = 2000M -request_disk = 100GBraiss +request_disk = 100GB #---------------------------------------------------------------------------------------# [DEFAULT] default_client_group = njs From adf286384681883deee01c11e57d42c9a4e6aebd Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 29 Apr 2021 08:50:42 -0700 Subject: [PATCH 059/109] DATAUP-389 - Add unhappy path integration tests for run_job_concierge (#370) * Add run_job_concierge unhappy tests * run black * DRY up some api to db test params * oopsie --- test/tests_for_integration/api_to_db_test.py | 183 +++++++++++++++++-- 1 file changed, 172 insertions(+), 11 deletions(-) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 7f36ce648..09ae0094e 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -94,6 +94,9 @@ MONGO_EE2_DB = "ee2" MONGO_EE2_JOBS_COL = "ee2_jobs" +_MOD = "mod.meth" +_APP = "mod/app" + @fixture(scope="module") def config() -> Dict[str, str]: @@ -417,8 +420,8 @@ def _set_up_workspace_objects(ws_controller, token): def _get_run_job_param_set(): return { - "method": "mod.meth", - "app_id": "mod/app", + "method": _MOD, + "app_id": _APP, "wsid": 1, "source_ws_objects": ["1/1/1", "1/2/1"], "params": [{"foo": "bar"}, 42], @@ -443,7 +446,7 @@ def _get_condor_sub_for_rj_param_set(job_id, user, token, clientgroup, cpu, mem, "+KB_PARENT_JOB_ID": '"totallywrongid"', "+KB_MODULE_NAME": '"mod"', "+KB_FUNCTION_NAME": '"meth"', - "+KB_APP_ID": '"mod/app"', + "+KB_APP_ID": f'"{_APP}"', "+KB_APP_MODULE_NAME": '"mod"', "+KB_WSID": '"1"', "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', @@ -484,10 +487,10 @@ def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, gi "status": "queued", "job_input": { "wsid": 1, - "method": "mod.meth", + "method": _MOD, "params": [{"foo": "bar"}, 42], "service_ver": githash, - "app_id": "mod/app", + "app_id": _APP, "source_ws_objects": ["1/1/1", "1/2/1"], "parent_job_id": "totallywrongid", "requirements": { @@ -559,7 +562,7 @@ def test_run_job(ee2_port, ws_controller, mongo_client): def test_run_job_fail_no_workspace_access(ee2_port): - params = {"method": "mod.meth", "app_id": "mod/app", "wsid": 1} + params = {"method": _MOD, "app_id": _APP, "wsid": 1} # this error could probably use some cleanup err = ( "('An error occurred while fetching user permissions from the Workspace', " @@ -569,21 +572,21 @@ def test_run_job_fail_no_workspace_access(ee2_port): def test_run_job_fail_bad_method(ee2_port): - params = {"method": "mod.meth.moke", "app_id": "mod/app"} + params = {"method": "mod.meth.moke", "app_id": _APP} err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def test_run_job_fail_bad_app(ee2_port): - params = {"method": "mod.meth", "app_id": "mod.app"} + params = {"method": _MOD, "app_id": "mod.app"} err = "Application ID 'mod.app' contains a '.'" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def test_run_job_fail_bad_upa(ee2_port): params = { - "method": "mod.meth", - "app_id": "mod/app", + "method": _MOD, + "app_id": _APP, "source_ws_objects": ["ws/obj/1"], } err = ( @@ -604,7 +607,7 @@ def test_run_job_fail_no_such_object(ee2_port, ws_controller): ], } ) - params = {"method": "mod.meth", "app_id": "mod/app", "source_ws_objects": ["1/2/1"]} + params = {"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]} err = "Some workspace object is inaccessible" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) @@ -763,3 +766,161 @@ def _run_job_concierge( disk, "somehash", ) + + +def test_run_job_concierge_fail_no_workspace_access(ee2_port): + params = {"method": _MOD, "app_id": _APP, "wsid": 1} + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 1 exists'))" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_not_concierge(ee2_port): + params = {"method": _MOD, "app_id": _APP} + err = "You are not the concierge user. This method is not for you" + _run_job_concierge_fail(ee2_port, TOKEN_NO_ADMIN, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_bad_method(ee2_port): + params = {"method": "mod.meth.moke", "app_id": _APP} + err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_reqs_list_not_list(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"requirements_list": {"a": "b"}} + err = "requirements_list must be a list" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_reqs_list_bad_req(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"requirements_list": ["a=b", "touchmymonkey"]} + err = "Found illegal requirement in requirements_list: touchmymonkey" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_cpu(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"request_cpus": [2]} + err = ( + "Found illegal cpu request '[2]' in job requirements from concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_mem(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"request_memory": "-3"} + err = "memory in MB must be at least 1" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_disk(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"request_disk": 4.5} + err = ( + "Found illegal disk request '4.5' in job requirements from concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_clientgroup(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"client_group": "fakefakefake"} + err = "No such clientgroup: fakefakefake" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_clientgroup_regex(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"client_group_regex": "now I have 2 problems"} + err = ( + "Found illegal client group regex 'now I have 2 problems' in job requirements " + + "from concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_catalog_data(ee2_port): + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [{"client_groups": ['{"request_cpus":-8}']}] + + params = {"method": _MOD, "app_id": _APP} + conc_params = {"request_memory": 9} + # TODO this is not a useful error for the user. Need to change the job reqs resolver + # However, getting this wrong in the catalog is not super likely so not urgent + err = "CPU count must be at least 1" + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err + ) + + +def test_run_job_concierge_fail_bad_reqs_item(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"requirements_list": ["a=b", "=c"]} + # this error isn't the greatest but as I understand it the concierge endpoint is going + # to become redundant so don't worry about it for now + err = "Missing input parameter: key in scheduler requirements structure" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_debug_mode(ee2_port): + params = {"method": _MOD, "app_id": _APP} + conc_params = {"debug_mode": "debug debug debug"} + err = ( + "Found illegal debug mode 'debug debug debug' in job requirements from " + + "concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_app(ee2_port): + params = {"method": _MOD, "app_id": "mod.app"} + err = "Application ID 'mod.app' contains a '.'" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_bad_upa(ee2_port): + params = { + "method": _MOD, + "app_id": _APP, + "source_ws_objects": ["ws/obj/1"], + } + err = ( + "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_no_such_object(ee2_port, ws_controller): + # Set up workspace and objects + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + params = {"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]} + err = "Some workspace object is inaccessible" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def _run_job_concierge_fail( + ee2_port, token, params, conc_params, expected, throw_exception=False +): + client = ee2client(f"http://localhost:{ee2_port}", token=token) + if throw_exception: + client.run_job_concierge(params, conc_params) + else: + with raises(ServerError) as got: + client.run_job_concierge(params, conc_params) + assert_exception_correct(got.value, ServerError("name", 1, expected)) From dae866d7ed3967c93012c9fdafc6ab15b380daa6 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 29 Apr 2021 09:50:08 -0700 Subject: [PATCH 060/109] Make test pass in docker container (#374) The tests key off `KB_DEPLOYMENT_CONFIG` to find the `deploy.cfg` file, which in the continer points to `/kb/module/deploy.cfg`. It appears that that file is generated from the template in `build/templates`, which hadn't been updated with the `concierge` section, so the tests failed. --- build/templates/deploy.docker.cfg.templ | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build/templates/deploy.docker.cfg.templ b/build/templates/deploy.docker.cfg.templ index e314aee23..849e88219 100644 --- a/build/templates/deploy.docker.cfg.templ +++ b/build/templates/deploy.docker.cfg.templ @@ -47,6 +47,11 @@ transfer_input_files = ../scripts/JobRunner.tgz # Log Level and sending DEBUG=true to the jobs, which means containers do not get cleaned up debug = false +#---------------------------------------------------------------------------------------# +[concierge] +request_cpus = 4 +request_memory = 23000M +request_disk = 100GB #---------------------------------------------------------------------------------------# [njs] request_cpus = 4 From f5f0b595a4f77d17cfd1ca8369afeab8762b9528 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 30 Apr 2021 11:08:23 -0500 Subject: [PATCH 061/109] SCT-3026 Upgrade mongo-engine (#372) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 339d37407..dcf5ca800 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,7 +33,7 @@ maps==5.1.1 MarkupSafe==1.1.1 memory-profiler==0.55.0 mock==3.0.5 -mongoengine==0.18.2 +mongoengine==0.23.0 multidict==4.5.2 nose==1.3.7 packaging==20.4 From be5b132d439a83d30cffda36a48a8bdf86024542 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 30 Apr 2021 10:03:13 -0700 Subject: [PATCH 062/109] Update mongoengine in pipfile to match reqs.txt (#375) --- Pipfile | 2 +- Pipfile.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Pipfile b/Pipfile index dec351075..d289deab1 100644 --- a/Pipfile +++ b/Pipfile @@ -38,7 +38,7 @@ iniconfig = "==1.1.1" maps = "==5.1.1" memory-profiler = "==0.55.0" mock = "==3.0.5" -mongoengine = "==0.18.2" +mongoengine = "==0.23.0" multidict = "==4.5.2" nose = "==1.3.7" pluggy = "==0.13.1" diff --git a/Pipfile.lock b/Pipfile.lock index 70e4c074b..7fc1df1ba 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b3b343fb441fabbc794f1f8283b2f6737578c7fa482703fb9dce293bb6d91adb" + "sha256": "5583d70e51897cfd1749f9b1dacbe8441656f04501f77e8ff5119ffbdd7b2dbd" }, "pipfile-spec": 6, "requires": { @@ -495,11 +495,11 @@ }, "mongoengine": { "hashes": [ - "sha256:9301ca84ada9377a200a50541f9be7d5308081bf2112049d00e1dd163f80b940", - "sha256:fa3e73c966fca2b814cc1103ac4f55bcca7aae05028b112ef0cc8b321ee4a2f7" + "sha256:136d93af442d867e1d52f90c3d3066112ad91578239d0bf5b8131b3ea15b3d75", + "sha256:4b5a4aa317a138f09df956deaac1d62c1618af24a27c1826a6893709007d9047" ], "index": "pypi", - "version": "==0.18.2" + "version": "==0.23.0" }, "multidict": { "hashes": [ From 3d4cb0a890b1554d0f0decee746b1553b469a020 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 30 Apr 2021 12:49:07 -0700 Subject: [PATCH 063/109] DATAUP-389 - Add integration tests for run_job_batch (#373) * Add happy path tests for run_job_batch Also update the ee2 client * Add unhappy patch run_job_batch integration tests * DRY up some parameters * run black * improve readabilty of function calls * run black --- dependencies.json | 4 + .../execution_engine2Client.py | 980 +++++++++++------- test/tests_for_integration/api_to_db_test.py | 359 ++++++- 3 files changed, 960 insertions(+), 383 deletions(-) diff --git a/dependencies.json b/dependencies.json index 4553e5758..257a87658 100644 --- a/dependencies.json +++ b/dependencies.json @@ -1,4 +1,8 @@ [ { + "module_name" : "execution_engine2", + "type" : "core", + "file_path" : "./execution_engine2.spec" +}, { "module_name" : "FakeObjectsForTests", "type" : "sdk", "version_tag" : "release" diff --git a/lib/installed_clients/execution_engine2Client.py b/lib/installed_clients/execution_engine2Client.py index 77b02084c..3e1d64f1e 100644 --- a/lib/installed_clients/execution_engine2Client.py +++ b/lib/installed_clients/execution_engine2Client.py @@ -102,50 +102,38 @@ def run_job(self, params, context=None): """ Start a new job (long running method of service registered in ServiceRegistery). Such job runs Docker image for this service in script mode. - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String :returns: instance of type "job_id" (A job id.) """ @@ -153,66 +141,126 @@ def run_job(self, params, context=None): "execution_engine2.run_job", [params], self._service_ver, context ) + def run_job_batch(self, params, batch_params, context=None): + """ + :param params: instance of list of type "RunJobParams" (method - the + SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional + parameters: service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, + parameter "wsid" of Long, parameter "parent_job_id" of String + :param batch_params: instance of type "BatchParams" -> structure: + parameter "wsid" of Long + :returns: instance of type "BatchSubmission" -> structure: parameter + "parent_job_id" of type "job_id" (A job id.), parameter + "child_job_ids" of list of type "job_id" (A job id.) + """ + return self._client.call_method( + "execution_engine2.run_job_batch", + [params, batch_params], + self._service_ver, + context, + ) + + def abandon_children(self, params, context=None): + """ + :param params: instance of type "AbandonChildren" -> structure: + parameter "parent_job_id" of type "job_id" (A job id.), parameter + "child_job_ids" of list of type "job_id" (A job id.), parameter + "as_admin" of type "boolean" (@range [0,1]) + :returns: instance of type "BatchSubmission" -> structure: parameter + "parent_job_id" of type "job_id" (A job id.), parameter + "child_job_ids" of list of type "job_id" (A job id.) + """ + return self._client.call_method( + "execution_engine2.abandon_children", [params], self._service_ver, context + ) + def run_job_concierge(self, params, concierge_params, context=None): """ - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int - request_memory: int in MB request_disk: int in MB job_priority: + request_memory: int in MB request_disk: int in GB job_priority: int = None range from -20 to +20, with higher values meaning - better priority. account_group: str = None # Someone elses account - requirements_list: list = None ['machine=worker102','color=red'] - client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can - leave default or specify a clientgroup) -> structure: parameter - "request_cpu" of Long, parameter "request_memory_mb" of Long, - parameter "request_disk_mb" of Long, parameter "job_priority" of - Long, parameter "account_group" of String, parameter - "requirements_list" of list of String, parameter "client_group" of - String + better priority. Note: job_priority is currently not implemented. + account_group: str = None # Someone elses account + ignore_concurrency_limits: ignore any limits on simultaneous job + runs. Default 1 (True). requirements_list: list = None + ['machine=worker102','color=red'] client_group: Optional[str] = + CONCIERGE_CLIENTGROUP # You can leave default or specify a + clientgroup client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. debug_mode: Whether to run the job in debug mode. Default + 0 (False).) -> structure: parameter "request_cpu" of Long, + parameter "request_memory" of Long, parameter "request_disk" of + Long, parameter "job_priority" of Long, parameter "account_group" + of String, parameter "ignore_concurrency_limits" of type "boolean" + (@range [0,1]), parameter "requirements_list" of list of String, + parameter "client_group" of String, parameter "client_group_regex" + of type "boolean" (@range [0,1]), parameter "debug_mode" of type + "boolean" (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ return self._client.call_method( @@ -228,50 +276,38 @@ def get_job_params(self, params, context=None): necessary for job execution @optional as_admin) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "RunJobParams" (method - service defined - in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + :returns: instance of type "RunJobParams" (method - the SDK method to + run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String """ return self._client.call_method( @@ -417,49 +453,37 @@ def check_job(self, params, context=None): parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -474,6 +498,171 @@ def check_job(self, params, context=None): "execution_engine2.check_job", [params], self._service_ver, context ) + def check_job_batch(self, params, context=None): + """ + get current status of a parent job, and it's children, if it has any. + :param params: instance of type "CheckJobParams" (exclude_fields: + exclude certain fields to return. default None. exclude_fields + strings can be one of fields defined in + execution_engine2.db.models.models.Job) -> structure: parameter + "job_id" of type "job_id" (A job id.), parameter "exclude_fields" + of list of String, parameter "as_admin" of type "boolean" (@range + [0,1]) + :returns: instance of type "CheckJobBatchResults" (parent_job - state + of parent job job_states - states of child jobs IDEA: ADD + aggregate_states - count of all available child job states, even + if they are zero) -> structure: parameter "parent_jobstate" of + type "JobState" (job_id - string - id of the job user - string - + user who started the job wsid - int - optional id of the workspace + where the job is bound authstrat - string - what strategy used to + authenticate the job job_input - object - inputs to the job (from + the run_job call) ## TODO - verify updated - int - timestamp + since epoch in milliseconds of the last time the status was + updated running - int - timestamp since epoch in milliseconds of + when it entered the running state created - int - timestamp since + epoch in milliseconds when the job was created finished - int - + timestamp since epoch in milliseconds when the job was finished + status - string - status of the job. one of the following: created + - job has been created in the service estimating - an estimation + job is running to estimate resources required for the main job, + and which queue should be used queued - job is queued to be run + running - job is running on a worker node completed - job was + completed successfully error - job is no longer running, but + failed with an error terminated - job is no longer running, + terminated either due to user cancellation, admin cancellation, or + some automated task error_code - int - internal reason why the job + is an error. one of the following: 0 - unknown 1 - job crashed 2 - + job terminated by automation 3 - job ran over time limit 4 - job + was missing its automated output document 5 - job authentication + token expired errormsg - string - message (e.g. stacktrace) + accompanying an errored job error - object - the JSON-RPC error + package that accompanies the error code and message + terminated_code - int - internal reason why a job was terminated, + one of: 0 - user cancellation 1 - admin cancellation 2 - + terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional + parameters: service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, + parameter "wsid" of Long, parameter "parent_job_id" of String, + parameter "created" of Long, parameter "queued" of Long, parameter + "estimating" of Long, parameter "running" of Long, parameter + "finished" of Long, parameter "updated" of Long, parameter "error" + of type "JsonRpcError" (Error block of JSON RPC response) -> + structure: parameter "name" of String, parameter "code" of Long, + parameter "message" of String, parameter "error" of String, + parameter "error_code" of Long, parameter "errormsg" of String, + parameter "terminated_code" of Long, parameter "child_jobstates" + of list of type "JobState" (job_id - string - id of the job user - + string - user who started the job wsid - int - optional id of the + workspace where the job is bound authstrat - string - what + strategy used to authenticate the job job_input - object - inputs + to the job (from the run_job call) ## TODO - verify updated - int + - timestamp since epoch in milliseconds of the last time the + status was updated running - int - timestamp since epoch in + milliseconds of when it entered the running state created - int - + timestamp since epoch in milliseconds when the job was created + finished - int - timestamp since epoch in milliseconds when the + job was finished status - string - status of the job. one of the + following: created - job has been created in the service + estimating - an estimation job is running to estimate resources + required for the main job, and which queue should be used queued - + job is queued to be run running - job is running on a worker node + completed - job was completed successfully error - job is no + longer running, but failed with an error terminated - job is no + longer running, terminated either due to user cancellation, admin + cancellation, or some automated task error_code - int - internal + reason why the job is an error. one of the following: 0 - unknown + 1 - job crashed 2 - job terminated by automation 3 - job ran over + time limit 4 - job was missing its automated output document 5 - + job authentication token expired errormsg - string - message (e.g. + stacktrace) accompanying an errored job error - object - the + JSON-RPC error package that accompanies the error code and message + terminated_code - int - internal reason why a job was terminated, + one of: 0 - user cancellation 1 - admin cancellation 2 - + terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional + parameters: service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, + parameter "wsid" of Long, parameter "parent_job_id" of String, + parameter "created" of Long, parameter "queued" of Long, parameter + "estimating" of Long, parameter "running" of Long, parameter + "finished" of Long, parameter "updated" of Long, parameter "error" + of type "JsonRpcError" (Error block of JSON RPC response) -> + structure: parameter "name" of String, parameter "code" of Long, + parameter "message" of String, parameter "error" of String, + parameter "error_code" of Long, parameter "errormsg" of String, + parameter "terminated_code" of Long + """ + return self._client.call_method( + "execution_engine2.check_job_batch", [params], self._service_ver, context + ) + def check_jobs(self, params, context=None): """ :param params: instance of type "CheckJobsParams" (As in check_job, @@ -519,49 +708,37 @@ def check_jobs(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -622,49 +799,37 @@ def check_workspace_jobs(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -740,14 +905,30 @@ def check_jobs_date_range_for_user(self, params, context=None): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -755,9 +936,34 @@ def check_jobs_date_range_for_user(self, params, context=None): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the @@ -791,49 +997,37 @@ def check_jobs_date_range_for_user(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -842,7 +1036,11 @@ def check_jobs_date_range_for_user(self, params, context=None): structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + parameter "terminated_code" of Long, parameter "count" of Long, + parameter "query_count" of Long, parameter "filter" of mapping + from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ return self._client.call_method( "execution_engine2.check_jobs_date_range_for_user", @@ -855,14 +1053,30 @@ def check_jobs_date_range_for_all(self, params, context=None): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -870,9 +1084,34 @@ def check_jobs_date_range_for_all(self, params, context=None): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the @@ -906,49 +1145,37 @@ def check_jobs_date_range_for_all(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. run_job_batch ignores this + parameter when starting a job batch.) -> structure: parameter + "method" of String, parameter "app_id" of String, parameter + "params" of list of unspecified object, parameter "service_ver" of + String, parameter "source_ws_objects" of list of type "wsref" (A + workspace object reference of the form X/Y or X/Y/Z, where X is + the workspace id, Y is the object id, Z is the version.), + parameter "meta" of type "Meta" (Narrative metadata for a job. All + fields are optional. run_id - the Narrative-assigned ID of the job + run. 1:1 with a job ID. token_id - the ID of the token used to run + the method. tag - the release tag, e.g. dev/beta/release. cell_id + - the ID of the narrative cell from which the job was run.) -> + structure: parameter "run_id" of String, parameter "token_id" of + String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter "parent_job_id" of String, parameter "created" of Long, parameter "queued" of Long, parameter "estimating" of Long, parameter "running" of Long, parameter @@ -957,7 +1184,11 @@ def check_jobs_date_range_for_all(self, params, context=None): structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + parameter "terminated_code" of Long, parameter "count" of Long, + parameter "query_count" of Long, parameter "filter" of mapping + from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ return self._client.call_method( "execution_engine2.check_jobs_date_range_for_all", @@ -993,7 +1224,7 @@ def get_admin_permission(self, context=None): """ Check if current user has ee2 admin rights. If so, return the type of rights and their roles - :returns: instance of type "AdminRolesResults" (str permission; # One + :returns: instance of type "AdminRolesResults" (str permission - One of 'r|w|x' (('read' | 'write' | 'none'))) -> structure: parameter "permission" of String """ @@ -1003,6 +1234,7 @@ def get_admin_permission(self, context=None): def get_client_groups(self, context=None): """ + Get a list of clientgroups manually extracted from the config file :returns: instance of list of String """ return self._client.call_method( diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 09ae0094e..918c22654 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -37,7 +37,7 @@ import pymongo from pytest import fixture, raises from typing import Dict -from unittest.mock import patch, create_autospec, ANY +from unittest.mock import patch, create_autospec, ANY, call from tests_for_integration.auth_controller import AuthController from tests_for_integration.workspace_controller import WorkspaceController @@ -404,12 +404,12 @@ def _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub): sub.queue.assert_called_once_with(txn, 1) -def _set_up_workspace_objects(ws_controller, token): +def _set_up_workspace_objects(ws_controller, token, ws_name="foo"): wsc = Workspace(ws_controller.get_url(), token=token) - wsc.create_workspace({"workspace": "foo"}) + wsid = wsc.create_workspace({"workspace": ws_name})[0] wsc.save_objects( { - "id": 1, + "id": wsid, "objects": [ {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, {"name": "two", "type": "Trivial.Object-1.0", "data": {}}, @@ -437,13 +437,15 @@ def _get_run_job_param_set(): } -def _get_condor_sub_for_rj_param_set(job_id, user, token, clientgroup, cpu, mem, disk): +def _get_condor_sub_for_rj_param_set( + job_id, user, token, clientgroup, cpu, mem, disk, parent_job_id="totallywrongid" +): expected_sub = _get_common_sub(job_id) expected_sub.update( { "JobBatchName": job_id, "arguments": f"{job_id} https://ci.kbase.us/services/ee2", - "+KB_PARENT_JOB_ID": '"totallywrongid"', + "+KB_PARENT_JOB_ID": f'"{parent_job_id}"', "+KB_MODULE_NAME": '"mod"', "+KB_FUNCTION_NAME": '"meth"', "+KB_APP_ID": f'"{_APP}"', @@ -461,7 +463,7 @@ def _get_condor_sub_for_rj_param_set(job_id, user, token, clientgroup, cpu, mem, '"DOCKER_JOB_TIMEOUT=604805 KB_ADMIN_AUTH_TOKEN=test_auth_token ' + f"KB_AUTH_TOKEN={token} CLIENTGROUP={clientgroup} JOB_ID={job_id} " + "CONDOR_ID=$(Cluster).$(Process) PYTHON_EXECUTABLE=/miniconda/bin/python " - + 'DEBUG_MODE=False PARENT_JOB_ID=totallywrongid "' + + f'DEBUG_MODE=False PARENT_JOB_ID={parent_job_id} "' ), "leavejobinqueue": "true", "initial_dir": "../scripts/", @@ -473,12 +475,19 @@ def _get_condor_sub_for_rj_param_set(job_id, user, token, clientgroup, cpu, mem, return expected_sub -def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, githash): +def _get_mongo_job(mongo_client, job_id, has_queued=True): + # also checks and removes the queued and updated times job = mongo_client[MONGO_EE2_DB][MONGO_EE2_JOBS_COL].find_one( {"_id": ObjectId(job_id)} ) assert_close_to_now(job.pop("updated")) - assert_close_to_now(job.pop("queued")) + if has_queued: + assert_close_to_now(job.pop("queued")) + return job + + +def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, githash): + job = _get_mongo_job(mongo_client, job_id) expected_job = { "_id": ObjectId(job_id), "user": user, @@ -924,3 +933,335 @@ def _run_job_concierge_fail( with raises(ServerError) as got: client.run_job_concierge(params, conc_params) assert_exception_correct(got.value, ServerError("name", 1, expected)) + + +######## run_job_batch tests ######## + + +def test_run_job_batch(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "foo") # ws 1 + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "bar") # ws 2 + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.side_effect = [123, 456] + list_cgroups.side_effect = [ + [{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + [{"client_groups": ['{"client_group":"bigmem"}']}], + ] + get_mod_ver.side_effect = [ + {"git_commit_hash": "somehash"}, + {"git_commit_hash": "somehash2"}, + ] + + # run the method + job1_params = { + "method": _MOD, + "app_id": _APP, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "thiskey": "getssilentlydropped", + }, + } + job2_params = { + "method": "mod2.meth2", + "app_id": "mod2/app2", + "wsid": 1, + "params": [{"baz": "bat"}, 3.14], + } + job_batch_params = { + "wsid": 2, + "meta": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + "thiskey": "getssilentlydropped2", + }, + } + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) + ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + parent_job_id = ret["parent_job_id"] + job_id_1, job_id_2 = ret["child_job_ids"] + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "version": "beta"}), + call(ANY, {"module_name": "mod2", "version": "release"}), + ] + ) + list_cgroups.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "function_name": "meth"}), + call(ANY, {"module_name": "mod2", "function_name": "meth2"}), + ] + ) + + job1 = _get_mongo_job(mongo_client, job_id_1) + job2 = _get_mongo_job(mongo_client, job_id_2) + + expected_job1 = { + "_id": ObjectId(job_id_1), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "job_input": { + "method": _MOD, + "params": [{"foo": "bar"}, 42], + "service_ver": "somehash", + "app_id": _APP, + "source_ws_objects": ["1/1/1", "1/2/1"], + "parent_job_id": parent_job_id, + "requirements": { + "clientgroup": "njs", + "cpu": 8, + "memory": 5, + "disk": 30, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + }, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + assert job1 == expected_job1 + + expected_job2 = { + "_id": ObjectId(job_id_2), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": 1, + "status": "queued", + "job_input": { + "wsid": 1, + "method": "mod2.meth2", + "params": [{"baz": "bat"}, 3.14], + "service_ver": "somehash2", + "app_id": "mod2/app2", + "source_ws_objects": [], + "parent_job_id": parent_job_id, + "requirements": { + "clientgroup": "bigmem", + "cpu": 4, + "memory": 2000, + "disk": 100, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "456", + "scheduler_type": "condor", + } + assert job2 == expected_job2 + + parent_job = _get_mongo_job(mongo_client, parent_job_id, has_queued=False) + expected_parent_job = { + "_id": ObjectId(parent_job_id), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": 2, + "status": "created", + "job_input": { + "method": "batch", + "service_ver": "batch", + "app_id": "batch", + "source_ws_objects": [], + "narrative_cell_info": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + }, + }, + "child_jobs": [job_id_1, job_id_2], + "batch_job": True, + } + assert parent_job == expected_parent_job + + expected_sub_1 = _get_condor_sub_for_rj_param_set( + job_id_1, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + parent_job_id=parent_job_id, + ) + expected_sub_1["+KB_WSID"] = "" + expected_sub_2 = _get_condor_sub_for_rj_param_set( + job_id_2, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="bigmem", + cpu=4, + mem=2000, + disk=100, + parent_job_id=parent_job_id, + ) + expected_sub_2.update( + { + "+KB_MODULE_NAME": '"mod2"', + "+KB_FUNCTION_NAME": '"meth2"', + "+KB_APP_ID": '"mod2/app2"', + "+KB_APP_MODULE_NAME": '"mod2"', + "+KB_SOURCE_WS_OBJECTS": "", + } + ) + + assert sub_init.call_args_list == [call(expected_sub_1), call(expected_sub_2)] + # The line above and the line below should be completely equivalent IIUC, but the line + # below fails for reasons I don't understand. The error output shows the actual calls + # for the line below having 2 extra calls that appear to be the sub.queue calls + # below. Stumped, so going with what works and moving on. + # sub_init.assert_has_calls([call(expected_sub_1), call(expected_sub_2)]) + schedd_init.call_args_list = [call(), call()] + # same deal here. Output includes stuff like `call().transaction()` so + # it appears the sub calls are being picked up, which is weird. + # schedd_init.assert_has_calls([call(), call()]) + schedd.transaction.call_args_list = [call(), call()] + # and again + # schedd.transaction.assert_has_calls([call(), call()]) + sub.queue.assert_has_calls([call(txn, 1), call(txn, 1)]) + + +def test_run_job_batch_fail_no_workspace_access_for_batch(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [{"method": _MOD, "app_id": _APP}] + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 2 exists'))" + ) + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 2}, err) + + +def test_run_job_batch_fail_no_workspace_access_for_job(ee2_port): + params = [ + {"method": _MOD, "app_id": _APP}, + {"method": _MOD, "app_id": _APP, "wsid": 1}, + ] + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 1 exists'))" + ) + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_catalog_data(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [{"client_groups": ['{"request_cpus":-8}']}] + + params = [{"method": _MOD, "app_id": _APP}] + # TODO this is not a useful error for the user. Need to change the job reqs resolver + # However, getting this wrong in the catalog is not super likely so not urgent + err = "CPU count must be at least 1" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_method(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD, "app_id": _APP}, + {"method": "mod.meth.moke", "app_id": _APP}, + ] + err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" + # TODO this test surfaced a bug - if a batch wsid is not supplied and any job does not have + # a wsid an error occurs + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_app(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [{"method": _MOD, "app_id": "mod.app"}] + err = "Application ID 'mod.app' contains a '.'" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_upa(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + { + "method": _MOD, + "app_id": _APP, + "source_ws_objects": ["ws/obj/1"], + } + ] + err = ( + "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" + ) + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_parent_id(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + + params = [{"method": _MOD, "app_id": _APP, "parent_job_id": "ae"}] + err = "Batch jobs may not specify a parent job ID" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + params = [ + {"method": _MOD, "app_id": _APP}, + {"method": _MOD, "app_id": _APP, "parent_job_id": "ae"}, + ] + err = "Job #2: batch jobs may not specify a parent job ID" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_no_such_object(ee2_port, ws_controller): + # Set up workspace and objects + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + params = [{"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]}] + err = "Some workspace object is inaccessible" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def _run_job_batch_fail( + ee2_port, token, params, batch_params, expected, throw_exception=False +): + client = ee2client(f"http://localhost:{ee2_port}", token=token) + if throw_exception: + client.run_job_batch(params, batch_params) + else: + with raises(ServerError) as got: + client.run_job_batch(params, batch_params) + assert_exception_correct(got.value, ServerError("name", 1, expected)) From 98d90bad3cd1f311b88320628abb79a896de89fa Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 3 May 2021 12:56:09 -0700 Subject: [PATCH 064/109] DATAUP-389 - Make tests pass when CI is down (#377) * Make ee2_admin_mode_test.py pass when CI down One test apparently deliberately contacts the auth service. For now just providing a toggle to skip it, but auth should probably be mocked out and/or the test moved to an integration test or removed altogether. * Make SDK_EE2Runjob_test run when CI is down * Make ee2_load_test pass when CI is down * Make api_to_db_test.py pass when CI is down That catalog call is tricky because even totally fake input gives a valid return * run black * Make mock names more readable --- test/tests_for_auth/ee2_admin_mode_test.py | 42 ++++++++----- test/tests_for_integration/api_to_db_test.py | 60 +++++++++++++++---- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 13 +++- test/tests_for_sdkmr/ee2_load_test.py | 11 +++- test/utils_shared/mock_utils.py | 25 ++++---- 5 files changed, 109 insertions(+), 42 deletions(-) diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index 93d569476..315d2bbdb 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -32,6 +32,11 @@ from test.utils_shared.mock_utils import get_client_mocks as _get_client_mocks +# Cause any tests that contact external services (e.g. KBASE CI auth) as part of the test to +# pass automatically. +SKIP_TESTS_WITH_EXTERNALITIES = False + + class EE2TestAdminMode(unittest.TestCase): @classmethod def setUpClass(cls): @@ -121,17 +126,18 @@ def get_user_mocks( def get_client_mocks(self, *to_be_mocked): return _get_client_mocks(self.cfg, self.config_file, *to_be_mocked) - @patch.object( - Catalog, - "get_module_version", - return_value={"git_commit_hash": "moduleversiongoeshere"}, - ) - def test_regular_user(self, catalog): + def test_regular_user(self): # Regular User lowly_user = "Access Denied: You are not an administrator" user_client_set, _, ws_auth = self.get_user_mocks() - clients_and_mocks = self.get_client_mocks(AdminAuthUtil) + clients_and_mocks = self.get_client_mocks(AdminAuthUtil, Catalog) aau = clients_and_mocks[AdminAuthUtil] + catalog = clients_and_mocks[Catalog] + # TODO check catalog called as expected + catalog.get_module_version.return_value = { + "git_commit_hash": "moduleversiongoeshere" + } + catalog.list_client_group_configs.return_value = [] aau.get_admin_role.return_value = None ws_auth.can_write.return_value = True runner = self.getRunner(user_client_set, clients_and_mocks[ClientSet]) @@ -212,18 +218,19 @@ def test_regular_user(self, catalog): # Start the job and get its status as an admin - @patch.object( - Catalog, - "get_module_version", - return_value={"git_commit_hash": "moduleversiongoeshere"}, - ) @patch.object(WorkspaceAuth, "can_write", return_value=True) - def test_admin_writer(self, workspace, catalog): + def test_admin_writer(self, workspace): # Admin User with WRITE - clients_and_mocks = self.get_client_mocks(AdminAuthUtil) + clients_and_mocks = self.get_client_mocks(AdminAuthUtil, Catalog) clients = clients_and_mocks[ClientSet] adminauth = clients_and_mocks[AdminAuthUtil] + catalog = clients_and_mocks[Catalog] + # TODO check catalog called as expected + catalog.get_module_version.return_value = { + "git_commit_hash": "moduleversiongoeshere" + } + catalog.list_client_group_configs.return_value = [] runner = self.getRunner(None, clients) adminauth.get_admin_role.return_value = ADMIN_READ_ROLE @@ -274,7 +281,12 @@ def test_admin_writer(self, workspace, catalog): # These tests should throw the most errors def test_no_user(self): - # No Token + if SKIP_TESTS_WITH_EXTERNALITIES: + return + # Passes a fake token to the auth server, guaranteed to fail. + # Auth is *not mocked*, hits the real auth service. Will fail if CI is down. + # Not sure of the value of this test - if a client actually passes a bad token to the + # server it'll get caught in the Server.py file before the Impl file is reached. runner = self.getRunner() method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 918c22654..31ca2adb1 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -589,7 +589,9 @@ def test_run_job_fail_bad_method(ee2_port): def test_run_job_fail_bad_app(ee2_port): params = {"method": _MOD, "app_id": "mod.app"} err = "Application ID 'mod.app' contains a '.'" - _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def test_run_job_fail_bad_upa(ee2_port): @@ -601,7 +603,9 @@ def test_run_job_fail_bad_upa(ee2_port): err = ( "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" ) - _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def test_run_job_fail_no_such_object(ee2_port, ws_controller): @@ -618,7 +622,9 @@ def test_run_job_fail_no_such_object(ee2_port, ws_controller): ) params = {"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]} err = "Some workspace object is inaccessible" - _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def _run_job_fail(ee2_port, token, params, expected, throw_exception=False): @@ -842,7 +848,11 @@ def test_run_job_concierge_fail_bad_clientgroup(ee2_port): params = {"method": _MOD, "app_id": _APP} conc_params = {"client_group": "fakefakefake"} err = "No such clientgroup: fakefakefake" - _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err + ) def test_run_job_concierge_fail_bad_clientgroup_regex(ee2_port): @@ -891,7 +901,11 @@ def test_run_job_concierge_fail_bad_debug_mode(ee2_port): def test_run_job_concierge_fail_bad_app(ee2_port): params = {"method": _MOD, "app_id": "mod.app"} err = "Application ID 'mod.app' contains a '.'" - _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err + ) def test_run_job_concierge_fail_bad_upa(ee2_port): @@ -903,7 +917,11 @@ def test_run_job_concierge_fail_bad_upa(ee2_port): err = ( "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" ) - _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err + ) def test_run_job_concierge_fail_no_such_object(ee2_port, ws_controller): @@ -920,7 +938,11 @@ def test_run_job_concierge_fail_no_such_object(ee2_port, ws_controller): ) params = {"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]} err = "Some workspace object is inaccessible" - _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err + ) def _run_job_concierge_fail( @@ -1198,14 +1220,18 @@ def test_run_job_batch_fail_bad_method(ee2_port, ws_controller): err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" # TODO this test surfaced a bug - if a batch wsid is not supplied and any job does not have # a wsid an error occurs - _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) def test_run_job_batch_fail_bad_app(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) params = [{"method": _MOD, "app_id": "mod.app"}] err = "Application ID 'mod.app' contains a '.'" - _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) def test_run_job_batch_fail_bad_upa(ee2_port, ws_controller): @@ -1220,7 +1246,9 @@ def test_run_job_batch_fail_bad_upa(ee2_port, ws_controller): err = ( "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" ) - _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) def test_run_job_batch_fail_parent_id(ee2_port, ws_controller): @@ -1228,14 +1256,18 @@ def test_run_job_batch_fail_parent_id(ee2_port, ws_controller): params = [{"method": _MOD, "app_id": _APP, "parent_job_id": "ae"}] err = "Batch jobs may not specify a parent job ID" - _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) params = [ {"method": _MOD, "app_id": _APP}, {"method": _MOD, "app_id": _APP, "parent_job_id": "ae"}, ] err = "Job #2: batch jobs may not specify a parent job ID" - _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) def test_run_job_batch_fail_no_such_object(ee2_port, ws_controller): @@ -1252,7 +1284,9 @@ def test_run_job_batch_fail_no_such_object(ee2_port, ws_controller): ) params = [{"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]}] err = "Some workspace object is inaccessible" - _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) def _run_job_batch_fail( diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 1d9823246..e71df36c8 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -18,6 +18,8 @@ get_user_client_set, ) from execution_engine2.sdk.job_submission_parameters import JobRequirements +from installed_clients.CatalogClient import Catalog + from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -85,7 +87,8 @@ def test_init_ok(self): runner = self.getRunner() self.assertTrue(set(class_attri) <= set(runner.__dict__.keys())) - def test_init_job_rec(self): + @patch.object(Catalog, "get_module_version") + def test_init_job_rec(self, get_mod_ver): ori_job_count = Job.objects.count() runner = self.getRunner() @@ -111,8 +114,16 @@ def test_init_job_rec(self): "meta": {"tag": "dev", "token_id": "12345"}, } + get_mod_ver.return_value = { + "git_commit_hash": "048baf3c2b76cb923b3b4c52008ed77dbe20292d" + } + job_id = runner.get_runjob()._init_job_rec(self.user_id, job_params) + get_mod_ver.assert_called_once_with( + {"module_name": "MEGAHIT", "version": "2.2.1"} + ) + self.assertEqual(ori_job_count, Job.objects.count() - 1) job = Job.objects.get(id=job_id) diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 3a98b233f..95bf8c740 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -245,13 +245,20 @@ def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): @patch.object(Condor, "run_job", return_value=si) @patch.object(WorkspaceAuth, "can_write", return_value=True) + @patch( + "installed_clients.CatalogClient.Catalog.list_client_group_configs", + autospec=True, + ) @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) @patch("installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) - def test_run_job_stress(self, ccles, cc, workspace, condor): + def test_run_job_stress( + self, cc_log_stats, cc_get_mod_ver, cc_list_cli_configs, workspace, condor + ): """ testing running 3 different jobs in multiple theads. """ - cc.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_get_mod_ver.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_list_cli_configs.return_value = [] thread_count = self.thread_count # threads to test diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py index 2acdcae78..a3d41c0a3 100644 --- a/test/utils_shared/mock_utils.py +++ b/test/utils_shared/mock_utils.py @@ -14,29 +14,29 @@ from execution_engine2.utils.clients import ClientSet -def _build_job_reqs(config, cfgfile): +def _build_job_reqs(config, cfgfile, impls): with open(cfgfile) as cf: - return JobRequirementsResolver(Catalog(config["catalog-url"]), cf) + return JobRequirementsResolver(impls[Catalog], cf) _CLASS_IMPLEMENTATION_BUILDERS = { - KBaseAuth: lambda config, cfgfile: KBaseAuth( + KBaseAuth: lambda config, cfgfile, impls: KBaseAuth( auth_url=config["auth-url"] + "/api/legacy/KBase/Sessions/Login" ), - AdminAuthUtil: lambda config, cfgfile: AdminAuthUtil( + AdminAuthUtil: lambda config, cfgfile, impls: AdminAuthUtil( config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] ), - Condor: lambda config, cfgfile: Condor(config), - Catalog: lambda config, cfgfile: Catalog(config["catalog-url"]), + Condor: lambda config, cfgfile, impls: Condor(config), + Catalog: lambda config, cfgfile, impls: Catalog(config["catalog-url"]), JobRequirementsResolver: _build_job_reqs, - KafkaClient: lambda config, cfgfile: KafkaClient(config["kafka-host"]), - MongoUtil: lambda config, cfgfile: MongoUtil(config), - SlackClient: lambda config, cfgfile: SlackClient( + KafkaClient: lambda config, cfgfile, impls: KafkaClient(config["kafka-host"]), + MongoUtil: lambda config, cfgfile, impls: MongoUtil(config), + SlackClient: lambda config, cfgfile, impls: SlackClient( config["slack-token"], debug=True, endpoint=config["ee2-url"] ), } -ALL_CLIENTS = _CLASS_IMPLEMENTATION_BUILDERS.keys() +ALL_CLIENTS = sorted(_CLASS_IMPLEMENTATION_BUILDERS.keys(), key=lambda x: x.__name__) def get_client_mocks(config, config_path, *to_be_mocked): @@ -56,7 +56,10 @@ def get_client_mocks(config, config_path, *to_be_mocked): if clazz in to_be_mocked: ret[clazz] = create_autospec(clazz, instance=True, spec_set=True) else: - ret[clazz] = _CLASS_IMPLEMENTATION_BUILDERS[clazz](config, config_path) + # this is a hack - only one client depends on another (JRR -> Cat) + # so we rely on the ALL_CLIENTS sort to make sure the dependency is built before the + # dependent module. If things become more complicated we'll need a dependency graph. + ret[clazz] = _CLASS_IMPLEMENTATION_BUILDERS[clazz](config, config_path, ret) ret[ClientSet] = ClientSet( ret[KBaseAuth], ret[AdminAuthUtil], From 9f2091141f2e91c3c612fbde8f380eca48a437e9 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 4 May 2021 11:08:51 -0700 Subject: [PATCH 065/109] DATAUP-389 - relax app_id requirements (#380) * Make app_id optional and relax requirements There are some KBase processes (KBParallel and Narrative downloads, perhaps more) where it doesn't make sense to provide an app_id but app_id is currently a required field. This PR: - makes app_id an optional field - allows a . rather than a / as the app ID separator as Narrative downloads use the method ID as the app ID - allows just the module name as the app ID as KBParallel sends that Based on an analysis of the production EE2 DB, there aren't any other regular cases of bad app IDs. In the future, as processes stop submitting incorrect app_ids, we may be able to tighten the restrictions on app IDs again. * Since app_id is no longer required, remove from tests ... where it's a red herring * run black * minor typo --- lib/execution_engine2/db/models/models.py | 2 +- lib/execution_engine2/sdk/EE2Runjob.py | 2 +- lib/execution_engine2/utils/Condor.py | 4 +- .../utils/application_info.py | 120 +++++++++----- test/tests_for_integration/api_to_db_test.py | 146 +++++++++++------- test/tests_for_sdkmr/EE2Runjob_test.py | 39 +++-- test/tests_for_utils/Condor_test.py | 6 +- test/tests_for_utils/application_info_test.py | 89 +++++++---- 8 files changed, 256 insertions(+), 152 deletions(-) diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 70a5af1ad..696dc5d62 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -163,7 +163,7 @@ class JobInput(EmbeddedDocument): requested_release = StringField() params = DynamicField() service_ver = StringField(required=True) - app_id = StringField(required=True) + app_id = StringField() source_ws_objects = ListField() parent_job_id = StringField() requirements = EmbeddedDocumentField(JobRequirements) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 37c71c47b..32b35d01f 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -204,7 +204,7 @@ def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParamet return JobSubmissionParameters( job_id, - AppInfo(params[_METHOD], params[_APP_ID]), + AppInfo(params[_METHOD], params.get(_APP_ID)), params[_JOB_REQUIREMENTS], UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), parent_job_id=params.get(_PARENT_JOB_ID), diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 6be7905f8..882509b8d 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -176,8 +176,8 @@ def _add_job_labels(sub: Dict, params: JobSubmissionParameters): sub["+KB_PARENT_JOB_ID"] = params.parent_job_id or "" sub["+KB_MODULE_NAME"] = params.app_info.module sub["+KB_FUNCTION_NAME"] = params.app_info.method - sub["+KB_APP_ID"] = params.app_info.get_application_id() - sub["+KB_APP_MODULE_NAME"] = params.app_info.application_module + sub["+KB_APP_ID"] = params.app_info.get_application_id() or "" + sub["+KB_APP_MODULE_NAME"] = params.app_info.application_module or "" sub["+KB_WSID"] = params.wsid or "" sub["+KB_SOURCE_WS_OBJECTS"] = ",".join(params.source_ws_objects) diff --git a/lib/execution_engine2/utils/application_info.py b/lib/execution_engine2/utils/application_info.py index cf76f1b7d..5ebc6d02d 100644 --- a/lib/execution_engine2/utils/application_info.py +++ b/lib/execution_engine2/utils/application_info.py @@ -2,15 +2,19 @@ Contains information about KBase applications. """ +from typing import Union from execution_engine2.utils.arg_processing import check_string as _check_string from execution_engine2.exceptions import IncorrectParamsException -def _get2part_string(s, sep, name): +def _get2part_string(s, sep, name, err_pt1, err_pt2, desired_sep=None): + desired_sep = desired_sep if desired_sep else sep parts = s.split(sep) if len(parts) != 2: - raise IncorrectParamsException(f"Expected exactly one '{sep}' in {name} '{s}'") - return parts[0].strip(), parts[1].strip() + raise IncorrectParamsException( + f"Expected exactly one '{desired_sep}' in {name} '{s}'" + ) + return _check_string(parts[0], err_pt1), _check_string(parts[1], err_pt2) class AppInfo: @@ -21,57 +25,79 @@ class AppInfo: module - the app's module, e.g. kb_uploadmethods. method - the SDK method the app will run, e.g. import_reads_from_staging application_module - the module containing the application. Under normal conditions this - will be the same as 'module'. + will be the same as 'module', if not None. Always supplied if 'application' is not None. application - the id of the application, e.g. import_fastq_interleaved_as_reads_from_staging. This is the name of the folder in the 'ui/narrative/methods' folder in the app repo - contining the spec files for the app. + contining the spec files for the app. May be None. """ - def __init__(self, method: str, app_id: str, strict=True): + def __init__(self, method: str, app_id: Union[str, None] = None, strict=True): """ Create the application information. method - the method name, e.g. kb_uploadmethods.import_reads_from_staging - app_id - the app name, either fully qualified (e.g. - kb_uploadmethods/import_fastq_interleaved_as_reads_from_staging or unqualified (e.g. - import_fastq_interleaved_as_reads_from_staging). If fully qualified, the module name - of the app (kb_uploadmethds in this example) must match the module name for the method. + app_id - the app name in the module/app_name format (e.g. + kb_uploadmethods/import_fastq_interleaved_as_reads_from_staging). For historical + reasons, this class will also accept only the module name or the module.app_name + format. In both cases the module name must match that given for the method argument. + Optional. strict - whether the app_id should be processed strictly or not. Without strict=True, - 1) The application module name may be different from the method module name - 2) The application module may be separated from the application name with a '.' - rather than a '/'. + the application module name may be different from the method module name. """ - # Implementation notes: as of this writing, there are app_ids in the ee2 database - # that have a . separator rather than a /, and, in some cases, test data where the - # module for the application and method is not the same, although that should never - # happen in practice. Hence we support non-strict mode to allow for those cases. - mod, meth = _get2part_string( - _check_string(method, "method ID"), ".", "method ID" + # Implementation notes: as of this writing, there are KBase processes that + # submit app_ids to ee2 that: + # 1) have a . separator rather than a / + # - Narrative downloads are a known place where this happens, although + # there are many other jobs in the database with this pattern, so there may be + # unknown processes submitting jobs like this. In most cases, this is just the + # process using the method for the app_id (and note that is often inaccurate). + # 2) consist only of a module ID with no separator + # - KBParallel does this. That may be the only source or there may be other sources + # as well, unknown. + # There are also some records in the database where the module for the application and + # method is not the same - to the best of our knowledge this was one off test data and + # shouldn't be expected to happen in practice. + # As such: + # 1) The only requirement for the app ID is that, if provided, it starts with the module + # given in the method argument. That must be followed by either nothing, or + # a '.' or '/' separator containing an arbitrary string. + # 2) We provide a 'strict' argument to disable even that check, which should be used for + # data loaded from the database. + self.module, self.method = _get2part_string( + _check_string(method, "method ID"), + ".", + "method ID", + "module portion of method ID", + "method portion of method ID", ) - self.module = _check_string(mod, "module portion of method ID") - self.method = _check_string(meth, "method portion of method ID") - app_id = _check_string(app_id, "application ID") - if "/" in app_id and "." in app_id: - raise IncorrectParamsException( - f"Application ID '{app_id}' has both '/' and '.' separators" - ) - if "/" in app_id: - mod, app = _get2part_string(app_id, "/", "application ID") - elif "." in app_id: - if strict: + app_id = _check_string(app_id, "application ID", optional=True) + app = None + sep = None + mod = None + if app_id: + err1 = "module portion of application ID" + err2 = "application portion of application ID" + if "/" in app_id and "." in app_id: raise IncorrectParamsException( - f"Application ID '{app_id}' contains a '.'" + f"Application ID '{app_id}' has both '/' and '.' separators" ) - mod, app = _get2part_string(app_id, ".", "application ID") - else: - mod = self.module - app = app_id - if strict and mod != self.module: + if "/" in app_id: + mod, app = _get2part_string(app_id, "/", "application ID", err1, err2) + sep = "/" + elif "." in app_id: + mod, app = _get2part_string( + app_id, ".", "application ID", err1, err2, "/" + ) + sep = "." + else: + mod = app_id + if strict and mod and mod != self.module: raise IncorrectParamsException( f"Application module '{mod}' must equal method module '{self.module}'" ) - self.application_module = _check_string(mod, "module portion of application ID") - self.application = _check_string(app, "application portion of application ID") + self.application = app + self.application_module = mod + self._sep = sep def get_method_id(self) -> str: """ @@ -81,9 +107,13 @@ def get_method_id(self) -> str: def get_application_id(self) -> str: """ - Get the application id, e.g. module/application + Get the application id, e.g. module/application, if present """ - return f"{self.application_module}/{self.application}" + if not self.application_module: + return None + if self.application: + return f"{self.application_module}{self._sep}{self.application}" + return self.application_module def __eq__(self, other): if type(self) == type(other): @@ -92,15 +122,23 @@ def __eq__(self, other): self.method, self.application_module, self.application, + self._sep, ) == ( other.module, other.method, other.application_module, other.application, + other._sep, ) return False def __hash__(self): return hash( - (self.module, self.method, self.application_module, self.application) + ( + self.module, + self.method, + self.application_module, + self.application, + self._sep, + ) ) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 31ca2adb1..29a9cf8dc 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -418,10 +418,10 @@ def _set_up_workspace_objects(ws_controller, token, ws_name="foo"): ) -def _get_run_job_param_set(): +def _get_run_job_param_set(app_id=_APP): return { "method": _MOD, - "app_id": _APP, + "app_id": app_id, "wsid": 1, "source_ws_objects": ["1/1/1", "1/2/1"], "params": [{"foo": "bar"}, 42], @@ -438,7 +438,16 @@ def _get_run_job_param_set(): def _get_condor_sub_for_rj_param_set( - job_id, user, token, clientgroup, cpu, mem, disk, parent_job_id="totallywrongid" + job_id, + user, + token, + clientgroup, + cpu, + mem, + disk, + parent_job_id="totallywrongid", + app_id=_APP, + app_module="mod", ): expected_sub = _get_common_sub(job_id) expected_sub.update( @@ -448,8 +457,8 @@ def _get_condor_sub_for_rj_param_set( "+KB_PARENT_JOB_ID": f'"{parent_job_id}"', "+KB_MODULE_NAME": '"mod"', "+KB_FUNCTION_NAME": '"meth"', - "+KB_APP_ID": f'"{_APP}"', - "+KB_APP_MODULE_NAME": '"mod"', + "+KB_APP_ID": f'"{app_id}"' if app_id else "", + "+KB_APP_MODULE_NAME": f'"{app_module}"' if app_module else "", "+KB_WSID": '"1"', "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', "request_cpus": f"{cpu}", @@ -486,7 +495,9 @@ def _get_mongo_job(mongo_client, job_id, has_queued=True): return job -def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, githash): +def _check_mongo_job( + mongo_client, job_id, user, app_id, clientgroup, cpu, mem, disk, githash +): job = _get_mongo_job(mongo_client, job_id) expected_job = { "_id": ObjectId(job_id), @@ -499,7 +510,6 @@ def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, gi "method": _MOD, "params": [{"foo": "bar"}, 42], "service_ver": githash, - "app_id": _APP, "source_ws_objects": ["1/1/1", "1/2/1"], "parent_job_id": "totallywrongid", "requirements": { @@ -520,13 +530,20 @@ def _check_mongo_job(mongo_client, job_id, user, clientgroup, cpu, mem, disk, gi "scheduler_id": "123", "scheduler_type": "condor", } + if app_id: + expected_job["job_input"]["app_id"] = app_id assert job == expected_job -def test_run_job(ee2_port, ws_controller, mongo_client): - """ - A test of the run_job method. - """ +def test_run_job_no_app_id(ee2_port, ws_controller, mongo_client): + _run_job(ee2_port, ws_controller, mongo_client, None, None) + + +def test_run_job_with_app_id(ee2_port, ws_controller, mongo_client): + _run_job(ee2_port, ws_controller, mongo_client, "mod/app", "mod") + + +def _run_job(ee2_port, ws_controller, mongo_client, app_id, app_mod): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) # need to get the mock objects first so spec_set can do its magic before we mock out # the classes in the context manager @@ -549,7 +566,8 @@ def test_run_job(ee2_port, ws_controller, mongo_client): # run the method ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) - job_id = ee2.run_job(_get_run_job_param_set()) + params = _get_run_job_param_set(app_id) + job_id = ee2.run_job(params) # check that mocks were called correctly # Since these are class methods, the first argument is self, which we ignore @@ -561,17 +579,33 @@ def test_run_job(ee2_port, ws_controller, mongo_client): ) expected_sub = _get_condor_sub_for_rj_param_set( - job_id, USER_NO_ADMIN, TOKEN_NO_ADMIN, "njs", 8, 5, 30 + job_id, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + app_id=app_id, + app_module=app_mod, ) _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) _check_mongo_job( - mongo_client, job_id, USER_NO_ADMIN, "njs", 8, 5, 30, "somehash" + mongo_client, + job_id, + USER_NO_ADMIN, + app_id, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + githash="somehash", ) def test_run_job_fail_no_workspace_access(ee2_port): - params = {"method": _MOD, "app_id": _APP, "wsid": 1} + params = {"method": _MOD, "wsid": 1} # this error could probably use some cleanup err = ( "('An error occurred while fetching user permissions from the Workspace', " @@ -581,14 +615,14 @@ def test_run_job_fail_no_workspace_access(ee2_port): def test_run_job_fail_bad_method(ee2_port): - params = {"method": "mod.meth.moke", "app_id": _APP} + params = {"method": "mod.meth.moke"} err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) def test_run_job_fail_bad_app(ee2_port): - params = {"method": _MOD, "app_id": "mod.app"} - err = "Application ID 'mod.app' contains a '.'" + params = {"method": _MOD, "app_id": "mod.ap\bp"} + err = "application ID contains control characters" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) @@ -597,7 +631,6 @@ def test_run_job_fail_bad_app(ee2_port): def test_run_job_fail_bad_upa(ee2_port): params = { "method": _MOD, - "app_id": _APP, "source_ws_objects": ["ws/obj/1"], } err = ( @@ -620,7 +653,7 @@ def test_run_job_fail_no_such_object(ee2_port, ws_controller): ], } ) - params = {"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]} + params = {"method": _MOD, "source_ws_objects": ["1/2/1"]} err = "Some workspace object is inaccessible" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] @@ -775,16 +808,17 @@ def _run_job_concierge( mongo_client, job_id, USER_KBASE_CONCIERGE, - clientgroup, - cpu, - mem, - disk, - "somehash", + app_id="mod/app", + clientgroup=clientgroup, + cpu=cpu, + mem=mem, + disk=disk, + githash="somehash", ) def test_run_job_concierge_fail_no_workspace_access(ee2_port): - params = {"method": _MOD, "app_id": _APP, "wsid": 1} + params = {"method": _MOD, "wsid": 1} # this error could probably use some cleanup err = ( "('An error occurred while fetching user permissions from the Workspace', " @@ -794,33 +828,33 @@ def test_run_job_concierge_fail_no_workspace_access(ee2_port): def test_run_job_concierge_fail_not_concierge(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} err = "You are not the concierge user. This method is not for you" _run_job_concierge_fail(ee2_port, TOKEN_NO_ADMIN, params, {"a": "b"}, err) def test_run_job_concierge_fail_bad_method(ee2_port): - params = {"method": "mod.meth.moke", "app_id": _APP} + params = {"method": "mod.meth.moke"} err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) def test_run_job_concierge_fail_reqs_list_not_list(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"requirements_list": {"a": "b"}} err = "requirements_list must be a list" _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) def test_run_job_concierge_fail_reqs_list_bad_req(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"requirements_list": ["a=b", "touchmymonkey"]} err = "Found illegal requirement in requirements_list: touchmymonkey" _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) def test_run_job_concierge_fail_bad_cpu(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"request_cpus": [2]} err = ( "Found illegal cpu request '[2]' in job requirements from concierge parameters" @@ -829,14 +863,14 @@ def test_run_job_concierge_fail_bad_cpu(ee2_port): def test_run_job_concierge_fail_bad_mem(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"request_memory": "-3"} err = "memory in MB must be at least 1" _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) def test_run_job_concierge_fail_bad_disk(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"request_disk": 4.5} err = ( "Found illegal disk request '4.5' in job requirements from concierge parameters" @@ -845,7 +879,7 @@ def test_run_job_concierge_fail_bad_disk(ee2_port): def test_run_job_concierge_fail_bad_clientgroup(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"client_group": "fakefakefake"} err = "No such clientgroup: fakefakefake" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: @@ -856,7 +890,7 @@ def test_run_job_concierge_fail_bad_clientgroup(ee2_port): def test_run_job_concierge_fail_bad_clientgroup_regex(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"client_group_regex": "now I have 2 problems"} err = ( "Found illegal client group regex 'now I have 2 problems' in job requirements " @@ -869,7 +903,7 @@ def test_run_job_concierge_fail_bad_catalog_data(ee2_port): with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [{"client_groups": ['{"request_cpus":-8}']}] - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"request_memory": 9} # TODO this is not a useful error for the user. Need to change the job reqs resolver # However, getting this wrong in the catalog is not super likely so not urgent @@ -880,7 +914,7 @@ def test_run_job_concierge_fail_bad_catalog_data(ee2_port): def test_run_job_concierge_fail_bad_reqs_item(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"requirements_list": ["a=b", "=c"]} # this error isn't the greatest but as I understand it the concierge endpoint is going # to become redundant so don't worry about it for now @@ -889,7 +923,7 @@ def test_run_job_concierge_fail_bad_reqs_item(ee2_port): def test_run_job_concierge_fail_bad_debug_mode(ee2_port): - params = {"method": _MOD, "app_id": _APP} + params = {"method": _MOD} conc_params = {"debug_mode": "debug debug debug"} err = ( "Found illegal debug mode 'debug debug debug' in job requirements from " @@ -899,8 +933,8 @@ def test_run_job_concierge_fail_bad_debug_mode(ee2_port): def test_run_job_concierge_fail_bad_app(ee2_port): - params = {"method": _MOD, "app_id": "mod.app"} - err = "Application ID 'mod.app' contains a '.'" + params = {"method": _MOD, "app_id": "mo\bd.app"} + err = "application ID contains control characters" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] _run_job_concierge_fail( @@ -911,7 +945,6 @@ def test_run_job_concierge_fail_bad_app(ee2_port): def test_run_job_concierge_fail_bad_upa(ee2_port): params = { "method": _MOD, - "app_id": _APP, "source_ws_objects": ["ws/obj/1"], } err = ( @@ -936,7 +969,7 @@ def test_run_job_concierge_fail_no_such_object(ee2_port, ws_controller): ], } ) - params = {"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]} + params = {"method": _MOD, "source_ws_objects": ["1/2/1"]} err = "Some workspace object is inaccessible" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] @@ -992,7 +1025,6 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): # run the method job1_params = { "method": _MOD, - "app_id": _APP, "source_ws_objects": ["1/1/1", "1/2/1"], "params": [{"foo": "bar"}, 42], "service_ver": "beta", @@ -1052,7 +1084,6 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "method": _MOD, "params": [{"foo": "bar"}, 42], "service_ver": "somehash", - "app_id": _APP, "source_ws_objects": ["1/1/1", "1/2/1"], "parent_job_id": parent_job_id, "requirements": { @@ -1137,6 +1168,8 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): mem=5, disk=30, parent_job_id=parent_job_id, + app_id=None, + app_module=None, ) expected_sub_1["+KB_WSID"] = "" expected_sub_2 = _get_condor_sub_for_rj_param_set( @@ -1177,7 +1210,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): def test_run_job_batch_fail_no_workspace_access_for_batch(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) - params = [{"method": _MOD, "app_id": _APP}] + params = [{"method": _MOD}] # this error could probably use some cleanup err = ( "('An error occurred while fetching user permissions from the Workspace', " @@ -1188,8 +1221,8 @@ def test_run_job_batch_fail_no_workspace_access_for_batch(ee2_port, ws_controlle def test_run_job_batch_fail_no_workspace_access_for_job(ee2_port): params = [ - {"method": _MOD, "app_id": _APP}, - {"method": _MOD, "app_id": _APP, "wsid": 1}, + {"method": _MOD}, + {"method": _MOD, "wsid": 1}, ] # this error could probably use some cleanup err = ( @@ -1204,7 +1237,7 @@ def test_run_job_batch_fail_bad_catalog_data(ee2_port, ws_controller): with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [{"client_groups": ['{"request_cpus":-8}']}] - params = [{"method": _MOD, "app_id": _APP}] + params = [{"method": _MOD}] # TODO this is not a useful error for the user. Need to change the job reqs resolver # However, getting this wrong in the catalog is not super likely so not urgent err = "CPU count must be at least 1" @@ -1214,8 +1247,8 @@ def test_run_job_batch_fail_bad_catalog_data(ee2_port, ws_controller): def test_run_job_batch_fail_bad_method(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) params = [ - {"method": _MOD, "app_id": _APP}, - {"method": "mod.meth.moke", "app_id": _APP}, + {"method": _MOD}, + {"method": "mod.meth.moke"}, ] err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" # TODO this test surfaced a bug - if a batch wsid is not supplied and any job does not have @@ -1227,8 +1260,8 @@ def test_run_job_batch_fail_bad_method(ee2_port, ws_controller): def test_run_job_batch_fail_bad_app(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) - params = [{"method": _MOD, "app_id": "mod.app"}] - err = "Application ID 'mod.app' contains a '.'" + params = [{"method": _MOD, "app_id": "mod.\bapp"}] + err = "application ID contains control characters" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) @@ -1239,7 +1272,6 @@ def test_run_job_batch_fail_bad_upa(ee2_port, ws_controller): params = [ { "method": _MOD, - "app_id": _APP, "source_ws_objects": ["ws/obj/1"], } ] @@ -1254,15 +1286,15 @@ def test_run_job_batch_fail_bad_upa(ee2_port, ws_controller): def test_run_job_batch_fail_parent_id(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) - params = [{"method": _MOD, "app_id": _APP, "parent_job_id": "ae"}] + params = [{"method": _MOD, "parent_job_id": "ae"}] err = "Batch jobs may not specify a parent job ID" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) params = [ - {"method": _MOD, "app_id": _APP}, - {"method": _MOD, "app_id": _APP, "parent_job_id": "ae"}, + {"method": _MOD}, + {"method": _MOD, "parent_job_id": "ae"}, ] err = "Job #2: batch jobs may not specify a parent job ID" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: @@ -1282,7 +1314,7 @@ def test_run_job_batch_fail_no_such_object(ee2_port, ws_controller): ], } ) - params = [{"method": _MOD, "app_id": _APP, "source_ws_objects": ["1/2/1"]}] + params = [{"method": _MOD, "source_ws_objects": ["1/2/1"]}] err = "Some workspace object is inaccessible" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 71390f8c6..2444ed3cf 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -154,7 +154,7 @@ def _set_up_common_return_values(mocks): mocks[MongoUtil].get_job.return_value = retjob -def _check_common_mock_calls(mocks, reqs, wsid): +def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): """ Check that mocks are called as expected when those calls are similar or the same for several tests. @@ -170,7 +170,7 @@ def _check_common_mock_calls(mocks, reqs, wsid): # initial job data save expected_job = _create_job( - reqs, wsid=wsid, source_ws_objects=[_WS_REF_1, _WS_REF_2] + reqs, app=app, wsid=wsid, source_ws_objects=[_WS_REF_1, _WS_REF_2] ) assert len(sdkmr.save_job.call_args_list) == 2 got_job = sdkmr.save_job.call_args_list[0][0][0] @@ -179,7 +179,7 @@ def _check_common_mock_calls(mocks, reqs, wsid): kafka.send_kafka_message.assert_any_call(KafkaCreateJob(_USER, _JOB_ID)) jsp_expected = JobSubmissionParameters( _JOB_ID, - AppInfo(_METHOD, _APP), + AppInfo(_METHOD, app), reqs, UserCreds(_USER, _TOKEN), wsid=wsid, @@ -212,6 +212,8 @@ def test_run_as_admin(): This test is a fairly minimal test of the run() method. It does not exercise all the potential code paths or provide all the possible run inputs, such as job parameters, cell metadata, etc. + + Does not include an app_id. """ # Set up data variables @@ -238,7 +240,6 @@ def test_run_as_admin(): rj = EE2RunJob(sdkmr) params = { "method": _METHOD, - "app_id": _APP, "source_ws_objects": [_WS_REF_1, _WS_REF_2], } assert rj.run(params, as_admin=True) == _JOB_ID @@ -246,12 +247,14 @@ def test_run_as_admin(): # check mocks called as expected. The order here is the order that they're called in the code. sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) jrr.resolve_requirements.assert_called_once_with(_METHOD) - _check_common_mock_calls(mocks, reqs, None) + _check_common_mock_calls(mocks, reqs, None, None) def test_run_as_concierge_with_wsid(): """ A unit test of the run() method with a concierge - but not admin - user. + + Includes an app ID. """ # Set up data variables @@ -339,16 +342,22 @@ def test_run_as_concierge_empty_as_admin(): A unit test of the run() method with an effectively empty concierge dict and admin privs. The fake key should be ignored but is required to make the concierge params truthy and trigger the pathway. + + Also provides a module only app ID, as some KBase processes provide these. """ - _run_as_concierge_empty_as_admin({"fake": "foo"}) + _run_as_concierge_empty_as_admin({"fake": "foo"}, "lolcats") def test_run_as_concierge_sched_reqs_None_as_admin(): """ A unit test of the run() method with an concierge dict containing None for the scheduler requirements and admin privs. + + Also provides an app ID with a . instead of a / """ - _run_as_concierge_empty_as_admin({"requirements_list": None}) + _run_as_concierge_empty_as_admin( + {"requirements_list": None}, "lolcats.itsmypartyilllolifiwantto" + ) def test_run_as_concierge_sched_reqs_empty_list_as_admin(): @@ -356,10 +365,10 @@ def test_run_as_concierge_sched_reqs_empty_list_as_admin(): A unit test of the run() method with an concierge dict containing an empty list for the scheduler requirements and admin privs. """ - _run_as_concierge_empty_as_admin({"requirements_list": []}) + _run_as_concierge_empty_as_admin({"requirements_list": []}, _APP) -def _run_as_concierge_empty_as_admin(concierge_params): +def _run_as_concierge_empty_as_admin(concierge_params, app): # Set up data variables client_group = "concierge" # hardcoded default for run_as_concierge @@ -389,7 +398,7 @@ def _run_as_concierge_empty_as_admin(concierge_params): rj = EE2RunJob(sdkmr) params = { "method": _METHOD, - "app_id": _APP, + "app_id": app, "source_ws_objects": [_WS_REF_1, _WS_REF_2], } assert rj.run(params, concierge_params=concierge_params, as_admin=True) == _JOB_ID @@ -413,7 +422,7 @@ def _run_as_concierge_empty_as_admin(concierge_params): scheduler_requirements={}, debug_mode=None, ) - _check_common_mock_calls(mocks, reqs, None) + _check_common_mock_calls(mocks, reqs, None, app) def test_run_fail_concierge_params(): @@ -466,15 +475,11 @@ class and its respective composed classes, and we don't reproduce all the error {}, IncorrectParamsException("Missing input parameter: method ID") ) _run_and_run_batch_fail_illegal_arguments( - {"method": "foo.bar"}, - IncorrectParamsException("Missing input parameter: application ID"), - ) - _run_and_run_batch_fail_illegal_arguments( - {"method": "foo.bar", "app_id": "foo/baz", "wsid": 0}, + {"method": "foo.bar", "wsid": 0}, IncorrectParamsException("wsid must be at least 1"), ) _run_and_run_batch_fail_illegal_arguments( - {"method": "foo.bar", "app_id": "foo/baz", "source_ws_objects": {"a": "b"}}, + {"method": "foo.bar", "source_ws_objects": {"a": "b"}}, IncorrectParamsException("source_ws_objects must be a list"), ) diff --git a/test/tests_for_utils/Condor_test.py b/test/tests_for_utils/Condor_test.py index 9bc099e43..d3c32d09e 100644 --- a/test/tests_for_utils/Condor_test.py +++ b/test/tests_for_utils/Condor_test.py @@ -75,7 +75,7 @@ def test_run_job_minimal(): subinfo = c.run_job( JobSubmissionParameters( "jobbyjob", - AppInfo("foo.bar", "foo/whoo"), + AppInfo("foo.bar"), JobRequirements(2, 3, 4, "cg"), UserCreds("user1", "token"), ) @@ -93,8 +93,8 @@ def test_run_job_minimal(): "+KB_PARENT_JOB_ID": "", "+KB_MODULE_NAME": '"foo"', "+KB_FUNCTION_NAME": '"bar"', - "+KB_APP_ID": '"foo/whoo"', - "+KB_APP_MODULE_NAME": '"foo"', + "+KB_APP_ID": "", + "+KB_APP_MODULE_NAME": "", "+KB_WSID": "", "+KB_SOURCE_WS_OBJECTS": "", "request_cpus": "2", diff --git a/test/tests_for_utils/application_info_test.py b/test/tests_for_utils/application_info_test.py index 978bb2949..364cecd5d 100644 --- a/test/tests_for_utils/application_info_test.py +++ b/test/tests_for_utils/application_info_test.py @@ -4,7 +4,28 @@ from utils_shared.test_utils import assert_exception_correct -def test_app_info_strict_init_success(): +def test_app_info_init_success_minimal_strict(): + ai = AppInfo(" \t mod . meth ") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module is None + assert ai.application is None + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() is None + + +def test_app_info_init_success_no_app_id_strict(): + for appid in [None, " \t "]: + ai = AppInfo(" \t mod . meth ", appid) + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module is None + assert ai.application is None + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() is None + + +def test_app_info_init_success_strict_full(): ai = AppInfo(" \t mod . meth ", "mod/ appthing") assert ai.module == "mod" assert ai.method == "meth" @@ -14,18 +35,28 @@ def test_app_info_strict_init_success(): assert ai.get_application_id() == "mod/appthing" -def test_app_info_without_app_module_strict_init_success(): - ai = AppInfo(" \t mod . meth ", " appthing \t ") +def test_app_info_init_success_strict_full_dot_separator(): + ai = AppInfo(" \t mod . meth ", "mod . appthing") assert ai.module == "mod" assert ai.method == "meth" assert ai.application_module == "mod" assert ai.application == "appthing" assert ai.get_method_id() == "mod.meth" - assert ai.get_application_id() == "mod/appthing" + assert ai.get_application_id() == "mod.appthing" + + +def test_app_info_init_success_strict_with_app_module_only(): + ai = AppInfo(" \t mod . meth ", " mod \t ") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod" + assert ai.application is None + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod" -def test_app_info_init_success(): - ai = AppInfo(" \t mod . meth ", "mod2. appthing", strict=False) +def test_app_info_init_success_non_strict(): + ai = AppInfo(" \t mod . meth ", "mod2/ appthing", strict=False) assert ai.module == "mod" assert ai.method == "meth" assert ai.application_module == "mod2" @@ -36,25 +67,27 @@ def test_app_info_init_success(): def test_app_info_init_fail(): m = "m.n" - a = "m.b" _app_info_init_fail( - None, a, False, IncorrectParamsException("Missing input parameter: method ID") + None, + None, + False, + IncorrectParamsException("Missing input parameter: method ID"), ) _app_info_init_fail( " \t ", - a, + None, False, IncorrectParamsException("Missing input parameter: method ID"), ) _app_info_init_fail( " method ", - a, + None, False, IncorrectParamsException("Expected exactly one '.' in method ID 'method'"), ) _app_info_init_fail( " mod.innermod.method ", - a, + None, False, IncorrectParamsException( "Expected exactly one '.' in method ID 'mod.innermod.method'" @@ -62,7 +95,7 @@ def test_app_info_init_fail(): ) _app_info_init_fail( " . meth", - a, + None, False, IncorrectParamsException( "Missing input parameter: module portion of method ID" @@ -70,24 +103,17 @@ def test_app_info_init_fail(): ) _app_info_init_fail( " mod . ", - a, + None, False, IncorrectParamsException( "Missing input parameter: method portion of method ID" ), ) - - _app_info_init_fail( - m, - None, - False, - IncorrectParamsException("Missing input parameter: application ID"), - ) _app_info_init_fail( m, - " \t ", + "mod / me\tth ", False, - IncorrectParamsException("Missing input parameter: application ID"), + IncorrectParamsException("application ID contains control characters"), ) _app_info_init_fail( m, @@ -105,21 +131,14 @@ def test_app_info_init_fail(): "Expected exactly one '/' in application ID 'mod / meth / bak'" ), ) - _app_info_init_fail( - m, - "mod.meth", - True, - IncorrectParamsException("Application ID 'mod.meth' contains a '.'"), - ) _app_info_init_fail( m, "mod.meth.anothermeth", False, IncorrectParamsException( - "Expected exactly one '.' in application ID 'mod.meth.anothermeth'" + "Expected exactly one '/' in application ID 'mod.meth.anothermeth'" ), ) - _app_info_init_fail( "mod.meth", " mod2 /meth", @@ -170,11 +189,16 @@ def _app_info_init_fail(meth, app, strict, expected): def test_equals(): + assert AppInfo("m.n") == AppInfo("m.n") + assert AppInfo("m.n", "m") == AppInfo("m.n", "m") assert AppInfo("m.n", "m/p") == AppInfo("m.n", "m/p") + assert AppInfo("m.n", "m.p") == AppInfo("m.n", "m.p") assert AppInfo("m.n", "p/p", False) == AppInfo("m.n", "p/p", False) + assert AppInfo("m.n", "p.p", False) == AppInfo("m.n", "p.p", False) assert AppInfo("m.n", "m/p", False) != AppInfo("n.n", "m/p", False) assert AppInfo("m.n", "m/p") != AppInfo("m.x", "m/p") + assert AppInfo("m.n", "m/p") != AppInfo("m.n", "m.p") assert AppInfo("m.n", "m/p", False) != AppInfo("m.n", "x/p", False) assert AppInfo("m.n", "m/p") != AppInfo("m.n", "m/x") assert AppInfo("m.n", "m/p") != ("m.n", "m/x") @@ -184,10 +208,15 @@ def test_hashcode(): # hashes will change from instance to instance of the python interpreter, and therefore # tests can't be written that directly test the hash value. See # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + assert hash(AppInfo("m.n")) == hash(AppInfo("m.n")) + assert hash(AppInfo("m.n", "m")) == hash(AppInfo("m.n", "m")) assert hash(AppInfo("m.n", "m/p")) == hash(AppInfo("m.n", "m/p")) + assert hash(AppInfo("m.n", "m.p")) == hash(AppInfo("m.n", "m.p")) assert hash(AppInfo("m.n", "p/p", False)) == hash(AppInfo("m.n", "p/p", False)) + assert hash(AppInfo("m.n", "p.p", False)) == hash(AppInfo("m.n", "p.p", False)) assert hash(AppInfo("m.n", "m/p", False)) != hash(AppInfo("n.n", "m/p", False)) assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.x", "m/p")) + assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.n", "m.p")) assert hash(AppInfo("m.n", "m/p", False)) != hash(AppInfo("m.n", "x/p", False)) assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.n", "m/x")) From 3a415258183d8611ed3cb0f07068af707190fcc6 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Tue, 4 May 2021 18:40:21 -0700 Subject: [PATCH 066/109] DATAUP-389 - specify job requirements in run_job[_batch] (#381) * Allow specifying job requirements at EE2RunJob layer Next up: process `as_admin` flag in the Server.py class and add integration tests. * run black * minor clarification * DRY up dict creation in EE2Runjob_test * run black --- lib/execution_engine2/sdk/EE2Runjob.py | 126 +++- .../utils/job_requirements_resolver.py | 4 +- test/tests_for_integration/api_to_db_test.py | 4 +- test/tests_for_sdkmr/EE2Runjob_test.py | 543 +++++++++++++++--- .../ee2_SDKMethodRunner_test.py | 6 +- 5 files changed, 590 insertions(+), 93 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 32b35d01f..af9912e5a 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -31,13 +31,18 @@ REQUEST_MEMORY, CLIENT_GROUP, CLIENT_GROUP_REGEX, + BILL_TO_USER, + IGNORE_CONCURRENCY_LIMITS, DEBUG_MODE, ) +from execution_engine2.utils.job_requirements_resolver import RequirementsType from execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange -from execution_engine2.exceptions import IncorrectParamsException +from execution_engine2.exceptions import IncorrectParamsException, AuthError _JOB_REQUIREMENTS = "job_reqs" +_JOB_REQUIREMENTS_INCOMING = "job_requirements" +_SCHEDULER_REQUIREMENTS = "scheduler_requirements" _REQUIREMENTS_LIST = "requirements_list" _METHOD = "method" _APP_ID = "app_id" @@ -331,7 +336,7 @@ def run_batch( wsids = [job_input.get(_WORKSPACE_ID, wsid) for job_input in params] self._check_workspace_permissions_list(wsids) - self._add_job_requirements(params) + self._add_job_requirements(params, bool(as_admin)) # as_admin checked above self._check_job_arguments(params, has_parent_job=True) parent_job = self._create_parent_job(wsid=wsid, meta=meta) @@ -339,20 +344,95 @@ def run_batch( return {_PARENT_JOB_ID: str(parent_job.id), "child_job_ids": children_jobs} # modifies the jobs in place - def _add_job_requirements(self, jobs: List[Dict[str, Any]]): + def _add_job_requirements(self, jobs: List[Dict[str, Any]], is_write_admin: bool): f""" Adds the job requirements, generated from the job requirements resolver, to the provided RunJobParams dicts. Expects the required field {_METHOD} in the param - dicts. Adds the {_JOB_REQUIREMENTS} field to the param dicts, which holds the value of the - job requirements object. + dicts. Looks in the {_JOB_REQUIREMENTS_INCOMING} key for a dictionary containing the + optional keys {REQUEST_CPUS}, {REQUEST_MEMORY}, {REQUEST_DISK}, {CLIENT_GROUP}, + {CLIENT_GROUP_REGEX}, {BILL_TO_USER}, {IGNORE_CONCURRENCY_LIMITS}, + {_SCHEDULER_REQUIREMENTS}, and {DEBUG_MODE}. Adds the {_JOB_REQUIREMENTS} field to the + param dicts, which holds the job requirements object. """ # could add a cache in the job requirements resolver to avoid making the same # catalog call over and over if all the jobs have the same method jrr = self.sdkmr.get_job_requirements_resolver() - for j in jobs: - # TODO JRR check if requesting any job requirements & if is admin - # TODO JRR actually process the requirements once added to the spec - j[_JOB_REQUIREMENTS] = jrr.resolve_requirements(j.get(_METHOD)) + for i, job in enumerate(jobs): + # TODO I feel like a class for just handling error formatting would be useful + # but too much work for a minor benefit + pre = f"Job #{i + 1}: " if len(jobs) > 1 else "" + job_reqs = job.get(_JOB_REQUIREMENTS_INCOMING) or {} + if not isinstance(job_reqs, dict): + raise IncorrectParamsException( + f"{pre}{_JOB_REQUIREMENTS_INCOMING} must be a mapping" + ) + try: + norm = jrr.normalize_job_reqs(job_reqs, "input job") + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, pre) + self._check_job_requirements_vs_admin( + jrr, norm, job_reqs, is_write_admin, pre + ) + + try: + job[_JOB_REQUIREMENTS] = jrr.resolve_requirements( + job.get(_METHOD), + cpus=norm.get(REQUEST_CPUS), + memory_MB=norm.get(REQUEST_MEMORY), + disk_GB=norm.get(REQUEST_DISK), + client_group=norm.get(CLIENT_GROUP), + client_group_regex=norm.get(CLIENT_GROUP_REGEX), + bill_to_user=job_reqs.get(BILL_TO_USER), + ignore_concurrency_limits=bool( + job_reqs.get(IGNORE_CONCURRENCY_LIMITS) + ), + scheduler_requirements=job_reqs.get(_SCHEDULER_REQUIREMENTS), + debug_mode=norm.get(DEBUG_MODE), + ) + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, pre) + + def _check_job_requirements_vs_admin( + self, jrr, norm, job_reqs, is_write_admin, err_prefix + ): + # just a helper method for _add_job_requirements to make that method a bit shorter. + # treat it as part of that method + try: + perm_type = jrr.get_requirements_type( + cpus=norm.get(REQUEST_CPUS), + memory_MB=norm.get(REQUEST_MEMORY), + disk_GB=norm.get(REQUEST_DISK), + client_group=norm.get(CLIENT_GROUP), + client_group_regex=norm.get(CLIENT_GROUP_REGEX), + # Note that this is never confirmed to be a real user. May want to fix that, but + # since it's admin only... YAGNI + bill_to_user=self._check_is_string( + job_reqs.get(BILL_TO_USER), "bill_to_user" + ), + ignore_concurrency_limits=bool(job_reqs.get(IGNORE_CONCURRENCY_LIMITS)), + scheduler_requirements=job_reqs.get(_SCHEDULER_REQUIREMENTS), + debug_mode=norm.get(DEBUG_MODE), + ) + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, err_prefix) + if perm_type != RequirementsType.STANDARD and not is_write_admin: + raise AuthError( + f"{err_prefix}In order to specify job requirements you must be a full admin" + ) + + def _check_is_string(self, putative_str, name): + if not putative_str: + return None + if type(putative_str) != str: + raise IncorrectParamsException(f"{name} must be a string") + return putative_str + + def _rethrow_incorrect_params_with_error_prefix( + self, error: IncorrectParamsException, error_prefix: str + ): + if not error_prefix: + raise error + raise IncorrectParamsException(f"{error_prefix}{error.args[0]}") from error def _check_job_arguments(self, jobs, has_parent_job=False): # perform sanity checks before creating any jobs, including the parent job for batch jobs @@ -360,18 +440,21 @@ def _check_job_arguments(self, jobs, has_parent_job=False): # Could make an argument checker method, or a class that doesn't require a job id. # Seems like more code & work for no real benefit though. # Just create the class for checks, don't use yet - JobSubmissionParameters( - "fakejobid", - AppInfo(job.get(_METHOD), job.get(_APP_ID)), - job[_JOB_REQUIREMENTS], - UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), - wsid=job.get(_WORKSPACE_ID), - source_ws_objects=job.get(_SOURCE_WS_OBJECTS), - ) + pre = f"Job #{i + 1}: " if len(jobs) > 1 else "" + try: + JobSubmissionParameters( + "fakejobid", + AppInfo(job.get(_METHOD), job.get(_APP_ID)), + job[_JOB_REQUIREMENTS], + UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), + wsid=job.get(_WORKSPACE_ID), + source_ws_objects=job.get(_SOURCE_WS_OBJECTS), + ) + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, pre) if has_parent_job and job.get(_PARENT_JOB_ID): - pre = f"Job #{i + 1}: b" if len(jobs) > 1 else "B" raise IncorrectParamsException( - f"{pre}atch jobs may not specify a parent job ID" + f"{pre}batch jobs may not specify a parent job ID" ) # This is also an opportunity for caching # although most likely jobs aren't operating on the same object @@ -399,7 +482,8 @@ def run( params.get(_METHOD), concierge_params ) else: - self._add_job_requirements([params]) + # as_admin checked above + self._add_job_requirements([params], bool(as_admin)) self._check_job_arguments([params]) return self._run(params=params) @@ -436,7 +520,7 @@ def _get_job_reqs_from_concierge_params( bill_to_user=concierge_params.get("account_group"), # default is to ignore concurrency limits for concierge ignore_concurrency_limits=bool( - concierge_params.get("ignore_concurrency_limits", 1) + concierge_params.get(IGNORE_CONCURRENCY_LIMITS, 1) ), scheduler_requirements=schd_reqs, debug_mode=norm.get(DEBUG_MODE), diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index a6c346256..c92dc2648 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -27,10 +27,12 @@ REQUEST_MEMORY = "request_memory" REQUEST_DISK = "request_disk" CLIENT_GROUP_REGEX = "client_group_regex" +BILL_TO_USER = "bill_to_user" +IGNORE_CONCURRENCY_LIMITS = "ignore_concurrency_limits" DEBUG_MODE = "debug_mode" _RESOURCES = set([CLIENT_GROUP, REQUEST_CPUS, REQUEST_MEMORY, REQUEST_DISK]) _ALL_SPECIAL_KEYS = _RESOURCES | set( - [CLIENT_GROUP_REGEX, DEBUG_MODE, "bill_to_user", "ignore_concurrency_limits"] + [CLIENT_GROUP_REGEX, DEBUG_MODE, BILL_TO_USER, IGNORE_CONCURRENCY_LIMITS] ) _CLIENT_GROUPS = "client_groups" diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 29a9cf8dc..b3e4b4c91 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -1250,7 +1250,7 @@ def test_run_job_batch_fail_bad_method(ee2_port, ws_controller): {"method": _MOD}, {"method": "mod.meth.moke"}, ] - err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" + err = "Job #2: Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" # TODO this test surfaced a bug - if a batch wsid is not supplied and any job does not have # a wsid an error occurs with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: @@ -1287,7 +1287,7 @@ def test_run_job_batch_fail_parent_id(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) params = [{"method": _MOD, "parent_job_id": "ae"}] - err = "Batch jobs may not specify a parent job ID" + err = "batch jobs may not specify a parent job ID" with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: list_cgroups.return_value = [] _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 2444ed3cf..2a9729205 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -11,7 +11,7 @@ from unittest.mock import create_autospec, call from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta -from execution_engine2.exceptions import IncorrectParamsException +from execution_engine2.exceptions import IncorrectParamsException, AuthError from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions from execution_engine2.sdk.job_submission_parameters import ( JobSubmissionParameters, @@ -26,7 +26,10 @@ KafkaQueueChange, KafkaCreateJob, ) -from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.job_requirements_resolver import ( + JobRequirementsResolver, + RequirementsType, +) from execution_engine2.utils.SlackUtils import SlackClient from execution_engine2.db.MongoUtil import MongoUtil from installed_clients.WorkspaceClient import Workspace @@ -62,6 +65,19 @@ _CLUSTER_2 = "cluster2" +_EMPTY_JOB_REQUIREMENTS = { + "cpus": None, + "memory_MB": None, + "disk_GB": None, + "client_group": None, + "client_group_regex": None, + "bill_to_user": None, + "ignore_concurrency_limits": False, + "scheduler_requirements": None, + "debug_mode": None, +} + + def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: """ Returns a dictionary of the class that is mocked to the mock of the class, and initializes @@ -205,9 +221,94 @@ def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): mocks[SlackClient].run_job_message.assert_called_once_with(_JOB_ID, _CLUSTER, _USER) -def test_run_as_admin(): +def _create_reqs_dict( + cpu, + mem, + disk, + clientgroup, + client_group_regex=None, + ignore_concurrency_limits=None, + debug_mode=None, + merge_with=None, + internal_representation=False, +): + # the bill to user and scheduler requirements keys are different for the concierge endpoint + # so we don't include them. If needed use the merge_with parameter. + if internal_representation: + ret = { + "cpus": cpu, + "memory_MB": mem, + "disk_GB": disk, + } + else: + ret = { + "request_cpus": cpu, + "request_memory": mem, + "request_disk": disk, + } + ret.update( + { + "client_group": clientgroup, + "client_group_regex": client_group_regex, + "ignore_concurrency_limits": ignore_concurrency_limits, + "debug_mode": debug_mode, + } + ) + if merge_with: + ret.update(merge_with) + return ret + + +def test_run_job(): + """ + A basic unit test of the run() method. + + This test is a fairly minimal test of the run() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + + # Set up data variables + client_group = "myfirstclientgroup" + cpus = 1 + mem = 1 + disk = 1 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.STANDARD + reqs = ResolvedRequirements( + cpus=cpus, memory_MB=mem, disk_GB=disk, client_group=client_group + ) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + assert rj.run(params) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + jrr.normalize_job_reqs.assert_called_once_with({}, "input job") + jrr.get_requirements_type.assert_called_once_with(**_EMPTY_JOB_REQUIREMENTS) + jrr.resolve_requirements.assert_called_once_with(_METHOD, **_EMPTY_JOB_REQUIREMENTS) + _check_common_mock_calls(mocks, reqs, None, _APP) + + +def test_run_job_as_admin_with_job_requirements(): """ - A basic unit test of the run() method with an administrative user. + A basic unit test of the run() method with an administrative user and job requirements. This test is a fairly minimal test of the run() method. It does not exercise all the potential code paths or provide all the possible run inputs, such as job parameters, cell @@ -230,27 +331,59 @@ def test_run_as_admin(): # already a very large test. This may be something to be added later when needed. # Set up call returns. These calls are in the order they occur in the code - reqs = ResolvedRequirements( - cpus=cpus, memory_MB=mem, disk_GB=disk, client_group=client_group + jrr.normalize_job_reqs.return_value = _create_reqs_dict( + cpus, mem, disk, client_group, client_group_regex=True, debug_mode=True + ) + jrr.get_requirements_type.return_value = RequirementsType.BILLING + req_args = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=True, + ignore_concurrency_limits=True, + debug_mode=True, + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + internal_representation=True, ) + reqs = ResolvedRequirements(**req_args) jrr.resolve_requirements.return_value = reqs _set_up_common_return_values(mocks) # set up the class to be tested and run the method rj = EE2RunJob(sdkmr) + inc_reqs = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=1, + ignore_concurrency_limits="righty ho, luv", + debug_mode="true", + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + ) params = { "method": _METHOD, "source_ws_objects": [_WS_REF_1, _WS_REF_2], + "job_requirements": inc_reqs, } assert rj.run(params, as_admin=True) == _JOB_ID # check mocks called as expected. The order here is the order that they're called in the code. sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) - jrr.resolve_requirements.assert_called_once_with(_METHOD) + jrr.normalize_job_reqs.assert_called_once_with(inc_reqs, "input job") + jrr.get_requirements_type.assert_called_once_with(**req_args) + jrr.resolve_requirements.assert_called_once_with(_METHOD, **req_args) _check_common_mock_calls(mocks, reqs, None, None) -def test_run_as_concierge_with_wsid(): +def test_run_job_as_concierge_with_wsid(): """ A unit test of the run() method with a concierge - but not admin - user. @@ -274,14 +407,9 @@ def test_run_as_concierge_with_wsid(): # Set up call returns. These calls are in the order they occur in the code wsauth.can_write.return_value = True - jrr.normalize_job_reqs.return_value = { - "request_cpus": cpus, - "request_memory": mem, - "request_disk": disk, - "client_group": client_group, - "client_group_regex": False, - "debug_mode": True, - } + jrr.normalize_job_reqs.return_value = _create_reqs_dict( + cpus, mem, disk, client_group, client_group_regex=False, debug_mode=True + ) reqs = ResolvedRequirements( cpus=cpus, memory_MB=mem, @@ -304,17 +432,19 @@ def test_run_as_concierge_with_wsid(): "wsid": wsid, "source_ws_objects": [_WS_REF_1, _WS_REF_2], } - conc_params = { - "request_cpus": cpus, - "request_memory": mem, - "request_disk": disk, - "client_group": client_group, - "client_group_regex": 0, - "ignore_concurrency_limits": 0, - "account_group": _OTHER_USER, - "requirements_list": [" foo = bar ", "baz=bat"], - "debug_mode": 1, - } + conc_params = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=0, + ignore_concurrency_limits=0, + debug_mode=1, + merge_with={ + "account_group": _OTHER_USER, + "requirements_list": [" foo = bar ", "baz=bat"], + }, + ) assert rj.run(params, concierge_params=conc_params) == _JOB_ID # check mocks called as expected. The order here is the order that they're called in the code. @@ -337,7 +467,7 @@ def test_run_as_concierge_with_wsid(): _check_common_mock_calls(mocks, reqs, wsid) -def test_run_as_concierge_empty_as_admin(): +def test_run_job_as_concierge_empty_as_admin(): """ A unit test of the run() method with an effectively empty concierge dict and admin privs. The fake key should be ignored but is required to make the concierge params truthy and @@ -348,7 +478,7 @@ def test_run_as_concierge_empty_as_admin(): _run_as_concierge_empty_as_admin({"fake": "foo"}, "lolcats") -def test_run_as_concierge_sched_reqs_None_as_admin(): +def test_run_job_as_concierge_sched_reqs_None_as_admin(): """ A unit test of the run() method with an concierge dict containing None for the scheduler requirements and admin privs. @@ -360,7 +490,7 @@ def test_run_as_concierge_sched_reqs_None_as_admin(): ) -def test_run_as_concierge_sched_reqs_empty_list_as_admin(): +def test_run_job_as_concierge_sched_reqs_empty_list_as_admin(): """ A unit test of the run() method with an concierge dict containing an empty list for the scheduler requirements and admin privs. @@ -425,7 +555,7 @@ def _run_as_concierge_empty_as_admin(concierge_params, app): _check_common_mock_calls(mocks, reqs, None, app) -def test_run_fail_concierge_params(): +def test_run_job_concierge_fail_bad_params(): """ Test that submitting invalid concierge params causes the job to fail. Note that most error checking happens in the mocked out job requirements resolver, so we only check for @@ -460,7 +590,7 @@ def _run_fail_concierge_params(concierge_params, expected): assert_exception_correct(got.value, expected) -def test_run_and_run_batch_fail_illegal_arguments(): +def test_run_job_and_run_job_batch_fail_illegal_arguments(): """ Test that illegal arguments cause the job to fail. Note that not all arguments are checked - this test checks arguments that are checked in the _check_job_arguments() @@ -482,17 +612,85 @@ class and its respective composed classes, and we don't reproduce all the error {"method": "foo.bar", "source_ws_objects": {"a": "b"}}, IncorrectParamsException("source_ws_objects must be a list"), ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "job_requirements": ["10 bob", "a pickled egg"]}, + IncorrectParamsException("job_requirements must be a mapping"), + ) + _run_and_run_batch_fail_illegal_arguments( + { + "method": "foo.bar", + "job_requirements": { + "bill_to_user": { + "Bill": "$3.78", + "Boris": "$2.95", + "AJ": "one BILIIOOOON dollars", + "Sumin": "$1,469,890.42", + } + }, + }, + IncorrectParamsException("bill_to_user must be a string"), + ) def _run_and_run_batch_fail_illegal_arguments(params, expected): mocks = _set_up_mocks(_USER, _TOKEN) - sdkmr = mocks[SDKMethodRunner] jrr = mocks[JobRequirementsResolver] jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") - _run_and_run_batch_fail(sdkmr, params, expected) + _run_and_run_batch_fail(mocks[SDKMethodRunner], params, expected) + + +def test_run_job_and_run_job_batch_fail_arg_normalization(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + e = "Found illegal request_cpus 'like 10 I guess? IDK' in job requirements from input job" + jrr.normalize_job_reqs.side_effect = IncorrectParamsException(e) + _run_and_run_batch_fail( + mocks[SDKMethodRunner], + { + "method": "foo.bar", + "job_requirements": {"request_cpus": "like 10 I guess? IDK"}, + }, + IncorrectParamsException(e), + ) + + +def test_run_job_and_run_job_batch_fail_get_requirements_type(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + e = "bill_to_user contains control characters" + jrr.get_requirements_type.side_effect = IncorrectParamsException(e) + _run_and_run_batch_fail( + mocks[SDKMethodRunner], + {"method": "foo.bar", "job_requirements": {"bill_to_user": "ding\bding"}}, + IncorrectParamsException(e), + ) + + +def test_run_job_and_run_job_batch_fail_not_admin_with_job_reqs(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.PROCESSING + _run_and_run_batch_fail( + mocks[SDKMethodRunner], + {"method": "foo.bar", "job_requirements": {"ignore_concurrency_limits": 1}}, + AuthError("In order to specify job requirements you must be a full admin"), + as_admin=False, + ) + + +def test_run_job_and_run_job_batch_fail_resolve_requirements(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.STANDARD + e = "Unrecognized method: 'None'. Please input module_name.function_name" + jrr.resolve_requirements.side_effect = IncorrectParamsException(e) + _run_and_run_batch_fail(mocks[SDKMethodRunner], {}, IncorrectParamsException(e)) -def test_run_and_run_batch_fail_workspace_objects_check(): +def test_run_job_and_run_job_batch_fail_workspace_objects_check(): mocks = _set_up_mocks(_USER, _TOKEN) sdkmr = mocks[SDKMethodRunner] jrr = mocks[JobRequirementsResolver] @@ -511,34 +709,19 @@ def test_run_and_run_batch_fail_workspace_objects_check(): ) -def _run_and_run_batch_fail(sdkmr, params, expected): +def _run_and_run_batch_fail(sdkmr, params, expected, as_admin=True): rj = EE2RunJob(sdkmr) with raises(Exception) as got: - rj.run(params, as_admin=True) + rj.run(params, as_admin=as_admin) assert_exception_correct(got.value, expected) - with raises(Exception) as got: - rj.run_batch([params], {}, as_admin=True) - assert_exception_correct(got.value, expected) + _run_batch_fail(rj, [params], {}, as_admin, expected) def _set_up_common_return_values_batch(mocks): """ Set up return values on mocks that are the same for several tests. """ - reqs1 = ResolvedRequirements( - cpus=1, - memory_MB=2, - disk_GB=3, - client_group="cg1", - ) - reqs2 = ResolvedRequirements( - cpus=10, - memory_MB=20, - disk_GB=30, - client_group="cg2", - ) - mocks[JobRequirementsResolver].resolve_requirements.side_effect = [reqs1, reqs2] mocks[Workspace].get_object_info3.return_value = { "paths": [[_WS_REF_1], [_WS_REF_2]] } @@ -569,7 +752,6 @@ def _set_up_common_return_values_batch(mocks): retjob_2.id = ObjectId(_JOB_ID_2) retjob_2.status = _CREATED_STATE mocks[MongoUtil].get_job.side_effect = [retjob_1, retjob_2] - return reqs1, reqs2 def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): @@ -578,12 +760,6 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): several tests. """ sdkmr = mocks[SDKMethodRunner] - mocks[JobRequirementsResolver].resolve_requirements.assert_has_calls( - [ - call(_METHOD_1), - call(_METHOD_2), - ] - ) mocks[Workspace].get_object_info3.assert_called_once_with( {"objects": [{"ref": _WS_REF_1}, {"ref": _WS_REF_2}], "ignoreErrors": 1} ) @@ -705,7 +881,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): assert_jobs_equal(final_got_parent_job, final_expected_parent_job) -def test_run_batch_with_parent_job_wsid(): +def test_run_job_batch_with_parent_job_wsid(): """ A basic unit test of the run_batch() method, providing a workspace ID for the parent job. @@ -720,14 +896,34 @@ def test_run_batch_with_parent_job_wsid(): # set up mocks mocks = _set_up_mocks(_USER, _TOKEN) sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] # We intentionally do not check the logger methods as there are a lot of them and this is # already a very large test. This may be something to be added later when needed. # Set up call returns. These calls are in the order they occur in the code - mocks[WorkspaceAuth].can_write.return_value = True mocks[WorkspaceAuth].can_write_list.return_value = {wsid: True} - reqs1, reqs2 = _set_up_common_return_values_batch(mocks) + + jrr.normalize_job_reqs.side_effect = [{}, {}] + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.STANDARD, + ] + reqs1 = ResolvedRequirements( + cpus=1, + memory_MB=2, + disk_GB=3, + client_group="cg1", + ) + reqs2 = ResolvedRequirements( + cpus=10, + memory_MB=20, + disk_GB=30, + client_group="cg2", + ) + jrr.resolve_requirements.side_effect = [reqs1, reqs2] + + _set_up_common_return_values_batch(mocks) # set up the class to be tested and run the method rj = EE2RunJob(sdkmr) @@ -752,12 +948,26 @@ def test_run_batch_with_parent_job_wsid(): mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) # this seems like a bug. See comments in the run_batch method mocks[WorkspaceAuth].can_write_list.assert_called_once_with([parent_wsid, wsid]) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.assert_has_calls( + [call({}, "input job"), call({}, "input job")] + ) + jrr.get_requirements_type.assert_has_calls( + [call(**_EMPTY_JOB_REQUIREMENTS), call(**_EMPTY_JOB_REQUIREMENTS)] + ) + jrr.resolve_requirements.assert_has_calls( + [ + call(_METHOD_1, **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, **_EMPTY_JOB_REQUIREMENTS), + ] + ) _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid) -def test_run_batch_as_admin(): +def test_run_job_batch_as_admin_with_job_requirements(): """ - A basic unit test of the run_batch() method with an administrative user. + A basic unit test of the run_batch() method with an administrative user and supplied job + requirements. This test is a fairly minimal test of the run_batch() method. It does not exercise all the potential code paths or provide all the possible run inputs, such as job parameters, cell @@ -765,18 +975,66 @@ def test_run_batch_as_admin(): """ # set up variables wsid = 32 + cpus = 89 + mem = 3 + disk = 10000 + client_group = "verylargeclientgroup" # set up mocks mocks = _set_up_mocks(_USER, _TOKEN) sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] # We intentionally do not check the logger methods as there are a lot of them and this is # already a very large test. This may be something to be added later when needed. # Set up call returns. These calls are in the order they occur in the code - reqs1, reqs2 = _set_up_common_return_values_batch(mocks) + jrr.normalize_job_reqs.side_effect = [ + {}, + _create_reqs_dict( + cpus, mem, disk, client_group, client_group_regex=True, debug_mode=True + ), + ] + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.BILLING, + ] + req_args = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=True, + ignore_concurrency_limits=True, + debug_mode=True, + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + internal_representation=True, + ) + reqs1 = ResolvedRequirements( + cpus=1, memory_MB=1, disk_GB=1, client_group="verysmallclientgroup" + ) + reqs2 = ResolvedRequirements(**req_args) + jrr.resolve_requirements.side_effect = [reqs1, reqs2] + + _set_up_common_return_values_batch(mocks) # set up the class to be tested and run the method rj = EE2RunJob(sdkmr) + inc_reqs = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=1, + ignore_concurrency_limits="righty ho, luv", + debug_mode="true", + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + ) params = [ { "method": _METHOD_1, @@ -787,6 +1045,7 @@ def test_run_batch_as_admin(): "method": _METHOD_2, "app_id": _APP_2, "wsid": wsid, + "job_requirements": inc_reqs, }, ] assert rj.run_batch(params, {}, as_admin=True) == { @@ -796,6 +1055,15 @@ def test_run_batch_as_admin(): # check mocks called as expected. The order here is the order that they're called in the code. sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + jrr.normalize_job_reqs.assert_has_calls( + [call({}, "input job"), call(inc_reqs, "input job")] + ) + jrr.get_requirements_type.assert_has_calls( + [call(**_EMPTY_JOB_REQUIREMENTS), call(**req_args)] + ) + jrr.resolve_requirements.assert_has_calls( + [call(_METHOD_1, **_EMPTY_JOB_REQUIREMENTS), call(_METHOD_2, **req_args)] + ) _check_common_mock_calls_batch(mocks, reqs1, reqs2, None, wsid) @@ -818,7 +1086,146 @@ def test_run_batch_fail_params_not_list(): ) -def test_run_batch_fail_parent_id_included(): +# Note the next few tests are specifically testing that errors for multiple jobs have the +# correct job number + + +def test_run_job_batch_fail_illegal_arguments(): + """ + Test that illegal arguments cause the job to fail. Note that not all arguments are + checked - this test checks arguments that are checked in the _check_job_arguments() + method. Furthermore, most argument checking occurs in the job submission parameters + class and its respective composed classes, and we don't reproduce all the error conditions + possible - just enough to ensure the error checking occurs. If major changes are made to + the error checking code then more tests may need to be written. + + """ + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") + rj = EE2RunJob(mocks[SDKMethodRunner]) + job = {"method": "foo.bar"} + + _run_batch_fail( + rj, + [job, job, {}], + {}, + True, + IncorrectParamsException("Job #3: Missing input parameter: method ID"), + ) + _run_batch_fail( + rj, + [job, {"method": "foo.bar", "wsid": 0}], + {}, + True, + IncorrectParamsException("Job #2: wsid must be at least 1"), + ) + _run_batch_fail( + rj, + [{"method": "foo.bar", "source_ws_objects": {"a": "b"}}, job], + {}, + True, + IncorrectParamsException("Job #1: source_ws_objects must be a list"), + ) + _run_batch_fail( + rj, + [job, {"method": "foo.bar", "job_requirements": ["10 bob", "a pickled egg"]}], + {}, + True, + IncorrectParamsException("Job #2: job_requirements must be a mapping"), + ) + _run_batch_fail( + rj, + [{"method": "foo.bar", "job_requirements": {"bill_to_user": 1}}, job], + {}, + True, + IncorrectParamsException("Job #1: bill_to_user must be a string"), + ) + + +def test_run_job_batch_fail_arg_normalization(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + e = "Found illegal request_cpus 'like 10 I guess? IDK' in job requirements from input job" + jrr.normalize_job_reqs.side_effect = [{}, IncorrectParamsException(e)] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [ + {"method": "foo.bar"}, + { + "method": "foo.bar", + "job_requirements": {"request_cpus": "like 10 I guess? IDK"}, + }, + ], + {}, + True, + IncorrectParamsException("Job #2: " + e), + ) + + +def test_run_job_batch_fail_get_requirements_type(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + e = "bill_to_user contains control characters" + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.STANDARD, + IncorrectParamsException(e), + ] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [ + {"method": "foo.bar"}, + {"method": "foo.bar"}, + {"method": "foo.bar", "job_requirements": {"bill_to_user": "ding\bding"}}, + ], + {}, + False, + IncorrectParamsException("Job #3: " + e), + ) + + +def test_run_job_batch_fail_not_admin_with_job_reqs(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.side_effect = [ + RequirementsType.PROCESSING, + RequirementsType.STANDARD, + ] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [ + {"method": "foo.bar", "job_requirements": {"ignore_concurrency_limits": 1}}, + {"method": "foo.bar"}, + ], + {}, + False, + AuthError( + "Job #1: In order to specify job requirements you must be a full admin" + ), + ) + + +def test_run_job_batch_fail_resolve_requirements(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.STANDARD + e = "Unrecognized method: 'None'. Please input module_name.function_name" + jr = ResolvedRequirements(cpus=4, memory_MB=4, disk_GB=4, client_group="cg") + jrr.resolve_requirements.side_effect = [jr, IncorrectParamsException(e)] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [{}, {"method": "foo.bar"}], + {}, + False, + IncorrectParamsException("Job #2: " + e), + ) + + +def test_run_job_batch_fail_parent_id_included(): mocks = _set_up_mocks(_USER, _TOKEN) sdkmr = mocks[SDKMethodRunner] rj = EE2RunJob(sdkmr) @@ -828,7 +1235,7 @@ def test_run_batch_fail_parent_id_included(): [{"method": "foo.bar", "app_id": "foo/bat", "parent_job_id": "a"}], {}, True, - IncorrectParamsException("Batch jobs may not specify a parent job ID"), + IncorrectParamsException("batch jobs may not specify a parent job ID"), ) _run_batch_fail( diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 1da33045d..111badf4e 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -22,7 +22,10 @@ from execution_engine2.utils.Condor import Condor from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient -from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.job_requirements_resolver import ( + JobRequirementsResolver, + RequirementsType, +) from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode from execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException @@ -914,6 +917,7 @@ def test_check_jobs_date_range(self, condor_mock): ) runner.workspace_auth = MagicMock() runner.get_job_requirements_resolver = MagicMock(return_value=resolver) + resolver.get_requirements_type.return_value = RequirementsType.STANDARD resolver.resolve_requirements.return_value = JobRequirements( cpus=1, memory_MB=100, From 0d14664dffc2be98c0808c397fbc85501a194bde Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Thu, 6 May 2021 15:22:34 -0700 Subject: [PATCH 067/109] DATAUP-389 - add as_admin to run_job and run_job_batch API, update spec (#382) * Add most integration tests for job reqs Still need a batch test with requirements * Add run_batch_jobs integration test with job reqs * Update release notes, minor clarifications * run black * Add read admin failure tests Ensures that ee2 read admins can't run jobs requesting special resources * run black --- RELEASE_NOTES.md | 2 + execution_engine2.html | 2 +- execution_engine2.spec | 61 +- .../execution_engine2Impl.py | 645 ++++++++++++++---- test/tests_for_integration/api_to_db_test.py | 450 +++++++++++- 5 files changed, 993 insertions(+), 167 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 29e30ad61..be8674d6a 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -3,6 +3,8 @@ ## 0.0.5 * Fix a bug that caused job requirements from the catalog in CSV format to be ignored other than the client group + * Full EE2 admins can now submit job requirements when running jobs via run_job_batch and + run_job. See the SDK spec for details. ## 0.0.4 * Fix up tests diff --git a/execution_engine2.html b/execution_engine2.html index d73d9ae5e..b1456fbff 100644 --- a/execution_engine2.html +++ b/execution_engine2.html @@ -1 +1 @@ -execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*params - the parameters to pass to the method.
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
}
RunJobParams;

/*
*Start a new job (long running method of service registered in ServiceRegistery).
*Such job runs Docker image for this service in script mode.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

typedefstructure{
intwsid;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge ot the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*params - the parameters to pass to the method.
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job (long running method registered in the Catalog).
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index 35848f4dd..36941ddd9 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -86,6 +86,36 @@ string cell_id; } Meta; + /* Job requirements for a job. All fields are optional. To submit job requirements, + the user must have full EE2 admin permissions. Ignored for the run concierge endpoint. + + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the job. + request_disk: the amount of disk space, in GB, to request for the job. + client_group: the name of the client group on which to run the job. + client_group_regex: Whether to treat the client group string, whether provided here, + from the catalog, or as a default, as a regular expression when matching + clientgroups. Default True for HTC, but the default depends on the scheduler. + Omit to use the default. + bill_to_user: the job will be counted against the provided user's fair share quota. + ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided to the job + scheduler. Requires knowledge of the scheduler interface. + debug_mode: Whether to run the job in debug mode. Default false. + + */ + typedef structure { + int request_cpus; + int requst_memory; + int request_disk; + string client_group; + boolean client_group_regex; + string bill_to_user; + boolean ignore_concurrency_limits; + mapping scheduler_requirements; + boolean debug_mode; + } JobRequirements; + /* method - the SDK method to run in module.method format, e.g. 'KBaseTrees.construct_species_tree' @@ -111,6 +141,14 @@ record is not altered. Submitting a job with a parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have full EE2 + administration rights to use this parameter. Note that the job_requirements + are not returned along with the rest of the job parameters when querying the EE2 + API - they are only considered when submitting a job. + as_admin: run the job with full EE2 permissions, meaning that any supplied workspace + IDs are not checked for accessibility and job_requirements may be supplied. The + user must have full EE2 administration rights. + Note that this field is not included in returned data when querying EE2. */ typedef structure { @@ -122,16 +160,24 @@ Meta meta; int wsid; string parent_job_id; + JobRequirements job_requirements; + boolean as_admin; } RunJobParams; /* - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. + Start a new job. */ funcdef run_job(RunJobParams params) returns (job_id job_id) authentication required; + /* Additional parameters for a batch job. + wsid: the workspace with which to associate the parent job. + as_admin: run the job with full EE2 permissions, meaning that any supplied workspace + IDs are not checked for accessibility and job_requirements may be supplied. The + user must have full EE2 administration rights. + */ typedef structure { int wsid; + boolean as_admin; } BatchParams; typedef structure { @@ -145,10 +191,15 @@ boolean as_admin; } AbandonChildren; + /* Run a batch job, consisting of a parent job and one or more child jobs. + Note that the as_admin parameters in the list of child jobs are ignored - + only the as_admin parameter in the batch_params is considered. + */ + funcdef run_job_batch(list params, BatchParams batch_params) + returns (BatchSubmission job_ids) authentication required; - funcdef run_job_batch(list params, BatchParams batch_params) returns (BatchSubmission job_ids) authentication required; - - funcdef abandon_children(AbandonChildren params) returns (BatchSubmission parent_and_child_ids) authentication required; + funcdef abandon_children(AbandonChildren params) + returns (BatchSubmission parent_and_child_ids) authentication required; /* EE2Constants Concierge Params are diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 2efb08ba5..d9c3f249e 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -9,7 +9,7 @@ from execution_engine2.utils.APIHelpers import GenerateFromConfig from execution_engine2.utils.clients import get_client_set - +_AS_ADMIN = "as_admin" #END_HEADER @@ -30,7 +30,7 @@ class execution_engine2: ######################################### noqa VERSION = "0.0.5" GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" - GIT_COMMIT_HASH = "ba016db2ffabc0fa48f79559816cf0f115c00feb" + GIT_COMMIT_HASH = "c5468aee40dbe0a3f557e38ee78d963b592f0a5d" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -178,8 +178,7 @@ def status(self, ctx): def run_job(self, ctx, params): """ - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. + Start a new job (long running method registered in the Catalog). :param params: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id of the @@ -199,7 +198,16 @@ def run_job(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -213,7 +221,33 @@ def run_job(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ # ctx is the context object @@ -223,9 +257,9 @@ def run_job(self, ctx, params): user_clients=self.gen_cfg.get_user_clients(ctx), clients = self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache + admin_permissions_cache=self.admin_permissions_cache, ) - job_id = mr.run_job(params) + job_id = mr.run_job(params, as_admin=bool(params.get(_AS_ADMIN))) #END run_job # At some point might do deeper type checking... @@ -237,6 +271,9 @@ def run_job(self, ctx, params): def run_job_batch(self, ctx, params, batch_params): """ + Run a batch job, consisting of a parent job and one or more child jobs. + Note that the as_admin parameters in the list of child jobs are ignored - + only the as_admin parameter in the batch_params is considered. :param params: instance of list of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id of the @@ -256,7 +293,16 @@ def run_job_batch(self, ctx, params, batch_params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -270,9 +316,41 @@ def run_job_batch(self, ctx, params, batch_params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String - :param batch_params: instance of type "BatchParams" -> structure: - parameter "wsid" of Long + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) + :param batch_params: instance of type "BatchParams" (Additional + parameters for a batch job. wsid: the workspace with which to + associate the parent job. as_admin: run the job with full EE2 + permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights.) -> structure: + parameter "wsid" of Long, parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "BatchSubmission" -> structure: parameter "parent_job_id" of type "job_id" (A job id.), parameter "child_job_ids" of list of type "job_id" (A job id.) @@ -286,7 +364,8 @@ def run_job_batch(self, ctx, params, batch_params): job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache ) - job_ids = mr.run_job_batch(params, batch_params) + job_ids = mr.run_job_batch( + params, batch_params, as_admin=bool(batch_params.get(_AS_ADMIN))) #END run_job_batch # At some point might do deeper type checking... @@ -348,7 +427,16 @@ def run_job_concierge(self, ctx, params, concierge_params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -362,7 +450,33 @@ def run_job_concierge(self, ctx, params, concierge_params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int request_memory: int in MB request_disk: int in GB job_priority: @@ -430,7 +544,16 @@ def get_job_params(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -444,7 +567,33 @@ def get_job_params(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) """ # ctx is the context object # return variables are: params @@ -706,7 +855,16 @@ def check_job(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -720,14 +878,40 @@ def check_job(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of Long + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: job_state @@ -814,7 +998,16 @@ def check_job_batch(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -828,52 +1021,77 @@ def check_job_batch(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of - Long, parameter "child_jobstates" of list of type "JobState" - (job_id - string - id of the job user - string - user who started - the job wsid - int - optional id of the workspace where the job is - bound authstrat - string - what strategy used to authenticate the - job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in - milliseconds when the job was created finished - int - timestamp - since epoch in milliseconds when the job was finished status - - string - status of the job. one of the following: created - job - has been created in the service estimating - an estimation job is - running to estimate resources required for the main job, and which - queue should be used queued - job is queued to be run running - - job is running on a worker node completed - job was completed - successfully error - job is no longer running, but failed with an - error terminated - job is no longer running, terminated either due - to user cancellation, admin cancellation, or some automated task - error_code - int - internal reason why the job is an error. one of - the following: 0 - unknown 1 - job crashed 2 - job terminated by - automation 3 - job ran over time limit 4 - job was missing its - automated output document 5 - job authentication token expired - errormsg - string - message (e.g. stacktrace) accompanying an - errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the - Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter + "child_jobstates" of list of type "JobState" (job_id - string - id + of the job user - string - user who started the job wsid - int - + optional id of the workspace where the job is bound authstrat - + string - what strategy used to authenticate the job job_input - + object - inputs to the job (from the run_job call) ## TODO - + verify updated - int - timestamp since epoch in milliseconds of + the last time the status was updated running - int - timestamp + since epoch in milliseconds of when it entered the running state + created - int - timestamp since epoch in milliseconds when the job + was created finished - int - timestamp since epoch in milliseconds + when the job was finished status - string - status of the job. one + of the following: created - job has been created in the service + estimating - an estimation job is running to estimate resources + required for the main job, and which queue should be used queued - + job is queued to be run running - job is running on a worker node + completed - job was completed successfully error - job is no + longer running, but failed with an error terminated - job is no + longer running, terminated either due to user cancellation, admin + cancellation, or some automated task error_code - int - internal + reason why the job is an error. one of the following: 0 - unknown + 1 - job crashed 2 - job terminated by automation 3 - job ran over + time limit 4 - job was missing its automated output document 5 - + job authentication token expired errormsg - string - message (e.g. + stacktrace) accompanying an errored job error - object - the + JSON-RPC error package that accompanies the error code and message + terminated_code - int - internal reason why a job was terminated, + one of: 0 - user cancellation 1 - admin cancellation 2 - + terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id + of the Narrative application (UI) running this job (e.g. + repo/name) params - the parameters to pass to the method. Optional parameters: service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve @@ -888,7 +1106,16 @@ def check_job_batch(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -902,14 +1129,40 @@ def check_job_batch(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of Long + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -994,7 +1247,16 @@ def check_jobs(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -1008,14 +1270,40 @@ def check_jobs(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of Long + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -1103,7 +1391,16 @@ def check_workspace_jobs(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -1117,14 +1414,40 @@ def check_workspace_jobs(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of Long + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ # ctx is the context object # return variables are: returnVal @@ -1356,7 +1679,16 @@ def check_jobs_date_range_for_user(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -1370,18 +1702,44 @@ def check_jobs_date_range_for_user(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of - Long, parameter "count" of Long, parameter "query_count" of Long, - parameter "filter" of mapping from String to String, parameter - "skip" of Long, parameter "projection" of list of String, - parameter "limit" of Long, parameter "sort_order" of String + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "count" of + Long, parameter "query_count" of Long, parameter "filter" of + mapping from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal @@ -1524,7 +1882,16 @@ def check_jobs_date_range_for_all(self, ctx, params): specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned.) -> + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> structure: parameter "method" of String, parameter "app_id" of String, parameter "params" of list of unspecified object, parameter "service_ver" of String, parameter "source_ws_objects" @@ -1538,18 +1905,44 @@ def check_jobs_date_range_for_all(self, ctx, params): which the job was run.) -> structure: parameter "run_id" of String, parameter "token_id" of String, parameter "tag" of String, parameter "cell_id" of String, parameter "wsid" of Long, parameter - "parent_job_id" of String, parameter "created" of Long, parameter - "queued" of Long, parameter "estimating" of Long, parameter - "running" of Long, parameter "finished" of Long, parameter - "updated" of Long, parameter "error" of type "JsonRpcError" (Error - block of JSON RPC response) -> structure: parameter "name" of - String, parameter "code" of Long, parameter "message" of String, - parameter "error" of String, parameter "error_code" of Long, - parameter "errormsg" of String, parameter "terminated_code" of - Long, parameter "count" of Long, parameter "query_count" of Long, - parameter "filter" of mapping from String to String, parameter - "skip" of Long, parameter "projection" of list of String, - parameter "limit" of Long, parameter "sort_order" of String + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge ot the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "count" of + Long, parameter "query_count" of Long, parameter "filter" of + mapping from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index b3e4b4c91..149e97e8f 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -418,7 +418,7 @@ def _set_up_workspace_objects(ws_controller, token, ws_name="foo"): ) -def _get_run_job_param_set(app_id=_APP): +def _get_run_job_param_set(app_id=_APP, job_reqs=None, as_admin=False): return { "method": _MOD, "app_id": app_id, @@ -427,6 +427,8 @@ def _get_run_job_param_set(app_id=_APP): "params": [{"foo": "bar"}, 42], "service_ver": "beta", "parent_job_id": "totallywrongid", + "job_requirements": job_reqs, + "as_admin": as_admin, "meta": { "run_id": "rid", "token_id": "tid", @@ -536,15 +538,120 @@ def _check_mongo_job( def test_run_job_no_app_id(ee2_port, ws_controller, mongo_client): - _run_job(ee2_port, ws_controller, mongo_client, None, None) + _run_job( + ee2_port, + ws_controller, + mongo_client, + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + ) def test_run_job_with_app_id(ee2_port, ws_controller, mongo_client): - _run_job(ee2_port, ws_controller, mongo_client, "mod/app", "mod") + _run_job( + ee2_port, + ws_controller, + mongo_client, + app_id="mod/app", + app_mod="mod", + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + ) -def _run_job(ee2_port, ws_controller, mongo_client, app_id, app_mod): - _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) +def test_run_job_with_job_requirements_full(ee2_port, ws_controller, mongo_client): + """ + Tests running a job where all requirements are specified on input. + """ + + def modify_sub(sub): + del sub["Concurrency_Limits"] + sub["requirements"] = ( + '(CLIENTGROUP == "extreme") && (after == "pantsremoval") && ' + + '(beforemy == "2pmsalonappt")' + ) + sub["+AccountingGroup"] = '"borishesgoodforit"' + sub["environment"] = sub["environment"].replace( + "DEBUG_MODE=False", "DEBUG_MODE=True" + ) + + _run_job( + ee2_port, + ws_controller, + mongo_client, + job_reqs={ + "request_cpus": 21, + "request_memory": 34, + "request_disk": 99, + "client_group": "extreme", + "client_group_regex": 0, + "bill_to_user": "borishesgoodforit", + "ignore_concurrency_limits": "true", + "scheduler_requirements": { + "beforemy": "2pmsalonappt", + "after": "pantsremoval", + }, + "debug_mode": True, + }, + modify_sub=modify_sub, + clientgroup="extreme", + cpu=21, + mem=34, + disk=99, + catalog_return=[ + { + "client_groups": [ + '{"client_group":"njs","request_cpus":8,"request_memory":5}' + ] + } + ], + as_admin=7, # truthy + user=USER_WRITE_ADMIN, + token=TOKEN_WRITE_ADMIN, + ) + + +def test_run_job_with_job_requirements_mixed(ee2_port, ws_controller, mongo_client): + """ + Tests running a job where requirements are specified on input, from the catalog, and from + the deploy.cfg file. + """ + _run_job( + ee2_port, + ws_controller, + mongo_client, + job_reqs={"request_cpus": 9}, + clientgroup="njs", + cpu=9, + mem=5, + disk=30, + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + as_admin="wheee", # truthy + user=USER_WRITE_ADMIN, + token=TOKEN_WRITE_ADMIN, + ) + + +def _run_job( + ee2_port, + ws_controller, + mongo_client, + app_id=None, + app_mod=None, + job_reqs=None, + modify_sub=lambda x: x, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + catalog_return=None, + as_admin=False, + user=None, + token=None, +): + # values in the method sig are set at the time of method creation, at which time the + # user and token fields haven't yet been set by the fixtures + user = user if user else USER_NO_ADMIN + token = token if token else TOKEN_NO_ADMIN + _set_up_workspace_objects(ws_controller, token) # need to get the mock objects first so spec_set can do its magic before we mock out # the classes in the context manager sub, schedd, txn = _get_htc_mocks() @@ -559,14 +666,12 @@ def _run_job(ee2_port, ws_controller, mongo_client, app_id, app_mod): # set up the rest of the mocks _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) sub.queue.return_value = 123 - list_cgroups.return_value = [ - {"client_groups": ['{"request_cpus":8,"request_memory":5}']} - ] + list_cgroups.return_value = catalog_return or [] get_mod_ver.return_value = {"git_commit_hash": "somehash"} # run the method - ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) - params = _get_run_job_param_set(app_id) + ee2 = ee2client(f"http://localhost:{ee2_port}", token=token) + params = _get_run_job_param_set(app_id, job_reqs, as_admin) job_id = ee2.run_job(params) # check that mocks were called correctly @@ -580,30 +685,45 @@ def _run_job(ee2_port, ws_controller, mongo_client, app_id, app_mod): expected_sub = _get_condor_sub_for_rj_param_set( job_id, - USER_NO_ADMIN, - TOKEN_NO_ADMIN, - clientgroup="njs", - cpu=8, - mem=5, - disk=30, + user, + token, + clientgroup=clientgroup, + cpu=cpu, + mem=mem, + disk=disk, app_id=app_id, app_module=app_mod, ) + modify_sub(expected_sub) _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) _check_mongo_job( mongo_client, job_id, - USER_NO_ADMIN, + user, app_id, - clientgroup="njs", - cpu=8, - mem=5, - disk=30, + clientgroup=clientgroup, + cpu=cpu, + mem=mem, + disk=disk, githash="somehash", ) +def test_run_job_fail_not_admin(ee2_port): + params = {"method": _MOD, "as_admin": 1} + err = "Access Denied: You are not an administrator. AdminPermissions.NONE" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_only_read_admin(ee2_port): + params = {"method": _MOD, "as_admin": 1} + err = ( + "Access Denied: You are a read-only admin. This function requires write access" + ) + _run_job_fail(ee2_port, TOKEN_READ_ADMIN, params, err) + + def test_run_job_fail_no_workspace_access(ee2_port): params = {"method": _MOD, "wsid": 1} # this error could probably use some cleanup @@ -614,6 +734,29 @@ def test_run_job_fail_no_workspace_access(ee2_port): _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) +def test_run_job_fail_bad_cpu(ee2_port): + params = {"method": _MOD, "job_requirements": {"request_cpus": -10}} + err = "CPU count must be at least 1" + _run_job_fail(ee2_port, TOKEN_WRITE_ADMIN, params, err) + + +def test_run_job_fail_bad_scheduler_requirements(ee2_port): + params = { + "method": _MOD, + "job_requirements": {"scheduler_requirements": {"foo": ""}}, + } + # TODO non-string keys/values in schd_reqs causes a not-very-useful error + # Since it's admin only don't worry about it for now + err = "Missing input parameter: value for key 'foo' in scheduler requirements structure" + _run_job_fail(ee2_port, TOKEN_WRITE_ADMIN, params, err) + + +def test_run_job_fail_job_reqs_but_no_as_admin(ee2_port): + params = {"method": _MOD, "job_requirements": {"request_cpus": 10}} + err = "In order to specify job requirements you must be a full admin" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + def test_run_job_fail_bad_method(ee2_port): params = {"method": "mod.meth.moke"} err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" @@ -1191,21 +1334,222 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "+KB_SOURCE_WS_OBJECTS": "", } ) + _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 + ) + + +def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method focusing on job requirements and minimizing all other inputs. + Since the batch endpoint uses the same code path as the single job endpoint for processing + job requirements, we only have a single test that mixes job requirements from the input, + catalog, and deploy configuration, as opposed to the multiple tests for single jobs. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "foo") # ws 1 + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.side_effect = [123, 456] + list_cgroups.side_effect = [ + [{"client_groups": ['{"client_group":"bigmem"}']}], + [{"client_groups": ['{"request_disk":8,"request_memory":5}']}], + ] + get_mod_ver.side_effect = [ + {"git_commit_hash": "somehash"}, + {"git_commit_hash": "somehash2"}, + ] - assert sub_init.call_args_list == [call(expected_sub_1), call(expected_sub_2)] - # The line above and the line below should be completely equivalent IIUC, but the line - # below fails for reasons I don't understand. The error output shows the actual calls - # for the line below having 2 extra calls that appear to be the sub.queue calls - # below. Stumped, so going with what works and moving on. - # sub_init.assert_has_calls([call(expected_sub_1), call(expected_sub_2)]) - schedd_init.call_args_list = [call(), call()] - # same deal here. Output includes stuff like `call().transaction()` so - # it appears the sub calls are being picked up, which is weird. - # schedd_init.assert_has_calls([call(), call()]) - schedd.transaction.call_args_list = [call(), call()] - # and again - # schedd.transaction.assert_has_calls([call(), call()]) - sub.queue.assert_has_calls([call(txn, 1), call(txn, 1)]) + # run the method + job1_params = {"method": _MOD} + job2_params = { + "method": "mod2.meth2", + "job_requirements": { + "request_memory": 42, + "client_group": "extreme", + "client_group_regex": 0, + "bill_to_user": "forrest_gump", + "ignore_concurrency_limits": "true", + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + "debug_mode": True, + }, + } + job_batch_params = {"wsid": 1, "as_admin": "foo"} + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_WRITE_ADMIN) + ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + parent_job_id = ret["parent_job_id"] + job_id_1, job_id_2 = ret["child_job_ids"] + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "version": "release"}), + call(ANY, {"module_name": "mod2", "version": "release"}), + ] + ) + list_cgroups.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "function_name": "meth"}), + call(ANY, {"module_name": "mod2", "function_name": "meth2"}), + ] + ) + + job1 = _get_mongo_job(mongo_client, job_id_1) + job2 = _get_mongo_job(mongo_client, job_id_2) + + expected_job1 = { + "_id": ObjectId(job_id_1), + "user": USER_WRITE_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "job_input": { + "method": _MOD, + "service_ver": "somehash", + "source_ws_objects": [], + "parent_job_id": parent_job_id, + "requirements": { + "clientgroup": "bigmem", + "cpu": 4, + "memory": 2000, + "disk": 100, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + assert job1 == expected_job1 + + expected_job2 = { + "_id": ObjectId(job_id_2), + "user": USER_WRITE_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "job_input": { + "method": "mod2.meth2", + "service_ver": "somehash2", + "source_ws_objects": [], + "parent_job_id": parent_job_id, + "requirements": { + "clientgroup": "extreme", + "cpu": 32, + "memory": 42, + "disk": 8, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "456", + "scheduler_type": "condor", + } + assert job2 == expected_job2 + + parent_job = _get_mongo_job(mongo_client, parent_job_id, has_queued=False) + expected_parent_job = { + "_id": ObjectId(parent_job_id), + "user": USER_WRITE_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": 1, + "status": "created", + "job_input": { + "method": "batch", + "service_ver": "batch", + "app_id": "batch", + "source_ws_objects": [], + "narrative_cell_info": {}, + }, + "child_jobs": [job_id_1, job_id_2], + "batch_job": True, + } + assert parent_job == expected_parent_job + + expected_sub_1 = _get_condor_sub_for_rj_param_set( + job_id_1, + USER_WRITE_ADMIN, + TOKEN_WRITE_ADMIN, + clientgroup="bigmem", + cpu=4, + mem=2000, + disk=100, + parent_job_id=parent_job_id, + app_id=None, + app_module=None, + ) + expected_sub_1.update({"+KB_SOURCE_WS_OBJECTS": "", "+KB_WSID": ""}) + expected_sub_2 = _get_condor_sub_for_rj_param_set( + job_id_2, + USER_WRITE_ADMIN, + TOKEN_WRITE_ADMIN, + clientgroup="extreme", + cpu=32, + mem=42, + disk=8, + parent_job_id=parent_job_id, + app_id=None, + app_module=None, + ) + expected_sub_2.update( + { + "+KB_SOURCE_WS_OBJECTS": "", + "+KB_WSID": "", + "+AccountingGroup": '"forrest_gump"', + "+KB_MODULE_NAME": '"mod2"', + "+KB_FUNCTION_NAME": '"meth2"', + "requirements": '(CLIENTGROUP == "extreme") && (baz == "bat") && (foo == "bar")', + "environment": expected_sub_2["environment"].replace( + "DEBUG_MODE=False", "DEBUG_MODE=True" + ), + } + ) + del expected_sub_2["Concurrency_Limits"] + _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 + ) + + +def _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 +): + assert sub_init.call_args_list == [call(expected_sub_1), call(expected_sub_2)] + # The line above and the line below should be completely equivalent IIUC, but the line + # below fails for reasons I don't understand. The error output shows the actual calls + # for the line below having 2 extra calls that appear to be the sub.queue calls + # below. Stumped, so going with what works and moving on. + # sub_init.assert_has_calls([call(expected_sub_1), call(expected_sub_2)]) + schedd_init.call_args_list = [call(), call()] + # same deal here. Output includes stuff like `call().transaction()` so + # it appears the sub calls are being picked up, which is weird. + # schedd_init.assert_has_calls([call(), call()]) + schedd.transaction.call_args_list = [call(), call()] + # and again + # schedd.transaction.assert_has_calls([call(), call()]) + sub.queue.assert_has_calls([call(txn, 1), call(txn, 1)]) + + +def test_run_job_batch_fail_not_admin(ee2_port, ws_controller): + err = "Access Denied: You are not an administrator. AdminPermissions.NONE" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, [], {"as_admin": True}, err) + + +def test_run_job_batch_fail_only_read_admin(ee2_port, ws_controller): + err = ( + "Access Denied: You are a read-only admin. This function requires write access" + ) + _run_job_batch_fail(ee2_port, TOKEN_READ_ADMIN, [], {"as_admin": True}, err) def test_run_job_batch_fail_no_workspace_access_for_batch(ee2_port, ws_controller): @@ -1232,6 +1576,42 @@ def test_run_job_batch_fail_no_workspace_access_for_job(ee2_port): _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) +def test_run_job_batch_fail_bad_memory(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD}, + {"method": _MOD}, + {"method": _MOD, "job_requirements": {"request_memory": [1000]}}, + ] + err = "Job #3: Found illegal memory request '[1000]' in job requirements from input job" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_scheduler_requirements(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD, "job_requirements": {"scheduler_requirements": {"": "foo"}}}, + {"method": _MOD}, + ] + err = "Job #1: Missing input parameter: key in scheduler requirements structure" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_job_reqs_but_no_as_admin(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD}, + { + "method": _MOD, + "job_requirements": {"request_memory": 1000}, + # as_admin is only considered in the batch params for run_job_batch + "as_admin": True, + }, + ] + err = "Job #2: In order to specify job requirements you must be a full admin" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + def test_run_job_batch_fail_bad_catalog_data(ee2_port, ws_controller): _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: From 724683da94e327f97850d83cf545d1b3b5659d63 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 7 May 2021 16:16:45 -0500 Subject: [PATCH 068/109] Bump urllib3 from 1.25.3 to 1.25.8 in /test/dockerfiles/condor (#378) Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.25.3 to 1.25.8. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.25.3...1.25.8) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- test/dockerfiles/condor/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dockerfiles/condor/requirements.txt b/test/dockerfiles/condor/requirements.txt index f447095f0..42ae91a96 100644 --- a/test/dockerfiles/condor/requirements.txt +++ b/test/dockerfiles/condor/requirements.txt @@ -18,7 +18,7 @@ requests-async==0.5.0 rfc3986==1.3.2 sanic==19.6.0 ujson==1.35 -urllib3==1.25.3 +urllib3==1.25.8 uvloop==0.12.2 websockets==6.0 htcondor==8.9.2 From 46df42a6ed9fda3796cf9a68ea8088067e674936 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 7 May 2021 16:16:57 -0500 Subject: [PATCH 069/109] Bump urllib3 from 1.25.3 to 1.25.8 (#379) Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.25.3 to 1.25.8. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.25.3...1.25.8) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Pipfile | 2 +- Pipfile.lock | 8 ++++---- requirements-dev.txt | 24 ++++++++++++------------ requirements.txt | 21 +++++++++++---------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/Pipfile b/Pipfile index d289deab1..7c7a4c470 100644 --- a/Pipfile +++ b/Pipfile @@ -65,7 +65,7 @@ toml = "==0.10.1" tqdm = "==4.42.1" typing-extensions = "==3.7.4.3" ujson = "==1.35" -urllib3 = "==1.25.3" +urllib3 = "==1.25.8" uvloop = "==0.12.2" websockets = "==6.0" yarl = "==1.5.1" diff --git a/Pipfile.lock b/Pipfile.lock index 7fc1df1ba..87868167e 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "5583d70e51897cfd1749f9b1dacbe8441656f04501f77e8ff5119ffbdd7b2dbd" + "sha256": "3b864de8c1c9f041d32e49e08edc75d786257e2df50100b083d1c4deb9009f19" }, "pipfile-spec": 6, "requires": { @@ -823,11 +823,11 @@ }, "urllib3": { "hashes": [ - "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", - "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" ], "index": "pypi", - "version": "==1.25.3" + "version": "==1.25.8" }, "uvloop": { "hashes": [ diff --git a/requirements-dev.txt b/requirements-dev.txt index e71cab6ae..f211aac16 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -wheel +-i https://pypi.org/simple/ aiofiles==0.4.0 aiohttp==3.6.3 asn1crypto==1.3.0 @@ -28,35 +28,35 @@ hyperframe==5.2.0 idna==2.8 importlib-metadata==2.0.0 iniconfig==1.1.1 -Jinja2==2.10.3 -JSONRPCBase==0.2.0 +jinja2==2.10.3 +jsonrpcbase==0.2.0 maps==5.1.1 -MarkupSafe==1.1.1 +markupsafe==1.1.1 memory-profiler==0.55.0 mock==3.0.5 -mongoengine==0.18.2 +mongoengine==0.23.0 multidict==4.5.2 nose==1.3.7 -packaging==20.4 +packaging==20.9 pluggy==0.13.1 psutil==5.6.6 py==1.9.0 pycosat==0.6.3 pycparser==2.19 pymongo==3.8.0 -pyOpenSSL==19.1.0 +pyopenssl==19.1.0 pyparsing==2.4.7 -PySocks==1.7.1 -pytest==6.1.1 +pysocks==1.7.1 pytest-cov==2.8.1 pytest-profiling==1.7.0 +pytest==6.1.1 python-dateutil==2.8.0 python-dotenv==0.10.3 -requests==2.22.0 requests-async==0.5.0 requests-mock==1.7.0 +requests==2.22.0 rfc3986==1.3.2 -ruamel-yaml==0.15.87 +ruamel.yaml==0.15.87 sanic==19.6.0 sentry-sdk==0.14.3 six==1.14.0 @@ -65,7 +65,7 @@ toml==0.10.1 tqdm==4.42.1 typing-extensions==3.7.4.3 ujson==1.35 -urllib3==1.25.3 +urllib3==1.25.8 uvloop==0.12.2 websocket-client==0.57.0 websockets==6.0 diff --git a/requirements.txt b/requirements.txt index dcf5ca800..f211aac16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +-i https://pypi.org/simple/ aiofiles==0.4.0 aiohttp==3.6.3 asn1crypto==1.3.0 @@ -27,35 +28,35 @@ hyperframe==5.2.0 idna==2.8 importlib-metadata==2.0.0 iniconfig==1.1.1 -Jinja2==2.10.3 -JSONRPCBase==0.2.0 +jinja2==2.10.3 +jsonrpcbase==0.2.0 maps==5.1.1 -MarkupSafe==1.1.1 +markupsafe==1.1.1 memory-profiler==0.55.0 mock==3.0.5 mongoengine==0.23.0 multidict==4.5.2 nose==1.3.7 -packaging==20.4 +packaging==20.9 pluggy==0.13.1 psutil==5.6.6 py==1.9.0 pycosat==0.6.3 pycparser==2.19 pymongo==3.8.0 -pyOpenSSL==19.1.0 +pyopenssl==19.1.0 pyparsing==2.4.7 -PySocks==1.7.1 -pytest==6.1.1 +pysocks==1.7.1 pytest-cov==2.8.1 pytest-profiling==1.7.0 +pytest==6.1.1 python-dateutil==2.8.0 python-dotenv==0.10.3 -requests==2.22.0 requests-async==0.5.0 requests-mock==1.7.0 +requests==2.22.0 rfc3986==1.3.2 -ruamel-yaml==0.15.87 +ruamel.yaml==0.15.87 sanic==19.6.0 sentry-sdk==0.14.3 six==1.14.0 @@ -64,7 +65,7 @@ toml==0.10.1 tqdm==4.42.1 typing-extensions==3.7.4.3 ujson==1.35 -urllib3==1.25.3 +urllib3==1.25.8 uvloop==0.12.2 websocket-client==0.57.0 websockets==6.0 From 73470a0fd36cf3848632755885610409ac07a0b3 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 7 May 2021 15:04:48 -0700 Subject: [PATCH 070/109] app_id is now optional (#384) --- execution_engine2.html | 2 +- execution_engine2.spec | 4 +- .../execution_engine2Impl.py | 322 +++++++++--------- 3 files changed, 164 insertions(+), 164 deletions(-) diff --git a/execution_engine2.html b/execution_engine2.html index b1456fbff..d1ed35423 100644 --- a/execution_engine2.html +++ b/execution_engine2.html @@ -1 +1 @@ -execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge ot the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*params - the parameters to pass to the method.
*
*Optional parameters:
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job (long running method registered in the Catalog).
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge of the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*params - the parameters to pass to the method.
*
*Optional parameters:
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index 36941ddd9..358ea17f5 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -119,11 +119,11 @@ /* method - the SDK method to run in module.method format, e.g. 'KBaseTrees.construct_species_tree' - app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional parameters: + app_id - the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index d9c3f249e..47434551b 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -30,7 +30,7 @@ class execution_engine2: ######################################### noqa VERSION = "0.0.5" GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" - GIT_COMMIT_HASH = "c5468aee40dbe0a3f557e38ee78d963b592f0a5d" + GIT_COMMIT_HASH = "46df42a6ed9fda3796cf9a68ea8088067e674936" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -178,27 +178,27 @@ def status(self, ctx): def run_job(self, ctx, params): """ - Start a new job (long running method registered in the Catalog). + Start a new job. :param params: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -237,7 +237,7 @@ def run_job(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -276,24 +276,24 @@ def run_job_batch(self, ctx, params, batch_params): only the as_admin parameter in the batch_params is considered. :param params: instance of list of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -332,7 +332,7 @@ def run_job_batch(self, ctx, params, batch_params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -410,24 +410,24 @@ def run_job_concierge(self, ctx, params, concierge_params): """ :param params: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -466,7 +466,7 @@ def run_job_concierge(self, ctx, params, concierge_params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -527,24 +527,24 @@ def get_job_params(self, ctx, params): "as_admin" of type "boolean" (@range [0,1]) :returns: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -583,7 +583,7 @@ def get_job_params(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -838,10 +838,10 @@ def check_job(self, ctx, params): of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references @@ -894,7 +894,7 @@ def check_job(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -981,10 +981,10 @@ def check_job_batch(self, ctx, params): of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references @@ -1037,7 +1037,7 @@ def check_job_batch(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -1089,10 +1089,10 @@ def check_job_batch(self, ctx, params): of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references @@ -1145,7 +1145,7 @@ def check_job_batch(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -1230,24 +1230,24 @@ def check_jobs(self, ctx, params): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1286,7 +1286,7 @@ def check_jobs(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -1374,24 +1374,24 @@ def check_workspace_jobs(self, ctx, params): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1430,7 +1430,7 @@ def check_workspace_jobs(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -1662,24 +1662,24 @@ def check_jobs_date_range_for_user(self, ctx, params): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1718,7 +1718,7 @@ def check_jobs_date_range_for_user(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of @@ -1865,24 +1865,24 @@ def check_jobs_date_range_for_all(self, ctx, params): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. Submitting a job with a - parent ID to run_job_batch will cause an error to be returned. + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1921,7 +1921,7 @@ def check_jobs_date_range_for_all(self, ctx, params): provided user's fair share quota. ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. scheduler_requirements: arbitrary key-value pairs to be provided - to the job scheduler. Requires knowledge ot the scheduler + to the job scheduler. Requires knowledge of the scheduler interface. debug_mode: Whether to run the job in debug mode. Default false.) -> structure: parameter "request_cpus" of Long, parameter "requst_memory" of Long, parameter "request_disk" of From 25a475bca9801b04846ce517777bed22a2edb013 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 10 May 2021 10:03:30 -0700 Subject: [PATCH 071/109] Fix finish_job bug (#385) Expected an app_id all the time Searched the codebase for "app_id" and I don't see any other places where it's treated as anything other than an opaque string. Per the catalog, the app id based fields are optional: https://github.com/kbase/catalog/blob/master/catalog.spec#L616-L619 --- lib/execution_engine2/sdk/EE2Status.py | 9 ++++-- test/tests_for_sdkmr/EE2Status_test.py | 40 +++++++++++++++----------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 9c6663348..63fc1a29b 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -537,8 +537,13 @@ def _send_exec_stats_to_catalog(self, job_id): log_exec_stats_params = dict() log_exec_stats_params["user_id"] = job.user app_id = job_input.app_id - log_exec_stats_params["app_module_name"] = app_id.split("/")[0] - log_exec_stats_params["app_id"] = app_id + if app_id: + # Note this will not work properly for app_ids incorrectly separated by a '.', + # which happens in some KBase code (which needs to be fixed at some point) - + # notably the narrative data download code, maybe more + # It's been this way for a long time, so leave for now + log_exec_stats_params["app_module_name"] = app_id.split("/")[0] + log_exec_stats_params["app_id"] = app_id method = job_input.method log_exec_stats_params["func_module_name"] = method.split(".")[0] log_exec_stats_params["func_name"] = method.split(".")[-1] diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py index b1cdc311c..26596dc6f 100644 --- a/test/tests_for_sdkmr/EE2Status_test.py +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -31,7 +31,15 @@ def _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, return job -def test_finish_job_complete_minimal(): +def test_finish_job_complete_minimal_without_app_id(): + _finish_job_complete_minimal(None, None) + + +def test_finish_job_complete_minimal_with_app_id(): + _finish_job_complete_minimal("module/myapp", "module") + + +def _finish_job_complete_minimal(app_id, app_module): """ Tests a very simple case of completing a job successfully by the `finish_job` method. """ @@ -39,7 +47,6 @@ def test_finish_job_complete_minimal(): job_id = "6046b539ce9c58ecf8c3e5f3" job_output = {"version": "1.1", "id": job_id, "result": [{"foo": "bar"}]} user = "someuser" - app_id = "module/myapp" gitcommit = "somecommit" resources = {"fake": "condor", "resources": "in", "here": "yo"} sched = "somescheduler" @@ -104,19 +111,18 @@ def test_finish_job_complete_minimal(): ) ) mongo.get_job.assert_called_once_with(job_id) - catalog.log_exec_stats.assert_called_once_with( - { - "user_id": user, - "app_module_name": "module", - "app_id": app_id, - "func_module_name": "module", - "func_name": "method_id", - "git_commit_hash": gitcommit, - "creation_time": 1615246649.0, # from Job ObjectId - "exec_start_time": 123.0, - "finish_time": 456.5, - "is_error": 0, - "job_id": job_id, - } - ) + les_expected = { + "user_id": user, + "func_module_name": "module", + "func_name": "method_id", + "git_commit_hash": gitcommit, + "creation_time": 1615246649.0, # from Job ObjectId + "exec_start_time": 123.0, + "finish_time": 456.5, + "is_error": 0, + "job_id": job_id, + } + if app_id: + les_expected.update({"app_id": app_id, "app_module_name": app_module}) + catalog.log_exec_stats.assert_called_once_with(les_expected) mongo.update_job_resources.assert_called_once_with(job_id, resources) From 0bc48d569c607143dbace05104102601fd6b4490 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 18 May 2021 09:55:05 -0500 Subject: [PATCH 072/109] DATAUP-424 remove slack calls (#376) * Removed extra logging * Removed extra logging * Removed slack call. We could add this in for single jobs and take out for batch jobs, and also put it in a thread * Removed failing test * Removed failing test Co-authored-by: Boris Sadkhin --- lib/execution_engine2/sdk/EE2Runjob.py | 3 --- test/tests_for_sdkmr/EE2Runjob_test.py | 16 +++++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index af9912e5a..c6c56bfa5 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -241,9 +241,6 @@ def _run(self, params): raise RuntimeError(error_msg) self.update_job_to_queued(job_id=job_id, scheduler_id=condor_job_id) - self.sdkmr.get_slack_client().run_job_message( - job_id=job_id, scheduler_id=condor_job_id, username=self.sdkmr.get_user_id() - ) return job_id diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 2a9729205..626160bf5 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -218,7 +218,8 @@ def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): scheduler_id=_CLUSTER, ) ) - mocks[SlackClient].run_job_message.assert_called_once_with(_JOB_ID, _CLUSTER, _USER) + # Removed for now, but might be added back in at a later point + # mocks[SlackClient].run_job_message.assert_called_once_with(_JOB_ID, _CLUSTER, _USER) def _create_reqs_dict( @@ -866,12 +867,13 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): ] ) - mocks[SlackClient].run_job_message.assert_has_calls( - [ - call(job_id=_JOB_ID_1, scheduler_id=_CLUSTER_1, username=_USER), - call(job_id=_JOB_ID_2, scheduler_id=_CLUSTER_2, username=_USER), - ] - ) + # Removed for now, but might be added back in if run_job_message is re-added + # mocks[SlackClient].run_job_message.assert_has_calls( + # [ + # call(job_id=_JOB_ID_1, scheduler_id=_CLUSTER_1, username=_USER), + # call(job_id=_JOB_ID_2, scheduler_id=_CLUSTER_2, username=_USER), + # ] + # ) final_expected_parent_job = Job() final_expected_parent_job.id = ObjectId(_JOB_ID) From 86de5487962e6096cd7bd8f2fb84ef80d2e2d1ec Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 May 2021 11:44:49 -0500 Subject: [PATCH 073/109] Bump cryptography from 3.2 to 3.3.2 (#307) Bumps [cryptography](https://github.com/pyca/cryptography) from 3.2 to 3.3.2. - [Release notes](https://github.com/pyca/cryptography/releases) - [Changelog](https://github.com/pyca/cryptography/blob/master/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/3.2...3.3.2) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Pipfile | 2 +- Pipfile.lock | 44 ++++++++++++++++++-------------------------- requirements-dev.txt | 2 +- requirements.txt | 2 +- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/Pipfile b/Pipfile index 7c7a4c470..bf7aad0ce 100644 --- a/Pipfile +++ b/Pipfile @@ -19,7 +19,7 @@ codecov = "==2.0.15" configparser = "==3.7.4" confluent-kafka = "==1.5.0" coverage = "==4.5.3" -cryptography = "==3.2" +cryptography = "==3.3.2" docker = "==4.3.1" gevent = "==20.9.0" gprof2dot = "==2019.11.30" diff --git a/Pipfile.lock b/Pipfile.lock index 87868167e..d50359dd7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3b864de8c1c9f041d32e49e08edc75d786257e2df50100b083d1c4deb9009f19" + "sha256": "6f4a61ee0b7197767ae6fd58ed3906ade43ed349b8e141297f01cb72ffcb08b4" }, "pipfile-spec": 6, "requires": { @@ -212,31 +212,23 @@ }, "cryptography": { "hashes": [ - "sha256:22f8251f68953553af4f9c11ec5f191198bc96cff9f0ac5dd5ff94daede0ee6d", - "sha256:284e275e3c099a80831f9898fb5c9559120d27675c3521278faba54e584a7832", - "sha256:3e17d02941c0f169c5b877597ca8be895fca0e5e3eb882526a74aa4804380a98", - "sha256:52a47e60953679eea0b4d490ca3c241fb1b166a7b161847ef4667dfd49e7699d", - "sha256:57b8c1ed13b8aa386cabbfde3be175d7b155682470b0e259fecfe53850967f8a", - "sha256:6a8f64ed096d13f92d1f601a92d9fd1f1025dc73a2ca1ced46dcf5e0d4930943", - "sha256:6e8a3c7c45101a7eeee93102500e1b08f2307c717ff553fcb3c1127efc9b6917", - "sha256:7ef41304bf978f33cfb6f43ca13bb0faac0c99cda33693aa20ad4f5e34e8cb8f", - "sha256:87c2fffd61e934bc0e2c927c3764c20b22d7f5f7f812ee1a477de4c89b044ca6", - "sha256:88069392cd9a1e68d2cfd5c3a2b0d72a44ef3b24b8977a4f7956e9e3c4c9477a", - "sha256:8a0866891326d3badb17c5fd3e02c926b635e8923fa271b4813cd4d972a57ff3", - "sha256:8f0fd8b0751d75c4483c534b209e39e918f0d14232c0d8a2a76e687f64ced831", - "sha256:9a07e6d255053674506091d63ab4270a119e9fc83462c7ab1dbcb495b76307af", - "sha256:9a8580c9afcdcddabbd064c0a74f337af74ff4529cdf3a12fa2e9782d677a2e5", - "sha256:bd80bc156d3729b38cb227a5a76532aef693b7ac9e395eea8063ee50ceed46a5", - "sha256:d1cbc3426e6150583b22b517ef3720036d7e3152d428c864ff0f3fcad2b97591", - "sha256:e15ac84dcdb89f92424cbaca4b0b34e211e7ce3ee7b0ec0e4f3c55cee65fae5a", - "sha256:e4789b84f8dedf190148441f7c5bfe7244782d9cbb194a36e17b91e7d3e1cca9", - "sha256:f01c9116bfb3ad2831e125a73dcd957d173d6ddca7701528eff1e7d97972872c", - "sha256:f0e3986f6cce007216b23c490f093f35ce2068f3c244051e559f647f6731b7ae", - "sha256:f2aa3f8ba9e2e3fd49bd3de743b976ab192fbf0eb0348cebde5d2a9de0090a9f", - "sha256:fb70a4cedd69dc52396ee114416a3656e011fb0311fca55eb55c7be6ed9c8aef" - ], - "index": "pypi", - "version": "==3.2" + "sha256:0d7b69674b738068fa6ffade5c962ecd14969690585aaca0a1b1fc9058938a72", + "sha256:1bd0ccb0a1ed775cd7e2144fe46df9dc03eefd722bbcf587b3e0616ea4a81eff", + "sha256:3c284fc1e504e88e51c428db9c9274f2da9f73fdf5d7e13a36b8ecb039af6e6c", + "sha256:49570438e60f19243e7e0d504527dd5fe9b4b967b5a1ff21cc12b57602dd85d3", + "sha256:541dd758ad49b45920dda3b5b48c968f8b2533d8981bcdb43002798d8f7a89ed", + "sha256:5a60d3780149e13b7a6ff7ad6526b38846354d11a15e21068e57073e29e19bed", + "sha256:7951a966613c4211b6612b0352f5bf29989955ee592c4a885d8c7d0f830d0433", + "sha256:922f9602d67c15ade470c11d616f2b2364950602e370c76f0c94c94ae672742e", + "sha256:a0f0b96c572fc9f25c3f4ddbf4688b9b38c69836713fb255f4a2715d93cbaf44", + "sha256:a777c096a49d80f9d2979695b835b0f9c9edab73b59e4ceb51f19724dda887ed", + "sha256:a9a4ac9648d39ce71c2f63fe7dc6db144b9fa567ddfc48b9fde1b54483d26042", + "sha256:aa4969f24d536ae2268c902b2c3d62ab464b5a66bcb247630d208a79a8098e9b", + "sha256:c7390f9b2119b2b43160abb34f63277a638504ef8df99f11cb52c1fda66a2e6f", + "sha256:e18e6ab84dfb0ab997faf8cca25a86ff15dfea4027b986322026cc99e0a892da" + ], + "index": "pypi", + "version": "==3.3.2" }, "docker": { "hashes": [ diff --git a/requirements-dev.txt b/requirements-dev.txt index f211aac16..7687975cb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,7 +12,7 @@ codecov==2.0.15 configparser==3.7.4 confluent-kafka==1.5.0 coverage==4.5.3 -cryptography==3.2 +cryptography==3.3.2 docker==4.3.1 gevent==20.9.0 gprof2dot==2019.11.30 diff --git a/requirements.txt b/requirements.txt index f211aac16..7687975cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ codecov==2.0.15 configparser==3.7.4 confluent-kafka==1.5.0 coverage==4.5.3 -cryptography==3.2 +cryptography==3.3.2 docker==4.3.1 gevent==20.9.0 gprof2dot==2019.11.30 From 911282d248c78f299cee4ae6429d80c6b56f661e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 May 2021 12:07:15 -0500 Subject: [PATCH 074/109] Bump aiohttp from 3.6.3 to 3.7.4 (#321) Bumps [aiohttp](https://github.com/aio-libs/aiohttp) from 3.6.3 to 3.7.4. - [Release notes](https://github.com/aio-libs/aiohttp/releases) - [Changelog](https://github.com/aio-libs/aiohttp/blob/master/CHANGES.rst) - [Commits](https://github.com/aio-libs/aiohttp/compare/v3.6.3...v3.7.4) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Pipfile | 2 +- Pipfile.lock | 58 +++++++++++++++++++++++++++++++------------- requirements-dev.txt | 2 +- requirements.txt | 2 +- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/Pipfile b/Pipfile index bf7aad0ce..772076846 100644 --- a/Pipfile +++ b/Pipfile @@ -7,7 +7,7 @@ verify_ssl = true [packages] aiofiles = "==0.4.0" -aiohttp = "==3.6.3" +aiohttp = "==3.7.4" asn1crypto = "==1.3.0" async-timeout = "==3.0.1" attrs = "==20.2.0" diff --git a/Pipfile.lock b/Pipfile.lock index d50359dd7..baffe4670 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "6f4a61ee0b7197767ae6fd58ed3906ade43ed349b8e141297f01cb72ffcb08b4" + "sha256": "220971ebdf3a95b33a1884691c51976b1ba0bfb839ee929ae376bcda40b05394" }, "pipfile-spec": 6, "requires": { @@ -26,22 +26,46 @@ }, "aiohttp": { "hashes": [ - "sha256:1a4160579ffbc1b69e88cb6ca8bb0fbd4947dfcbf9fb1e2a4fc4c7a4a986c1fe", - "sha256:206c0ccfcea46e1bddc91162449c20c72f308aebdcef4977420ef329c8fcc599", - "sha256:2ad493de47a8f926386fa6d256832de3095ba285f325db917c7deae0b54a9fc8", - "sha256:319b490a5e2beaf06891f6711856ea10591cfe84fe9f3e71a721aa8f20a0872a", - "sha256:470e4c90da36b601676fe50c49a60d34eb8c6593780930b1aa4eea6f508dfa37", - "sha256:60f4caa3b7f7a477f66ccdd158e06901e1d235d572283906276e3803f6b098f5", - "sha256:66d64486172b032db19ea8522328b19cfb78a3e1e5b62ab6a0567f93f073dea0", - "sha256:687461cd974722110d1763b45c5db4d2cdee8d50f57b00c43c7590d1dd77fc5c", - "sha256:698cd7bc3c7d1b82bb728bae835724a486a8c376647aec336aa21a60113c3645", - "sha256:797456399ffeef73172945708810f3277f794965eb6ec9bd3a0c007c0476be98", - "sha256:a885432d3cabc1287bcf88ea94e1826d3aec57fd5da4a586afae4591b061d40d", - "sha256:c506853ba52e516b264b106321c424d03f3ddef2813246432fa9d1cefd361c81", - "sha256:fb83326d8295e8840e4ba774edf346e87eca78ba8a89c55d2690352842c15ba5" - ], - "index": "pypi", - "version": "==3.6.3" + "sha256:119feb2bd551e58d83d1b38bfa4cb921af8ddedec9fad7183132db334c3133e0", + "sha256:16d0683ef8a6d803207f02b899c928223eb219111bd52420ef3d7a8aa76227b6", + "sha256:2eb3efe243e0f4ecbb654b08444ae6ffab37ac0ef8f69d3a2ffb958905379daf", + "sha256:2ffea7904e70350da429568113ae422c88d2234ae776519549513c8f217f58a9", + "sha256:40bd1b101b71a18a528ffce812cc14ff77d4a2a1272dfb8b11b200967489ef3e", + "sha256:418597633b5cd9639e514b1d748f358832c08cd5d9ef0870026535bd5eaefdd0", + "sha256:481d4b96969fbfdcc3ff35eea5305d8565a8300410d3d269ccac69e7256b1329", + "sha256:4c1bdbfdd231a20eee3e56bd0ac1cd88c4ff41b64ab679ed65b75c9c74b6c5c2", + "sha256:5563ad7fde451b1986d42b9bb9140e2599ecf4f8e42241f6da0d3d624b776f40", + "sha256:58c62152c4c8731a3152e7e650b29ace18304d086cb5552d317a54ff2749d32a", + "sha256:5b50e0b9460100fe05d7472264d1975f21ac007b35dcd6fd50279b72925a27f4", + "sha256:5d84ecc73141d0a0d61ece0742bb7ff5751b0657dab8405f899d3ceb104cc7de", + "sha256:5dde6d24bacac480be03f4f864e9a67faac5032e28841b00533cd168ab39cad9", + "sha256:5e91e927003d1ed9283dee9abcb989334fc8e72cf89ebe94dc3e07e3ff0b11e9", + "sha256:62bc216eafac3204877241569209d9ba6226185aa6d561c19159f2e1cbb6abfb", + "sha256:6c8200abc9dc5f27203986100579fc19ccad7a832c07d2bc151ce4ff17190076", + "sha256:6ca56bdfaf825f4439e9e3673775e1032d8b6ea63b8953d3812c71bd6a8b81de", + "sha256:71680321a8a7176a58dfbc230789790639db78dad61a6e120b39f314f43f1907", + "sha256:7c7820099e8b3171e54e7eedc33e9450afe7cd08172632d32128bd527f8cb77d", + "sha256:7dbd087ff2f4046b9b37ba28ed73f15fd0bc9f4fdc8ef6781913da7f808d9536", + "sha256:822bd4fd21abaa7b28d65fc9871ecabaddc42767884a626317ef5b75c20e8a2d", + "sha256:8ec1a38074f68d66ccb467ed9a673a726bb397142c273f90d4ba954666e87d54", + "sha256:950b7ef08b2afdab2488ee2edaff92a03ca500a48f1e1aaa5900e73d6cf992bc", + "sha256:99c5a5bf7135607959441b7d720d96c8e5c46a1f96e9d6d4c9498be8d5f24212", + "sha256:b84ad94868e1e6a5e30d30ec419956042815dfaea1b1df1cef623e4564c374d9", + "sha256:bc3d14bf71a3fb94e5acf5bbf67331ab335467129af6416a437bd6024e4f743d", + "sha256:c2a80fd9a8d7e41b4e38ea9fe149deed0d6aaede255c497e66b8213274d6d61b", + "sha256:c44d3c82a933c6cbc21039326767e778eface44fca55c65719921c4b9661a3f7", + "sha256:cc31e906be1cc121ee201adbdf844522ea3349600dd0a40366611ca18cd40e81", + "sha256:d5d102e945ecca93bcd9801a7bb2fa703e37ad188a2f81b1e65e4abe4b51b00c", + "sha256:dd7936f2a6daa861143e376b3a1fb56e9b802f4980923594edd9ca5670974895", + "sha256:dee68ec462ff10c1d836c0ea2642116aba6151c6880b688e56b4c0246770f297", + "sha256:e76e78863a4eaec3aee5722d85d04dcbd9844bc6cd3bfa6aa880ff46ad16bfcb", + "sha256:eab51036cac2da8a50d7ff0ea30be47750547c9aa1aa2cf1a1b710a1827e7dbe", + "sha256:f4496d8d04da2e98cc9133e238ccebf6a13ef39a93da2e87146c8c8ac9768242", + "sha256:fbd3b5e18d34683decc00d9a360179ac1e7a320a5fee10ab8053ffd6deab76e0", + "sha256:feb24ff1226beeb056e247cf2e24bba5232519efb5645121c4aea5b6ad74c1f2" + ], + "index": "pypi", + "version": "==3.7.4" }, "asn1crypto": { "hashes": [ diff --git a/requirements-dev.txt b/requirements-dev.txt index 7687975cb..e891ab88b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ -i https://pypi.org/simple/ aiofiles==0.4.0 -aiohttp==3.6.3 +aiohttp==3.7.4 asn1crypto==1.3.0 async-timeout==3.0.1 attrs==20.2.0 diff --git a/requirements.txt b/requirements.txt index 7687975cb..e891ab88b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -i https://pypi.org/simple/ aiofiles==0.4.0 -aiohttp==3.6.3 +aiohttp==3.7.4 asn1crypto==1.3.0 async-timeout==3.0.1 attrs==20.2.0 From 4561b9f97c0bb429a8fb70df5d93e237d8cb24e3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 May 2021 13:55:34 -0500 Subject: [PATCH 075/109] Bump jinja2 from 2.10.3 to 2.11.3 (#337) Bumps [jinja2](https://github.com/pallets/jinja) from 2.10.3 to 2.11.3. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/master/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/2.10.3...2.11.3) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Pipfile | 2 +- Pipfile.lock | 8 ++++---- requirements-dev.txt | 2 +- requirements.txt | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Pipfile b/Pipfile index 772076846..18110100d 100644 --- a/Pipfile +++ b/Pipfile @@ -70,7 +70,7 @@ uvloop = "==0.12.2" websockets = "==6.0" yarl = "==1.5.1" zipp = "==3.3.1" -Jinja2 = "==2.10.3" +Jinja2 = "==2.11.3" JSONRPCBase = "==0.2.0" MarkupSafe = "==1.1.1" pyOpenSSL = "==19.1.0" diff --git a/Pipfile.lock b/Pipfile.lock index baffe4670..a93b82121 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "220971ebdf3a95b33a1884691c51976b1ba0bfb839ee929ae376bcda40b05394" + "sha256": "d622fd73be75f37d26ef2eac9743f2545e5965faa68948197bab5a5faf7a7d16" }, "pipfile-spec": 6, "requires": { @@ -416,11 +416,11 @@ }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:03e47ad063331dd6a3f04a43eddca8a966a26ba0c5b7207a9a9e4e08f1b29419", + "sha256:a6d58433de0ae800347cab1fa3043cebbabe8baa9d29e668f1c768cb87a333c6" ], "index": "pypi", - "version": "==2.10.3" + "version": "==2.11.3" }, "jsonrpcbase": { "hashes": [ diff --git a/requirements-dev.txt b/requirements-dev.txt index e891ab88b..aa93a9b95 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -28,7 +28,7 @@ hyperframe==5.2.0 idna==2.8 importlib-metadata==2.0.0 iniconfig==1.1.1 -jinja2==2.10.3 +jinja2==2.11.3 jsonrpcbase==0.2.0 maps==5.1.1 markupsafe==1.1.1 diff --git a/requirements.txt b/requirements.txt index e891ab88b..aa93a9b95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ hyperframe==5.2.0 idna==2.8 importlib-metadata==2.0.0 iniconfig==1.1.1 -jinja2==2.10.3 +jinja2==2.11.3 jsonrpcbase==0.2.0 maps==5.1.1 markupsafe==1.1.1 From bd91b2604f60165fae5c92d49a598f4fe66d3636 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 19 May 2021 14:44:09 -0500 Subject: [PATCH 076/109] build-all-feature-branches (#302) * build-all-feature-branches * Rename workflow.yml to feature_builds.yml * Update build_feature_branch.yml * Create build_on_push.yaml * Update build_on_push.yaml * Update build_on_push.yaml * Create build_on_push.yaml Co-authored-by: bio-boris --- ...ker_image.yml => build_feature_branch.yml} | 8 ++--- .github/workflows/build_on_push.yaml | 31 +++++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) rename .github/workflows/{build_and_push_docker_image.yml => build_feature_branch.yml} (84%) create mode 100644 .github/workflows/build_on_push.yaml diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_feature_branch.yml similarity index 84% rename from .github/workflows/build_and_push_docker_image.yml rename to .github/workflows/build_feature_branch.yml index 85b0292b0..b62fc1c89 100644 --- a/.github/workflows/build_and_push_docker_image.yml +++ b/.github/workflows/build_feature_branch.yml @@ -1,8 +1,6 @@ -name: Build Develop/Master +name: Build Feature Branches -on: - pull_request: - branches: [master,develop] +on: [pull_request] jobs: main: @@ -16,7 +14,7 @@ jobs: username: ${{ secrets.GHCR_USERNAME }} password: ${{ secrets.GHCR_TOKEN }} - - name: Build and push + name: Build and push this feature branch id: docker_build uses: docker/build-push-action@v2 with: diff --git a/.github/workflows/build_on_push.yaml b/.github/workflows/build_on_push.yaml new file mode 100644 index 000000000..b132e61e0 --- /dev/null +++ b/.github/workflows/build_on_push.yaml @@ -0,0 +1,31 @@ +name: Build Main/Develop Branches + +on: + push: + branches: + - main + - master + - develop + +jobs: + main: + runs-on: ubuntu-20.04 + steps: + - + name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ secrets.GHCR_USERNAME }} + password: ${{ secrets.GHCR_TOKEN }} + - + name: Build and push the main branch + id: docker_build + uses: docker/build-push-action@v2 + with: + push: true + tags: ghcr.io/${{ github.repository }}:${{ ref }} + + - + name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} From 9acbef38f10932e3a49e4d08e08323f8c728bcf2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 May 2021 15:37:12 -0500 Subject: [PATCH 077/109] Bump py from 1.9.0 to 1.10.0 (#357) Bumps [py](https://github.com/pytest-dev/py) from 1.9.0 to 1.10.0. - [Release notes](https://github.com/pytest-dev/py/releases) - [Changelog](https://github.com/pytest-dev/py/blob/master/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/py/compare/1.9.0...1.10.0) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Pipfile | 2 +- Pipfile.lock | 8 ++++---- requirements-dev.txt | 2 +- requirements.txt | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Pipfile b/Pipfile index 18110100d..ec7e9deb0 100644 --- a/Pipfile +++ b/Pipfile @@ -43,7 +43,7 @@ multidict = "==4.5.2" nose = "==1.3.7" pluggy = "==0.13.1" psutil = "==5.6.6" -py = "==1.9.0" +py = "==1.10.0" pycosat = "==0.6.3" pycparser = "==2.19" pymongo = "==3.8.0" diff --git a/Pipfile.lock b/Pipfile.lock index a93b82121..5df9ba0ff 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d622fd73be75f37d26ef2eac9743f2545e5965faa68948197bab5a5faf7a7d16" + "sha256": "96e75d4a0d655bab93d08d5d163c1bdb458d7ff8bc22b3e48af30e07797d0340" }, "pipfile-spec": 6, "requires": { @@ -595,11 +595,11 @@ }, "py": { "hashes": [ - "sha256:366389d1db726cd2fcfc79732e75410e5fe4d31db13692115529d34069a043c2", - "sha256:9ca6883ce56b4e8da7e79ac18787889fa5206c79dcc67fb065376cd2fe03f342" + "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3", + "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a" ], "index": "pypi", - "version": "==1.9.0" + "version": "==1.10.0" }, "pycosat": { "hashes": [ diff --git a/requirements-dev.txt b/requirements-dev.txt index aa93a9b95..06e45c8e1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -40,7 +40,7 @@ nose==1.3.7 packaging==20.9 pluggy==0.13.1 psutil==5.6.6 -py==1.9.0 +py==1.10.0 pycosat==0.6.3 pycparser==2.19 pymongo==3.8.0 diff --git a/requirements.txt b/requirements.txt index aa93a9b95..06e45c8e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,7 +40,7 @@ nose==1.3.7 packaging==20.9 pluggy==0.13.1 psutil==5.6.6 -py==1.9.0 +py==1.10.0 pycosat==0.6.3 pycparser==2.19 pymongo==3.8.0 From b8c5fdc2d33c61d46dfd2d0a0bde768a661ebbfe Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 20 May 2021 10:26:17 -0500 Subject: [PATCH 078/109] Update pull_request_template.md (#388) --- .github/pull_request_template.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f15b16722..0ffad221c 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -14,7 +14,7 @@ # Dev Checklist: -- [ ] My code follows the guidelines at https://sites.google.com/truss.works/kbasetruss/development +- [ ] My code follows the guidelines at https://sites.google.com/truss.works/kbasetruss/data-upload-project/development - [ ] I have performed a self-review of my own code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation From fd1a5b429c0f00ca67d2d2c7ccdeed6832bc6bce Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 20 May 2021 14:36:27 -0500 Subject: [PATCH 079/109] Update models.py (#389) --- lib/execution_engine2/db/models/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 696dc5d62..96caf8cda 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -160,7 +160,6 @@ class JobInput(EmbeddedDocument): wsid = IntField(required=False, default=None) method = StringField(required=True) - requested_release = StringField() params = DynamicField() service_ver = StringField(required=True) app_id = StringField() From 134a9d6df38a8bc0cb02eafc4eab97b24869ee1a Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 20 May 2021 15:18:42 -0500 Subject: [PATCH 080/109] Update build_on_push.yaml --- .github/workflows/build_on_push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_push.yaml b/.github/workflows/build_on_push.yaml index b132e61e0..acd000fc6 100644 --- a/.github/workflows/build_on_push.yaml +++ b/.github/workflows/build_on_push.yaml @@ -24,7 +24,7 @@ jobs: uses: docker/build-push-action@v2 with: push: true - tags: ghcr.io/${{ github.repository }}:${{ ref }} + tags: ghcr.io/${{ github.repository }}:${{ github.ref }} - name: Image digest From e4763d4283681210699d85bf7d9c12e3fb3fb97c Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 20 May 2021 15:19:24 -0500 Subject: [PATCH 081/109] Update build_on_push.yaml --- .github/workflows/build_on_push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_push.yaml b/.github/workflows/build_on_push.yaml index acd000fc6..ab68ea2e2 100644 --- a/.github/workflows/build_on_push.yaml +++ b/.github/workflows/build_on_push.yaml @@ -1,4 +1,4 @@ -name: Build Main/Develop Branches +name: Build Main/Develop Branches on push on: push: From 08611655174180b503a100713e8838d72e7e182c Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 20 May 2021 15:21:44 -0500 Subject: [PATCH 082/109] Update build_on_push.yaml --- .github/workflows/build_on_push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_push.yaml b/.github/workflows/build_on_push.yaml index ab68ea2e2..d781abd72 100644 --- a/.github/workflows/build_on_push.yaml +++ b/.github/workflows/build_on_push.yaml @@ -24,7 +24,7 @@ jobs: uses: docker/build-push-action@v2 with: push: true - tags: ghcr.io/${{ github.repository }}:${{ github.ref }} + tags: ghcr.io/${{ github.repository }}:${{ GITHUB_REF##*/ }} - name: Image digest From 3d85a55ffdc00ed466a9f2a161a97e090cc65532 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 20 May 2021 15:22:53 -0500 Subject: [PATCH 083/109] Update build_on_push.yaml --- .github/workflows/build_on_push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_push.yaml b/.github/workflows/build_on_push.yaml index d781abd72..d610acfae 100644 --- a/.github/workflows/build_on_push.yaml +++ b/.github/workflows/build_on_push.yaml @@ -24,7 +24,7 @@ jobs: uses: docker/build-push-action@v2 with: push: true - tags: ghcr.io/${{ github.repository }}:${{ GITHUB_REF##*/ }} + tags: ghcr.io/${{ github.repository }}:${ GITHUB_REF##*/ } - name: Image digest From 62bb7c19735864c141af7bbb63a3e521f0f3577d Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 27 May 2021 17:07:19 -0500 Subject: [PATCH 084/109] Dataup 363 retry (#383) * Adding retry endpoint * Adding retry endpoint * Adding retry endpoint * Adding retry endpoint * Make tests work * Make tests work * Update black * ran black * exception path * Recompiled * Hack for job browser * Hack for jobbrowser * Fix tests * Fix tests * Fix tests and endpoint * fix tests * fix tests * Fix bug with retry optional * fix tests * fix tests * Fix bug with service_version * Fix bug with service_version * Added naive retry_jobs endpoint (#387) * Added naive retry_jobs endpoint * Revert ee2server * Revert ee2server * Revert ee2server * Revert ee2server * PR reviews * PR reviews * Fixed tests Co-authored-by: bio-boris * Fixed automicity bug * Updated docs * Updated docs * Updated docs * Updated docs var interpolation * Update return for batch retry * Update return for batch retry * ideas for retry meeting * ideas for retry meeting * updates from the meeting * updates from the meeting * Ran black * undo the repr * new client * added better message * modify parent * Fixed api tests * Removed code * Removed code * Removed code * updated execptions * Black * Testing timings * testing * testing * testing * Testing timings * Remove unused var * Remove unused var * Validate first * Validate first * Fix typo * Fix bug * fix bug * Updated error message * Changed order of db writes * unused var * unused var * Fix typo * Fixing tests * re-add codecov * Testing db errors * Increase coverage * Increase coverage * Fixed staticmethod * removed unused attr * Pleasing the coverage overlords * Disallow same job id within retry_multiple now Co-authored-by: bio-boris --- .github/workflows/ee2-tests.yml | 1 + .pre-commit-config.yaml | 2 +- execution_engine2.html | 2 +- execution_engine2.spec | 52 + lib/execution_engine2/db/models/models.py | 60 +- lib/execution_engine2/exceptions.py | 26 +- .../execution_engine2Impl.py | 79 +- .../execution_engine2Server.py | 12 + lib/execution_engine2/sdk/EE2Runjob.py | 296 +++- lib/execution_engine2/sdk/EE2StatusRange.py | 8 + lib/execution_engine2/sdk/SDKMethodRunner.py | 9 + .../execution_engine2Client.py | 1188 +++++++++++------ requirements.txt | 2 +- test/tests_for_sdkmr/EE2Runjob_test.py | 5 +- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 282 +++- test/tests_for_sdkmr/ee2_load_test.py | 33 + test/tests_for_sdkmr/ee2_retry_test.py | 135 ++ test/utils_shared/test_utils.py | 79 +- 18 files changed, 1830 insertions(+), 441 deletions(-) create mode 100644 test/tests_for_sdkmr/ee2_retry_test.py diff --git a/.github/workflows/ee2-tests.yml b/.github/workflows/ee2-tests.yml index 8faedc45a..06c518a98 100644 --- a/.github/workflows/ee2-tests.yml +++ b/.github/workflows/ee2-tests.yml @@ -42,4 +42,5 @@ jobs: docker-compose up -d cp test/env/test.travis.env test.env make test-coverage + codecov diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9325ecde8..d3cb732c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/ambv/black - rev: 20.8b1 + rev: 21.5b0 hooks: - id: black exclude: '.+Impl.py' diff --git a/execution_engine2.html b/execution_engine2.html index d1ed35423..0620c8ca0 100644 --- a/execution_engine2.html +++ b/execution_engine2.html @@ -1 +1 @@ -execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge of the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*params - the parameters to pass to the method.
*
*Optional parameters:
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge of the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*params - the parameters to pass to the method.
*
*Optional parameters:
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

/*
*job_id of retried job
*retry_id: job_id of the job that was launched
*str error: reason as to why that particular retry failed (available for bulk retry only)
*/
typedefstructure{
job_idjob_id;
job_idretry_id;
stringerror;
}
RetryResult;

/*
*job_id of job to retry
*as_admin: retry someone elses job in your namespace
*#TODO Possibly Add JobRequirements job_requirements;
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
RetryParams;

/*
*job_ids of job to retry
*as_admin: retry someone else's job in your namespace
*#TODO: Possibly Add list<JobRequirements> job_requirements;
*/
typedefstructure{
list<job_id>job_ids;
booleanas_admin;
}
BulkRetryParams;

/*
*#TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present
*#TODO Add retry child that checks the status of the child? to prevent multiple retries
*Allowed Jobs
** Regular Job with no children
** Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call
*Not Allowed
** Regular Job with children (Should not be possible to create yet)
** Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs)
*/
funcdefretry_job(RetryParamsparams)returns(RetryResultretry_result)authenticationrequired;

/*
*Same as retry_job, but accepts multiple jobs
*/
funcdefretry_jobs(BulkRetryParamsparams)returns(list<RetryResult>retry_result)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index 358ea17f5..5aa498bf0 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -198,10 +198,62 @@ funcdef run_job_batch(list params, BatchParams batch_params) returns (BatchSubmission job_ids) authentication required; + /* + job_id of retried job + retry_id: job_id of the job that was launched + str error: reason as to why that particular retry failed (available for bulk retry only) + */ + typedef structure { + job_id job_id; + job_id retry_id; + string error; + } RetryResult; + + /* + job_id of job to retry + as_admin: retry someone elses job in your namespace + #TODO Possibly Add JobRequirements job_requirements; + */ + typedef structure { + job_id job_id; + boolean as_admin; + } RetryParams; + + /* + job_ids of job to retry + as_admin: retry someone else's job in your namespace + #TODO: Possibly Add list job_requirements; + */ + typedef structure { + list job_ids; + boolean as_admin; + } BulkRetryParams; + + /* + #TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present + #TODO Add retry child that checks the status of the child? to prevent multiple retries + Allowed Jobs + * Regular Job with no children + * Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call + Not Allowed + * Regular Job with children (Should not be possible to create yet) + * Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs) + */ + funcdef retry_job(RetryParams params) returns (RetryResult retry_result) authentication required; + + /* + Same as retry_job, but accepts multiple jobs + */ + funcdef retry_jobs(BulkRetryParams params) returns (list retry_result) authentication required; + + + + funcdef abandon_children(AbandonChildren params) returns (BatchSubmission parent_and_child_ids) authentication required; + /* EE2Constants Concierge Params are request_cpus: int request_memory: int in MB diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 96caf8cda..0356f037b 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -216,6 +216,7 @@ class TerminatedCode(Enum): terminated_by_admin = 1 terminated_by_automation = 2 terminated_by_batch_abort = 3 + terminated_by_server_failure = 4 class Status(Enum): @@ -311,15 +312,21 @@ class Job(Document): terminated_code = IntField(validation=valid_termination_code) error_code = IntField(validation=valid_errorcode) - + batch_job = BooleanField(default=False) scheduler_type = StringField() scheduler_id = StringField() scheduler_estimator_id = StringField() job_input = EmbeddedDocumentField(JobInput, required=True) job_output = DynamicField() condor_job_ads = DynamicField() - child_jobs = ListField() - batch_job = BooleanField(default=False) + child_jobs = ListField() # Only parent container should have child jobs + # batch_parent_container = BooleanField(default=False) # Only parent container should have this + + # Only present when a job has been retried and on the retry_parent + retry_count = IntField(min_value=0) + + # Only present on a retried job, not it's parent. If attempting to retry this job, use its parent instead + retry_parent = StringField() meta = {"collection": "ee2_jobs"} @@ -331,6 +338,53 @@ def __repr__(self): return self.to_json() +# class BatchJobCollection(Document): +# """ +# A container for storing related batch job containers +# Does this need to exist before creating a collection? +# """ +# +# # User and wsid are used for permission handling +# user = StringField(required=True) +# wsid = IntField(required=False, default=None) +# batch_jobs = ListField(required=True) +# updated = FloatField(default=time.time) +# title = StringField(required=False) +# description = StringField(required=False) +# +# def save(self, *args, **kwargs): +# self.updated = time.time() +# return super(BatchJobCollection, self).save(*args, **kwargs) +# +# def __repr__(self): +# return self.to_json() +# +# +# class BatchJobContainer(Document): +# """ +# A container for storing jobs information +# Can be created via run_job_batch endpoint, or through the UI/ee2 api, +# or a running job with the ee2_client +# """ +# +# meta = {"collection": "ee2_jobs"} +# user = StringField(required=True) +# wsid = IntField(required=False, default=None) +# updated = FloatField(default=time.time) +# scheduler_type = StringField(default="htcondor", required=False) +# child_jobs = ListField(required=True) +# title = StringField(required=False) +# description = StringField(required=False) +# meta = {"collection": "ee2_jobs"} +# +# def save(self, *args, **kwargs): +# self.updated = time.time() +# return super(BatchJobContainer, self).save(*args, **kwargs) +# +# def __repr__(self): +# return self.to_json() + + # Unused for now class HeldJob(Document): job_id = ReferenceField(Job) diff --git a/lib/execution_engine2/exceptions.py b/lib/execution_engine2/exceptions.py index 523ac086b..de471d4c8 100644 --- a/lib/execution_engine2/exceptions.py +++ b/lib/execution_engine2/exceptions.py @@ -13,11 +13,11 @@ class IncorrectParamsException(ExecutionEngineValueError): class MissingRunJobParamsException(ExecutionEngineValueError): - pass + """Missing a required run_job_parameter""" class InvalidStatusTransitionException(ExecutionEngineValueError): - pass + """Raised if the status transition is NOT ALLOWED""" class InvalidOperationForStatusException(ExecutionEngineValueError): @@ -25,30 +25,36 @@ class InvalidOperationForStatusException(ExecutionEngineValueError): class MissingCondorRequirementsException(ExecutionEngineValueError): - pass + """Raised if malformed requirements information is retrieved for an ee2 job""" class MalformedJobIdException(ExecutionEngineValueError): - pass + """Raised if bad ee2 id is passed in""" class MalformedTimestampException(ExecutionEngineException): - pass + """Bad timestamps""" class ChildrenNotFoundError(ExecutionEngineException): - pass + """Raised if children are not found for a given parent when attempting to abandon children""" class RecordNotFoundException(ExecutionEngineException): - pass + """Raised if ee2 job or ee2 job log record is not found in db""" class CondorJobNotFoundException(ExecutionEngineException): - pass + """Raised if condor job is not found""" + + +class RetryFailureException(ExecutionEngineException): + """General exception for couldn't Retry the job failures'""" + + +class CannotRetryJob(ExecutionEngineException): + """Can only retry errored or cancelled jobs, and not batch parents""" class AuthError(ExecutionEngineException): """Raised if a user is unauthorized for a particular action, or doesn't have the right auth role""" - - pass diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 47434551b..e2684a99d 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -29,8 +29,8 @@ class execution_engine2: # the latter method is running. ######################################### noqa VERSION = "0.0.5" - GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" - GIT_COMMIT_HASH = "46df42a6ed9fda3796cf9a68ea8088067e674936" + GIT_URL = "git@github.com:kbase/execution_engine2.git" + GIT_COMMIT_HASH = "8b6f4e1917dbdfa374e6f22b1f2adbe7eca5a24c" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -375,6 +375,81 @@ def run_job_batch(self, ctx, params, batch_params): # return the results return [job_ids] + def retry_job(self, ctx, params): + """ + #TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present + #TODO Add retry child that checks the status of the child? to prevent multiple retries + Allowed Jobs + Regular Job with no children + Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call + Not Allowed + Regular Job with children (Should not be possible to create yet) + Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs) + :param params: instance of type "RetryParams" (job_id of job to retry + as_admin: retry someone elses job in your namespace #TODO Possibly + Add JobRequirements job_requirements;) -> structure: parameter + "job_id" of type "job_id" (A job id.), parameter "as_admin" of + type "boolean" (@range [0,1]) + :returns: instance of type "RetryResult" (job_id of retried job + retry_id: job_id of the job that was launched str error: reason as + to why that particular retry failed (available for bulk retry + only)) -> structure: parameter "job_id" of type "job_id" (A job + id.), parameter "retry_id" of type "job_id" (A job id.), parameter + "error" of String + """ + # ctx is the context object + # return variables are: retry_result + #BEGIN retry_job + mr = SDKMethodRunner( + user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, + job_permission_cache=self.job_permission_cache, + admin_permissions_cache=self.admin_permissions_cache + ) + retry_result = mr.retry(job_id=params.get('job_id'), as_admin=params.get('as_admin')) + #END retry_job + + # At some point might do deeper type checking... + if not isinstance(retry_result, dict): + raise ValueError('Method retry_job return value ' + + 'retry_result is not type dict as required.') + # return the results + return [retry_result] + + def retry_jobs(self, ctx, params): + """ + Same as retry_job, but accepts multiple jobs + :param params: instance of type "BulkRetryParams" (job_ids of job to + retry as_admin: retry someone else's job in your namespace #TODO: + Possibly Add list job_requirements;) -> + structure: parameter "job_ids" of list of type "job_id" (A job + id.), parameter "as_admin" of type "boolean" (@range [0,1]) + :returns: instance of list of type "RetryResult" (job_id of retried + job retry_id: job_id of the job that was launched str error: + reason as to why that particular retry failed (available for bulk + retry only)) -> structure: parameter "job_id" of type "job_id" (A + job id.), parameter "retry_id" of type "job_id" (A job id.), + parameter "error" of String + """ + # ctx is the context object + # return variables are: retry_result + #BEGIN retry_jobs + mr = SDKMethodRunner( + user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, + job_permission_cache=self.job_permission_cache, + admin_permissions_cache=self.admin_permissions_cache + ) + retry_result = mr.retry_multiple(job_ids=params.get('job_ids'), as_admin=params.get('as_admin')) + #END retry_jobs + + # At some point might do deeper type checking... + if not isinstance(retry_result, list): + raise ValueError('Method retry_jobs return value ' + + 'retry_result is not type list as required.') + # return the results + return [retry_result] + def abandon_children(self, ctx, params): """ :param params: instance of type "AbandonChildren" -> structure: diff --git a/lib/execution_engine2/execution_engine2Server.py b/lib/execution_engine2/execution_engine2Server.py index 5e3df02ef..b63fe2210 100644 --- a/lib/execution_engine2/execution_engine2Server.py +++ b/lib/execution_engine2/execution_engine2Server.py @@ -400,6 +400,18 @@ def __init__(self): self.method_authentication[ "execution_engine2.run_job_batch" ] = "required" # noqa + self.rpc_service.add( + impl_execution_engine2.retry_job, + name="execution_engine2.retry_job", + types=[dict], + ) + self.method_authentication["execution_engine2.retry_job"] = "required" # noqa + self.rpc_service.add( + impl_execution_engine2.retry_jobs, + name="execution_engine2.retry_jobs", + types=[dict], + ) + self.method_authentication["execution_engine2.retry_jobs"] = "required" # noqa self.rpc_service.add( impl_execution_engine2.abandon_children, name="execution_engine2.abandon_children", diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index c6c56bfa5..3b7f9d48d 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -6,6 +6,7 @@ """ import os import time +from collections import defaultdict from enum import Enum from typing import Optional, Dict, NamedTuple, Union, List, Any @@ -18,13 +19,20 @@ ErrorCode, TerminatedCode, ) +from execution_engine2.exceptions import ( + IncorrectParamsException, + AuthError, + CannotRetryJob, + RetryFailureException, +) +from execution_engine2.sdk.EE2Constants import CONCIERGE_CLIENTGROUP from execution_engine2.sdk.job_submission_parameters import ( JobSubmissionParameters, JobRequirements as ResolvedRequirements, AppInfo, UserCreds, ) -from execution_engine2.sdk.EE2Constants import CONCIERGE_CLIENTGROUP +from execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange from execution_engine2.utils.job_requirements_resolver import ( REQUEST_CPUS, REQUEST_DISK, @@ -36,19 +44,20 @@ DEBUG_MODE, ) from execution_engine2.utils.job_requirements_resolver import RequirementsType -from execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange -from execution_engine2.exceptions import IncorrectParamsException, AuthError - _JOB_REQUIREMENTS = "job_reqs" _JOB_REQUIREMENTS_INCOMING = "job_requirements" _SCHEDULER_REQUIREMENTS = "scheduler_requirements" +_META = "meta" # narrative_cell_info +_APP_PARAMS = "params" # application parameters _REQUIREMENTS_LIST = "requirements_list" _METHOD = "method" _APP_ID = "app_id" _PARENT_JOB_ID = "parent_job_id" +_PARENT_RETRY_JOB_ID = "retry_parent" _WORKSPACE_ID = "wsid" _SOURCE_WS_OBJECTS = "source_ws_objects" +_SERVICE_VER = "service_ver" class JobPermissions(Enum): @@ -79,6 +88,22 @@ def _init_job_rec( user_id: str, params: Dict, ) -> str: + f""" + Save an initial job record to the db and send a message to kafka + + *** Expected OPTIONAL Parameters *** + {_WORKSPACE_ID} (The workspace id) + {_APP_PARAMS} (job params for the app/method itself) + {_SERVICE_VER} (app version) + {_APP_ID} (app UI) + {_SOURCE_WS_OBJECTS} (collected workspace objects for this app) + {_PARENT_JOB_ID} (parent of this job, doesn't update/notify the parent) + {_META} (narrative cell information) + + *** Expected REQUIRED Parameters *** + {_METHOD} (The app method to run) + {_JOB_REQUIREMENTS} (Job Resource information) + """ job = Job() inputs = JobInput() job.user = user_id @@ -87,54 +112,67 @@ def _init_job_rec( job.status = "created" # Inputs inputs.wsid = job.wsid - inputs.method = params.get(_METHOD) + + required_job_inputs = [_JOB_REQUIREMENTS, _METHOD] + for item in required_job_inputs: + if item not in params: + raise ValueError(f"{item} is required for job initialization") + + inputs.method = params[_METHOD] inputs.params = params.get("params") - params["service_ver"] = self._get_module_git_commit( - params.get(_METHOD), params.get("service_ver") + # Catalog git commit + params[_SERVICE_VER] = self._get_module_git_commit( + params.get(_METHOD), params.get(_SERVICE_VER) ) - inputs.service_ver = params.get("service_ver") - + inputs.service_ver = params.get(_SERVICE_VER) inputs.app_id = params.get(_APP_ID) inputs.source_ws_objects = params.get(_SOURCE_WS_OBJECTS) - inputs.parent_job_id = str(params.get(_PARENT_JOB_ID)) + + parent_job_id = params.get(_PARENT_JOB_ID) + if parent_job_id: + inputs.parent_job_id = str(parent_job_id) + inputs.narrative_cell_info = Meta() - meta = params.get("meta") + # Meta and Requirements + meta = params.get(_META) if meta: for meta_attr in ["run_id", "token_id", "tag", "cell_id"]: inputs.narrative_cell_info[meta_attr] = meta.get(meta_attr) - - jr = JobRequirements() - jr.cpu = params[_JOB_REQUIREMENTS].cpus - jr.memory = params[_JOB_REQUIREMENTS].memory_MB - jr.disk = params[_JOB_REQUIREMENTS].disk_GB - jr.clientgroup = params[_JOB_REQUIREMENTS].client_group + resolved_reqs = params[_JOB_REQUIREMENTS] # type: ResolvedRequirements + jr = JobRequirements( + cpu=resolved_reqs.cpus, + memory=resolved_reqs.memory_MB, + disk=resolved_reqs.disk_GB, + clientgroup=resolved_reqs.client_group, + ) inputs.requirements = jr - job.job_input = inputs - job_id = self.sdkmr.save_job(job) + f""" + Set the id of the parent that was retried to get this job + The {_PARENT_RETRY_JOB_ID} will only be set on a job retry + """ + parent_retry_job_id = params.get(_PARENT_RETRY_JOB_ID) + if parent_retry_job_id: + job.retry_parent = str(parent_retry_job_id) + + job_id = self.sdkmr.save_job(job) self.sdkmr.get_kafka_client().send_kafka_message( message=KafkaCreateJob(job_id=job_id, user=user_id) ) - return job_id def _get_module_git_commit(self, method, service_ver=None) -> Optional[str]: module_name = method.split(".")[0] - if not service_ver: service_ver = "release" - self.logger.debug(f"Getting commit for {module_name} {service_ver}") - module_version = self.sdkmr.get_catalog().get_module_version( {"module_name": module_name, "version": service_ver} ) - git_commit_hash = module_version.get("git_commit_hash") - return git_commit_hash def _check_ws_objects(self, source_objects) -> None: @@ -149,6 +187,7 @@ def _check_ws_objects(self, source_objects) -> None: ) paths = info.get("paths") + # TODO It would be nice to show which object is inaccessible if None in paths: raise ValueError("Some workspace object is inaccessible") @@ -293,6 +332,7 @@ def _create_parent_job(self, wsid, meta): def _run_batch(self, parent_job: Job, params): child_jobs = [] + for job_param in params: job_param[_PARENT_JOB_ID] = str(parent_job.id) try: @@ -321,7 +361,8 @@ def run_batch( if type(params) != list: raise IncorrectParamsException("params must be a list") wsid = batch_params.get(_WORKSPACE_ID) - meta = batch_params.get("meta") + meta = batch_params.get(_META) + if as_admin: self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) else: @@ -338,6 +379,7 @@ def run_batch( parent_job = self._create_parent_job(wsid=wsid, meta=meta) children_jobs = self._run_batch(parent_job=parent_job, params=params) + return {_PARENT_JOB_ID: str(parent_job.id), "child_job_ids": children_jobs} # modifies the jobs in place @@ -457,6 +499,208 @@ def _check_job_arguments(self, jobs, has_parent_job=False): # although most likely jobs aren't operating on the same object self._check_ws_objects(source_objects=job.get(_SOURCE_WS_OBJECTS)) + @staticmethod + def _retryable(status: str): + return status in [Status.terminated.value, Status.error.value] + + def _safe_cancel( + self, + job_id: str, + terminated_code: TerminatedCode, + ): + try: + self.sdkmr.cancel_job(job_id=job_id, terminated_code=terminated_code.value) + except Exception as e: + self.logger.error(f"Couldn't cancel {job_id} due to {e}") + + def _db_update_failure( + self, job_that_failed_operation: str, job_to_abort: str, exception: Exception + ): + """Attempt to cancel created/queued/running retried job and then raise exception""" + # TODO Use and create a method in sdkmr? + msg = ( + f"Couldn't update job record:{job_that_failed_operation} during retry. Aborting:{job_to_abort}" + f" Exception:{exception} " + ) + self._safe_cancel( + job_id=job_to_abort, + terminated_code=TerminatedCode.terminated_by_server_failure, + ) + # TODO Maybe move this log into multiple so not multiple error messages are generated + self.logger.error(msg, exc_info=True, stack_info=True) + raise RetryFailureException(msg) + + def _validate_retry_presubmit(self, job_id: str, as_admin: bool = False): + """ + Validate retry request before attempting to contact scheduler + + _validate doesn't do a recursive check if if the job has a retry parent, + but the _validate call on the recursion is guaranteed to pass because + the parent was retried once already so the _validate must have passed previously. + Since the parent job's state can't have changed it would just pass again. + """ + + # Check to see if you still have permissions to the job and then optionally the parent job id + job = self.sdkmr.get_job_with_permission( + job_id, JobPermissions.WRITE, as_admin=as_admin + ) # type: Job + job_input = job.job_input # type: JobInput + + parent_job = None + if job_input.parent_job_id: + parent_job = self.sdkmr.get_job_with_permission( + job_input.parent_job_id, JobPermissions.WRITE, as_admin=as_admin + ) + + if job.batch_job: + raise CannotRetryJob( + "Cannot retry batch job parents. Must retry individual jobs" + ) + + if not self._retryable(job.status): + raise CannotRetryJob( + f"Error retrying job {job_id} with status {job.status}: can only retry jobs with status 'error' or 'terminated'" + ) + + return job, parent_job + + def _retry(self, job_id: str, job: Job, parent_job: Job, as_admin: bool = False): + # Cannot retry a retried job, you must retry the retry_parent + if job.retry_parent: + return self.retry(str(job.retry_parent), as_admin=as_admin) + + # Get run job params from db, and inject parent job id, then run it + run_job_params = self._get_run_job_params_from_existing_job( + job, user_id=self.sdkmr.user_id + ) + # Submit job to job scheduler or fail and not count it as a retry attempt + run_job_params[_PARENT_RETRY_JOB_ID] = job_id + retry_job_id = self.run(params=run_job_params, as_admin=as_admin) + + # Save that the job has been retried, and increment the count. Notify the parent(s) + # 1) Notify the parent container that it has a new child.. + if parent_job: + try: + parent_job.modify(push__child_jobs=retry_job_id) + except Exception as e: + self._db_update_failure( + job_that_failed_operation=str(parent_job.id), + job_to_abort=retry_job_id, + exception=e, + ) + + # 2) Notify the retry_parent that it has been retried + try: + job.modify(inc__retry_count=1) + except Exception as e: + self._db_update_failure( + job_that_failed_operation=str(job.id), + job_to_abort=retry_job_id, + exception=e, + ) + + # Should we compare the original and child job to make sure certain fields match, + # to make sure the retried job is correctly submitted? Or save that for a unit test? + return {"job_id": job_id, "retry_id": retry_job_id} + + def retry(self, job_id: str, as_admin=False) -> Dict[str, Optional[str]]: + """ + #TODO Add new job requirements/cgroups as an optional param + :param job_id: The main job to retry + :param as_admin: Run with admin permission + :return: The child job id that has been retried + """ + job, parent_job = self._validate_retry_presubmit( + job_id=job_id, as_admin=as_admin + ) + return self._retry( + job_id=job_id, job=job, parent_job=parent_job, as_admin=as_admin + ) + + def retry_multiple( + self, job_ids, as_admin=False + ) -> List[Dict[str, Union[str, Any]]]: + """ + #TODO Add new job requirements/cgroups as an optional param + #TODO Notify the parent container that it has multiple new children, instead of multiple transactions? + + :param job_ids: The list of jobs to retry + :param as_admin: Run with admin permission + :return: The child job ids that have been retried or errors + """ + if not job_ids: + raise ValueError("No job_ids provided to retry") + + offending_ids = defaultdict(int) + for job_id in job_ids: + if job_ids.count(job_id) > 1: + offending_ids[job_id] += 1 + + if offending_ids.keys(): + raise ValueError( + f"Retry of the same id in the same request is not supported." + f" Offending ids:{list(offending_ids.keys())} " + ) + + # Check all inputs before attempting to start submitting jobs + retried_jobs = [] + jobs = [] + parent_jobs = [] + for job_id in job_ids: + try: + job, parent_job = self._validate_retry_presubmit( + job_id=job_id, as_admin=as_admin + ) + jobs.append(job) + parent_jobs.append(parent_job) + except Exception as e: + raise RetryFailureException(e) + + # Submit all of the collected jobs + for i, job_id in enumerate(job_ids): + try: + retried_jobs.append( + self._retry( + job_id=job_id, + job=jobs[i], + parent_job=parent_jobs[i], + as_admin=as_admin, + ) + ) + except Exception as e: + retried_jobs.append({"job_id": job_id, "error": f"{e}"}) + return retried_jobs + + @staticmethod + def _get_run_job_params_from_existing_job(job: Job, user_id: str) -> Dict: + """ + Get top level fields from job model to be sent into `run_job` + """ + ji = job.job_input # type: JobInput + + meta = None + if ji.narrative_cell_info: + meta = ji.narrative_cell_info.to_mongo().to_dict() + + source_ws_objects = list() + if ji.source_ws_objects: + source_ws_objects = list(ji.source_ws_objects) + + run_job_params = { + _WORKSPACE_ID: job.wsid, + _META: meta, + _APP_PARAMS: ji.params or {}, + "user": user_id, # REQUIRED, it runs as the current user + _METHOD: ji.method, # REQUIRED + _APP_ID: ji.app_id, + _SOURCE_WS_OBJECTS: source_ws_objects, # Must be list + _SERVICE_VER: ji.service_ver, + _PARENT_JOB_ID: ji.parent_job_id, + } + + # Then the next fields are job inputs top level requirements, app run parameters, and scheduler resource requirements + return run_job_params + def run( self, params=None, as_admin=False, concierge_params: Dict = None ) -> Optional[str]: diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index 724dda322..18703fdbd 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -204,9 +204,17 @@ def _job_state_from_jobs(jobs): str(job_id) float(created/queued/estimating/running/finished/updated/) (Time in MS) """ + retry_keys = ["retry_parent", "retried", "retry_count"] + job_states = [] for job in jobs: mongo_rec = job.to_mongo().to_dict() + + # Hack until job browser supports these keys + for key in retry_keys: + if key in mongo_rec: + del mongo_rec[key] + mongo_rec["_id"] = str(job.id) mongo_rec["job_id"] = str(job.id) mongo_rec["created"] = int(job.id.generation_time.timestamp() * 1000) diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 41d45e5fb..d410d7934 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -299,6 +299,15 @@ def get_admin_permission(self): return self.get_ee2_auth().retrieve_admin_permissions() # ENDPOINTS: Running jobs and getting job input params + + def retry_multiple(self, job_ids, as_admin=False): + """Authorization Required Read/Write""" + return self.get_runjob().retry_multiple(job_ids=job_ids, as_admin=as_admin) + + def retry(self, job_id, as_admin=False): + """Authorization Required Read/Write""" + return self.get_runjob().retry(job_id=job_id, as_admin=as_admin) + def run_job(self, params, as_admin=False): """Authorization Required Read/Write""" return self.get_runjob().run(params=params, as_admin=as_admin) diff --git a/lib/installed_clients/execution_engine2Client.py b/lib/installed_clients/execution_engine2Client.py index 3e1d64f1e..591328e75 100644 --- a/lib/installed_clients/execution_engine2Client.py +++ b/lib/installed_clients/execution_engine2Client.py @@ -100,41 +100,76 @@ def status(self, context=None): def run_job(self, params, context=None): """ - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. + Start a new job. :param params: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ return self._client.call_method( @@ -143,41 +178,86 @@ def run_job(self, params, context=None): def run_job_batch(self, params, batch_params, context=None): """ + Run a batch job, consisting of a parent job and one or more child jobs. + Note that the as_admin parameters in the list of child jobs are ignored - + only the as_admin parameter in the batch_params is considered. :param params: instance of list of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String - :param batch_params: instance of type "BatchParams" -> structure: - parameter "wsid" of Long + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) + :param batch_params: instance of type "BatchParams" (Additional + parameters for a batch job. wsid: the workspace with which to + associate the parent job. as_admin: run the job with full EE2 + permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights.) -> structure: + parameter "wsid" of Long, parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "BatchSubmission" -> structure: parameter "parent_job_id" of type "job_id" (A job id.), parameter "child_job_ids" of list of type "job_id" (A job id.) @@ -189,6 +269,51 @@ def run_job_batch(self, params, batch_params, context=None): context, ) + def retry_job(self, params, context=None): + """ + #TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present + #TODO Add retry child that checks the status of the child? to prevent multiple retries + Allowed Jobs + Regular Job with no children + Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call + Not Allowed + Regular Job with children (Should not be possible to create yet) + Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs) + :param params: instance of type "RetryParams" (job_id of job to retry + as_admin: retry someone elses job in your namespace #TODO Possibly + Add JobRequirements job_requirements;) -> structure: parameter + "job_id" of type "job_id" (A job id.), parameter "as_admin" of + type "boolean" (@range [0,1]) + :returns: instance of type "RetryResult" (job_id of retried job + retry_id: job_id of the job that was launched str error: reason as + to why that particular retry failed (available for bulk retry + only)) -> structure: parameter "job_id" of type "job_id" (A job + id.), parameter "retry_id" of type "job_id" (A job id.), parameter + "error" of String + """ + return self._client.call_method( + "execution_engine2.retry_job", [params], self._service_ver, context + ) + + def retry_jobs(self, params, context=None): + """ + Same as retry_job, but accepts multiple jobs + :param params: instance of type "BulkRetryParams" (job_ids of job to + retry as_admin: retry someone else's job in your namespace #TODO: + Possibly Add list job_requirements;) -> + structure: parameter "job_ids" of list of type "job_id" (A job + id.), parameter "as_admin" of type "boolean" (@range [0,1]) + :returns: instance of list of type "RetryResult" (job_id of retried + job retry_id: job_id of the job that was launched str error: + reason as to why that particular retry failed (available for bulk + retry only)) -> structure: parameter "job_id" of type "job_id" (A + job id.), parameter "retry_id" of type "job_id" (A job id.), + parameter "error" of String + """ + return self._client.call_method( + "execution_engine2.retry_jobs", [params], self._service_ver, context + ) + def abandon_children(self, params, context=None): """ :param params: instance of type "AbandonChildren" -> structure: @@ -207,37 +332,73 @@ def run_job_concierge(self, params, concierge_params, context=None): """ :param params: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int request_memory: int in MB request_disk: int in GB job_priority: @@ -278,37 +439,73 @@ def get_job_params(self, params, context=None): "as_admin" of type "boolean" (@range [0,1]) :returns: instance of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) """ return self._client.call_method( "execution_engine2.get_job_params", [params], self._service_ver, context @@ -454,10 +651,10 @@ def check_job(self, params, context=None): of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references @@ -470,29 +667,64 @@ def check_job(self, params, context=None): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_job", [params], self._service_ver, context @@ -546,10 +778,10 @@ def check_job_batch(self, params, context=None): of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references @@ -562,41 +794,76 @@ def check_job_batch(self, params, context=None): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "child_jobstates" - of list of type "JobState" (job_id - string - id of the job user - - string - user who started the job wsid - int - optional id of the - workspace where the job is bound authstrat - string - what - strategy used to authenticate the job job_input - object - inputs - to the job (from the run_job call) ## TODO - verify updated - int - - timestamp since epoch in milliseconds of the last time the - status was updated running - int - timestamp since epoch in - milliseconds of when it entered the running state created - int - - timestamp since epoch in milliseconds when the job was created - finished - int - timestamp since epoch in milliseconds when the - job was finished status - string - status of the job. one of the - following: created - job has been created in the service + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter + "child_jobstates" of list of type "JobState" (job_id - string - id + of the job user - string - user who started the job wsid - int - + optional id of the workspace where the job is bound authstrat - + string - what strategy used to authenticate the job job_input - + object - inputs to the job (from the run_job call) ## TODO - + verify updated - int - timestamp since epoch in milliseconds of + the last time the status was updated running - int - timestamp + since epoch in milliseconds of when it entered the running state + created - int - timestamp since epoch in milliseconds when the job + was created finished - int - timestamp since epoch in milliseconds + when the job was finished status - string - status of the job. one + of the following: created - job has been created in the service estimating - an estimation job is running to estimate resources required for the main job, and which queue should be used queued - job is queued to be run running - job is running on a worker node @@ -619,10 +886,10 @@ def check_job_batch(self, params, context=None): of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method - format, e.g. 'KBaseTrees.construct_species_tree' app_id - the id - of the Narrative application (UI) running this job (e.g. - repo/name) params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references @@ -635,29 +902,64 @@ def check_job_batch(self, params, context=None): job. For run_job and run_job_concierge, this value can be specified to denote the parent job of the job being created. Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_job_batch", [params], self._service_ver, context @@ -709,45 +1011,80 @@ def check_jobs(self, params, context=None): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_jobs", [params], self._service_ver, context @@ -800,45 +1137,80 @@ def check_workspace_jobs(self, params, context=None): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_workspace_jobs", @@ -998,47 +1370,82 @@ def check_jobs_date_range_for_user(self, params, context=None): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "count" of Long, - parameter "query_count" of Long, parameter "filter" of mapping - from String to String, parameter "skip" of Long, parameter + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "count" of + Long, parameter "query_count" of Long, parameter "filter" of + mapping from String to String, parameter "skip" of Long, parameter "projection" of list of String, parameter "limit" of Long, parameter "sort_order" of String """ @@ -1146,47 +1553,82 @@ def check_jobs_date_range_for_all(self, params, context=None): "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' app_id - the id of the + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. repo/name) - params - the parameters to pass to the method. Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined - source_ws_objects - denotes the workspace objects that will serve - as a source of data when running the SDK method. These references - will be added to the autogenerated provenance. Must be in UPA - format (e.g. 6/90/4). meta - Narrative metadata to associate with - the job. wsid - an optional workspace id to associate with the - job. This is passed to the workspace service, which will share the - job based on the permissions of the workspace rather than owner of - the job parent_job_id - EE2 job id for the parent of the current - job. For run_job and run_job_concierge, this value can be - specified to denote the parent job of the job being created. - Warning: No checking is done on the validity of the job ID, and - the parent job record is not altered. run_job_batch ignores this - parameter when starting a job batch.) -> structure: parameter - "method" of String, parameter "app_id" of String, parameter - "params" of list of unspecified object, parameter "service_ver" of - String, parameter "source_ws_objects" of list of type "wsref" (A - workspace object reference of the form X/Y or X/Y/Z, where X is - the workspace id, Y is the object id, Z is the version.), - parameter "meta" of type "Meta" (Narrative metadata for a job. All - fields are optional. run_id - the Narrative-assigned ID of the job - run. 1:1 with a job ID. token_id - the ID of the token used to run - the method. tag - the release tag, e.g. dev/beta/release. cell_id - - the ID of the narrative cell from which the job was run.) -> - structure: parameter "run_id" of String, parameter "token_id" of - String, parameter "tag" of String, parameter "cell_id" of String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "count" of Long, - parameter "query_count" of Long, parameter "filter" of mapping - from String to String, parameter "skip" of Long, parameter + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "count" of + Long, parameter "query_count" of Long, parameter "filter" of + mapping from String to String, parameter "skip" of Long, parameter "projection" of list of String, parameter "limit" of Long, parameter "sort_order" of String """ diff --git a/requirements.txt b/requirements.txt index 06e45c8e1..6dc8946da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ cachetools==3.1.1 certifi==2019.6.16 cffi==1.14.0 chardet==3.0.4 -codecov==2.0.15 +codecov==2.1.11 configparser==3.7.4 confluent-kafka==1.5.0 coverage==4.5.3 diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 626160bf5..5f782958a 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -117,7 +117,7 @@ def _create_job( app=_APP, state=_CREATED_STATE, git_commit=_GIT_COMMIT, - parent_job_id="None", + parent_job_id=None, source_ws_objects=None, wsid=None, ): @@ -131,7 +131,8 @@ def _create_job( ji.wsid = wsid ji.service_ver = git_commit ji.source_ws_objects = source_ws_objects - ji.parent_job_id = parent_job_id + if parent_job_id: + ji.parent_job_id = parent_job_id jr = JobRequirements() jr.clientgroup = reqs.client_group jr.cpu = reqs.cpus diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index e71df36c8..ff58930d1 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -9,17 +9,17 @@ import requests_mock from mock import MagicMock -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.exceptions import CannotRetryJob, RetryFailureException +from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.utils.clients import ( get_client_set, get_user_client_set, ) -from execution_engine2.sdk.job_submission_parameters import JobRequirements from installed_clients.CatalogClient import Catalog - +from lib.execution_engine2.db.MongoUtil import MongoUtil +from lib.execution_engine2.db.models.models import Job, Status +from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from lib.execution_engine2.utils.CondorTuples import SubmissionInfo from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -247,6 +247,247 @@ def test_run_job(self, rq_mock, condor_mock): job_id = runner.run_job(params=job) print(f"Job id is {job_id} ") + @staticmethod + def check_retry_job_state(job_id: str, retry_job_id: str): + job = Job.objects.get(id=job_id) # type: Job + retry_job = Job.objects.get(id=retry_job_id) # type: Job + + check_attributes = [ + "job_input", + "wsid", + "authstrat", + "batch_job", + "scheduler_type", + ] + + for item in check_attributes: + if job[item]: + assert job[item] == retry_job[item] + + assert retry_job.retry_parent == job_id + assert job.retry_count > 0 + + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_retry_job_multiple(self, rq_mock, condor_mock): + # 1. Run the job + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}} + ) + ) + runner = self.getRunner() + runner.get_condor = MagicMock(return_value=condor_mock) + runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + job = get_example_job_as_dict( + user=self.user_id, wsid=self.ws_id, source_ws_objects=[] + ) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + + parent_job_id1 = runner.run_job(params=job) + parent_job_id2 = runner.run_job(params=job) + parent_job_id3 = runner.run_job(params=job) + parent_job_id4 = runner.run_job(params=job) + + runner.update_job_status(job_id=parent_job_id1, status=Status.terminated.value) + runner.update_job_status(job_id=parent_job_id2, status=Status.error.value) + runner.update_job_status(job_id=parent_job_id3, status=Status.terminated.value) + runner.update_job_status(job_id=parent_job_id4, status=Status.error.value) + + # 2. Retry the jobs with a fake input + errmsg = ( + "'123' is not a valid ObjectId, it must be a 12-byte input or a 24-character " + "hex string" + ) + with self.assertRaisesRegexp(RetryFailureException, errmsg): + runner.retry_multiple(job_ids=[parent_job_id1, 123]) + + # 3. Retry the jobs with duplicate job ids + retry_candidates = ( + parent_job_id1, + parent_job_id2, + parent_job_id1, + parent_job_id2, + ) + fail_msg = f"Retry of the same id in the same request is not supported. Offending ids:{[parent_job_id1,parent_job_id2]} " + + with self.assertRaises(ValueError) as e: + runner.retry_multiple(retry_candidates) + assert str(e.exception) == str(ValueError(fail_msg)) + + # 4. Retry the jobs + retry_candidates = ( + parent_job_id1, + parent_job_id2, + parent_job_id3, + parent_job_id4, + ) + retry_job_ids = runner.retry_multiple(retry_candidates) + + assert len(retry_job_ids) == len(retry_candidates) + + # Lets retry the jobs a few times + js = runner.check_jobs( + job_ids=[ + retry_job_ids[0]["retry_id"], + retry_job_ids[1]["retry_id"], + retry_job_ids[2]["retry_id"], + retry_job_ids[3]["retry_id"], + ] + )["job_states"] + + job1, job2, job3, job4 = js + + self.check_retry_job_state(parent_job_id1, job1["job_id"]) + self.check_retry_job_state(parent_job_id2, job2["job_id"]) + self.check_retry_job_state(parent_job_id3, job3["job_id"]) + self.check_retry_job_state(parent_job_id4, job4["job_id"]) + + # Test no job ids + with self.assertRaisesRegexp(ValueError, "No job_ids provided to retry"): + runner.retry_multiple(job_ids=None) + + # Test error during retry, but passing validate + runner._ee2_runjob._retry = MagicMock( + side_effect=Exception("Job Retry Misbehaved!") + ) + misbehaving_jobs = runner.retry_multiple(retry_candidates) + for i, candidate in enumerate(retry_candidates): + assert misbehaving_jobs[i] == { + "error": "Job Retry Misbehaved!", + "job_id": candidate, + } + + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_retry_job(self, rq_mock, condor_mock): + # 1. Run the job + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}} + ) + ) + runner = self.getRunner() + runner.get_condor = MagicMock(return_value=condor_mock) + runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + job = get_example_job_as_dict( + user=self.user_id, wsid=self.ws_id, source_ws_objects=[] + ) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + parent_job_id = runner.run_job(params=job) + + # 2a. Retry the job and fail because it's in progress + expected_error = f"Error retrying job {parent_job_id} with status running: can only retry jobs with status 'error' or 'terminated'" + with self.assertRaisesRegex(CannotRetryJob, expected_regex=expected_error): + runner.update_job_status(job_id=parent_job_id, status=Status.running.value) + runner.retry(job_id=parent_job_id) + + # 2b. Retry the job + runner.update_job_status(job_id=parent_job_id, status=Status.terminated.value) + retry_job_id = runner.retry(job_id=parent_job_id)["retry_id"] + + # 3. Attempt to retry a retry, and check to see that that the new job is retried off of the parent + runner.update_job_status(job_id=retry_job_id, status=Status.terminated.value) + retry_from_retry_id = runner.retry(job_id=retry_job_id)["retry_id"] + + retry_from_original_again = runner.retry(job_id=parent_job_id)["retry_id"] + original_job, retried_job, retried_job2, retried_job3 = runner.check_jobs( + job_ids=[ + parent_job_id, + retry_job_id, + retry_from_retry_id, + retry_from_original_again, + ] + )["job_states"] + + self.check_retry_job_state(parent_job_id, retry_job_id) + self.check_retry_job_state(parent_job_id, retry_from_retry_id) + self.check_retry_job_state(parent_job_id, retry_from_original_again) + + for job in [original_job, retried_job, retried_job2, retried_job3]: + if job == original_job: + assert original_job["retry_count"] == 3 + else: + assert job["retry_parent"] == parent_job_id + + # 4. Get jobs and ensure they contain the same keys and params + same_keys = ["user", "authstrat", "wsid", "scheduler_type", "job_input"] + + assert "retry_parent" not in original_job + + for key in same_keys: + assert original_job[key] == retried_job[key] + + assert original_job["job_input"]["params"] == retried_job["job_input"]["params"] + + # Some failure cases + + # TODO Retry a job that uses run_job_batch or kbparallels (Like metabat) + # TODO Retry a job without an app_id + + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_retry_job_with_params_and_nci_and_src_ws_objs(self, rq_mock, condor_mock): + # 1. Run the job + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}} + ) + ) + runner = self.getRunner() + runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + runner.workspace_auth.can_write = MagicMock(return_value=True) + runner.get_condor = MagicMock(return_value=condor_mock) + + quast_params = { + "workspace_name": "XX:narrative_1620418248793", + "assemblies": ["62160/9/18"], + "force_glimmer": 0, + } + source_ws_objects = quast_params["assemblies"] + nci = { + "run_id": "3a211c4e-5ba8-4b94-aeae-378079ccc63d", + "token_id": "f38f09f7-5ab1-4bfc-9f3f-2b82c7a8dbdc", + "tag": "release", + "cell_id": "3ee13d64-623b-407f-98a1-72e577662132", + } + + job = get_example_job_as_dict( + user=self.user_id, + wsid=self.ws_id, + narrative_cell_info=nci, + params=quast_params, + source_ws_objects=source_ws_objects, + method_name="kb_quast.run_QUAST_app", + app_id="kb_quast/run_QUAST_app", + ) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + parent_job_id = runner.run_job(params=job) + + # 2. Retry the job + runner.update_job_status(job_id=parent_job_id, status=Status.terminated.value) + retry_job_id = runner.retry(job_id=parent_job_id)["retry_id"] + + # 3. Get both jobs and compare them! + original_job, retried_job = runner.check_jobs( + job_ids=[parent_job_id, retry_job_id] + )["job_states"] + + same_keys = ["user", "authstrat", "wsid", "scheduler_type", "job_input"] + assert "retry_parent" not in original_job + assert original_job["retry_count"] == 1 + assert retried_job["retry_parent"] == parent_job_id + + for key in same_keys: + assert original_job[key] == retried_job[key] + + # TODO Possible test additions Retry a job that uses run_job_batch or kbparallels (Like metabat) + # TODO Retry a job without an app_id + # TODO Check narrative_cell_info + @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_run_job_batch(self, rq_mock, condor_mock): @@ -260,11 +501,12 @@ def test_run_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict(user=self.user_id, wsid=self.ws_id) - + runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + job = get_example_job_as_dict( + user=self.user_id, wsid=self.ws_id, source_ws_objects=[] + ) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) @@ -281,6 +523,28 @@ def test_run_job_batch(self, rq_mock, condor_mock): jobs = [job, job_bad] runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + # Squeeze in a retry test here + parent_job_id = job_ids["parent_job_id"] + child_job_id = job_ids["child_job_ids"][0] + runner.update_job_status(job_id=child_job_id, status=Status.terminated.value) + parent_job = runner.check_job(job_id=parent_job_id) + assert len(parent_job["child_jobs"]) == 3 + retry_id = runner.retry(job_id=child_job_id)["retry_id"] + parent_job = runner.check_job(job_id=parent_job_id) + assert len(parent_job["child_jobs"]) == 4 + assert parent_job["child_jobs"][-1] == retry_id + + job = Job.objects.get(id=child_job_id) + retry_count = job.retry_count + + # Test to see if one input fails, so fail them all + with self.assertRaises(expected_exception=RetryFailureException): + retry_id = runner.retry_multiple(job_ids=[child_job_id, "grail", "fail"]) + print(retry_id) + # Check to see other job wasn't retried + job.reload() + assert job.retry_count == retry_count + @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_run_job_fail(self, rq_mock, condor_mock): diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 95bf8c740..7303a3c0c 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -243,6 +243,39 @@ def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): si = SubmissionInfo(clusterid="test", submit="job", error=None) + @patch.object(Condor, "run_job", return_value=si) + @patch.object(WorkspaceAuth, "can_write", return_value=True) + @patch( + "installed_clients.CatalogClient.Catalog.list_client_group_configs", + autospec=True, + ) + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + def test_retry_job_stress( + self, cc_get_mod_ver, cc_list_cli_configs, workspace, condor + ): + """ + Not a stress test, more of an impl test + """ + cc_get_mod_ver.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_list_cli_configs.return_value = [] + + # set job method differently to distinguish + method_1 = "app1.a_method" + job_params_1 = get_sample_job_params(method=method_1, app_id="app1/a") + + # Remove fake parent_job_id + del job_params_1["parent_job_id"] + + job_ids = [] + for i in range(10): + job_ids.append(self.impl.run_job(ctx=self.ctx, params=job_params_1)[0]) + + for job_id in job_ids: + self.impl.update_job_status( + ctx=self.ctx, params={"job_id": job_id, "status": "error"} + ) + self.impl.retry_job(ctx=self.ctx, params={"job_id": job_id}) + @patch.object(Condor, "run_job", return_value=si) @patch.object(WorkspaceAuth, "can_write", return_value=True) @patch( diff --git a/test/tests_for_sdkmr/ee2_retry_test.py b/test/tests_for_sdkmr/ee2_retry_test.py new file mode 100644 index 000000000..a73df8dd0 --- /dev/null +++ b/test/tests_for_sdkmr/ee2_retry_test.py @@ -0,0 +1,135 @@ +""" +Unit tests for the Retry Code +""" +from execution_engine2.exceptions import CannotRetryJob, RetryFailureException +from execution_engine2.sdk.EE2Runjob import EE2RunJob +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner + +from test.utils_shared.test_utils import get_example_job, assert_exception_correct +from unittest.mock import create_autospec, MagicMock +from pytest import raises + + +def test_retry_db_failures(): + """ + * Test correct db update failure message, and that cancel_job is called + * Test that on exception, the db_update failure is called + """ + sdkmr = MagicMock() + retry_job = get_example_job(status="error") + parent_job = get_example_job(status="error") + retry_job.job_input.parent_job_id = "123" + sdkmr.get_job_with_permission = MagicMock(return_value=retry_job) + sdkmr.cancel_job = MagicMock() + rj = EE2RunJob(sdkmr=sdkmr) + + # Check correct exception and that safe cancel/cancel_job is called + job1 = "job1" + job_to_abort = "job_to_abort" + + # Check to make sure cancel_job is called on failure + with raises(Exception) as e: + rj._db_update_failure( + job_that_failed_operation="job1", + job_to_abort="job_to_abort", + exception=Exception(123), + ) + expected_exception = RetryFailureException( + f"Couldn't update job record:{job1} during retry. Aborting:{job_to_abort} Exception:123 " + ) + assert_exception_correct(e.value, expected_exception) + assert sdkmr.cancel_job.call_count == 1 + + # Check to make sure safe_cancel_call is called on failure + with raises(Exception) as e: + rj._safe_cancel = MagicMock() + rj._db_update_failure( + job_that_failed_operation="job1", + job_to_abort="job_to_abort", + exception=Exception(123), + ) + expected_exception = RetryFailureException( + f"Couldn't update job record:{job1} during retry. Aborting:{job_to_abort} Exception:123 " + ) + assert_exception_correct(e.value, expected_exception) + assert rj._safe_cancel.call_count == 1 + + rj.run = MagicMock(return_value=retry_job) + # One DB failure + rj._db_update_failure = MagicMock(side_effect=Exception("Boom!")) + with raises(Exception): + rj._retry(job_id=retry_job.id, job=retry_job, parent_job=parent_job) + assert rj._db_update_failure.call_count == 1 + + # Two db failures + rj._db_update_failure = MagicMock() + rj._retry(job_id=retry_job.id, job=retry_job, parent_job=parent_job) + + +def test_validate_retry(): + sdkmr = create_autospec(SDKMethodRunner, instance=True, spec_set=True) + + # Passing case with nothing to assert, all goes well + good_job = get_example_job(status="error") + sdkmr.get_job_with_permission = MagicMock(return_value=good_job) + rj = EE2RunJob(sdkmr=sdkmr) + rj._validate_retry_presubmit("unknown") + + # Fail case with the wrong status + with raises(Exception) as e: + sdkmr.get_job_with_permission = MagicMock( + return_value=get_example_job(status="running") + ) + rj = EE2RunJob(sdkmr=sdkmr) + rj._validate_retry_presubmit("unknown") + expected_exception = CannotRetryJob( + "Error retrying job unknown with status running: can only retry jobs with " + "status 'error' or 'terminated'", + ) + assert_exception_correct(e.value, expected_exception) + + # Fail case with the batch job + with raises(Exception) as e: + good_job.batch_job = True + sdkmr.get_job_with_permission = MagicMock(return_value=good_job) + rj = EE2RunJob(sdkmr=sdkmr) + rj._validate_retry_presubmit("unknown") + + expected_exception = CannotRetryJob( + "Cannot retry batch job parents. Must retry individual jobs" + ) + assert_exception_correct(e.value, expected_exception) + + +def test_retry_get_run_job_params_from_existing_job(): + """ + Test to see that the retried job matches the job it got retried from the db + Not all fields are expected back + """ + example_job = get_example_job() + example_job_as_dict = example_job.to_mongo().to_dict() + extracted_job = EE2RunJob._get_run_job_params_from_existing_job( + example_job, user_id=example_job.user + "other" + ) + # Check Top Level Fields Match + discarded_keys = [ + "user", + "authstrat", + "status", + "job_input", + "child_jobs", + "batch_job", + ] + expected_unequal_keys = [ + "updated", + "queued", + "scheduler_id", + ] + for key in example_job_as_dict.keys(): + if key in discarded_keys: + continue + if key in expected_unequal_keys: + if key in extracted_job: + assert example_job_as_dict[key] != extracted_job[key] + else: + assert example_job_as_dict[key] == extracted_job[key] diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index 47d5d75e6..73a59605d 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -40,45 +40,94 @@ def get_example_job_as_dict( wsid: int = 123, authstrat: str = "kbaseworkspace", scheduler_id: str = None, + params: dict = None, + narrative_cell_info: dict = None, + source_ws_objects: list = None, + method_name: str = None, + app_id: str = None, ): job = ( get_example_job( - user=user, wsid=wsid, authstrat=authstrat, scheduler_id=scheduler_id + user=user, + wsid=wsid, + authstrat=authstrat, + scheduler_id=scheduler_id, + params=params, + narrative_cell_info=narrative_cell_info, + source_ws_objects=source_ws_objects, + method_name=method_name, + app_id=app_id, ) .to_mongo() .to_dict() ) + # Copy fields to match run_job signature + job_input = job["job_input"] + job["meta"] = job_input["narrative_cell_info"] + job["narrative_cell_info"] = job_input["narrative_cell_info"] + job["params"] = job_input["params"] + job["source_ws_objects"] = job_input["source_ws_objects"] job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] return job +def get_example_job_input(wsid, params=None, method_name=None, app_id=None): + if params == None: + params = {} + + job_input = JobInput() + job_input.wsid = wsid + + job_input.method = method_name or "module.method" + job_input.params = params + job_input.service_ver = "dev" + job_input.app_id = app_id or "module/super_function" + job_input.source_ws_objects = ["1/2/3", "2/3/4", "3/5/6"] + + m = Meta() + m.cell_id = "ApplePie" + job_input.narrative_cell_info = m + + return job_input + + def get_example_job( user: str = "boris", wsid: int = 123, authstrat: str = "kbaseworkspace", + params: dict = None, scheduler_id: str = None, + narrative_cell_info: dict = None, + source_ws_objects: list = None, + method_name: str = None, + app_id: str = None, + status: str = None, ) -> Job: j = Job() j.user = user j.wsid = wsid - job_input = JobInput() - job_input.wsid = j.wsid - - job_input.method = "module.method" - job_input.requested_release = "requested_release" - job_input.params = {} - job_input.service_ver = "dev" - job_input.app_id = "module/super_function" + job_input = get_example_job_input( + params=params, wsid=wsid, method_name=method_name, app_id=app_id + ) - m = Meta() - m.cell_id = "ApplePie" - job_input.narrative_cell_info = m j.job_input = job_input j.status = "queued" j.authstrat = authstrat + if status: + j.status = status + + if params: + job_input.params = params + + if source_ws_objects: + job_input.source_ws_objects = source_ws_objects + + if narrative_cell_info: + job_input.narrative_cell_info = narrative_cell_info + if scheduler_id is None: scheduler_id = str(uuid.uuid4()) @@ -91,7 +140,11 @@ def get_example_job_as_dict_for_runjob( user=None, wsid=None, authstrat=None, scheduler_id=None ): job = get_example_job( - user=user, wsid=wsid, authstrat=authstrat, scheduler_id=scheduler_id + user=user, + wsid=wsid, + authstrat=authstrat, + scheduler_id=scheduler_id, + narrative_cell_info={}, ) job_dict = job.to_mongo().to_dict() job_dict["method"] = job["job_input"]["method"] From 14f5a55bc97e97f03d6dd7faccdc4a83dfee268e Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 2 Jun 2021 15:14:55 -0500 Subject: [PATCH 085/109] DATAUP-433 Create 002-Retry_endpoint.md (#367) * Create 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * Update 002-Retry_endpoint.md * ADR * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 002-Retry_endpoint.md Co-authored-by: Boris Sadkhin --- docs/adrs/002-Retry_endpoint.md | 94 ++++++++++++++++++++ docs/adrs/003-Retry_endpoint_design.md | 118 +++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 docs/adrs/002-Retry_endpoint.md create mode 100644 docs/adrs/003-Retry_endpoint_design.md diff --git a/docs/adrs/002-Retry_endpoint.md b/docs/adrs/002-Retry_endpoint.md new file mode 100644 index 000000000..0e15235ce --- /dev/null +++ b/docs/adrs/002-Retry_endpoint.md @@ -0,0 +1,94 @@ +# Retry Endpoint + +Date: 2021-04-27 + + +## For discussion on this ADR, see the following PR https://github.com/kbase/execution_engine2/pull/367 + +The current requirement for the Batch/Bulk UI is to be able to retry jobs. Using a job id, it should be possible to get information from the database. + +The current implementation of retry is to run jobs using the `run_job` or `run_job_batch` endpoint. This is not adequate due to the following deficiencies: + + +* Lack of book-keeping for the parent job and the child job relationship: +* 1) Launching a new job will not update the relationship between those jobs. +* 2) e.g. the child job can specify a parent_job_id, but the parent_job will not know about the child_job +* 3) e.g. the parent will not know about new retried jobs +* 4) e.g. the child will not know how many times it was retried + +* Lack of book-keeping for number of retries for a particular job / set of job inputs +* Lack of ability to launch multiple jobs using the `run_job_batch` endpoint without creating a new parent job +* Lack of ability to ensure that the proper catalog version /git commit of an app is used from the front end based on a tag, such as "beta/dev/release" +* Lack of ability to specify which retries succeeded and which ones failed during submit time. +* Code is split more than is necessary + +### Note about submit time vs run time for job submission +The job first needs to be submitted to ee2_runjob. It can fail there. Afterwards, it gets submitted to condor, it can fail there too. Currently those two happen at the same time, but they are supposed to happen in a thread at some point, so the current single point of submit time will become two separate points of submission. Once the job begins running, it can fail at Run Time. + +## Author(s) + +@bio-boris + +## Status and Decision Outcome + +* We have decided to go with the retry endpoint in favor of retrofitting the run_job endpoint +* We are implementing a minimal retry endpoint, then iterating over a design ADR to create a more fully featured/robust Retry endpoint when time permits +* Minimal endpoint PR found at https://github.com/kbase/execution_engine2/pull/383 +* Design ADR will be added to ee2 repo + +### The new ADR will contain: +* Copy of comments to be addressed +* Link to spec file with inputs and outputs for the retry endpoint +* Link to Jira Ticket with business logic documentation for success and cancel cases + + +## Alternatives Considered + +* Not book-keeping, or doing minimal book-keeping and calling run_job multiple times +* Re-writing run_job/run_job_batch to address the aforementioned deficiencies +* Creating a retry endpoint dedicated to addressing book-keeping and job launching features + + +### Possible additional things to think about +* Creating sets of objects and what to do at the end of a batch run +* What to do about a set if a child task fails during processing +* Convenience endpoints that operate on the parent_job_id or list of child job ids may be out of scope (e.g. cancel all jobs with a certain status) + +## Consequences +* Requires 2nd ADR + +## Pros and Cons of the Alternatives + +### Not book-keeping, or doing minimal book-keeping and calling `run_job` multiple times +* `+` Can re-use existing endpoints without any additional work +* `+` Less api endpoints to manage +* `-` Issues on re-rendering/regenerating a cell based on just the job record +* `-` Loss of information about job runs, and ability to infer relationships between parents and child jobs. +* `-` Loss of control of jobs, such as the ability to restrict a job's running based on number of retries/failures. +* `-` Wrong version of app will run if the app was updated after job completion, and a version tag rather than a git commit was provided +* `-` Increase complexity of `run_job*` methods +* `-` The client will have to keep track of the child_job relationship, so that info is lost once the client is terminated + +### Re-writing `run_job` to address the aforementioned deficiencies without refactoring +* `+` Solves most requirements, but +* `-` Adds more complexity to `run_job` methods +* `-` Increase difficulty in maintaining and testing `run_job` method +* `-` Wrong version of app will run if the app was updated after job completion, and a version tag rather than a git commit was provided +* `-` Inefficient job submission +* `-` Possibly Insufficient error handling + +### Re-writing `run_job/run_job_batch` to address the aforementioned deficiencies with some refactoring +* `+` Same as above, but if you are refactoring, you might as well have a retry endpoint, and clean out/decouple `run_job` endpoint from having so many features and branching logic + +### Creating `retry` endpoint to address the aforementioned deficiencies with some refactoring +* `+` Decrease coupling between `run_job` and retry functionality, possibly making testing and development easier +* `+` Faster development than a full refactor +* `-` Faster development than a full refactor, but creates technical debt, might have to update both `run_job` and `retry` each time a change is made +* `-` Extra endpoint to manage + + +### Creating `retry` endpoint to address the aforementioned deficiencies with full refactoring where run_job functions are split out into their own functions +* `+` Decrease coupling between `run_job` and retry functionality, possibly making testing and development easier +* `+` Increase DRYNESS of the code +* `+` Allows retry to benefit from changes to `run_job` +* `-` Slower development for a full refactor, but decreases technical debt diff --git a/docs/adrs/003-Retry_endpoint_design.md b/docs/adrs/003-Retry_endpoint_design.md new file mode 100644 index 000000000..a8438639c --- /dev/null +++ b/docs/adrs/003-Retry_endpoint_design.md @@ -0,0 +1,118 @@ +# Retry Endpoint + +Date: 2021-05-19 + + +## Motivation for the Endpoint: + +The current requirement for the Batch/Bulk UI is to be able to retry jobs that have either "errored" out, or were cancelled. +The UI allows you to retry either single jobs, or multiple jobs, and saves you from having to cancel and resubmit each job individually, +which is not really possibly with the UI anyway. + +### Motivation for the `code spike` for retry endpoint and follow up design ADR +``` +As I mentioned, as the product owner, I find our ability to deliver functionality to be pretty awful. +We have invested so much effort in refactoring that its killed our timeline - we started in late July, and it is now almost May with no functioning bulk uploader, which was just the first deliverable. +If we are going to refactor, we need to be able to do it in a timely fashion, and have it not kill the schedule any more than it has. +I want to see the estimate for a quick and dirty solution that implements a proposed retry endpoint, that can be deployed ASAP, and then once the API contract has been established, and the functional MVP is done, we begin the cleanup of the backend code. +Note that this is NOT business as usual, the usual way we do this is the nasty MVP gets deployed and then we don't go back until much later. +Here, we get the API working so that it doesn't block dependencies, and we immediately start the refactoring. The refactor needs to be broken down into smallish chunks of ~3 days estimated work, and each merge should maintain functionality and incrementally improve the codebase. +Tasks that take more than a couple of days are more likely to be far off in their estimate and this is how we mitigate the risk of poor estimation. +``` + + +### High Level Behavior of the `retry` endpoint +The current implementation of retry is to run jobs using the `retry_job` or `retry_jobs` endpoint. +The endpoint takes a job or list of job ids and then attempts to resubmit them to the queue, using the exact same set of parameters and version of the app. + +### Current Behavior +* Spec file is located at https://github.com/kbase/execution_engine2/blob/8baab8e3ac5212f4bbe59fd935980aa41b4ee06d/execution_engine2.spec#L201-L247 + +* A job id is provided. If there are sufficient permissions, the call will proceed, if not, it will error out, unless the `as_admin` flag is provided by an admin +* The retry will only continue if the status of the job to be retried is in [Status.terminated.value, Status.error.value] +* If the job id points to a job that has already been retried, it will attempt to retry that job's `retry_parent` instead. +* If the job id has never been retried, it becomes the `retry_parent` +* EE2 looks up the job versions and parameters, and then submits the job to be retried, incrementing the `retry_count` + of the job being retried, and the newly launched job gains a pointer to the `_PARENT_RETRY_JOB_ID` +* The job is submitted and upon successful submission, notifies the `retry_parent` and notifies the `parent_job_id` that a new `child_job` has been added + + +### Batch Behavior +* If a job has the attribute of `batch_job=True` the retry will fail, since there is no method to re-run. This is a bug, as it doesn't fail gracefully. +* If a job has the attribute of `batch_job=True`, but is actually a child job, the parent will be notified of this new retried job +* Multiple in-flight retries are allowed. + +## Retry_job behavior +* Blocking and single submit to HTCondor. It should be fine + +## Retry_jobs behavior +* Submitting multiple jobs uses the `run_job` endpoint, and is blocking (NOT OK!) + +### Desired Behavior +* Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) +* Add retry_count to retried jobs as well to aid in more book-keeping in a new field called `retry_number` +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) +* One single submission to HTCondor instead of multiple job submissions +* Ability to gracefully handle jobs with children +* Ability to handle database consistentcy during retry failure + + +### Questions + +#To Be Answered + +#### Q: should the number of retries of a job be limited, and if so, where? e.g. a max_retries field in the parent job? wait and see whether people attempt to rerun jobs that have already failed nine zillion times? +A: Unknown TBD + +#### Q: Preventing the same params from being re-run +A: We have decided to allow it + +#### Q: Finding the most recent run of the job: I would very much like to avoid anything involving iterating over a chain of jobs before you can find the most recent run or the original run -- we can come up with better data structures than that! +A: Unknown TBD, maybe the frontend does it? + +#### Q: It might be best to always submit a git commit for the module, maybe? +A: (This could be a narrative ticket) + +#### Q: How do we handle DB consistency during retry failure? +Looks like the options are +* implement db integrity checks and two-phase commits for making the relationships between a job, its retry parent, and the batch container +* accept that the db info may be incomplete and write workarounds into the clients +* (upgrade to Mongo 4.4 for better transaction support) + + + +### Sort of answered +#### Q: how to prevent incorrect parent-child relationships being created -- should the client be allowed to specify a parent ID? Is it currently possible to add a new child to a parent job if the child is a new job, rather than an existing job ID / set of params that is being rerun? +A: Not necessarily relevant to this endpoint, more of a run_job_batch endpoint question. Currently the `retry_parent` and `parent_job_id` are looked up from the ee2 record on retry, and not specified in this endpoint. + +#### Answered: + + Should we track a retry count? (Done) + Should users see this retry count? (Unknown TBD) + Are retried jobs saved in some sort of data structure linking them, possibly indirectly, to the parent job or are they orphaned? (Yes, retry_parent) + If the former, is the retry relationship linear or a tree? E.g. what happens if there are two simultaneous calls to retry a job? (Tree, simultaneous jobs run) + Should it be at least theoretically possible to see the list of retried jobs in order? (It is possible by sorting on creation date) + Should there be a maximum retry count? Or a warning that more retries are not likely to help? (Unknown TBD) + Can a job in states other than failed or canceled be retried? Or should the user be required to cancel a job before it can be retried? (Job must be in Error/Cancel state) + + +# Work estimation +Priority descending +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) +* One single submission to HTCondor instead of multiple job submission () +* Ability to gracefully handle jobs with children (may require refactoring models) +* Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) +* Add retry_count to retried jobs as well to aid in more book-keeping in a new field called `retry_number` + +# Time / Tickets to be created +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) +> Requires refactor of run_job_batch to add jobs to an existing batch job, and force the same app `git_commit versions` and `JobRequirements` +> Estimate 3-4 days +* One single submission to HTCondor instead of multiple job submission () +> Dependent on run_job_batch to be threaded first : Estimate 1 day +* Ability to gracefully handle jobs with children +> (may require refactoring models. Especially when children spawn more jobs) : Estimate 3 day +* Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) +> Some sort of locking mechanism or something else : Estimate 3 day +* Add retry_count to retried jobs as well to aid in more book-keeping in a new field called `retry_number` +> Requires addition to run_job and new field in model : Estimate 1.25 day From be34bca42f0a937e3c2e5034e8065e9ea27c570f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 2 Jun 2021 15:15:23 -0500 Subject: [PATCH 086/109] Bump urllib3 from 1.25.8 to 1.26.5 in /test/dockerfiles/condor (#392) Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.25.8 to 1.26.5. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.25.8...1.26.5) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- test/dockerfiles/condor/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dockerfiles/condor/requirements.txt b/test/dockerfiles/condor/requirements.txt index 42ae91a96..a7f499bb6 100644 --- a/test/dockerfiles/condor/requirements.txt +++ b/test/dockerfiles/condor/requirements.txt @@ -18,7 +18,7 @@ requests-async==0.5.0 rfc3986==1.3.2 sanic==19.6.0 ujson==1.35 -urllib3==1.25.8 +urllib3==1.26.5 uvloop==0.12.2 websockets==6.0 htcondor==8.9.2 From 725af07212c90e9a89ef2cb9a58054ad028069d3 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 3 Jun 2021 13:42:05 -0500 Subject: [PATCH 087/109] Revert "Update models.py (#389)" (#394) This reverts commit fd1a5b429c0f00ca67d2d2c7ccdeed6832bc6bce. --- lib/execution_engine2/db/models/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 0356f037b..55e91034a 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -160,6 +160,7 @@ class JobInput(EmbeddedDocument): wsid = IntField(required=False, default=None) method = StringField(required=True) + requested_release = StringField() params = DynamicField() service_ver = StringField(required=True) app_id = StringField() From 10cf8bcc5628ed3af0d5cfdb074d4c3031b63ee9 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 14 Jun 2021 13:00:58 -0500 Subject: [PATCH 088/109] DATAUP-390 Cache catalog lookups (#391) * Adding retry endpoint * Adding retry endpoint * Adding retry endpoint * Adding retry endpoint * Make tests work * Make tests work * Update black * ran black * exception path * Recompiled * Hack for job browser * Hack for jobbrowser * Fix tests * Fix tests * Fix tests and endpoint * fix tests * fix tests * Fix bug with retry optional * fix tests * fix tests * Fix bug with service_version * Fix bug with service_version * Added naive retry_jobs endpoint (#387) * Added naive retry_jobs endpoint * Revert ee2server * Revert ee2server * Revert ee2server * Revert ee2server * PR reviews * PR reviews * Fixed tests Co-authored-by: bio-boris * Fixed automicity bug * Updated docs * Updated docs * Updated docs * Updated docs var interpolation * Update return for batch retry * Update return for batch retry * ideas for retry meeting * ideas for retry meeting * updates from the meeting * updates from the meeting * Ran black * undo the repr * new client * added better message * modify parent * Fixed api tests * Removed code * Removed code * Removed code * updated execptions * Black * Testing timings * testing * testing * testing * Testing timings * Remove unused var * Remove unused var * Validate first * Validate first * Fix typo * Fix bug * fix bug * Create catalog util * Caching * Cache catalog lookup * add cache catalog util * add cache catalog util0 * Missing test * Add cc * Added cc * Added cc * Updated uvloop * Updated actions * Optimize imports * Fix tests * Fix tests * Fix tests * Fix tests * Fix tests * Fix tests * Fix docs * Update tests * Update tests * Update tests * Update tests * Optimize imports * Add test cases * Add test cases * Goodbye constructor * Fix tests * optimize imports * optimize imports * Remove indrection * Code review cleanup * Fixed tests Co-authored-by: bio-boris --- .github/workflows/ee2-tests.yml | 10 +- lib/execution_engine2/sdk/EE2Runjob.py | 21 +- lib/execution_engine2/sdk/SDKMethodRunner.py | 21 +- lib/execution_engine2/utils/catalog_cache.py | 93 +++++++ lib/execution_engine2/utils/clients.py | 22 +- .../utils/job_requirements_resolver.py | 37 ++- requirements.txt | 2 +- test/tests_for_auth/ee2_admin_mode_test.py | 11 +- test/tests_for_sdkmr/EE2Runjob_test.py | 61 +++-- .../ee2_SDKMethodRunner_test.py | 20 +- test/tests_for_sdkmr/ee2_load_test.py | 9 +- test/tests_for_sdkmr/ee2_retry_test.py | 10 +- test/tests_for_utils/catalog_cache_test.py | 175 +++++++++++++ test/tests_for_utils/clients_test.py | 20 +- .../job_requirements_resolver_test.py | 245 +++++++++--------- test/utils_shared/mock_utils.py | 8 +- test/utils_shared/test_utils.py | 21 +- 17 files changed, 543 insertions(+), 243 deletions(-) create mode 100644 lib/execution_engine2/utils/catalog_cache.py create mode 100644 test/tests_for_utils/catalog_cache_test.py diff --git a/.github/workflows/ee2-tests.yml b/.github/workflows/ee2-tests.yml index 06c518a98..c4920ce01 100644 --- a/.github/workflows/ee2-tests.yml +++ b/.github/workflows/ee2-tests.yml @@ -22,18 +22,18 @@ jobs: uses: actions/setup-python@v2 with: python-version: 3.8 - - name: Install dependencies + - name: Lint with flake8 and black run: | python -m pip install --upgrade pip pip install flake8 black pytest + flake8 ./lib ./test + black --check ./lib ./test + - name: Install dependencies + run: | if [ -f requirements.txt ]; then pip install -r requirements-dev.txt; fi cd /opt git clone https://github.com/kbase/jars cd - - - name: Lint with flake8 and black - run: | - flake8 ./lib ./test - black --check ./lib ./test - name: Build Docker Image run: | docker build . -t execution_engine2:test diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 3b7f9d48d..17636dbda 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -122,8 +122,8 @@ def _init_job_rec( inputs.params = params.get("params") # Catalog git commit - params[_SERVICE_VER] = self._get_module_git_commit( - params.get(_METHOD), params.get(_SERVICE_VER) + params[_SERVICE_VER] = self.sdkmr.get_catalog_cache().lookup_git_commit_version( + method=params.get(_METHOD), service_ver=params.get(_SERVICE_VER) ) inputs.service_ver = params.get(_SERVICE_VER) inputs.app_id = params.get(_APP_ID) @@ -164,17 +164,6 @@ def _init_job_rec( ) return job_id - def _get_module_git_commit(self, method, service_ver=None) -> Optional[str]: - module_name = method.split(".")[0] - if not service_ver: - service_ver = "release" - self.logger.debug(f"Getting commit for {module_name} {service_ver}") - module_version = self.sdkmr.get_catalog().get_module_version( - {"module_name": module_name, "version": service_ver} - ) - git_commit_hash = module_version.get("git_commit_hash") - return git_commit_hash - def _check_ws_objects(self, source_objects) -> None: """ perform sanity checks on input WS objects @@ -415,7 +404,8 @@ def _add_job_requirements(self, jobs: List[Dict[str, Any]], is_write_admin: bool try: job[_JOB_REQUIREMENTS] = jrr.resolve_requirements( - job.get(_METHOD), + method=job.get(_METHOD), + catalog_cache=self.sdkmr.get_catalog_cache(), cpus=norm.get(REQUEST_CPUS), memory_MB=norm.get(REQUEST_MEMORY), disk_GB=norm.get(REQUEST_DISK), @@ -748,7 +738,8 @@ def _get_job_reqs_from_concierge_params( schd_reqs[key.strip()] = val.strip() return jrr.resolve_requirements( - method, + method=method, + catalog_cache=self.sdkmr.get_catalog_cache(), cpus=norm.get(REQUEST_CPUS), memory_MB=norm.get(REQUEST_MEMORY), disk_GB=norm.get(REQUEST_DISK), diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index d410d7934..e3c692e9c 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -25,16 +25,17 @@ EE2Status, EE2Logs, ) -from lib.execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME -from lib.execution_engine2.utils.Condor import Condor +from execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME +from execution_engine2.utils.Condor import Condor from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from execution_engine2.utils.clients import UserClientSet, ClientSet -from lib.execution_engine2.utils.EE2Logger import get_logger as _get_logger -from lib.execution_engine2.utils.KafkaUtils import KafkaClient -from lib.execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.EE2Logger import get_logger as _get_logger +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace +from execution_engine2.utils.catalog_cache import CatalogCache class JobPermissions(Enum): @@ -46,6 +47,7 @@ class JobPermissions(Enum): class SDKMethodRunner: """ The execution engine 2 api calls functions from here. + The SDKMR is instantiated per call """ """ @@ -68,7 +70,10 @@ def __init__( self.mongo_util = clients.mongo_util self.condor = clients.condor self.catalog = clients.catalog + # Cache Instantiated on a per request basis + self.catalog_cache = CatalogCache(catalog=clients.catalog_no_auth) self.job_requirements_resolver = clients.requirements_resolver + self.workspace = user_clients.workspace self.workspace_auth = user_clients.workspace_auth self.auth = clients.auth @@ -158,6 +163,12 @@ def get_catalog(self) -> Catalog: """ return self.catalog + def get_catalog_cache(self) -> CatalogCache: + """ + Get the catalog cache client for this instance of SDKMR. + """ + return self.catalog_cache + def get_job_requirements_resolver(self) -> JobRequirementsResolver: """ Get the job requirements resolver for this instance of SDKMR. diff --git a/lib/execution_engine2/utils/catalog_cache.py b/lib/execution_engine2/utils/catalog_cache.py new file mode 100644 index 000000000..df5d07dd9 --- /dev/null +++ b/lib/execution_engine2/utils/catalog_cache.py @@ -0,0 +1,93 @@ +from collections import defaultdict +from typing import Dict + +from lib.installed_clients.CatalogClient import Catalog + + +class CatalogCache: + """ + Per call catalog cache used to speed up catalog lookups + Caches the "Method Version" and the "Job Resource Requirements" + There's no cache invalidation, and to refresh a cache entry you have to make a new cache + Cache is not thread safe + """ + + def __init__(self, catalog: Catalog): + """ + :param catalog: Instance of catalog client. Does not require authentication + """ + if not catalog: + raise ValueError("Please provide instance of catalog client") + + self._catalog = catalog + self._method_version_cache = defaultdict(dict) + self._job_requirements_cache = defaultdict(dict) + + def get_catalog(self) -> Catalog: + """Get the catalog client for this instance.""" + return self._catalog + + def get_method_version_cache(self) -> Dict: + """Get the _method_version_cache for this instance.""" + return self._method_version_cache + + def get_job_resources_cache(self) -> Dict: + """Get the _condor_resources_cache for this instance.""" + return self._job_requirements_cache + + def lookup_git_commit_version(self, method, service_ver=None) -> str: + """ + If "service_ver" is "release|beta|dev", get git commit version for that version + if "service_ver" is a semantic version, get commit version for that semantic version + If "service_ver" is a git commit hash, see if that get commit is valid + Convenience wrapper for verifying a git commit hash, or getting git commit hash from a tag + :param method: Method to look up + :param service_ver: Version to look up + :return: A git commit hash for the requested job + """ + + # Structure of cache + # { 'run_megahit' : + # { + # 'dev' : 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6', #Tag + # '2.5.0' : 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6', #Semantic + # 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6' : 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6' #vcs + # } + # } + mv_cache = self.get_method_version_cache() + if not method: + raise ValueError("Must provide a method to lookup") + + if not service_ver: + service_ver = "release" + + # If not in the cache add it + if method not in mv_cache or service_ver not in mv_cache[method]: + module_name = method.split(".")[0] + module_version = self.get_catalog().get_module_version( + {"module_name": module_name, "version": service_ver} + ) + mv_cache[method][service_ver] = module_version.get("git_commit_hash") + # Retrieve from cache + return mv_cache[method][service_ver] + + def lookup_job_resource_requirements(self, module_name, function_name) -> dict: + """ + Gets required job resources and clientgroups for a job submission + :param module_name: Module name to lookup + :param function_name: Function name to lookup + :return: A cached lookup of unformatted resource requests from the catalog + """ + # Structure of cache + # { 'module_name' : {'function_name' : [group_config] } + # } + cr_cache = self.get_job_resources_cache() + # If not in the cache add it + if module_name not in cr_cache or function_name not in cr_cache[module_name]: + cr_cache[module_name][ + function_name + ] = self.get_catalog().list_client_group_configs( + {"module_name": module_name, "function_name": function_name} + ) + # Retrieve from cache + return cr_cache[module_name][function_name] diff --git a/lib/execution_engine2/utils/clients.py b/lib/execution_engine2/utils/clients.py index e5a4c928e..b47381252 100644 --- a/lib/execution_engine2/utils/clients.py +++ b/lib/execution_engine2/utils/clients.py @@ -8,17 +8,16 @@ from execution_engine2.authorization.roles import AdminAuthUtil from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.utils.arg_processing import not_falsy as _not_falsy -from execution_engine2.utils.Condor import Condor from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE -from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.Condor import Condor from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.arg_processing import not_falsy as _not_falsy from execution_engine2.utils.arg_processing import parse_bool - -from installed_clients.authclient import KBaseAuth +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace +from installed_clients.authclient import KBaseAuth class UserClientSet: @@ -86,8 +85,8 @@ def get_user_client_set(cfg: Dict[str, str], user_id: str, token: str): class ClientSet: """ + There is only one instance of this class globally. The codebase effectively treats this as a singleton. Clients required by EE2 for communicating with other services. - These are not user-specific and can be reused throughout the application. """ @@ -97,6 +96,7 @@ def __init__( auth_admin: AdminAuthUtil, condor: Condor, catalog: Catalog, + catalog_no_auth: Catalog, requirements_resolver: JobRequirementsResolver, kafka_client: KafkaClient, mongo_util: MongoUtil, @@ -110,6 +110,7 @@ def __init__( self.auth_admin = _not_falsy(auth_admin, "auth_admin") self.condor = _not_falsy(condor, "condor") self.catalog = _not_falsy(catalog, "catalog") + self.catalog_no_auth = _not_falsy(catalog_no_auth, "catalog_no_auth") self.requirements_resolver = _not_falsy( requirements_resolver, "requirements_resolver" ) @@ -131,6 +132,7 @@ def get_clients( AdminAuthUtil, Condor, Catalog, + Catalog, JobRequirementsResolver, KafkaClient, MongoUtil, @@ -158,10 +160,9 @@ def get_clients( # TODO check keys are present - make some general methods for dealing with this # token is needed for running log_exec_stats in EE2Status catalog = Catalog(cfg["catalog-url"], token=cfg["catalog-token"]) - # make a separate, hidden catalog instance - jrr = JobRequirementsResolver( - Catalog(cfg["catalog-url"]), cfg_file, override_client_group - ) + # instance of catalog without creds is used here + catalog_no_auth = Catalog(cfg["catalog-url"]) + jrr = JobRequirementsResolver(cfg_file, override_client_group) auth_url = cfg["auth-url"] auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") # TODO using hardcoded roles for now to avoid possible bugs with mismatched cfg roles @@ -184,6 +185,7 @@ def get_clients( auth_admin, condor, catalog, + catalog_no_auth, jrr, kafka_client, mongo_util, diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py index c92dc2648..c0f17cdc6 100644 --- a/lib/execution_engine2/utils/job_requirements_resolver.py +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -4,23 +4,21 @@ import json from configparser import ConfigParser -from typing import Iterable, Dict, Union, Set from enum import Enum +from typing import Iterable, Dict, Union, Set -from lib.installed_clients.CatalogClient import Catalog - -from execution_engine2.utils.arg_processing import ( - check_string as _check_string, - not_falsy as _not_falsy, -) - -from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.exceptions import IncorrectParamsException from execution_engine2.sdk.EE2Constants import ( EE2_CONFIG_SECTION, EE2_DEFAULT_SECTION, EE2_DEFAULT_CLIENT_GROUP, ) -from execution_engine2.exceptions import IncorrectParamsException +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.utils.arg_processing import ( + check_string as _check_string, + not_falsy as _not_falsy, +) +from execution_engine2.utils.catalog_cache import CatalogCache CLIENT_GROUP = "client_group" REQUEST_CPUS = "request_cpus" @@ -157,19 +155,15 @@ class JobRequirementsResolver: def __init__( self, - catalog: Catalog, cfgfile: Iterable[str], override_client_group: str = None, ): """ Create the resolver. - - catalog - a catalog client pointing at the relevant KBase catalog service. cfgfile - the configuration file as an open file object or other iterable. override_client_group - if provided, this client group will be used for all jobs, ignoring all other sources of client group information. """ - self._catalog = _not_falsy(catalog, "catalog") self._override_client_group = _check_string( override_client_group, "override_client_group", optional=True ) @@ -360,6 +354,7 @@ def _has_value(cls, inc): def resolve_requirements( self, method: str, + catalog_cache: CatalogCache, cpus: int = None, memory_MB: int = None, disk_GB: int = None, @@ -377,6 +372,7 @@ def resolve_requirements( the catalog and ee2 settings for the job. method - the method to be run in module.method format. + catalog_cache - a per request instance of a CatalogCache in order to speed up catalog lookups cpus - the number of CPUs required for the job. memory_MB - the amount of memory, in MB, required for the job. disk_GB - the amount of disk space, in GB, required for the job. @@ -413,7 +409,7 @@ def resolve_requirements( # the catalog could contain arbitrary scheduler requirements so we can't skip the # call even if all the arguments are provided - cat_reqs_all = self._get_catalog_reqs(module_name, function_name) + cat_reqs_all = self._get_catalog_reqs(module_name, function_name, catalog_cache) cat_reqs = self.normalize_job_reqs( cat_reqs_all, f"catalog method {module_name}.{function_name}", @@ -465,10 +461,13 @@ def _get_client_group(self, user_cg, catalog_cg, module_name, function_name): raise IncorrectParamsException(f"No such clientgroup: {cg}") return cg - def _get_catalog_reqs(self, module_name, function_name): + @staticmethod + def _get_catalog_reqs( + module_name: str, function_name: str, catalog_cache: CatalogCache + ): # could cache results for 30s or so to speed things up... YAGNI - group_config = self._catalog.list_client_group_configs( - {"module_name": module_name, "function_name": function_name} + group_config = catalog_cache.lookup_job_resource_requirements( + module_name=module_name, function_name=function_name ) # If group_config is empty, that means there's no clientgroup entry in the catalog # It'll return an empty list even for non-existent modules @@ -477,7 +476,7 @@ def _get_catalog_reqs(self, module_name, function_name): if len(group_config) > 1: raise ValueError( "Unexpected result from the Catalog service: more than one client group " - + f"configuration found for method {module_name}.{function_name}" + + f"configuration found for method {module_name}.{function_name} {group_config}" ) resources_request = group_config[0].get(_CLIENT_GROUPS, None) diff --git a/requirements.txt b/requirements.txt index 6dc8946da..363084bc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -66,7 +66,7 @@ tqdm==4.42.1 typing-extensions==3.7.4.3 ujson==1.35 urllib3==1.25.8 -uvloop==0.12.2 +uvloop==0.15.2 websocket-client==0.57.0 websockets==6.0 yarl==1.5.1 diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index 315d2bbdb..bbc16c465 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -2,19 +2,16 @@ import os import unittest from configparser import ConfigParser - from unittest.mock import create_autospec import bson from mock import MagicMock from mock import patch -from installed_clients.CatalogClient import Catalog -from installed_clients.WorkspaceClient import Workspace from execution_engine2.authorization.roles import AdminAuthUtil from execution_engine2.authorization.workspaceauth import WorkspaceAuth -from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from execution_engine2.db.models.models import Status +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.utils.Condor import Condor from execution_engine2.utils.CondorTuples import SubmissionInfo @@ -24,14 +21,14 @@ get_client_set, get_user_client_set, ) +from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace +from test.utils_shared.mock_utils import get_client_mocks as _get_client_mocks from test.utils_shared.test_utils import ( get_sample_job_params, get_sample_condor_info, ) -from test.utils_shared.mock_utils import get_client_mocks as _get_client_mocks - - # Cause any tests that contact external services (e.g. KBASE CI auth) as part of the test to # pass automatically. SKIP_TESTS_WITH_EXTERNALITIES = False diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 5f782958a..20fc3debd 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -4,36 +4,39 @@ # Incomplete by a long way. Will add more unit tests as they come up. -from pytest import raises -from typing import List, Dict, Any -from bson.objectid import ObjectId from logging import Logger +from typing import List, Dict, Any from unittest.mock import create_autospec, call + +from bson.objectid import ObjectId +from pytest import raises + from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta from execution_engine2.exceptions import IncorrectParamsException, AuthError from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.sdk.job_submission_parameters import ( JobSubmissionParameters, JobRequirements as ResolvedRequirements, AppInfo, UserCreds, ) -from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.utils.Condor import Condor, SubmissionInfo from execution_engine2.utils.KafkaUtils import ( KafkaClient, KafkaQueueChange, KafkaCreateJob, ) +from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.catalog_cache import CatalogCache from execution_engine2.utils.job_requirements_resolver import ( JobRequirementsResolver, RequirementsType, ) -from execution_engine2.utils.SlackUtils import SlackClient -from execution_engine2.db.MongoUtil import MongoUtil -from installed_clients.WorkspaceClient import Workspace from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace from utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from utils_shared.test_utils import assert_exception_correct @@ -94,7 +97,10 @@ def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: mocks[Logger] = create_autospec(Logger, spec_set=True, instance=True) mocks[Workspace] = create_autospec(Workspace, spec_set=True, instance=True) mocks[WorkspaceAuth] = create_autospec(WorkspaceAuth, spec_set=True, instance=True) + mocks[CatalogCache] = create_autospec(CatalogCache, spec_set=True, instance=True) + # Set up basic getter calls + sdkmr.get_catalog_cache.return_value = mocks[CatalogCache] sdkmr.get_catalog.return_value = mocks[Catalog] sdkmr.get_condor.return_value = mocks[Condor] sdkmr.get_kafka_client.return_value = mocks[KafkaClient] @@ -162,7 +168,7 @@ def _set_up_common_return_values(mocks): mocks[Workspace].get_object_info3.return_value = { "paths": [[_WS_REF_1], [_WS_REF_2]] } - mocks[Catalog].get_module_version.return_value = {"git_commit_hash": _GIT_COMMIT} + mocks[CatalogCache].lookup_git_commit_version.return_value = _GIT_COMMIT mocks[SDKMethodRunner].save_job.return_value = _JOB_ID mocks[Condor].run_job.return_value = SubmissionInfo(_CLUSTER, {}, None) retjob = Job() @@ -181,8 +187,8 @@ def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): mocks[Workspace].get_object_info3.assert_called_once_with( {"objects": [{"ref": _WS_REF_1}, {"ref": _WS_REF_2}], "ignoreErrors": 1} ) - mocks[Catalog].get_module_version.assert_called_once_with( - {"module_name": "lolcats", "version": "release"} + mocks[CatalogCache].lookup_git_commit_version.assert_called_once_with( + method="lolcats.lol_unto_death", service_ver=None ) # initial job data save @@ -304,7 +310,9 @@ def test_run_job(): # check mocks called as expected. The order here is the order that they're called in the code. jrr.normalize_job_reqs.assert_called_once_with({}, "input job") jrr.get_requirements_type.assert_called_once_with(**_EMPTY_JOB_REQUIREMENTS) - jrr.resolve_requirements.assert_called_once_with(_METHOD, **_EMPTY_JOB_REQUIREMENTS) + jrr.resolve_requirements.assert_called_once_with( + _METHOD, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS + ) _check_common_mock_calls(mocks, reqs, None, _APP) @@ -381,7 +389,9 @@ def test_run_job_as_admin_with_job_requirements(): sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) jrr.normalize_job_reqs.assert_called_once_with(inc_reqs, "input job") jrr.get_requirements_type.assert_called_once_with(**req_args) - jrr.resolve_requirements.assert_called_once_with(_METHOD, **req_args) + jrr.resolve_requirements.assert_called_once_with( + _METHOD, mocks[CatalogCache], **req_args + ) _check_common_mock_calls(mocks, reqs, None, None) @@ -456,6 +466,7 @@ def test_run_job_as_concierge_with_wsid(): jrr.resolve_requirements.assert_called_once_with( _METHOD, + mocks[CatalogCache], cpus=cpus, memory_MB=mem, disk_GB=disk, @@ -544,6 +555,7 @@ def _run_as_concierge_empty_as_admin(concierge_params, app): jrr.resolve_requirements.assert_called_once_with( _METHOD, + mocks[CatalogCache], cpus=None, memory_MB=None, disk_GB=None, @@ -730,11 +742,13 @@ def _set_up_common_return_values_batch(mocks): returned_parent_job = Job() returned_parent_job.id = ObjectId(_JOB_ID) returned_parent_job.user = _USER - mocks[SDKMethodRunner].save_and_return_job.return_value = returned_parent_job - mocks[Catalog].get_module_version.side_effect = [ - {"git_commit_hash": _GIT_COMMIT_1}, - {"git_commit_hash": _GIT_COMMIT_2}, + mocks[CatalogCache].lookup_git_commit_version.side_effect = [ + _GIT_COMMIT_1, + _GIT_COMMIT_2, ] + + mocks[SDKMethodRunner].save_and_return_job.return_value = returned_parent_job + # create job1, update job1, create job2, update job2, update parent job mocks[SDKMethodRunner].save_job.side_effect = [ _JOB_ID_1, @@ -782,10 +796,10 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): got_parent_job = sdkmr.save_and_return_job.call_args_list[0][0][0] assert_jobs_equal(got_parent_job, expected_parent_job) - mocks[Catalog].get_module_version.assert_has_calls( + mocks[CatalogCache].lookup_git_commit_version.assert_has_calls( [ - call({"module_name": "module1", "version": "release"}), - call({"module_name": "module2", "version": "release"}), + call(method="module1.method1", service_ver=None), + call(method="module2.method2", service_ver=None), ] ) @@ -960,8 +974,8 @@ def test_run_job_batch_with_parent_job_wsid(): ) jrr.resolve_requirements.assert_has_calls( [ - call(_METHOD_1, **_EMPTY_JOB_REQUIREMENTS), - call(_METHOD_2, **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_1, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), ] ) _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid) @@ -1065,7 +1079,10 @@ def test_run_job_batch_as_admin_with_job_requirements(): [call(**_EMPTY_JOB_REQUIREMENTS), call(**req_args)] ) jrr.resolve_requirements.assert_has_calls( - [call(_METHOD_1, **_EMPTY_JOB_REQUIREMENTS), call(_METHOD_2, **req_args)] + [ + call(_METHOD_1, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, mocks[CatalogCache], **req_args), + ] ) _check_common_mock_calls_batch(mocks, reqs1, reqs2, None, wsid) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 111badf4e..a90044703 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -9,32 +9,33 @@ from datetime import datetime, timedelta, timezone from pprint import pprint from unittest.mock import patch, create_autospec -from pytest import raises import bson import dateutil import requests_mock from bson import ObjectId from mock import MagicMock +from pytest import raises from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.exceptions import AuthError +from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.utils.Condor import Condor from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.clients import UserClientSet, ClientSet +from execution_engine2.utils.clients import get_user_client_set, get_client_set from execution_engine2.utils.job_requirements_resolver import ( JobRequirementsResolver, RequirementsType, ) from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode -from execution_engine2.exceptions import AuthError from lib.execution_engine2.exceptions import InvalidStatusTransitionException from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from execution_engine2.sdk.job_submission_parameters import JobRequirements from lib.execution_engine2.utils.CondorTuples import SubmissionInfo -from execution_engine2.utils.clients import UserClientSet, ClientSet -from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper +from test.utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -42,7 +43,6 @@ run_job_adapter, assert_exception_correct, ) -from test.utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from tests_for_db.mongo_test_helper import MongoTestHelper logging.basicConfig(level=logging.INFO) @@ -98,6 +98,7 @@ def getRunner(self) -> SDKMethodRunner: runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() + runner.get_catalog_cache() return runner def create_job_rec(self): @@ -158,6 +159,7 @@ def test_getters(self): sdkmr = SDKMethodRunner(user_clients, clients_and_mocks[ClientSet]) + assert sdkmr.get_catalog_cache() is sdkmr.catalog_cache assert sdkmr.get_workspace() is ws assert sdkmr.get_workspace_auth() is wsa assert sdkmr.get_user_id() == "user" @@ -277,8 +279,9 @@ def test_cancel_job2(self, rq_mock, condor_mock): # runner.get_runjob = MagicMock(return_value="git_commit_goes_here") runner.get_condor = MagicMock(return_value=condor_mock) + fixed_rj = EE2RunJob(runner) - fixed_rj._get_module_git_commit = MagicMock(return_value="hash_goes_here") + # _get_module_git_commitfixed_rj._get_module_git_commit = MagicMock(return_value="hash_goes_here") runner.get_runjob = MagicMock(return_value=fixed_rj) @@ -930,9 +933,6 @@ def test_check_jobs_date_range(self, condor_mock): runner.workspace_auth.can_read = MagicMock(return_value=True) self.mock = MagicMock(return_value=True) - runner._ee2_runjob._get_module_git_commit = MagicMock( - return_value="hash_goes_here" - ) # fixed_rj = RunJob(runner) # fixed_rj._get_module_git_commit = MagicMock(return_value='hash_goes_here') diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 7303a3c0c..c1801a909 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -73,10 +73,12 @@ def _getRunner(cls) -> SDKMethodRunner: return runner - def test_init_job_stress(self): + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + def test_init_job_stress(self, cc_get_mod_ver): """ testing initializing 3 different jobs in multiple theads. """ + cc_get_mod_ver.return_value = {"git_commit_hash": "123"} thread_count = self.thread_count # threads to test @@ -446,11 +448,12 @@ def update_states(index, job_ids_queued, job_ids_running, job_ids_completed): jobs.delete() self.assertEqual(ori_job_count, Job.objects.count()) - def test_check_jobs_stress(self): + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + def test_check_jobs_stress(self, cc_get_mod_ver): """ testing check jobs in multiple theads. """ - + cc_get_mod_ver.return_value = {"git_commit_hash": "123"} thread_count = self.thread_count # threads to test ori_job_count = Job.objects.count() diff --git a/test/tests_for_sdkmr/ee2_retry_test.py b/test/tests_for_sdkmr/ee2_retry_test.py index a73df8dd0..184695525 100644 --- a/test/tests_for_sdkmr/ee2_retry_test.py +++ b/test/tests_for_sdkmr/ee2_retry_test.py @@ -1,13 +1,15 @@ """ Unit tests for the Retry Code """ +from unittest.mock import create_autospec, MagicMock + +from pytest import raises + from execution_engine2.exceptions import CannotRetryJob, RetryFailureException from execution_engine2.sdk.EE2Runjob import EE2RunJob from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner - -from test.utils_shared.test_utils import get_example_job, assert_exception_correct -from unittest.mock import create_autospec, MagicMock -from pytest import raises +from test.utils_shared.test_utils import assert_exception_correct +from test.utils_shared.test_utils import get_example_job def test_retry_db_failures(): diff --git a/test/tests_for_utils/catalog_cache_test.py b/test/tests_for_utils/catalog_cache_test.py new file mode 100644 index 000000000..0c7713e78 --- /dev/null +++ b/test/tests_for_utils/catalog_cache_test.py @@ -0,0 +1,175 @@ +# This test only tests code that can be exercised without a network connection to services. +# That code is tested in integration tests. +from unittest.mock import create_autospec + +import pytest + +from execution_engine2.utils.catalog_cache import CatalogCache +from installed_clients.CatalogClient import Catalog +from utils_shared.test_utils import ( + assert_exception_correct, + CLIENT_GROUP_CONFIG, +) + + +@pytest.fixture +def catalog(): + return create_autospec(Catalog, spec_set=True, instance=True) + + +@pytest.fixture +def catalog_cache(): + return create_autospec(CatalogCache, spec_set=True, instance=True) + + +def test_fail_cc(): + with pytest.raises(ValueError) as e: + CatalogCache(None) + assert_exception_correct( + e.value, ValueError("Please provide instance of catalog client") + ) + + # Test that a new catalog call is made once more + with pytest.raises(ValueError) as e: + catalog_cache = CatalogCache(catalog=catalog) + catalog_cache.lookup_git_commit_version(method=None, service_ver="dev") + assert_exception_correct(e.value, ValueError("Must provide a method to lookup")) + + +def assert_call_count_and_return_val( + mock, call_count, return_value, expected_return_value +): + assert mock.call_count == call_count + assert return_value == expected_return_value + + +def test_get_catalog(catalog): + assert catalog == CatalogCache(catalog).get_catalog() + + +def test_cc_job_reqs(catalog): + """Test to see the job requirements cache is being used.""" + test_return = {"Test1"} + catalog.list_client_group_configs.return_value = test_return + catalog_cache = CatalogCache(catalog=catalog) + job_reqs_cache = catalog_cache.get_job_resources_cache() + + # Test Cache is called on second call + rv1 = catalog_cache.lookup_job_resource_requirements( + module_name="test1", function_name="test1" + ) + + assert catalog.list_client_group_configs.call_count == 1 + # Test to make sure it still returns values based on the catalog + assert rv1 == test_return + assert "test1" in job_reqs_cache and "test1" in job_reqs_cache["test1"] + catalog.list_client_group_configs.assert_called_with( + {"module_name": "test1", "function_name": "test1"} + ) + + catalog.list_client_group_configs.return_value = CLIENT_GROUP_CONFIG + catalog_cache._job_requirements_cache["test1"]["test1"] = "Something else" + rv2 = catalog_cache.lookup_job_resource_requirements( + module_name="test1", function_name="test1" + ) + # Test to make sure the catalog cache is being used this time, even though the underlying catalog record changed + assert rv2 != CLIENT_GROUP_CONFIG + assert rv2 == "Something else" + catalog.list_client_group_configs.assert_called_with( + {"module_name": "test1", "function_name": "test1"} + ) + + # Test to see a new catalog call is made + assert catalog.list_client_group_configs.call_count == 1 + catalog_cache.lookup_job_resource_requirements( + module_name="test1", function_name="test2" + ) + assert catalog.list_client_group_configs.call_count == 2 + assert "test1" in job_reqs_cache and "test2" in job_reqs_cache["test1"] + catalog.list_client_group_configs.assert_called_with( + {"module_name": "test1", "function_name": "test2"} + ) + + +def test_cc_git_commit_version(catalog): + """Test to see the git commit cache is being used.""" + catalog_cache = CatalogCache(catalog=catalog) + catalog_git_return_1 = {"git_commit_hash": "1234"} + catalog_git_return_2 = {"git_commit_hash": "12345"} + catalog.get_module_version.return_value = catalog_git_return_1 + method_version_cache = catalog_cache.get_method_version_cache() + + # Test Cache is called on second call + version = catalog_cache.lookup_git_commit_version( + method="method1", service_ver="any" + ) + + # Test to make sure return_value is correct + assert version == catalog_git_return_1["git_commit_hash"] + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "any"} + ) + + # Test to make sure same commit is returned regardless of underlying catalog data + catalog.get_module_version.return_value = catalog_git_return_2 + version2 = catalog_cache.lookup_git_commit_version( + method="method1", service_ver="any" + ) + assert version2 == catalog_git_return_1["git_commit_hash"] + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "any"} + ) + + catalog_cache.lookup_git_commit_version(method="method1", service_ver="any") + assert catalog.get_module_version.call_count == 1 + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "any"} + ) + catalog_cache.lookup_git_commit_version( + method="method1", + ) + assert catalog.get_module_version.call_count == 2 + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "release"} + ) + + assert method_version_cache["method1"] == {"any": "1234", "release": "12345"} + + # Test None defaults to release case + catalog_cache.lookup_git_commit_version(method="method3", service_ver=None) + catalog.get_module_version.assert_called_with( + {"module_name": "method3", "version": "release"} + ) + + assert None not in catalog_cache.get_method_version_cache()["method3"] + assert catalog_cache.get_method_version_cache()["method3"]["release"] + catalog.get_module_version.assert_called_with( + {"module_name": "method3", "version": "release"} + ) + + # Test module_name = method.split(".")[0] and call count + call_count = catalog.get_module_version.call_count + catalog_cache.lookup_git_commit_version( + method="MEGAHIT.run_megahit", service_ver="dev" + ) + catalog.get_module_version.assert_called_with( + {"module_name": "MEGAHIT", "version": "dev"} + ) + assert catalog.get_module_version.call_count == call_count + 1 + + # Test that the catalog is not called, from cache now + catalog_cache.lookup_git_commit_version( + method="MEGAHIT.run_megahit", service_ver="dev" + ) + assert catalog.get_module_version.call_count == call_count + 1 + + # Test that a new catalog call is made once more + catalog_cache.lookup_git_commit_version( + method="MEGAHIT.run_megahit2", service_ver="dev" + ) + catalog.get_module_version.assert_called_with( + {"module_name": "MEGAHIT", "version": "dev"} + ) + + assert method_version_cache["MEGAHIT.run_megahit"] == {"dev": "12345"} + assert method_version_cache["MEGAHIT.run_megahit2"] == {"dev": "12345"} diff --git a/test/tests_for_utils/clients_test.py b/test/tests_for_utils/clients_test.py index 92bd39e01..acde30020 100644 --- a/test/tests_for_utils/clients_test.py +++ b/test/tests_for_utils/clients_test.py @@ -98,21 +98,23 @@ def test_client_set_init_fail(): n = None e = ValueError("auth cannot be a value that evaluates to false") - _client_set_init_fail(n, aa, c, ca, j, k, m, s, e) + _client_set_init_fail(n, aa, c, ca, ca, j, k, m, s, e) e = ValueError("auth_admin cannot be a value that evaluates to false") - _client_set_init_fail(a, n, c, ca, j, k, m, s, e) + _client_set_init_fail(a, n, c, ca, ca, j, k, m, s, e) e = ValueError("condor cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, n, ca, j, k, m, s, e) + _client_set_init_fail(a, aa, n, ca, ca, j, k, m, s, e) e = ValueError("catalog cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, n, j, k, m, s, e) + _client_set_init_fail(a, aa, c, n, ca, j, k, m, s, e) + e = ValueError("catalog_no_auth cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, n, j, k, m, s, e) e = ValueError("requirements_resolver cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, n, k, m, s, e) + _client_set_init_fail(a, aa, c, ca, ca, n, k, m, s, e) e = ValueError("kafka_client cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, n, m, s, e) + _client_set_init_fail(a, aa, c, ca, ca, j, n, m, s, e) e = ValueError("mongo_util cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, k, n, s, e) + _client_set_init_fail(a, aa, c, ca, ca, j, k, n, s, e) e = ValueError("slack_client cannot be a value that evaluates to false") - _client_set_init_fail(a, aa, c, ca, j, k, m, n, e) + _client_set_init_fail(a, aa, c, ca, ca, j, k, m, n, e) def _client_set_init_fail( @@ -120,6 +122,7 @@ def _client_set_init_fail( auth_admin: AdminAuthUtil, condor: Condor, catalog: Catalog, + catalog_no_auth: Catalog, requirements_resolver: JobRequirementsResolver, kafka_client: KafkaClient, mongo_util: MongoUtil, @@ -132,6 +135,7 @@ def _client_set_init_fail( auth_admin, condor, catalog, + catalog_no_auth, requirements_resolver, kafka_client, mongo_util, diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py index 9deb59c56..4cd01c26f 100644 --- a/test/tests_for_utils/job_requirements_resolver_test.py +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -12,7 +12,7 @@ RequirementsType, ) from execution_engine2.exceptions import IncorrectParamsException -from installed_clients.CatalogClient import Catalog +from execution_engine2.utils.catalog_cache import CatalogCache from utils_shared.test_utils import assert_exception_correct @@ -449,11 +449,10 @@ def _get_simple_deploy_spec_file_obj(): def test_init(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec) + jrr = JobRequirementsResolver(spec) assert jrr.get_default_client_group() == "cg2" assert jrr.get_override_client_group() is None assert jrr.get_configured_client_groups() == set(["cg1", "cg2"]) @@ -475,33 +474,23 @@ def test_init(): def test_init_with_override(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec, " \t ") + jrr = JobRequirementsResolver(spec, " \t ") assert jrr.get_override_client_group() is None spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec, "cg1") + jrr = JobRequirementsResolver(spec, "cg1") assert jrr.get_override_client_group() == "cg1" def test_init_fail_missing_input(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) _init_fail( - None, - _get_simple_deploy_spec_file_obj(), - None, - ValueError("catalog cannot be a value that evaluates to false"), - ) - _init_fail( - catalog, None, None, ValueError("cfgfile cannot be a value that evaluates to false"), ) _init_fail( - catalog, [], None, ValueError("cfgfile cannot be a value that evaluates to false"), @@ -509,11 +498,9 @@ def test_init_fail_missing_input(): def test_init_fail_no_override_in_config(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) spec = _get_simple_deploy_spec_file_obj() _init_fail( - catalog, spec, "cg3", ValueError("No deployment configuration entry for override client group 'cg3'"), @@ -521,7 +508,6 @@ def test_init_fail_no_override_in_config(): def test_init_fail_default_config_error(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) shared_spec = """ [njs] @@ -531,7 +517,6 @@ def test_init_fail_default_config_error(): """ _init_fail( - catalog, StringIO(shared_spec), None, IncorrectParamsException( @@ -548,7 +533,6 @@ def test_init_fail_default_config_error(): """ ) _init_fail( - catalog, spec, None, IncorrectParamsException( @@ -565,7 +549,6 @@ def test_init_fail_default_config_error(): """ ) _init_fail( - catalog, spec, None, ValueError("No deployment configuration entry for default client group 'njrs'"), @@ -573,7 +556,6 @@ def test_init_fail_default_config_error(): def test_init_fail_bad_config(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) shared_spec = """ [DEFAULT] @@ -590,7 +572,6 @@ def test_init_fail_bad_config(): ) _init_fail( - catalog, StringIO(spec), None, IncorrectParamsException( @@ -609,7 +590,6 @@ def test_init_fail_bad_config(): ) _init_fail( - catalog, StringIO(spec), None, IncorrectParamsException( @@ -628,7 +608,6 @@ def test_init_fail_bad_config(): ) _init_fail( - catalog, StringIO(spec), None, IncorrectParamsException( @@ -638,16 +617,15 @@ def test_init_fail_bad_config(): ) -def _init_fail(catalog, spec, override, expected): +def _init_fail(spec, override, expected): with raises(Exception) as got: - JobRequirementsResolver(catalog, spec, override) + JobRequirementsResolver(spec, override) assert_exception_correct(got.value, expected) def test_get_configured_client_group_spec_fail(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) with raises(Exception) as got: jrr.get_configured_client_group_spec("cg4") @@ -662,6 +640,17 @@ def test_get_configured_client_group_spec_fail(): # testing may be required. +def get_catalog_cache_mock(catalog_return=None): + """ + :param catalog_return: Set the lookup_job_resource_requirements return value + :return: A mocked instance of the CatalogCache + """ + catalog_cache = create_autospec(CatalogCache, spec_set=True, instance=True) + if catalog_return is not None: + catalog_cache.lookup_job_resource_requirements.return_value = catalog_return + return catalog_cache + + def test_resolve_requirements_from_spec(): """ Resolve requirements when no user input and no catalog record is available. @@ -672,14 +661,12 @@ def test_resolve_requirements_from_spec(): def _resolve_requirements_from_spec(catalog_return): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = catalog_return + catalog_cache = get_catalog_cache_mock(catalog_return) spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(spec) - jrr = JobRequirementsResolver(catalog, spec) - - assert jrr.resolve_requirements(" mod.meth ") == JobRequirements( + assert jrr.resolve_requirements(" mod.meth ", catalog_cache) == JobRequirements( 8, 700, 32, @@ -687,9 +674,8 @@ def _resolve_requirements_from_spec(catalog_return): client_group_regex=False, debug_mode=True, ) - - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "mod", "function_name": "meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="mod", function_name="meth" ) @@ -697,22 +683,20 @@ def test_resolve_requirements_from_spec_with_override(): """ Test that an override ignores client group information from the catalog and deploy config. """ - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}] + catalog_cache = get_catalog_cache_mock(catalog_return=[{"client_groups": ["cg2"]}]) spec = _get_simple_deploy_spec_file_obj() - - jrr = JobRequirementsResolver(catalog, spec, " cg1 ") - - assert jrr.resolve_requirements(" module2. some_meth ") == JobRequirements( + jrr = JobRequirementsResolver(spec, " cg1 ") + assert jrr.resolve_requirements( + " module2. some_meth ", catalog_cache + ) == JobRequirements( 4, 2000, 100, "cg1", ) - - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "module2", "function_name": "some_meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" ) @@ -721,30 +705,28 @@ def test_resolve_requirements_from_spec_with_override_and_user_client_group(): Test that a user providing a client group ignores client group information from all other sources. """ - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}] + catalog_cache = get_catalog_cache_mock(catalog_return=[{"client_groups": ["cg2"]}]) spec = _get_simple_deploy_spec_file_obj() - - jrr = JobRequirementsResolver(catalog, spec, " cg2 ") + jrr = JobRequirementsResolver(spec, " cg2 ") assert jrr.resolve_requirements( - " module2. some_meth ", client_group=" cg1" + " module2. some_meth ", + client_group=" cg1", + catalog_cache=catalog_cache, ) == JobRequirements( 4, 2000, 100, "cg1", ) - - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "module2", "function_name": "some_meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" ) def test_resolve_requirements_from_catalog_full_CSV(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ + return_value = [ { "client_groups": [ "cg1", @@ -759,11 +741,15 @@ def test_resolve_requirements_from_catalog_full_CSV(): } ] + catalog_cache = get_catalog_cache_mock(return_value) + spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec) + jrr = JobRequirementsResolver(spec) - assert jrr.resolve_requirements(" module2. some_meth ") == JobRequirements( + assert jrr.resolve_requirements( + " module2. some_meth ", catalog_cache + ) == JobRequirements( 78, 500, 700, @@ -775,14 +761,14 @@ def test_resolve_requirements_from_catalog_full_CSV(): True, ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "module2", "function_name": "some_meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" ) def test_resolve_requirements_from_catalog_partial_JSON(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ + + return_value = [ { "client_groups": [ '{"client_group": " cg1 "', @@ -792,12 +778,15 @@ def test_resolve_requirements_from_catalog_partial_JSON(): ] } ] + catalog_cache = get_catalog_cache_mock(return_value) spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec) + jrr = JobRequirementsResolver(spec) - assert jrr.resolve_requirements(" module2. some_meth ") == JobRequirements( + assert jrr.resolve_requirements( + " module2. some_meth ", catalog_cache + ) == JobRequirements( 4, 300, 100000, @@ -805,8 +794,8 @@ def test_resolve_requirements_from_catalog_partial_JSON(): scheduler_requirements={"exactlythesameshape": "asathingy"}, ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "module2", "function_name": "some_meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" ) @@ -816,8 +805,8 @@ def test_resolve_requirements_from_user_full(): def _resolve_requirements_from_user_full(bool_val): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ + + return_value = [ { "client_groups": [ "cg2", @@ -831,13 +820,14 @@ def _resolve_requirements_from_user_full(bool_val): ] } ] - + catalog_cache = get_catalog_cache_mock(return_value) spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec) + jrr = JobRequirementsResolver(spec) assert jrr.resolve_requirements( " module2. some_meth ", + catalog_cache, 42, 789, 1, @@ -866,8 +856,8 @@ def _resolve_requirements_from_user_full(bool_val): bool_val, ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "module2", "function_name": "some_meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" ) @@ -877,8 +867,8 @@ def test_resolve_requirements_from_user_partial(): Also tests that special keys are removed from the scheduler requirements. """ - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ + + return_value = [ { "client_groups": [ "cg2", @@ -891,14 +881,16 @@ def test_resolve_requirements_from_user_partial(): ] } ] + catalog_cache = get_catalog_cache_mock(return_value) spec = _get_simple_deploy_spec_file_obj() - jrr = JobRequirementsResolver(catalog, spec) + jrr = JobRequirementsResolver(spec) assert jrr.resolve_requirements( " module2. some_meth ", cpus=42, + catalog_cache=catalog_cache, client_group="cg1", client_group_regex=True, scheduler_requirements={ @@ -922,17 +914,19 @@ def test_resolve_requirements_from_user_partial(): debug_mode=True, ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "module2", "function_name": "some_meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" ) def test_resolve_requirements_fail_illegal_inputs(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + catalog_cache = get_catalog_cache_mock() _resolve_requirements_fail( jrr, + catalog_cache, None, {}, IncorrectParamsException( @@ -941,6 +935,7 @@ def test_resolve_requirements_fail_illegal_inputs(): ) _resolve_requirements_fail( jrr, + catalog_cache, "method", {}, IncorrectParamsException( @@ -949,6 +944,7 @@ def test_resolve_requirements_fail_illegal_inputs(): ) _resolve_requirements_fail( jrr, + catalog_cache, "mod1.mod2.method", {}, IncorrectParamsException( @@ -957,36 +953,42 @@ def test_resolve_requirements_fail_illegal_inputs(): ) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"cpus": 0}, IncorrectParamsException("CPU count must be at least 1"), ) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"memory_MB": 0}, IncorrectParamsException("memory in MB must be at least 1"), ) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"disk_GB": 0}, IncorrectParamsException("disk space in GB must be at least 1"), ) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"client_group": " \t "}, IncorrectParamsException("Missing input parameter: client_group"), ) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"bill_to_user": "\b"}, IncorrectParamsException("bill_to_user contains control characters"), ) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"scheduler_requirements": {"a": None}}, IncorrectParamsException( @@ -996,34 +998,35 @@ def test_resolve_requirements_fail_illegal_inputs(): def test_resolve_requirements_fail_catalog_multiple_entries(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [{"client_groups": ["cg2"]}, {}] - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + return_value = [{"client_groups": ["cg2"]}, {}] + catalog_cache = get_catalog_cache_mock(return_value) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {}, ValueError( "Unexpected result from the Catalog service: more than one client group " - + "configuration found for method m.m" + + f"configuration found for method m.m {return_value}" ), ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "m", "function_name": "m"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" ) def test_resolve_requirements_fail_catalog_bad_JSON(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ - {"client_groups": ['{"foo": "bar", "baz":}']} - ] - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + return_value = [{"client_groups": ['{"foo": "bar", "baz":}']}] + catalog_cache = get_catalog_cache_mock(return_value) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {}, ValueError( @@ -1031,20 +1034,20 @@ def test_resolve_requirements_fail_catalog_bad_JSON(): ), ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "m", "function_name": "m"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" ) def test_resolve_requirements_fail_catalog_bad_CSV(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ - {"client_groups": ["cg", "foo is bar"]} - ] - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + return_value = [{"client_groups": ["cg", "foo is bar"]}] + catalog_cache = get_catalog_cache_mock(return_value) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {}, ValueError( @@ -1053,71 +1056,69 @@ def test_resolve_requirements_fail_catalog_bad_CSV(): ), ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "m", "function_name": "m"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" ) def test_resolve_requirements_fail_catalog_normalize(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ - {"client_groups": ["cg", "request_memory=72TB"]} - ] - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + return_value = [{"client_groups": ["cg", "request_memory=72TB"]}] + catalog_cache = get_catalog_cache_mock(return_value) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) _resolve_requirements_fail( jrr, + catalog_cache, " mod . meth ", {}, IncorrectParamsException( "Found illegal memory request '72TB' in job requirements from catalog method mod.meth" ), ) - - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "mod", "function_name": "meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="mod", function_name="meth" ) def test_resolve_requirements_fail_catalog_clientgroup(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [ - {"client_groups": ["cg", "request_memory=72"]} - ] - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + return_value = [{"client_groups": ["cg", "request_memory=72"]}] + catalog_cache = get_catalog_cache_mock(return_value) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) _resolve_requirements_fail( jrr, + catalog_cache, " mod . meth ", {}, IncorrectParamsException( "Catalog specified illegal client group 'cg' for method mod.meth" ), ) - - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "mod", "function_name": "meth"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="mod", function_name="meth" ) def test_resolve_requirements_fail_input_clientgroup(): - catalog = create_autospec(Catalog, spec_set=True, instance=True) - catalog.list_client_group_configs.return_value = [] - jrr = JobRequirementsResolver(catalog, _get_simple_deploy_spec_file_obj()) + catalog_cache = get_catalog_cache_mock([]) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) _resolve_requirements_fail( jrr, + catalog_cache, "m.m", {"client_group": "cb4"}, IncorrectParamsException("No such clientgroup: cb4"), ) - catalog.list_client_group_configs.assert_called_once_with( - {"module_name": "m", "function_name": "m"} + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" ) -def _resolve_requirements_fail(jrr, method, kwargs, expected): +def _resolve_requirements_fail(jrr, catalog_cache, method, kwargs, expected): + # Workaround to avoid passing catalog multiple times with raises(Exception) as got: - jrr.resolve_requirements(method, **kwargs) + jrr.resolve_requirements(method, catalog_cache, **kwargs) assert_exception_correct(got.value, expected) diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py index a3d41c0a3..58cfbf71b 100644 --- a/test/utils_shared/mock_utils.py +++ b/test/utils_shared/mock_utils.py @@ -16,7 +16,7 @@ def _build_job_reqs(config, cfgfile, impls): with open(cfgfile) as cf: - return JobRequirementsResolver(impls[Catalog], cf) + return JobRequirementsResolver(cf) _CLASS_IMPLEMENTATION_BUILDERS = { @@ -56,15 +56,13 @@ def get_client_mocks(config, config_path, *to_be_mocked): if clazz in to_be_mocked: ret[clazz] = create_autospec(clazz, instance=True, spec_set=True) else: - # this is a hack - only one client depends on another (JRR -> Cat) - # so we rely on the ALL_CLIENTS sort to make sure the dependency is built before the - # dependent module. If things become more complicated we'll need a dependency graph. ret[clazz] = _CLASS_IMPLEMENTATION_BUILDERS[clazz](config, config_path, ret) ret[ClientSet] = ClientSet( ret[KBaseAuth], ret[AdminAuthUtil], ret[Condor], - ret[Catalog], + ret[Catalog], # This one is for "catalog" + ret[Catalog], # This one is for "catalog_no_auth" ret[JobRequirementsResolver], ret[KafkaClient], ret[MongoUtil], diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index 73a59605d..a39e5e4b8 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -1,9 +1,9 @@ import json -import os.path -import uuid import logging +import os.path import socket import time +import uuid from configparser import ConfigParser from contextlib import closing from datetime import datetime @@ -12,11 +12,10 @@ import requests from dotenv import load_dotenv -from lib.execution_engine2.db.models.models import Job, JobInput, Meta -from lib.execution_engine2.db.models.models import Status -from lib.execution_engine2.exceptions import MalformedTimestampException -from lib.execution_engine2.utils.CondorTuples import JobInfo - +from execution_engine2.db.models.models import Job, JobInput, Meta +from execution_engine2.db.models.models import Status +from execution_engine2.exceptions import MalformedTimestampException +from execution_engine2.utils.CondorTuples import JobInfo EE2_CONFIG_SECTION = "execution_engine2" KB_DEPLOY_ENV = "KB_DEPLOYMENT_CONFIG" @@ -541,3 +540,11 @@ def get_ee2_test_config() -> Dict[str, str]: cfg[nameval[0]] = nameval[1] return cfg + + +CLIENT_GROUP_CONFIG = { + "module_name": "module_name", + "function_name": "function_name", + "client_groups": ["client_groups_go_here"], +} +MODULE_VERSION = {"git_commit_hash": 123} From c38185a3613463d8c98873f093af57d139bf0e41 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 24 Jun 2021 18:52:01 -0500 Subject: [PATCH 089/109] DATAUP-461 Refactor Retry (#397) * black * ran blackity black * Fix tests * Fix tests * Updated some documentation a bit * removed comment * removed unused * black * Remove toggle * Removed spec doc Co-authored-by: bio-boris --- execution_engine2.spec | 40 +++++++++---- lib/execution_engine2/db/models/models.py | 8 +-- lib/execution_engine2/sdk/EE2Runjob.py | 17 +++++- lib/execution_engine2/sdk/EE2Status.py | 2 + lib/execution_engine2/sdk/EE2StatusRange.py | 5 +- test/tests_for_db/ee2_MongoUtil_test.py | 12 ++++ test/tests_for_integration/api_to_db_test.py | 41 +++++++++---- test/tests_for_sdkmr/EE2StatusRange_test.py | 1 + ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 30 +++++++--- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 60 ++++++++++++++++++- test/tests_for_sdkmr/ee2_retry_test.py | 4 ++ 11 files changed, 177 insertions(+), 43 deletions(-) diff --git a/execution_engine2.spec b/execution_engine2.spec index 5aa498bf0..9c839994e 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -428,6 +428,7 @@ wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) ## TODO - verify + job_output - object - outputs from the job (from the run_job call) ## TODO - verify updated - int - timestamp since epoch in milliseconds of the last time the status was updated running - int - timestamp since epoch in milliseconds of when it entered the running state created - int - timestamp since epoch in milliseconds when the job was created @@ -452,18 +453,34 @@ errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, one of: - 0 - user cancellation - 1 - admin cancellation - 2 - terminated by some automatic process + #TODO, add these to the structure? + condor_job_ads - dict - condor related job information - @optional error - @optional error_code - @optional errormsg - @optional terminated_code - @optional estimating - @optional running - @optional finished + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this job + retry_parent - str - job_id of the parent this retry is based off of. Not available on a retry_parent itself + + parent_job_id - str - job_id taken from job_input.parent_job_id + batch_job - bool - whether or not this is a batch parent container + child_jobs - array - Only parent container should have child job ids + + scheduler_type - str - scheduler, such as awe or condor + scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for estimation + + + terminated_code - int - internal reason why a job was terminated, one of: + 0 - user cancellation + 1 - admin cancellation + 2 - terminated by some automatic process + + @optional error + @optional error_code + @optional errormsg + @optional terminated_code + @optional estimating + @optional running + @optional finished */ @@ -484,6 +501,7 @@ int error_code; string errormsg; int terminated_code; + } JobState; /* diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 55e91034a..37caa8cfe 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -322,12 +322,12 @@ class Job(Document): condor_job_ads = DynamicField() child_jobs = ListField() # Only parent container should have child jobs # batch_parent_container = BooleanField(default=False) # Only parent container should have this - - # Only present when a job has been retried and on the retry_parent - retry_count = IntField(min_value=0) - + retry_ids = ListField() # The retry_parent has been used to launch these jobs # Only present on a retried job, not it's parent. If attempting to retry this job, use its parent instead retry_parent = StringField() + retry_saved_toggle = BooleanField( + default=False + ) # Marked true when all retry steps have completed meta = {"collection": "ee2_jobs"} diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 17636dbda..c3b6ca609 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -55,6 +55,7 @@ _APP_ID = "app_id" _PARENT_JOB_ID = "parent_job_id" _PARENT_RETRY_JOB_ID = "retry_parent" +_RETRY_IDS = "retry_ids" _WORKSPACE_ID = "wsid" _SOURCE_WS_OBJECTS = "source_ws_objects" _SERVICE_VER = "service_ver" @@ -571,7 +572,7 @@ def _retry(self, job_id: str, job: Job, parent_job: Job, as_admin: bool = False) # 1) Notify the parent container that it has a new child.. if parent_job: try: - parent_job.modify(push__child_jobs=retry_job_id) + parent_job.modify(add_to_set__child_jobs=retry_job_id) except Exception as e: self._db_update_failure( job_that_failed_operation=str(parent_job.id), @@ -579,15 +580,25 @@ def _retry(self, job_id: str, job: Job, parent_job: Job, as_admin: bool = False) exception=e, ) - # 2) Notify the retry_parent that it has been retried + # 2) Notify the retry_parent that it has been retried by adding a retry id try: - job.modify(inc__retry_count=1) + job.modify(add_to_set__retry_ids=retry_job_id) except Exception as e: self._db_update_failure( job_that_failed_operation=str(job.id), job_to_abort=retry_job_id, exception=e, ) + # 3) If the retry_ids is updated and if present, the child_jobs, is updated, set toggle to true + try: + retry_job = self.sdkmr.get_mongo_util().get_job(job_id=retry_job_id) + retry_job.modify(set__retry_saved_toggle=True) + except Exception: + self.logger.error( + f"Couldn't toggle job retry state for {retry_job_id} ", + exc_info=True, + stack_info=True, + ) # Should we compare the original and child job to make sure certain fields match, # to make sure the retried job is correctly submitted? Or save that for a unit test? diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 63fc1a29b..bec552e8d 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -472,7 +472,9 @@ def check_jobs( else: mongo_rec = job.to_mongo().to_dict() del mongo_rec["_id"] + mongo_rec["retry_count"] = len(job["retry_ids"]) mongo_rec["job_id"] = str(job.id) + mongo_rec["parent_job_id"] = job.job_input.parent_job_id mongo_rec["created"] = int(job.id.generation_time.timestamp() * 1000) mongo_rec["updated"] = int(job.updated * 1000) if job.estimating: diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index 18703fdbd..3bd22203c 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -204,14 +204,13 @@ def _job_state_from_jobs(jobs): str(job_id) float(created/queued/estimating/running/finished/updated/) (Time in MS) """ - retry_keys = ["retry_parent", "retried", "retry_count"] + hidden_keys = ["retry_saved_toggle"] job_states = [] for job in jobs: mongo_rec = job.to_mongo().to_dict() - # Hack until job browser supports these keys - for key in retry_keys: + for key in hidden_keys: if key in mongo_rec: del mongo_rec[key] diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index 2ddbfbf23..fc0e276bf 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -90,6 +90,8 @@ def test_get_job_ok(self): "scheduler_id", "child_jobs", "batch_job", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -110,6 +112,8 @@ def test_get_job_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -129,6 +133,8 @@ def test_get_job_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -161,6 +167,8 @@ def test_get_jobs_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] for job in jobs: @@ -180,6 +188,8 @@ def test_get_jobs_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] for job in jobs: self.assertCountEqual(job.to_mongo().to_dict().keys(), expected_keys) @@ -211,6 +221,8 @@ def test_connection_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 149e97e8f..5ca063a30 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -28,19 +28,27 @@ import os import tempfile import time -import htcondor - -from bson import ObjectId from configparser import ConfigParser -from threading import Thread from pathlib import Path -import pymongo -from pytest import fixture, raises +from threading import Thread from typing import Dict from unittest.mock import patch, create_autospec, ANY, call +import htcondor +import pymongo +from bson import ObjectId +from pytest import fixture, raises + +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from installed_clients.WorkspaceClient import Workspace +from installed_clients.baseclient import ServerError +from installed_clients.execution_engine2Client import execution_engine2 as ee2client +from test.utils_shared.test_utils import bootstrap from tests_for_integration.auth_controller import AuthController from tests_for_integration.workspace_controller import WorkspaceController + +# in the future remove this +from tests_for_utils.Condor_test import _get_common_sub from utils_shared.test_utils import ( get_full_test_config, get_ee2_test_config, @@ -54,13 +62,8 @@ assert_close_to_now, assert_exception_correct, ) -from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE -from installed_clients.baseclient import ServerError -from installed_clients.execution_engine2Client import execution_engine2 as ee2client -from installed_clients.WorkspaceClient import Workspace -# in the future remove this -from tests_for_utils.Condor_test import _get_common_sub +bootstrap() KEEP_TEMP_FILES = False TEMP_DIR = Path("test_temp_can_delete") @@ -528,6 +531,8 @@ def _check_mongo_job( }, }, "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, "batch_job": False, "scheduler_id": "123", "scheduler_type": "condor", @@ -1243,6 +1248,8 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): }, }, "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, "batch_job": False, "scheduler_id": "123", "scheduler_type": "condor", @@ -1272,6 +1279,8 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "narrative_cell_info": {}, }, "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, "batch_job": False, "scheduler_id": "456", "scheduler_type": "condor", @@ -1299,6 +1308,8 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): }, "child_jobs": [job_id_1, job_id_2], "batch_job": True, + "retry_ids": [], + "retry_saved_toggle": False, } assert parent_job == expected_parent_job @@ -1430,6 +1441,8 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "batch_job": False, "scheduler_id": "123", "scheduler_type": "condor", + "retry_ids": [], + "retry_saved_toggle": False, } assert job1 == expected_job1 @@ -1455,6 +1468,8 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "batch_job": False, "scheduler_id": "456", "scheduler_type": "condor", + "retry_ids": [], + "retry_saved_toggle": False, } assert job2 == expected_job2 @@ -1474,6 +1489,8 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli }, "child_jobs": [job_id_1, job_id_2], "batch_job": True, + "retry_ids": [], + "retry_saved_toggle": False, } assert parent_job == expected_parent_job diff --git a/test/tests_for_sdkmr/EE2StatusRange_test.py b/test/tests_for_sdkmr/EE2StatusRange_test.py index 7d2e14db4..f515898f1 100644 --- a/test/tests_for_sdkmr/EE2StatusRange_test.py +++ b/test/tests_for_sdkmr/EE2StatusRange_test.py @@ -84,6 +84,7 @@ def _run_minimal(user): "status": created_state, "updated": 1000000000, "user": expected_user, + "retry_ids": [], } ], "limit": 2000, diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index ff58930d1..8e29def37 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -265,7 +265,9 @@ def check_retry_job_state(job_id: str, retry_job_id: str): assert job[item] == retry_job[item] assert retry_job.retry_parent == job_id - assert job.retry_count > 0 + assert len(job.retry_ids) > 0 + assert retry_job_id in job.retry_ids + assert not job.retry_saved_toggle and retry_job.retry_saved_toggle @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -310,7 +312,7 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): parent_job_id1, parent_job_id2, ) - fail_msg = f"Retry of the same id in the same request is not supported. Offending ids:{[parent_job_id1,parent_job_id2]} " + fail_msg = f"Retry of the same id in the same request is not supported. Offending ids:{[parent_job_id1, parent_job_id2]} " with self.assertRaises(ValueError) as e: runner.retry_multiple(retry_candidates) @@ -323,6 +325,9 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): parent_job_id3, parent_job_id4, ) + check_job = runner.check_job(parent_job_id1) + assert check_job["retry_ids"] == [] + assert check_job["retry_count"] == 0 retry_job_ids = runner.retry_multiple(retry_candidates) assert len(retry_job_ids) == len(retry_candidates) @@ -407,10 +412,19 @@ def test_retry_job(self, rq_mock, condor_mock): self.check_retry_job_state(parent_job_id, retry_from_original_again) for job in [original_job, retried_job, retried_job2, retried_job3]: + j = Job.objects.get(id=job["job_id"]) if job == original_job: assert original_job["retry_count"] == 3 + assert not j.retry_saved_toggle else: assert job["retry_parent"] == parent_job_id + assert j.retry_saved_toggle + + assert [ + retried_job["job_id"], + retried_job2["job_id"], + retried_job3["job_id"], + ] == original_job["retry_ids"] # 4. Get jobs and ensure they contain the same keys and params same_keys = ["user", "authstrat", "wsid", "scheduler_type", "job_input"] @@ -529,21 +543,21 @@ def test_run_job_batch(self, rq_mock, condor_mock): runner.update_job_status(job_id=child_job_id, status=Status.terminated.value) parent_job = runner.check_job(job_id=parent_job_id) assert len(parent_job["child_jobs"]) == 3 + retry_id = runner.retry(job_id=child_job_id)["retry_id"] + self.check_retry_job_state(child_job_id, retry_id) parent_job = runner.check_job(job_id=parent_job_id) assert len(parent_job["child_jobs"]) == 4 assert parent_job["child_jobs"][-1] == retry_id - job = Job.objects.get(id=child_job_id) - retry_count = job.retry_count + job = runner.check_job(job_id=child_job_id) + retry_count = job["retry_count"] # Test to see if one input fails, so fail them all with self.assertRaises(expected_exception=RetryFailureException): - retry_id = runner.retry_multiple(job_ids=[child_job_id, "grail", "fail"]) - print(retry_id) + runner.retry_multiple(job_ids=[child_job_id, "grail", "fail"]) # Check to see other job wasn't retried - job.reload() - assert job.retry_count == retry_count + assert retry_count == runner.check_job(job_id=child_job_id)["retry_count"] @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index 5985ba92d..e4b845808 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -69,6 +69,64 @@ def getRunner(self) -> SDKMethodRunner: def create_job_rec(self): return self.sdkmr_test_helper.create_job_rec() + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_check_job(self, rq_mock, condor_mock): + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}}, + user_roles=["EE2_ADMIN"], + ) + ) + runner = self.getRunner() + runner.get_condor = MagicMock(return_value=condor_mock) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + condor_mock.get_job_resource_info = MagicMock( + return_value=self.fake_used_resources + ) + job_id = runner.run_job(params=job) + job_status = runner.check_job(job_id=job_id) + expected_status = { + "authstrat": "kbaseworkspace", + "batch_job": False, + "child_jobs": [], + "created": 1623781528000, + "job_id": "60c8f0989a70bc8ec0ac0ec7", + "job_input": { + "app_id": "module/super_function", + "method": "module.method", + "narrative_cell_info": {}, + "requirements": { + "clientgroup": "njs", + "cpu": 4, + "disk": 30, + "memory": 2000, + }, + "service_ver": "some_commit_hash", + "source_ws_objects": [], + "wsid": 9999, + }, + "parent_job_id": None, + "queued": 1623781529017, + "retry_count": 0, + "retry_ids": [], + "scheduler_id": "test", + "scheduler_type": "condor", + "status": "queued", + "updated": 1623781529017, + "user": "wsadmin", + "wsid": 9999, + } + + expected_different = ["job_id", "created", "queued", "updated"] + for key, val in expected_status.items(): + if key not in expected_different: + assert job_status[key] == val + else: + assert key in job_status + @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_run_job_and_handle_held(self, rq_mock, condor_mock): @@ -99,9 +157,7 @@ def test_run_job_and_handle_held(self, rq_mock, condor_mock): print( f"Job id is {job_id}. Status is {check_job.get('status')} Cluster is {check_job.get('scheduler_id')} " ) - job_record = runner.handle_held_job(cluster_id=check_job.get("scheduler_id")) - print("Records are", job_record.get("condor_job_ads")) self.assertEqual(self.fake_used_resources, job_record.get("condor_job_ads")) def test_update_job_status(self): diff --git a/test/tests_for_sdkmr/ee2_retry_test.py b/test/tests_for_sdkmr/ee2_retry_test.py index 184695525..fabddec5c 100644 --- a/test/tests_for_sdkmr/ee2_retry_test.py +++ b/test/tests_for_sdkmr/ee2_retry_test.py @@ -67,6 +67,8 @@ def test_retry_db_failures(): rj._db_update_failure = MagicMock() rj._retry(job_id=retry_job.id, job=retry_job, parent_job=parent_job) + assert not retry_job.retry_saved_toggle + def test_validate_retry(): sdkmr = create_autospec(SDKMethodRunner, instance=True, spec_set=True) @@ -121,6 +123,8 @@ def test_retry_get_run_job_params_from_existing_job(): "job_input", "child_jobs", "batch_job", + "retry_ids", + "retry_saved_toggle", ] expected_unequal_keys = [ "updated", From e58c0debbb6669058d6b861ef8a781f0428d490e Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 25 Jun 2021 16:44:25 -0500 Subject: [PATCH 090/109] Fix parent job id (#400) * Update EE2Status.py * Update EE2Status.py * ran black Co-authored-by: bio-boris --- lib/execution_engine2/sdk/EE2Status.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index bec552e8d..0a4b01b26 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -474,7 +474,9 @@ def check_jobs( del mongo_rec["_id"] mongo_rec["retry_count"] = len(job["retry_ids"]) mongo_rec["job_id"] = str(job.id) - mongo_rec["parent_job_id"] = job.job_input.parent_job_id + mongo_rec["parent_job_id"] = ( + job.job_input.parent_job_id if job.job_input else None + ) mongo_rec["created"] = int(job.id.generation_time.timestamp() * 1000) mongo_rec["updated"] = int(job.updated * 1000) if job.estimating: From bd218d67be688a27152bf99e6538cb98ee6654e0 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Wed, 30 Jun 2021 19:54:32 -0700 Subject: [PATCH 091/109] Protect the catalog cache internals from external mutation (#401) * Protect the catalog cache from external mutation * run black --- lib/execution_engine2/utils/catalog_cache.py | 4 ++- test/tests_for_utils/catalog_cache_test.py | 38 ++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/lib/execution_engine2/utils/catalog_cache.py b/lib/execution_engine2/utils/catalog_cache.py index df5d07dd9..46400f3cc 100644 --- a/lib/execution_engine2/utils/catalog_cache.py +++ b/lib/execution_engine2/utils/catalog_cache.py @@ -1,3 +1,5 @@ +import copy + from collections import defaultdict from typing import Dict @@ -90,4 +92,4 @@ def lookup_job_resource_requirements(self, module_name, function_name) -> dict: {"module_name": module_name, "function_name": function_name} ) # Retrieve from cache - return cr_cache[module_name][function_name] + return copy.deepcopy(cr_cache[module_name][function_name]) diff --git a/test/tests_for_utils/catalog_cache_test.py b/test/tests_for_utils/catalog_cache_test.py index 0c7713e78..0ae9380ba 100644 --- a/test/tests_for_utils/catalog_cache_test.py +++ b/test/tests_for_utils/catalog_cache_test.py @@ -91,6 +91,44 @@ def test_cc_job_reqs(catalog): ) +def test_cc_job_reqs_internal_mutation(catalog): + """ + Tests that if a client alters the job requirements returned from the cache, it does not + affect the cache internals. + """ + catalog.list_client_group_configs.return_value = [{"client_groups": ["kb_upload"]}] + + cc = CatalogCache(catalog) + + # call #1. Depending on the implementation, the catalog info may be returned directly + # or added to the cache and the cache entry returned. + assert cc.lookup_job_resource_requirements( + "kb_uploadmethods", "import_reads_from_staging" + ) == [{"client_groups": ["kb_upload"]}] + + # call #2. Regardless of the implementation, this data should be coming from the cache. + cgs = cc.lookup_job_resource_requirements( + "kb_uploadmethods", "import_reads_from_staging" + ) + assert cgs == [{"client_groups": ["kb_upload"]}] + + # Mutate the cache if the cache implementation allows it + cgs[0]["client_groups"].pop(0) # The job requirements resolver does this + + # call #3. Confirm that the cache was not mutated + assert cc.lookup_job_resource_requirements( + "kb_uploadmethods", "import_reads_from_staging" + ) == [{"client_groups": ["kb_upload"]}] + + # check there was only one call to the cache + catalog.list_client_group_configs.assert_called_once_with( + { + "module_name": "kb_uploadmethods", + "function_name": "import_reads_from_staging", + } + ) + + def test_cc_git_commit_version(catalog): """Test to see the git commit cache is being used.""" catalog_cache = CatalogCache(catalog=catalog) From 8d4299f8210e0a7e4e96d428c52ee10519b5272b Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Fri, 9 Jul 2021 11:20:44 -0700 Subject: [PATCH 092/109] DATAUP-499 Switch parent_job_id for batch_id for 'true' batch jobs (#404) * Switch parent_job_id for batch_id for 'true' batch jobs Per the request of the front end devs, we want a way to distinguish 'true' batch jobs created by run_job_batch from 'manual' batch jobs created by including a parent_job_id in the job input params. As such, true batch jobs now have a batch_id field in the job status dict, replacing the parent_job_id field. That field remains in the job_input. A job must only ever have zero or one of the fields, never both. * run black * Match spec, change some spec keys Changed the abandon_children return dict to match the spec Updated the check_jobs_batch spec to match nomenclature elsewhere * Update names and docs to batch from parent --- execution_engine2.html | 2 +- execution_engine2.spec | 13 +- lib/execution_engine2/db/models/models.py | 9 +- .../execution_engine2Impl.py | 559 ++++++++++-------- lib/execution_engine2/sdk/EE2Runjob.py | 67 ++- lib/execution_engine2/sdk/EE2Status.py | 22 +- lib/execution_engine2/sdk/SDKMethodRunner.py | 10 +- test/tests_for_integration/api_to_db_test.py | 28 +- test/tests_for_sdkmr/EE2Runjob_test.py | 29 +- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 14 +- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 28 +- test/tests_for_sdkmr/ee2_retry_test.py | 4 +- 12 files changed, 454 insertions(+), 331 deletions(-) diff --git a/execution_engine2.html b/execution_engine2.html index 0620c8ca0..14eda82c6 100644 --- a/execution_engine2.html +++ b/execution_engine2.html @@ -1 +1 @@ -execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge of the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*params - the parameters to pass to the method.
*
*Optional parameters:
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idparent_job_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

/*
*job_id of retried job
*retry_id: job_id of the job that was launched
*str error: reason as to why that particular retry failed (available for bulk retry only)
*/
typedefstructure{
job_idjob_id;
job_idretry_id;
stringerror;
}
RetryResult;

/*
*job_id of job to retry
*as_admin: retry someone elses job in your namespace
*#TODO Possibly Add JobRequirements job_requirements;
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
RetryParams;

/*
*job_ids of job to retry
*as_admin: retry someone else's job in your namespace
*#TODO: Possibly Add list<JobRequirements> job_requirements;
*/
typedefstructure{
list<job_id>job_ids;
booleanas_admin;
}
BulkRetryParams;

/*
*#TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present
*#TODO Add retry child that checks the status of the child? to prevent multiple retries
*Allowed Jobs
** Regular Job with no children
** Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call
*Not Allowed
** Regular Job with children (Should not be possible to create yet)
** Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs)
*/
funcdefretry_job(RetryParamsparams)returns(RetryResultretry_result)authenticationrequired;

/*
*Same as retry_job, but accepts multiple jobs
*/
funcdefretry_jobs(BulkRetryParamsparams)returns(list<RetryResult>retry_result)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*parent_job - state of parent job
*job_states - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStateparent_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge of the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*params - the parameters to pass to the method.
*
*Optional parameters:
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idbatch_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idbatch_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

/*
*job_id of retried job
*retry_id: job_id of the job that was launched
*str error: reason as to why that particular retry failed (available for bulk retry only)
*/
typedefstructure{
job_idjob_id;
job_idretry_id;
stringerror;
}
RetryResult;

/*
*job_id of job to retry
*as_admin: retry someone elses job in your namespace
*#TODO Possibly Add JobRequirements job_requirements;
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
RetryParams;

/*
*job_ids of job to retry
*as_admin: retry someone else's job in your namespace
*#TODO: Possibly Add list<JobRequirements> job_requirements;
*/
typedefstructure{
list<job_id>job_ids;
booleanas_admin;
}
BulkRetryParams;

/*
*#TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present
*#TODO Add retry child that checks the status of the child? to prevent multiple retries
*Allowed Jobs
** Regular Job with no children
** Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call
*Not Allowed
** Regular Job with children (Should not be possible to create yet)
** Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs)
*/
funcdefretry_job(RetryParamsparams)returns(RetryResultretry_result)authenticationrequired;

/*
*Same as retry_job, but accepts multiple jobs
*/
funcdefretry_jobs(BulkRetryParamsparams)returns(list<RetryResult>retry_result)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*job_output - object - outputs from the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*#TODO, add these to the structure?
*condor_job_ads - dict - condor related job information
*
*retry_count - int - generated field based on length of retry_ids
*retry_ids - list - list of jobs that are retried based off of this job
*retry_parent - str - job_id of the parent this retry is based off of. Not available on a retry_parent itself
*
*batch_id - str - the parent of the job, if the job is a child job created via run_job_batch
*batch_job - bool - whether or not this is a batch parent container
*child_jobs - array - Only parent container should have child job ids
*
*scheduler_type - str - scheduler, such as awe or condor
*scheduler_id - str - scheduler generated id
*scheduler_estimator_id - str - id for the job spawned for estimation
*
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
stringbatch_id;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*batch_jobstate - state of parent job of the batch
*child_jobstates - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStatebatch_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index 9c839994e..eb042674e 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -181,12 +181,12 @@ } BatchParams; typedef structure { - job_id parent_job_id; + job_id batch_id; list child_job_ids; } BatchSubmission; typedef structure { - job_id parent_job_id; + job_id batch_id; list child_job_ids; boolean as_admin; } AbandonChildren; @@ -460,7 +460,7 @@ retry_ids - list - list of jobs that are retried based off of this job retry_parent - str - job_id of the parent this retry is based off of. Not available on a retry_parent itself - parent_job_id - str - job_id taken from job_input.parent_job_id + batch_id - str - the coordinating job, if the job is a child job created via run_job_batch batch_job - bool - whether or not this is a batch parent container child_jobs - array - Only parent container should have child job ids @@ -501,6 +501,7 @@ int error_code; string errormsg; int terminated_code; + string batch_id; } JobState; @@ -510,12 +511,12 @@ funcdef check_job(CheckJobParams params) returns (JobState job_state) authentication required; /* - parent_job - state of parent job - job_states - states of child jobs + batch_jobstate - state of the coordinating job for the batch + child_jobstates - states of child jobs IDEA: ADD aggregate_states - count of all available child job states, even if they are zero */ typedef structure { - JobState parent_jobstate; + JobState batch_jobstate; list child_jobstates; } CheckJobBatchResults; diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index 37caa8cfe..99e115412 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -165,6 +165,8 @@ class JobInput(EmbeddedDocument): service_ver = StringField(required=True) app_id = StringField() source_ws_objects = ListField() + # this ID is for jobs submitted via run_job with a parent_job_id field included by the + # client. For this case, the parent job is not updated at all. parent_job_id = StringField() requirements = EmbeddedDocumentField(JobRequirements) narrative_cell_info = EmbeddedDocumentField(Meta, required=True) @@ -320,7 +322,12 @@ class Job(Document): job_input = EmbeddedDocumentField(JobInput, required=True) job_output = DynamicField() condor_job_ads = DynamicField() - child_jobs = ListField() # Only parent container should have child jobs + # this is the ID of the coordinating job created as part of run_job_batch. Only child jobs + # in a "true" batch job maintained by EE2 should have this field. Coordinating jobs will + # be updated with the child ID in child_jobs, unlike "fake" batch jobs that are created + # outside of the EE2 codebase using the 'parent_job_id' field. + batch_id = StringField() + child_jobs = ListField() # Only coordinating jobs should have child jobs # batch_parent_container = BooleanField(default=False) # Only parent container should have this retry_ids = ListField() # The retry_parent has been used to launch these jobs # Only present on a retried job, not it's parent. If attempting to retry this job, use its parent instead diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index e2684a99d..49e7b9b9a 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -29,8 +29,8 @@ class execution_engine2: # the latter method is running. ######################################### noqa VERSION = "0.0.5" - GIT_URL = "git@github.com:kbase/execution_engine2.git" - GIT_COMMIT_HASH = "8b6f4e1917dbdfa374e6f22b1f2adbe7eca5a24c" + GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" + GIT_COMMIT_HASH = "2ad95ce47caa4f1e7b939651f2b1773840e67a8a" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -352,8 +352,8 @@ def run_job_batch(self, ctx, params, batch_params): parameter "wsid" of Long, parameter "as_admin" of type "boolean" (@range [0,1]) :returns: instance of type "BatchSubmission" -> structure: parameter - "parent_job_id" of type "job_id" (A job id.), parameter - "child_job_ids" of list of type "job_id" (A job id.) + "batch_id" of type "job_id" (A job id.), parameter "child_job_ids" + of list of type "job_id" (A job id.) """ # ctx is the context object # return variables are: job_ids @@ -453,12 +453,12 @@ def retry_jobs(self, ctx, params): def abandon_children(self, ctx, params): """ :param params: instance of type "AbandonChildren" -> structure: - parameter "parent_job_id" of type "job_id" (A job id.), parameter + parameter "batch_id" of type "job_id" (A job id.), parameter "child_job_ids" of list of type "job_id" (A job id.), parameter "as_admin" of type "boolean" (@range [0,1]) :returns: instance of type "BatchSubmission" -> structure: parameter - "parent_job_id" of type "job_id" (A job id.), parameter - "child_job_ids" of list of type "job_id" (A job id.) + "batch_id" of type "job_id" (A job id.), parameter "child_job_ids" + of list of type "job_id" (A job id.) """ # ctx is the context object # return variables are: parent_and_child_ids @@ -469,7 +469,7 @@ def abandon_children(self, ctx, params): job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, ) - parent_and_child_ids = mr.abandon_children(parent_job_id=params['parent_job_id'], + parent_and_child_ids = mr.abandon_children(batch_id=params['batch_id'], child_job_ids=params['child_job_ids'], as_admin=params.get('as_admin')) #END abandon_children @@ -884,29 +884,42 @@ def check_job(self, ctx, params): id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) ## TODO - verify - updated - int - timestamp since epoch in milliseconds of the last - time the status was updated running - int - timestamp since epoch - in milliseconds of when it entered the running state created - int - - timestamp since epoch in milliseconds when the job was created - finished - int - timestamp since epoch in milliseconds when the - job was finished status - string - status of the job. one of the - following: created - job has been created in the service - estimating - an estimation job is running to estimate resources - required for the main job, and which queue should be used queued - - job is queued to be run running - job is running on a worker node - completed - job was completed successfully error - job is no - longer running, but failed with an error terminated - job is no - longer running, terminated either due to user cancellation, admin - cancellation, or some automated task error_code - int - internal - reason why the job is an error. one of the following: 0 - unknown - 1 - job crashed 2 - job terminated by automation 3 - job ran over - time limit 4 - job was missing its automated output document 5 - - job authentication token expired errormsg - string - message (e.g. - stacktrace) accompanying an errored job error - object - the - JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, - one of: 0 - user cancellation 1 - admin cancellation 2 - - terminated by some automatic process @optional error @optional + job_output - object - outputs from the job (from the run_job call) + ## TODO - verify updated - int - timestamp since epoch in + milliseconds of the last time the status was updated running - int + - timestamp since epoch in milliseconds of when it entered the + running state created - int - timestamp since epoch in + milliseconds when the job was created finished - int - timestamp + since epoch in milliseconds when the job was finished status - + string - status of the job. one of the following: created - job + has been created in the service estimating - an estimation job is + running to estimate resources required for the main job, and which + queue should be used queued - job is queued to be run running - + job is running on a worker node completed - job was completed + successfully error - job is no longer running, but failed with an + error terminated - job is no longer running, terminated either due + to user cancellation, admin cancellation, or some automated task + error_code - int - internal reason why the job is an error. one of + the following: 0 - unknown 1 - job crashed 2 - job terminated by + automation 3 - job ran over time limit 4 - job was missing its + automated output document 5 - job authentication token expired + errormsg - string - message (e.g. stacktrace) accompanying an + errored job error - object - the JSON-RPC error package that + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional error_code @optional errormsg @optional terminated_code @optional estimating @optional running @optional finished) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "user" @@ -986,7 +999,8 @@ def check_job(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: job_state @@ -1018,38 +1032,51 @@ def check_job_batch(self, ctx, params): "job_id" of type "job_id" (A job id.), parameter "exclude_fields" of list of String, parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobBatchResults" (parent_job - state - of parent job job_states - states of child jobs IDEA: ADD - aggregate_states - count of all available child job states, even - if they are zero) -> structure: parameter "parent_jobstate" of - type "JobState" (job_id - string - id of the job user - string - - user who started the job wsid - int - optional id of the workspace - where the job is bound authstrat - string - what strategy used to - authenticate the job job_input - object - inputs to the job (from - the run_job call) ## TODO - verify updated - int - timestamp - since epoch in milliseconds of the last time the status was - updated running - int - timestamp since epoch in milliseconds of - when it entered the running state created - int - timestamp since - epoch in milliseconds when the job was created finished - int - - timestamp since epoch in milliseconds when the job was finished - status - string - status of the job. one of the following: created - - job has been created in the service estimating - an estimation - job is running to estimate resources required for the main job, - and which queue should be used queued - job is queued to be run - running - job is running on a worker node completed - job was - completed successfully error - job is no longer running, but - failed with an error terminated - job is no longer running, - terminated either due to user cancellation, admin cancellation, or - some automated task error_code - int - internal reason why the job - is an error. one of the following: 0 - unknown 1 - job crashed 2 - - job terminated by automation 3 - job ran over time limit 4 - job - was missing its automated output document 5 - job authentication - token expired errormsg - string - message (e.g. stacktrace) - accompanying an errored job error - object - the JSON-RPC error - package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, - one of: 0 - user cancellation 1 - admin cancellation 2 - - terminated by some automatic process @optional error @optional + :returns: instance of type "CheckJobBatchResults" (batch_jobstate - + state of parent job of the batch child_jobstates - states of child + jobs IDEA: ADD aggregate_states - count of all available child job + states, even if they are zero) -> structure: parameter + "batch_jobstate" of type "JobState" (job_id - string - id of the + job user - string - user who started the job wsid - int - optional + id of the workspace where the job is bound authstrat - string - + what strategy used to authenticate the job job_input - object - + inputs to the job (from the run_job call) ## TODO - verify + job_output - object - outputs from the job (from the run_job call) + ## TODO - verify updated - int - timestamp since epoch in + milliseconds of the last time the status was updated running - int + - timestamp since epoch in milliseconds of when it entered the + running state created - int - timestamp since epoch in + milliseconds when the job was created finished - int - timestamp + since epoch in milliseconds when the job was finished status - + string - status of the job. one of the following: created - job + has been created in the service estimating - an estimation job is + running to estimate resources required for the main job, and which + queue should be used queued - job is queued to be run running - + job is running on a worker node completed - job was completed + successfully error - job is no longer running, but failed with an + error terminated - job is no longer running, terminated either due + to user cancellation, admin cancellation, or some automated task + error_code - int - internal reason why the job is an error. one of + the following: 0 - unknown 1 - job crashed 2 - job terminated by + automation 3 - job ran over time limit 4 - job was missing its + automated output document 5 - job authentication token expired + errormsg - string - message (e.g. stacktrace) accompanying an + errored job error - object - the JSON-RPC error package that + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional error_code @optional errormsg @optional terminated_code @optional estimating @optional running @optional finished) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "user" @@ -1129,35 +1156,48 @@ def check_job_batch(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long, parameter - "child_jobstates" of list of type "JobState" (job_id - string - id - of the job user - string - user who started the job wsid - int - - optional id of the workspace where the job is bound authstrat - - string - what strategy used to authenticate the job job_input - - object - inputs to the job (from the run_job call) ## TODO - - verify updated - int - timestamp since epoch in milliseconds of - the last time the status was updated running - int - timestamp - since epoch in milliseconds of when it entered the running state - created - int - timestamp since epoch in milliseconds when the job - was created finished - int - timestamp since epoch in milliseconds - when the job was finished status - string - status of the job. one - of the following: created - job has been created in the service - estimating - an estimation job is running to estimate resources - required for the main job, and which queue should be used queued - - job is queued to be run running - job is running on a worker node - completed - job was completed successfully error - job is no - longer running, but failed with an error terminated - job is no - longer running, terminated either due to user cancellation, admin - cancellation, or some automated task error_code - int - internal - reason why the job is an error. one of the following: 0 - unknown - 1 - job crashed 2 - job terminated by automation 3 - job ran over - time limit 4 - job was missing its automated output document 5 - - job authentication token expired errormsg - string - message (e.g. - stacktrace) accompanying an errored job error - object - the - JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, - one of: 0 - user cancellation 1 - admin cancellation 2 - - terminated by some automatic process @optional error @optional + String, parameter "terminated_code" of Long, parameter "batch_id" + of String, parameter "child_jobstates" of list of type "JobState" + (job_id - string - id of the job user - string - user who started + the job wsid - int - optional id of the workspace where the job is + bound authstrat - string - what strategy used to authenticate the + job job_input - object - inputs to the job (from the run_job call) + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in + milliseconds when the job was created finished - int - timestamp + since epoch in milliseconds when the job was finished status - + string - status of the job. one of the following: created - job + has been created in the service estimating - an estimation job is + running to estimate resources required for the main job, and which + queue should be used queued - job is queued to be run running - + job is running on a worker node completed - job was completed + successfully error - job is no longer running, but failed with an + error terminated - job is no longer running, terminated either due + to user cancellation, admin cancellation, or some automated task + error_code - int - internal reason why the job is an error. one of + the following: 0 - unknown 1 - job crashed 2 - job terminated by + automation 3 - job ran over time limit 4 - job was missing its + automated output document 5 - job authentication token expired + errormsg - string - message (e.g. stacktrace) accompanying an + errored job error - object - the JSON-RPC error package that + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional error_code @optional errormsg @optional terminated_code @optional estimating @optional running @optional finished) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "user" @@ -1237,7 +1277,8 @@ def check_job_batch(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: returnVal @@ -1275,10 +1316,11 @@ def check_jobs(self, ctx, params): the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1295,34 +1337,45 @@ def check_jobs(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' params - the parameters to - pass to the method. Optional parameters: app_id - the id of the - Narrative application (UI) running this job (e.g. repo/name) - service_ver - specific version of deployed service, last version - is used if this parameter is not defined source_ws_objects - - denotes the workspace objects that will serve as a source of data - when running the SDK method. These references will be added to the - autogenerated provenance. Must be in UPA format (e.g. 6/90/4). - meta - Narrative metadata to associate with the job. wsid - an - optional workspace id to associate with the job. This is passed to - the workspace service, which will share the job based on the - permissions of the workspace rather than owner of the job - parent_job_id - EE2 job id for the parent of the current job. For - run_job and run_job_concierge, this value can be specified to - denote the parent job of the job being created. Warning: No - checking is done on the validity of the job ID, and the parent job - record is not altered. Submitting a job with a parent ID to - run_job_batch will cause an error to be returned. + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1378,7 +1431,8 @@ def check_jobs(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: returnVal @@ -1419,10 +1473,11 @@ def check_workspace_jobs(self, ctx, params): the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1439,34 +1494,45 @@ def check_workspace_jobs(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' params - the parameters to - pass to the method. Optional parameters: app_id - the id of the - Narrative application (UI) running this job (e.g. repo/name) - service_ver - specific version of deployed service, last version - is used if this parameter is not defined source_ws_objects - - denotes the workspace objects that will serve as a source of data - when running the SDK method. These references will be added to the - autogenerated provenance. Must be in UPA format (e.g. 6/90/4). - meta - Narrative metadata to associate with the job. wsid - an - optional workspace id to associate with the job. This is passed to - the workspace service, which will share the job based on the - permissions of the workspace rather than owner of the job - parent_job_id - EE2 job id for the parent of the current job. For - run_job and run_job_concierge, this value can be specified to - denote the parent job of the job being created. Warning: No - checking is done on the validity of the job ID, and the parent job - record is not altered. Submitting a job with a parent ID to - run_job_batch will cause an error to be returned. + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1522,7 +1588,8 @@ def check_workspace_jobs(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: returnVal @@ -1707,10 +1774,11 @@ def check_jobs_date_range_for_user(self, ctx, params): the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1727,34 +1795,45 @@ def check_jobs_date_range_for_user(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' params - the parameters to - pass to the method. Optional parameters: app_id - the id of the - Narrative application (UI) running this job (e.g. repo/name) - service_ver - specific version of deployed service, last version - is used if this parameter is not defined source_ws_objects - - denotes the workspace objects that will serve as a source of data - when running the SDK method. These references will be added to the - autogenerated provenance. Must be in UPA format (e.g. 6/90/4). - meta - Narrative metadata to associate with the job. wsid - an - optional workspace id to associate with the job. This is passed to - the workspace service, which will share the job based on the - permissions of the workspace rather than owner of the job - parent_job_id - EE2 job id for the parent of the current job. For - run_job and run_job_concierge, this value can be specified to - denote the parent job of the job being created. Warning: No - checking is done on the validity of the job ID, and the parent job - record is not altered. Submitting a job with a parent ID to - run_job_batch will cause an error to be returned. + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -1810,11 +1889,11 @@ def check_jobs_date_range_for_user(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long, parameter "count" of - Long, parameter "query_count" of Long, parameter "filter" of - mapping from String to String, parameter "skip" of Long, parameter - "projection" of list of String, parameter "limit" of Long, - parameter "sort_order" of String + String, parameter "terminated_code" of Long, parameter "batch_id" + of String, parameter "count" of Long, parameter "query_count" of + Long, parameter "filter" of mapping from String to String, + parameter "skip" of Long, parameter "projection" of list of + String, parameter "limit" of Long, parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal @@ -1910,10 +1989,11 @@ def check_jobs_date_range_for_all(self, ctx, params): the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1930,34 +2010,45 @@ def check_jobs_date_range_for_all(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - the SDK method to run in module.method format, e.g. - 'KBaseTrees.construct_species_tree' params - the parameters to - pass to the method. Optional parameters: app_id - the id of the - Narrative application (UI) running this job (e.g. repo/name) - service_ver - specific version of deployed service, last version - is used if this parameter is not defined source_ws_objects - - denotes the workspace objects that will serve as a source of data - when running the SDK method. These references will be added to the - autogenerated provenance. Must be in UPA format (e.g. 6/90/4). - meta - Narrative metadata to associate with the job. wsid - an - optional workspace id to associate with the job. This is passed to - the workspace service, which will share the job based on the - permissions of the workspace rather than owner of the job - parent_job_id - EE2 job id for the parent of the current job. For - run_job and run_job_concierge, this value can be specified to - denote the parent job of the job being created. Warning: No - checking is done on the validity of the job ID, and the parent job - record is not altered. Submitting a job with a parent ID to - run_job_batch will cause an error to be returned. + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. job_requirements: the requirements for the job. The user must have full EE2 administration rights to use this parameter. Note that the job_requirements are not returned along with the rest of the @@ -2013,11 +2104,11 @@ def check_jobs_date_range_for_all(self, ctx, params): response) -> structure: parameter "name" of String, parameter "code" of Long, parameter "message" of String, parameter "error" of String, parameter "error_code" of Long, parameter "errormsg" of - String, parameter "terminated_code" of Long, parameter "count" of - Long, parameter "query_count" of Long, parameter "filter" of - mapping from String to String, parameter "skip" of Long, parameter - "projection" of list of String, parameter "limit" of Long, - parameter "sort_order" of String + String, parameter "terminated_code" of Long, parameter "batch_id" + of String, parameter "count" of Long, parameter "query_count" of + Long, parameter "filter" of mapping from String to String, + parameter "skip" of Long, parameter "projection" of list of + String, parameter "limit" of Long, parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index c3b6ca609..6bdb84b3a 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -53,6 +53,7 @@ _REQUIREMENTS_LIST = "requirements_list" _METHOD = "method" _APP_ID = "app_id" +_BATCH_ID = "batch_id" _PARENT_JOB_ID = "parent_job_id" _PARENT_RETRY_JOB_ID = "retry_parent" _RETRY_IDS = "retry_ids" @@ -98,6 +99,7 @@ def _init_job_rec( {_SERVICE_VER} (app version) {_APP_ID} (app UI) {_SOURCE_WS_OBJECTS} (collected workspace objects for this app) + {_BATCH_ID} (parent of the job for EE2 batch jobs, the parent should be updated) {_PARENT_JOB_ID} (parent of this job, doesn't update/notify the parent) {_META} (narrative cell information) @@ -158,6 +160,7 @@ def _init_job_rec( parent_retry_job_id = params.get(_PARENT_RETRY_JOB_ID) if parent_retry_job_id: job.retry_parent = str(parent_retry_job_id) + job.batch_id = str(params.get(_BATCH_ID)) if params.get(_BATCH_ID) else None job_id = self.sdkmr.save_job(job) self.sdkmr.get_kafka_client().send_kafka_message( @@ -241,7 +244,10 @@ def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParamet AppInfo(params[_METHOD], params.get(_APP_ID)), params[_JOB_REQUIREMENTS], UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), - parent_job_id=params.get(_PARENT_JOB_ID), + # a job should have a parent ID or a batch ID or nothing, but never both + # Do we want to distinguish between the two cases in the sub params? + # It's informational only for Condor + parent_job_id=params.get(_BATCH_ID) or params.get(_PARENT_JOB_ID), wsid=params.get(_WORKSPACE_ID), source_ws_objects=params.get(_SOURCE_WS_OBJECTS), ) @@ -287,7 +293,7 @@ def _abort_child_jobs(self, child_job_ids): # TODO Maybe add a retry here? self.logger.error(f"Couldn't cancel child job {e}") - def _create_parent_job(self, wsid, meta): + def _create_batch_job(self, wsid, meta): """ This creates the parent job for all children to mark as their ancestor :param params: @@ -320,11 +326,11 @@ def _create_parent_job(self, wsid, meta): ) return j - def _run_batch(self, parent_job: Job, params): + def _run_batch(self, batch_job: Job, params): child_jobs = [] for job_param in params: - job_param[_PARENT_JOB_ID] = str(parent_job.id) + job_param[_BATCH_ID] = str(batch_job.id) try: child_jobs.append(str(self._run(params=job_param))) except Exception as e: @@ -334,8 +340,8 @@ def _run_batch(self, parent_job: Job, params): self._abort_child_jobs(child_jobs) raise e - parent_job.child_jobs = child_jobs - self.sdkmr.save_job(parent_job) + batch_job.child_jobs = child_jobs + self.sdkmr.save_job(batch_job) return child_jobs @@ -365,12 +371,12 @@ def run_batch( self._check_workspace_permissions_list(wsids) self._add_job_requirements(params, bool(as_admin)) # as_admin checked above - self._check_job_arguments(params, has_parent_job=True) + self._check_job_arguments(params, batch_job=True) - parent_job = self._create_parent_job(wsid=wsid, meta=meta) - children_jobs = self._run_batch(parent_job=parent_job, params=params) + batch_job = self._create_batch_job(wsid=wsid, meta=meta) + children_jobs = self._run_batch(batch_job=batch_job, params=params) - return {_PARENT_JOB_ID: str(parent_job.id), "child_job_ids": children_jobs} + return {_BATCH_ID: str(batch_job.id), "child_job_ids": children_jobs} # modifies the jobs in place def _add_job_requirements(self, jobs: List[Dict[str, Any]], is_write_admin: bool): @@ -464,7 +470,7 @@ def _rethrow_incorrect_params_with_error_prefix( raise error raise IncorrectParamsException(f"{error_prefix}{error.args[0]}") from error - def _check_job_arguments(self, jobs, has_parent_job=False): + def _check_job_arguments(self, jobs, batch_job=False): # perform sanity checks before creating any jobs, including the parent job for batch jobs for i, job in enumerate(jobs): # Could make an argument checker method, or a class that doesn't require a job id. @@ -482,7 +488,7 @@ def _check_job_arguments(self, jobs, has_parent_job=False): ) except IncorrectParamsException as e: self._rethrow_incorrect_params_with_error_prefix(e, pre) - if has_parent_job and job.get(_PARENT_JOB_ID): + if batch_job and job.get(_PARENT_JOB_ID): raise IncorrectParamsException( f"{pre}batch jobs may not specify a parent job ID" ) @@ -535,12 +541,11 @@ def _validate_retry_presubmit(self, job_id: str, as_admin: bool = False): job = self.sdkmr.get_job_with_permission( job_id, JobPermissions.WRITE, as_admin=as_admin ) # type: Job - job_input = job.job_input # type: JobInput - parent_job = None - if job_input.parent_job_id: - parent_job = self.sdkmr.get_job_with_permission( - job_input.parent_job_id, JobPermissions.WRITE, as_admin=as_admin + batch_job = None + if job.batch_id: + batch_job = self.sdkmr.get_job_with_permission( + job.batch_id, JobPermissions.WRITE, as_admin=as_admin ) if job.batch_job: @@ -553,9 +558,9 @@ def _validate_retry_presubmit(self, job_id: str, as_admin: bool = False): f"Error retrying job {job_id} with status {job.status}: can only retry jobs with status 'error' or 'terminated'" ) - return job, parent_job + return job, batch_job - def _retry(self, job_id: str, job: Job, parent_job: Job, as_admin: bool = False): + def _retry(self, job_id: str, job: Job, batch_job: Job, as_admin: bool = False): # Cannot retry a retried job, you must retry the retry_parent if job.retry_parent: return self.retry(str(job.retry_parent), as_admin=as_admin) @@ -569,13 +574,15 @@ def _retry(self, job_id: str, job: Job, parent_job: Job, as_admin: bool = False) retry_job_id = self.run(params=run_job_params, as_admin=as_admin) # Save that the job has been retried, and increment the count. Notify the parent(s) - # 1) Notify the parent container that it has a new child.. - if parent_job: + # 1) Notify the batch container that it has a new child. Note that the parent jobs of + # 'manual' batch jobs using the job_input.parent_job_id field *are not* modified to + # include their children, so we don't do that here either. + if batch_job: try: - parent_job.modify(add_to_set__child_jobs=retry_job_id) + batch_job.modify(add_to_set__child_jobs=retry_job_id) except Exception as e: self._db_update_failure( - job_that_failed_operation=str(parent_job.id), + job_that_failed_operation=str(batch_job.id), job_to_abort=retry_job_id, exception=e, ) @@ -611,11 +618,11 @@ def retry(self, job_id: str, as_admin=False) -> Dict[str, Optional[str]]: :param as_admin: Run with admin permission :return: The child job id that has been retried """ - job, parent_job = self._validate_retry_presubmit( + job, batch_job = self._validate_retry_presubmit( job_id=job_id, as_admin=as_admin ) return self._retry( - job_id=job_id, job=job, parent_job=parent_job, as_admin=as_admin + job_id=job_id, job=job, batch_job=batch_job, as_admin=as_admin ) def retry_multiple( @@ -646,14 +653,14 @@ def retry_multiple( # Check all inputs before attempting to start submitting jobs retried_jobs = [] jobs = [] - parent_jobs = [] + batch_jobs = [] for job_id in job_ids: try: - job, parent_job = self._validate_retry_presubmit( + job, batch_job = self._validate_retry_presubmit( job_id=job_id, as_admin=as_admin ) jobs.append(job) - parent_jobs.append(parent_job) + batch_jobs.append(batch_job) except Exception as e: raise RetryFailureException(e) @@ -664,7 +671,7 @@ def retry_multiple( self._retry( job_id=job_id, job=jobs[i], - parent_job=parent_jobs[i], + batch_job=batch_jobs[i], as_admin=as_admin, ) ) @@ -810,6 +817,8 @@ def get_job_params(self, job_id, as_admin=False): job_params["service_ver"] = job_input.service_ver job_params[_APP_ID] = job_input.app_id job_params[_WORKSPACE_ID] = job_input.wsid + # This is specfically the data in the job params, which includes any manually submitted + # parent job information but does not include batch job information job_params[_PARENT_JOB_ID] = job_input.parent_job_id job_params[_SOURCE_WS_OBJECTS] = job_input.source_ws_objects diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index 0a4b01b26..f26f9ce56 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -474,9 +474,7 @@ def check_jobs( del mongo_rec["_id"] mongo_rec["retry_count"] = len(job["retry_ids"]) mongo_rec["job_id"] = str(job.id) - mongo_rec["parent_job_id"] = ( - job.job_input.parent_job_id if job.job_input else None - ) + mongo_rec["batch_id"] = job.batch_id mongo_rec["created"] = int(job.id.generation_time.timestamp() * 1000) mongo_rec["updated"] = int(job.updated * 1000) if job.estimating: @@ -560,14 +558,17 @@ def _send_exec_stats_to_catalog(self, job_id): self.sdkmr.get_catalog().log_exec_stats(log_exec_stats_params) - def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict: - if not parent_job_id: - raise ValueError("Please provide valid parent_job id") + def abandon_children(self, batch_id, child_job_ids, as_admin=False) -> Dict: + # Note this does not work for 'manual' batch jobs as the parent job is + # never updated with the child jobs. It will only work with batch jobs specifically + # created by the run_job_batch endpoint. + if not batch_id: + raise ValueError("Please provide valid batch_id") if not child_job_ids: raise ValueError("Please provide job_ids of children to abandon") job = self.sdkmr.get_job_with_permission( - parent_job_id, JobPermissions.WRITE, as_admin=as_admin + batch_id, JobPermissions.WRITE, as_admin=as_admin ) # type: Job for child_job_id in child_job_ids: if child_job_id not in job.child_jobs: @@ -575,11 +576,10 @@ def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict f"Couldn't find {child_job_id} in {child_job_ids}" ) - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - job.update(pull_all__child_jobs=child_job_ids) - job.reload() + job.update(pull_all__child_jobs=child_job_ids) + job.reload() - return {"parent_job_id": parent_job_id, "child_jobs": job.child_jobs} + return {"batch_id": batch_id, "child_job_ids": job.child_jobs} def start_job(self, job_id, skip_estimation=True, as_admin=False): """ diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index e3c692e9c..5091b2fee 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -358,10 +358,10 @@ def start_job(self, job_id, skip_estimation=True, as_admin=False): ) # Endpoints: Changing a job's status - def abandon_children(self, parent_job_id, child_job_ids, as_admin=False): + def abandon_children(self, batch_id, child_job_ids, as_admin=False): """Authorization Required Read/Write""" return self.get_jobs_status().abandon_children( - parent_job_id=parent_job_id, child_job_ids=child_job_ids, as_admin=as_admin + batch_id=batch_id, child_job_ids=child_job_ids, as_admin=as_admin ) def update_job_status(self, job_id, status, as_admin=False): @@ -433,7 +433,7 @@ def get_job_status_field(self, job_id, as_admin=False): def check_job_batch( self, - parent_job_id, + batch_id, check_permission=True, exclude_fields=None, as_admin=False, @@ -448,7 +448,7 @@ def check_job_batch( raise ValueError("You can't exclude child jobs from this endpoint") parent_job_status = self.get_jobs_status().check_job( - job_id=parent_job_id, + job_id=batch_id, check_permission=check_permission, exclude_fields=exclude_fields, ) @@ -462,7 +462,7 @@ def check_job_batch( return_list=1, )["job_states"] return { - "parent_jobstate": parent_job_status, + "batch_jobstate": parent_job_status, "child_jobstates": child_job_states, } diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index 5ca063a30..d02a812a2 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -1202,7 +1202,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): } ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) - parent_job_id = ret["parent_job_id"] + batch_id = ret["batch_id"] job_id_1, job_id_2 = ret["child_job_ids"] # check that mocks were called correctly @@ -1228,12 +1228,12 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "user": USER_NO_ADMIN, "authstrat": "kbaseworkspace", "status": "queued", + "batch_id": batch_id, "job_input": { "method": _MOD, "params": [{"foo": "bar"}, 42], "service_ver": "somehash", "source_ws_objects": ["1/1/1", "1/2/1"], - "parent_job_id": parent_job_id, "requirements": { "clientgroup": "njs", "cpu": 8, @@ -1262,6 +1262,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "authstrat": "kbaseworkspace", "wsid": 1, "status": "queued", + "batch_id": batch_id, "job_input": { "wsid": 1, "method": "mod2.meth2", @@ -1269,7 +1270,6 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "service_ver": "somehash2", "app_id": "mod2/app2", "source_ws_objects": [], - "parent_job_id": parent_job_id, "requirements": { "clientgroup": "bigmem", "cpu": 4, @@ -1287,9 +1287,9 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): } assert job2 == expected_job2 - parent_job = _get_mongo_job(mongo_client, parent_job_id, has_queued=False) + parent_job = _get_mongo_job(mongo_client, batch_id, has_queued=False) expected_parent_job = { - "_id": ObjectId(parent_job_id), + "_id": ObjectId(batch_id), "user": USER_NO_ADMIN, "authstrat": "kbaseworkspace", "wsid": 2, @@ -1321,7 +1321,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): cpu=8, mem=5, disk=30, - parent_job_id=parent_job_id, + parent_job_id=batch_id, app_id=None, app_module=None, ) @@ -1334,7 +1334,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): cpu=4, mem=2000, disk=100, - parent_job_id=parent_job_id, + parent_job_id=batch_id, ) expected_sub_2.update( { @@ -1398,7 +1398,7 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli job_batch_params = {"wsid": 1, "as_admin": "foo"} ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_WRITE_ADMIN) ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) - parent_job_id = ret["parent_job_id"] + batch_id = ret["batch_id"] job_id_1, job_id_2 = ret["child_job_ids"] # check that mocks were called correctly @@ -1424,11 +1424,11 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "user": USER_WRITE_ADMIN, "authstrat": "kbaseworkspace", "status": "queued", + "batch_id": batch_id, "job_input": { "method": _MOD, "service_ver": "somehash", "source_ws_objects": [], - "parent_job_id": parent_job_id, "requirements": { "clientgroup": "bigmem", "cpu": 4, @@ -1451,11 +1451,11 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "user": USER_WRITE_ADMIN, "authstrat": "kbaseworkspace", "status": "queued", + "batch_id": batch_id, "job_input": { "method": "mod2.meth2", "service_ver": "somehash2", "source_ws_objects": [], - "parent_job_id": parent_job_id, "requirements": { "clientgroup": "extreme", "cpu": 32, @@ -1473,9 +1473,9 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli } assert job2 == expected_job2 - parent_job = _get_mongo_job(mongo_client, parent_job_id, has_queued=False) + parent_job = _get_mongo_job(mongo_client, batch_id, has_queued=False) expected_parent_job = { - "_id": ObjectId(parent_job_id), + "_id": ObjectId(batch_id), "user": USER_WRITE_ADMIN, "authstrat": "kbaseworkspace", "wsid": 1, @@ -1502,7 +1502,7 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli cpu=4, mem=2000, disk=100, - parent_job_id=parent_job_id, + parent_job_id=batch_id, app_id=None, app_module=None, ) @@ -1515,7 +1515,7 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli cpu=32, mem=42, disk=8, - parent_job_id=parent_job_id, + parent_job_id=batch_id, app_id=None, app_module=None, ) diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 20fc3debd..04680b284 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -123,6 +123,7 @@ def _create_job( app=_APP, state=_CREATED_STATE, git_commit=_GIT_COMMIT, + batch_id=None, parent_job_id=None, source_ws_objects=None, wsid=None, @@ -131,6 +132,7 @@ def _create_job( job.user = user job.status = state job.wsid = wsid + job.batch_id = batch_id ji = JobInput() ji.method = method ji.app_id = app @@ -177,7 +179,7 @@ def _set_up_common_return_values(mocks): mocks[MongoUtil].get_job.return_value = retjob -def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): +def _check_common_mock_calls(mocks, reqs, wsid, app=_APP, parent_job_id=None): """ Check that mocks are called as expected when those calls are similar or the same for several tests. @@ -193,7 +195,11 @@ def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): # initial job data save expected_job = _create_job( - reqs, app=app, wsid=wsid, source_ws_objects=[_WS_REF_1, _WS_REF_2] + reqs, + app=app, + wsid=wsid, + parent_job_id=parent_job_id, + source_ws_objects=[_WS_REF_1, _WS_REF_2], ) assert len(sdkmr.save_job.call_args_list) == 2 got_job = sdkmr.save_job.call_args_list[0][0][0] @@ -206,6 +212,7 @@ def _check_common_mock_calls(mocks, reqs, wsid, app=_APP): reqs, UserCreds(_USER, _TOKEN), wsid=wsid, + parent_job_id=parent_job_id, source_ws_objects=[_WS_REF_1, _WS_REF_2], ) mocks[Condor].run_job.assert_called_once_with(params=jsp_expected) @@ -316,7 +323,7 @@ def test_run_job(): _check_common_mock_calls(mocks, reqs, None, _APP) -def test_run_job_as_admin_with_job_requirements(): +def test_run_job_as_admin_with_job_requirements_and_parent_job(): """ A basic unit test of the run() method with an administrative user and job requirements. @@ -325,6 +332,8 @@ def test_run_job_as_admin_with_job_requirements(): metadata, etc. Does not include an app_id. + + Does include a parent job id. """ # Set up data variables @@ -382,6 +391,7 @@ def test_run_job_as_admin_with_job_requirements(): "method": _METHOD, "source_ws_objects": [_WS_REF_1, _WS_REF_2], "job_requirements": inc_reqs, + "parent_job_id": "thisislikesoooofake", } assert rj.run(params, as_admin=True) == _JOB_ID @@ -392,7 +402,9 @@ def test_run_job_as_admin_with_job_requirements(): jrr.resolve_requirements.assert_called_once_with( _METHOD, mocks[CatalogCache], **req_args ) - _check_common_mock_calls(mocks, reqs, None, None) + _check_common_mock_calls( + mocks, reqs, None, None, parent_job_id="thisislikesoooofake" + ) def test_run_job_as_concierge_with_wsid(): @@ -812,7 +824,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): app=_APP_1, git_commit=_GIT_COMMIT_1, source_ws_objects=[_WS_REF_1, _WS_REF_2], - parent_job_id=_JOB_ID, + batch_id=_JOB_ID, ) got_job_1 = sdkmr.save_job.call_args_list[0][0][0] assert_jobs_equal(got_job_1, expected_job_1) @@ -823,7 +835,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): app=_APP_2, git_commit=_GIT_COMMIT_2, wsid=wsid, - parent_job_id=_JOB_ID, + batch_id=_JOB_ID, ) # index 2 because job 1 is updated with save_job before this job is created got_job_2 = sdkmr.save_job.call_args_list[2][0][0] @@ -957,7 +969,7 @@ def test_run_job_batch_with_parent_job_wsid(): }, ] assert rj.run_batch(params, {"wsid": parent_wsid}) == { - "parent_job_id": _JOB_ID, + "batch_id": _JOB_ID, "child_job_ids": [_JOB_ID_1, _JOB_ID_2], } @@ -1066,7 +1078,7 @@ def test_run_job_batch_as_admin_with_job_requirements(): }, ] assert rj.run_batch(params, {}, as_admin=True) == { - "parent_job_id": _JOB_ID, + "batch_id": _JOB_ID, "child_job_ids": [_JOB_ID_1, _JOB_ID_2], } @@ -1324,6 +1336,7 @@ def assert_jobs_equal(got_job: Job, expected_job: Job): "condor_job_ads", "child_jobs", "batch_job", + "batch_id", ] _assert_field_subset_equal(got_job, expected_job, job_fields) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 8e29def37..74b7a61ba 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -524,7 +524,7 @@ def test_run_job_batch(self, rq_mock, condor_mock): jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - assert "parent_job_id" in job_ids and isinstance(job_ids["parent_job_id"], str) + assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) @@ -538,17 +538,17 @@ def test_run_job_batch(self, rq_mock, condor_mock): runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) # Squeeze in a retry test here - parent_job_id = job_ids["parent_job_id"] + batch_id = job_ids["batch_id"] child_job_id = job_ids["child_job_ids"][0] runner.update_job_status(job_id=child_job_id, status=Status.terminated.value) - parent_job = runner.check_job(job_id=parent_job_id) - assert len(parent_job["child_jobs"]) == 3 + batch_job = runner.check_job(job_id=batch_id) + assert len(batch_job["child_jobs"]) == 3 retry_id = runner.retry(job_id=child_job_id)["retry_id"] self.check_retry_job_state(child_job_id, retry_id) - parent_job = runner.check_job(job_id=parent_job_id) - assert len(parent_job["child_jobs"]) == 4 - assert parent_job["child_jobs"][-1] == retry_id + batch_job = runner.check_job(job_id=batch_id) + assert len(batch_job["child_jobs"]) == 4 + assert batch_job["child_jobs"][-1] == retry_id job = runner.check_job(job_id=child_job_id) retry_count = job["retry_count"] diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index e4b845808..5b9b80a98 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -108,7 +108,7 @@ def test_check_job(self, rq_mock, condor_mock): "source_ws_objects": [], "wsid": 9999, }, - "parent_job_id": None, + "batch_id": None, "queued": 1623781529017, "retry_count": 0, "retry_ids": [], @@ -236,13 +236,13 @@ def test_cancel_job_batch(self, rq_mock, condor_mock): jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - assert "parent_job_id" in job_ids and isinstance(job_ids["parent_job_id"], str) + assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) - runner.cancel_job(job_id=job_ids["parent_job_id"]) + runner.cancel_job(job_id=job_ids["batch_id"]) job_status = runner.check_jobs( - job_ids=[job_ids["parent_job_id"]] + job_ids["child_job_ids"] + job_ids=[job_ids["batch_id"]] + job_ids["child_job_ids"] ) for job in job_status["job_states"]: assert job["status"] == Status.terminated.value @@ -265,18 +265,20 @@ def test_abandon_children(self, rq_mock, condor_mock): jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - assert "parent_job_id" in job_ids and isinstance(job_ids["parent_job_id"], str) + assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) - runner.abandon_children( - parent_job_id=job_ids["parent_job_id"], + res = runner.abandon_children( + batch_id=job_ids["batch_id"], child_job_ids=job_ids["child_job_ids"][0:2], ) + assert res == { + "batch_id": job_ids["batch_id"], + "child_job_ids": job_ids["child_job_ids"][2:], + } - job_status = runner.check_jobs(job_ids=[job_ids["parent_job_id"]])[ - "job_states" - ][0] + job_status = runner.check_jobs(job_ids=[job_ids["batch_id"]])["job_states"][0] for job_id in job_ids["child_job_ids"][0:2]: assert job_id not in job_status["child_jobs"] @@ -301,10 +303,10 @@ def test_check_job_batch(self, rq_mock, condor_mock): jobs = [job, job, job] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - job_status = runner.check_job_batch(parent_job_id=job_ids["parent_job_id"]) - parent_job_state = job_status["parent_jobstate"] + job_status = runner.check_job_batch(batch_id=job_ids["batch_id"]) + batch_jobstate = job_status["batch_jobstate"] child_jobstates = job_status["child_jobstates"] assert len(child_jobstates) == len(jobs) for child_job in child_jobstates: - assert child_job["job_id"] in parent_job_state.get("child_jobs") + assert child_job["job_id"] in batch_jobstate.get("child_jobs") diff --git a/test/tests_for_sdkmr/ee2_retry_test.py b/test/tests_for_sdkmr/ee2_retry_test.py index fabddec5c..c567227e3 100644 --- a/test/tests_for_sdkmr/ee2_retry_test.py +++ b/test/tests_for_sdkmr/ee2_retry_test.py @@ -60,12 +60,12 @@ def test_retry_db_failures(): # One DB failure rj._db_update_failure = MagicMock(side_effect=Exception("Boom!")) with raises(Exception): - rj._retry(job_id=retry_job.id, job=retry_job, parent_job=parent_job) + rj._retry(job_id=retry_job.id, job=retry_job, batch_job=parent_job) assert rj._db_update_failure.call_count == 1 # Two db failures rj._db_update_failure = MagicMock() - rj._retry(job_id=retry_job.id, job=retry_job, parent_job=parent_job) + rj._retry(job_id=retry_job.id, job=retry_job, batch_job=parent_job) assert not retry_job.retry_saved_toggle From 553d52b05032c6e717e033e28b4ffdd78d992160 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 15 Jul 2021 12:43:56 -0500 Subject: [PATCH 093/109] DATAUP-528 Refactor preflight and wsids (#402) * First preflight light at the end of the tunnel * Did we finally beat the tests? * Did we finally beat the tests? * The path * Fixing up * Fixing up * Pr * Added coverage for condor * Added coverage * Added an exception * Added an exception * Stash * Stash * pr feedback * Add test * Readded exception, removed tests code * Forgot the else Co-authored-by: bio-boris --- lib/execution_engine2/exceptions.py | 26 +- lib/execution_engine2/sdk/EE2Runjob.py | 141 ++++++++-- .../sdk/job_submission_parameters.py | 3 + lib/execution_engine2/utils/Condor.py | 2 +- test/tests_for_integration/api_to_db_test.py | 257 +++++++++++++++++- test/tests_for_sdkmr/EE2Runjob_test.py | 85 +++++- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 89 ++++-- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 18 +- 8 files changed, 538 insertions(+), 83 deletions(-) diff --git a/lib/execution_engine2/exceptions.py b/lib/execution_engine2/exceptions.py index de471d4c8..13961697e 100644 --- a/lib/execution_engine2/exceptions.py +++ b/lib/execution_engine2/exceptions.py @@ -1,19 +1,33 @@ class ExecutionEngineValueError(ValueError): - """Base Class for ee2 exceptions""" + """ + Base Class for ee2 value exceptions + Subclass exceptions use docstring as default message + """ - pass + def __init__(self, msg=None, *args, **kwargs): + super().__init__(msg or self.__doc__, *args, **kwargs) class ExecutionEngineException(Exception): - pass + """ + Base Class for ee2 exceptions + Subclass exceptions use docstring as default message + """ + + def __init__(self, msg=None, *args, **kwargs): + super().__init__(msg or self.__doc__, *args, **kwargs) class IncorrectParamsException(ExecutionEngineValueError): - pass + """Wrong parameters were provided""" + + +class InvalidParameterForBatch(ExecutionEngineValueError): + """Workspace ids are not allowed in RunJobParams in Batch Mode""" class MissingRunJobParamsException(ExecutionEngineValueError): - """Missing a required run_job_parameter""" + """Provided an empty (RunJobParams) parameter mapping""" class InvalidStatusTransitionException(ExecutionEngineValueError): @@ -21,7 +35,7 @@ class InvalidStatusTransitionException(ExecutionEngineValueError): class InvalidOperationForStatusException(ExecutionEngineValueError): - pass + """The current operation is not valid for this job status""" class MissingCondorRequirementsException(ExecutionEngineValueError): diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 6bdb84b3a..3fb17588e 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -24,6 +24,7 @@ AuthError, CannotRetryJob, RetryFailureException, + InvalidParameterForBatch, ) from execution_engine2.sdk.EE2Constants import CONCIERGE_CLIENTGROUP from execution_engine2.sdk.job_submission_parameters import ( @@ -195,15 +196,20 @@ def _check_workspace_permissions(self, wsid): ) def _check_workspace_permissions_list(self, wsids): - perms = self.sdkmr.get_workspace_auth().can_write_list(wsids) - bad_ws = [key for key in perms.keys() if perms[key] is False] - if bad_ws: - self.logger.debug( - f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." - ) - raise PermissionError( - f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." + # TODO Cover this in tests once you can execute multiple independent runs + unique_not_none_not_zero_wsids = [wsid for wsid in set(wsids) if wsid] + if unique_not_none_not_zero_wsids: + perms = self.sdkmr.get_workspace_auth().can_write_list( + unique_not_none_not_zero_wsids ) + bad_ws = [key for key in perms.keys() if perms[key] is False] + if bad_ws: + self.logger.debug( + f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." + ) + raise PermissionError( + f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." + ) def _finish_created_job( self, job_id, exception, error_code=None, error_message=None @@ -271,7 +277,7 @@ def _run(self, params): self._finish_created_job(exception=submission_info.error, job_id=job_id) raise submission_info.error if condor_job_id is None: - error_msg = "Condor job not ran, and error not found. Something went wrong" + error_msg = "Condor job not run, and error not found. Something went wrong" self._finish_created_job(job_id=job_id, exception=RuntimeError(error_msg)) raise RuntimeError(error_msg) @@ -349,26 +355,28 @@ def run_batch( self, params, batch_params, as_admin=False ) -> Dict[str, Union[Job, List[str]]]: """ + Warning: modifies params in place :param params: List of RunJobParams (See Spec File) - :param batch_params: List of Batch Params, such as wsid (See Spec file) + :param batch_params: Mapping of Batch Params, such as {wsid, as_admin} (See Spec file) :param as_admin: Allows you to run jobs in other people's workspaces :return: A list of condor job ids or a failure notification """ + if type(params) != list: raise IncorrectParamsException("params must be a list") + + if type(batch_params) != dict: + raise IncorrectParamsException("batch params must be a mapping") + wsid = batch_params.get(_WORKSPACE_ID) meta = batch_params.get(_META) - if as_admin: - self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) - else: - # Make sure you aren't running a job in someone elses workspace - self._check_workspace_permissions(wsid) - # this is very odd. Why check the parent wsid again if there's no wsid in the job? - # also, what if the parent wsid is None? - # also also, why not just put all the wsids in one list and make one ws call? - wsids = [job_input.get(_WORKSPACE_ID, wsid) for job_input in params] - self._check_workspace_permissions_list(wsids) + self._preflight( + runjob_params=params, + batch_params=batch_params, + new_batch_job=True, + as_admin=as_admin, + ) self._add_job_requirements(params, bool(as_admin)) # as_admin checked above self._check_job_arguments(params, batch_job=True) @@ -709,20 +717,102 @@ def _get_run_job_params_from_existing_job(job: Job, user_id: str) -> Dict: # Then the next fields are job inputs top level requirements, app run parameters, and scheduler resource requirements return run_job_params + def _check_ws_perms( + self, + runjob_params: Union[dict, list], + new_batch_job: bool, + batch_params: dict, + as_admin: bool = False, + ): + """ + Check a single job, a single batch job, or a retry_multiple request with a mix of different jobs. + """ + if as_admin: + return self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) + # Batch Param runs + if new_batch_job: + if batch_params: + return self._check_workspace_permissions(batch_params.get("wsid")) + # Single job runs + elif isinstance(runjob_params, dict): + return self._check_workspace_permissions(runjob_params.get("wsid")) + # Multiple independent job runs, think retry_multiple() + elif isinstance(runjob_params, list): + return self._check_workspace_permissions_list( + [job_param.get("wsid") for job_param in runjob_params] + ) + else: + raise IncorrectParamsException( + "Runjob params must be an instance of a dict, or a list of dicts" + ) + + @staticmethod + def _propagate_wsid_for_new_batch_jobs( + runjob_params: dict, batch_params: dict, new_batch_job: bool + ): + """ + For batch jobs, check to make sure the job params do not provide a wsid other than None + Then Modify the run job params to use the batch params wsid, which may be set to None + """ + if new_batch_job: + batch_wsid = batch_params.get("wsid") if batch_params else None + for runjob_param in runjob_params: + if runjob_param.get("wsid") is not None: + raise InvalidParameterForBatch() + # Do we do a deepcopy here in case the params point to the same obj? + runjob_param["wsid"] = batch_wsid + + def _preflight( + self, + runjob_params: Union[dict, list], + batch_params: dict = None, + new_batch_job: bool = False, + as_admin: bool = False, + ) -> None: + """ + Propagate and check ws permissions for job(s) + :param runjob_params: List of RunJobParams or a single RunJobParams mapping + :param batch_params: Optional mapping for Batch Jobs + :param new_batch_job: Whether or not this is a new batch job + :param as_admin: For checking ws permissions as an admin or not + """ + if batch_params and not new_batch_job: + raise IncorrectParamsException( + "Programming error, you forgot to set the new_batch_job flag to True" + ) + if batch_params == runjob_params: + raise IncorrectParamsException( + "RunJobParams and BatchParams cannot be identical" + ) + + self._propagate_wsid_for_new_batch_jobs( + runjob_params=runjob_params, + batch_params=batch_params, + new_batch_job=new_batch_job, + ) + self._check_ws_perms( + runjob_params=runjob_params, + new_batch_job=new_batch_job, + batch_params=batch_params, + as_admin=as_admin, + ) + def run( self, params=None, as_admin=False, concierge_params: Dict = None ) -> Optional[str]: """ - :param params: SpecialRunJobParamsParams object (See spec file) + Warning: modifies params in place :param params: RunJobParams object (See spec file) :param as_admin: Allows you to run jobs in other people's workspaces :param concierge_params: Allows you to specify request_cpu, request_memory, request_disk, clientgroup :return: The condor job id """ - if as_admin: - self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) - else: - self._check_workspace_permissions(params.get(_WORKSPACE_ID)) + + # TODO Test this + if type(params) != dict: + raise IncorrectParamsException("params must be a mapping") + + self._preflight(runjob_params=params, as_admin=as_admin) if concierge_params: self.sdkmr.check_as_concierge() @@ -734,7 +824,6 @@ def run( # as_admin checked above self._add_job_requirements([params], bool(as_admin)) self._check_job_arguments([params]) - return self._run(params=params) def _get_job_reqs_from_concierge_params( diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py index 2a720920e..90bc58d73 100644 --- a/lib/execution_engine2/sdk/job_submission_parameters.py +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -252,3 +252,6 @@ def __eq__(self, other): def __hash__(self): return hash(self._params()) + + def __repr__(self): + return str(self._params()) diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 882509b8d..71289b550 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -209,7 +209,7 @@ def run_job(self, params: JobSubmissionParameters) -> SubmissionInfo: return SubmissionInfo(None, sub, e) def get_job_resource_info( - self, job_id: Optional[str] = None, cluster_id: Optional[str] = None + self, job_id: str = None, cluster_id: str = None ) -> Dict[str, str]: if job_id is not None and cluster_id is not None: raise Exception("Use only batch name (job_id) or cluster_id, not both") diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index d02a812a2..ca67c539f 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -39,6 +39,7 @@ from bson import ObjectId from pytest import fixture, raises +from execution_engine2.exceptions import InvalidParameterForBatch from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE from installed_clients.WorkspaceClient import Workspace from installed_clients.baseclient import ServerError @@ -453,6 +454,7 @@ def _get_condor_sub_for_rj_param_set( parent_job_id="totallywrongid", app_id=_APP, app_module="mod", + wsid=1, ): expected_sub = _get_common_sub(job_id) expected_sub.update( @@ -464,7 +466,7 @@ def _get_condor_sub_for_rj_param_set( "+KB_FUNCTION_NAME": '"meth"', "+KB_APP_ID": f'"{app_id}"' if app_id else "", "+KB_APP_MODULE_NAME": f'"{app_module}"' if app_module else "", - "+KB_WSID": '"1"', + "+KB_WSID": f'"{wsid}"', "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', "request_cpus": f"{cpu}", "request_memory": f"{mem}MB", @@ -1187,11 +1189,11 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): job2_params = { "method": "mod2.meth2", "app_id": "mod2/app2", - "wsid": 1, "params": [{"baz": "bat"}, 3.14], } + job_batch_wsid = 2 job_batch_params = { - "wsid": 2, + "wsid": job_batch_wsid, "meta": { "run_id": "rid2", "token_id": "tid2", @@ -1230,6 +1232,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "status": "queued", "batch_id": batch_id, "job_input": { + "wsid": job_batch_wsid, "method": _MOD, "params": [{"foo": "bar"}, 42], "service_ver": "somehash", @@ -1247,6 +1250,7 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "cell_id": "cid", }, }, + "wsid": job_batch_wsid, "child_jobs": [], "retry_ids": [], "retry_saved_toggle": False, @@ -1254,17 +1258,227 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "scheduler_id": "123", "scheduler_type": "condor", } + + assert job1 == expected_job1 + + expected_job2 = { + "_id": ObjectId(job_id_2), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": job_batch_wsid, + "status": "queued", + "batch_id": batch_id, + "job_input": { + "wsid": job_batch_wsid, + "method": "mod2.meth2", + "params": [{"baz": "bat"}, 3.14], + "service_ver": "somehash2", + "app_id": "mod2/app2", + "source_ws_objects": [], + "requirements": { + "clientgroup": "bigmem", + "cpu": 4, + "memory": 2000, + "disk": 100, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "456", + "scheduler_type": "condor", + } + assert job2 == expected_job2 + + parent_job = _get_mongo_job(mongo_client, batch_id, has_queued=False) + expected_parent_job = { + "_id": ObjectId(batch_id), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": job_batch_wsid, + "status": "created", + "job_input": { + "method": "batch", + "service_ver": "batch", + "app_id": "batch", + "source_ws_objects": [], + "narrative_cell_info": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + }, + }, + "child_jobs": [job_id_1, job_id_2], + "batch_job": True, + "retry_ids": [], + "retry_saved_toggle": False, + } + assert parent_job == expected_parent_job + + expected_sub_1 = _get_condor_sub_for_rj_param_set( + job_id_1, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + parent_job_id=batch_id, + app_id=None, + app_module=None, + wsid=job_batch_wsid, + ) + expected_sub_1["+KB_WSID"] = f'"{job_batch_wsid}"' + expected_sub_2 = _get_condor_sub_for_rj_param_set( + job_id_2, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="bigmem", + cpu=4, + mem=2000, + disk=100, + parent_job_id=batch_id, + wsid=job_batch_wsid, + ) + expected_sub_2.update( + { + "+KB_MODULE_NAME": '"mod2"', + "+KB_FUNCTION_NAME": '"meth2"', + "+KB_APP_ID": '"mod2/app2"', + "+KB_APP_MODULE_NAME": '"mod2"', + "+KB_SOURCE_WS_OBJECTS": "", + } + ) + _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 + ) + + +def test_run_job_batch_with_no_batch_wsid(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "foo") # ws 1 + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "bar") # ws 2 + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.side_effect = [123, 456] + list_cgroups.side_effect = [ + [{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + [{"client_groups": ['{"client_group":"bigmem"}']}], + ] + get_mod_ver.side_effect = [ + {"git_commit_hash": "somehash"}, + {"git_commit_hash": "somehash2"}, + ] + + # run the method + job1_params = { + "method": _MOD, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "thiskey": "getssilentlydropped", + }, + } + job2_params = { + "method": "mod2.meth2", + "app_id": "mod2/app2", + "params": [{"baz": "bat"}, 3.14], + } + + job_batch_params = { + "meta": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + "thiskey": "getssilentlydropped2", + }, + } + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) + ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + batch_id = ret["batch_id"] + job_id_1, job_id_2 = ret["child_job_ids"] + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "version": "beta"}), + call(ANY, {"module_name": "mod2", "version": "release"}), + ] + ) + list_cgroups.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "function_name": "meth"}), + call(ANY, {"module_name": "mod2", "function_name": "meth2"}), + ] + ) + + job1 = _get_mongo_job(mongo_client, job_id_1) + job2 = _get_mongo_job(mongo_client, job_id_2) + + expected_job1 = { + "_id": ObjectId(job_id_1), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "batch_id": batch_id, + "job_input": { + "method": _MOD, + "params": [{"foo": "bar"}, 42], + "service_ver": "somehash", + "source_ws_objects": ["1/1/1", "1/2/1"], + "requirements": { + "clientgroup": "njs", + "cpu": 8, + "memory": 5, + "disk": 30, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + }, + }, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + assert job1 == expected_job1 expected_job2 = { "_id": ObjectId(job_id_2), "user": USER_NO_ADMIN, "authstrat": "kbaseworkspace", - "wsid": 1, "status": "queued", "batch_id": batch_id, "job_input": { - "wsid": 1, "method": "mod2.meth2", "params": [{"baz": "bat"}, 3.14], "service_ver": "somehash2", @@ -1292,7 +1506,6 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): "_id": ObjectId(batch_id), "user": USER_NO_ADMIN, "authstrat": "kbaseworkspace", - "wsid": 2, "status": "created", "job_input": { "method": "batch", @@ -1336,8 +1549,10 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): disk=100, parent_job_id=batch_id, ) + expected_sub_2.update( { + "+KB_WSID": "", "+KB_MODULE_NAME": '"mod2"', "+KB_FUNCTION_NAME": '"meth2"', "+KB_APP_ID": '"mod2/app2"', @@ -1395,7 +1610,8 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "debug_mode": True, }, } - job_batch_params = {"wsid": 1, "as_admin": "foo"} + job_batch_wsid = 1 + job_batch_params = {"wsid": job_batch_wsid, "as_admin": "foo"} ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_WRITE_ADMIN) ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) batch_id = ret["batch_id"] @@ -1424,8 +1640,10 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "user": USER_WRITE_ADMIN, "authstrat": "kbaseworkspace", "status": "queued", + "wsid": job_batch_wsid, "batch_id": batch_id, "job_input": { + "wsid": job_batch_wsid, "method": _MOD, "service_ver": "somehash", "source_ws_objects": [], @@ -1451,8 +1669,10 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "user": USER_WRITE_ADMIN, "authstrat": "kbaseworkspace", "status": "queued", + "wsid": job_batch_wsid, "batch_id": batch_id, "job_input": { + "wsid": job_batch_wsid, "method": "mod2.meth2", "service_ver": "somehash2", "source_ws_objects": [], @@ -1478,7 +1698,7 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli "_id": ObjectId(batch_id), "user": USER_WRITE_ADMIN, "authstrat": "kbaseworkspace", - "wsid": 1, + "wsid": job_batch_wsid, "status": "created", "job_input": { "method": "batch", @@ -1505,8 +1725,12 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli parent_job_id=batch_id, app_id=None, app_module=None, + wsid=job_batch_wsid, ) - expected_sub_1.update({"+KB_SOURCE_WS_OBJECTS": "", "+KB_WSID": ""}) + expected_sub_1.update( + {"+KB_SOURCE_WS_OBJECTS": "", "+KB_WSID": f'"{job_batch_wsid}"'} + ) + expected_sub_2 = _get_condor_sub_for_rj_param_set( job_id_2, USER_WRITE_ADMIN, @@ -1518,11 +1742,12 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli parent_job_id=batch_id, app_id=None, app_module=None, + wsid=job_batch_wsid, ) expected_sub_2.update( { "+KB_SOURCE_WS_OBJECTS": "", - "+KB_WSID": "", + "+KB_WSID": f'"{job_batch_wsid}"', "+AccountingGroup": '"forrest_gump"', "+KB_MODULE_NAME": '"mod2"', "+KB_FUNCTION_NAME": '"meth2"', @@ -1580,12 +1805,22 @@ def test_run_job_batch_fail_no_workspace_access_for_batch(ee2_port, ws_controlle _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 2}, err) -def test_run_job_batch_fail_no_workspace_access_for_job(ee2_port): +def test_run_job_batch_fail_no_allowed_wsid(ee2_port): params = [ {"method": _MOD}, {"method": _MOD, "wsid": 1}, ] # this error could probably use some cleanup + err = "Workspace ids are not allowed in RunJobParams in Batch Mode" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_no_workspace_access_for_job(ee2_port): + params = [ + {"method": _MOD}, + {"method": _MOD}, + ] + # this error could probably use some cleanup err = ( "('An error occurred while fetching user permissions from the Workspace', " + "ServerError('No workspace with id 1 exists'))" diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 04680b284..3698c9124 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -4,6 +4,7 @@ # Incomplete by a long way. Will add more unit tests as they come up. +import copy from logging import Logger from typing import List, Dict, Any from unittest.mock import create_autospec, call @@ -14,7 +15,11 @@ from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta -from execution_engine2.exceptions import IncorrectParamsException, AuthError +from execution_engine2.exceptions import ( + IncorrectParamsException, + AuthError, + InvalidParameterForBatch, +) from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.sdk.job_submission_parameters import ( @@ -627,12 +632,22 @@ class and its respective composed classes, and we don't reproduce all the error Tests both the run() and run_batch() methods. """ + # These are extremely annoying to debug as they don't raise a stacktrace if a different exception type was thrown + # or let you know that it was an entirely different exception, or if the exception happened in the bulk version of the run + _run_and_run_batch_fail_illegal_arguments( {}, IncorrectParamsException("Missing input parameter: method ID") ) + _run_and_run_batch_fail_illegal_arguments( {"method": "foo.bar", "wsid": 0}, IncorrectParamsException("wsid must be at least 1"), + InvalidParameterForBatch(), + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "wsid": -1}, + IncorrectParamsException("wsid must be at least 1"), + InvalidParameterForBatch(), ) _run_and_run_batch_fail_illegal_arguments( {"method": "foo.bar", "source_ws_objects": {"a": "b"}}, @@ -658,11 +673,11 @@ class and its respective composed classes, and we don't reproduce all the error ) -def _run_and_run_batch_fail_illegal_arguments(params, expected): +def _run_and_run_batch_fail_illegal_arguments(params, expected, batch_expected=None): mocks = _set_up_mocks(_USER, _TOKEN) jrr = mocks[JobRequirementsResolver] jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") - _run_and_run_batch_fail(mocks[SDKMethodRunner], params, expected) + _run_and_run_batch_fail(mocks[SDKMethodRunner], params, expected, batch_expected) def test_run_job_and_run_job_batch_fail_arg_normalization(): @@ -735,12 +750,16 @@ def test_run_job_and_run_job_batch_fail_workspace_objects_check(): ) -def _run_and_run_batch_fail(sdkmr, params, expected, as_admin=True): +def _run_and_run_batch_fail( + sdkmr, params, expected, batch_expected=None, as_admin=True +): rj = EE2RunJob(sdkmr) with raises(Exception) as got: rj.run(params, as_admin=as_admin) assert_exception_correct(got.value, expected) + if batch_expected: + expected = batch_expected _run_batch_fail(rj, [params], {}, as_admin, expected) @@ -782,7 +801,7 @@ def _set_up_common_return_values_batch(mocks): mocks[MongoUtil].get_job.side_effect = [retjob_1, retjob_2] -def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): +def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): """ Check that mocks are called as expected when those calls are similar or the same for several tests. @@ -824,6 +843,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): app=_APP_1, git_commit=_GIT_COMMIT_1, source_ws_objects=[_WS_REF_1, _WS_REF_2], + wsid=parent_wsid, batch_id=_JOB_ID, ) got_job_1 = sdkmr.save_job.call_args_list[0][0][0] @@ -834,7 +854,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): method=_METHOD_2, app=_APP_2, git_commit=_GIT_COMMIT_2, - wsid=wsid, + wsid=parent_wsid, batch_id=_JOB_ID, ) # index 2 because job 1 is updated with save_job before this job is created @@ -848,6 +868,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): UserCreds(_USER, _TOKEN), parent_job_id=_JOB_ID, source_ws_objects=[_WS_REF_1, _WS_REF_2], + wsid=parent_wsid, ) jsp_expected_2 = JobSubmissionParameters( _JOB_ID_2, @@ -855,7 +876,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid): reqs2, UserCreds(_USER, _TOKEN), parent_job_id=_JOB_ID, - wsid=wsid, + wsid=parent_wsid, ) mocks[Condor].run_job.assert_has_calls( [call(params=jsp_expected_1), call(params=jsp_expected_2)] @@ -918,6 +939,10 @@ def test_run_job_batch_with_parent_job_wsid(): potential code paths or provide all the possible run inputs, such as job parameters, cell metadata, etc. """ + # When an assertion is failed, this test doesn't show you where failed in PyCharm, so use + # Additional arguments `--no-cov -s` or run from cmd line + # PYTHONPATH=.:lib:test pytest test/tests_for_sdkmr/EE2Runjob_test.py::test_run_job_batch_with_parent_job_wsid --no-cov + # set up variables parent_wsid = 89 wsid = 32 @@ -968,6 +993,11 @@ def test_run_job_batch_with_parent_job_wsid(): "wsid": wsid, }, ] + with raises(InvalidParameterForBatch) as got: + rj.run_batch(copy.deepcopy(params), {"wsid": parent_wsid}) + assert_exception_correct(got.value, InvalidParameterForBatch()) + + params[1]["wsid"] = None assert rj.run_batch(params, {"wsid": parent_wsid}) == { "batch_id": _JOB_ID, "child_job_ids": [_JOB_ID_1, _JOB_ID_2], @@ -975,8 +1005,7 @@ def test_run_job_batch_with_parent_job_wsid(): # check mocks called as expected. The order here is the order that they're called in the code. mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) - # this seems like a bug. See comments in the run_batch method - mocks[WorkspaceAuth].can_write_list.assert_called_once_with([parent_wsid, wsid]) + jrr = mocks[JobRequirementsResolver] jrr.normalize_job_reqs.assert_has_calls( [call({}, "input job"), call({}, "input job")] @@ -990,7 +1019,7 @@ def test_run_job_batch_with_parent_job_wsid(): call(_METHOD_2, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), ] ) - _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid, wsid) + _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid) def test_run_job_batch_as_admin_with_job_requirements(): @@ -1003,7 +1032,6 @@ def test_run_job_batch_as_admin_with_job_requirements(): metadata, etc. """ # set up variables - wsid = 32 cpus = 89 mem = 3 disk = 10000 @@ -1073,7 +1101,6 @@ def test_run_job_batch_as_admin_with_job_requirements(): { "method": _METHOD_2, "app_id": _APP_2, - "wsid": wsid, "job_requirements": inc_reqs, }, ] @@ -1096,13 +1123,37 @@ def test_run_job_batch_as_admin_with_job_requirements(): call(_METHOD_2, mocks[CatalogCache], **req_args), ] ) - _check_common_mock_calls_batch(mocks, reqs1, reqs2, None, wsid) + _check_common_mock_calls_batch(mocks, reqs1, reqs2, None) -def test_run_batch_fail_params_not_list(): +def test_run_batch_preflight_failures(): mocks = _set_up_mocks(_USER, _TOKEN) sdkmr = mocks[SDKMethodRunner] + rj = EE2RunJob(sdkmr) + with raises(Exception) as got: + rj._preflight(runjob_params=[], batch_params=[]) + + assert_exception_correct( + got.value, + expected=IncorrectParamsException( + "RunJobParams and BatchParams cannot be identical" + ), + ) + with raises(Exception) as got: + rj._preflight(runjob_params=[], batch_params={"batch": "batch"}) + + assert_exception_correct( + got.value, + expected=IncorrectParamsException( + "Programming error, you forgot to set the new_batch_job flag to True" + ), + ) + + +def test_run_batch_fail_params_not_list_or_batch_not_mapping(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] rj = EE2RunJob(sdkmr) for params in [ None, @@ -1117,6 +1168,10 @@ def test_run_batch_fail_params_not_list(): rj, params, {}, True, IncorrectParamsException("params must be a list") ) + _run_batch_fail( + rj, [], [], True, IncorrectParamsException("batch params must be a mapping") + ) + # Note the next few tests are specifically testing that errors for multiple jobs have the # correct job number @@ -1150,7 +1205,7 @@ class and its respective composed classes, and we don't reproduce all the error [job, {"method": "foo.bar", "wsid": 0}], {}, True, - IncorrectParamsException("Job #2: wsid must be at least 1"), + InvalidParameterForBatch(), ) _run_batch_fail( rj, diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 74b7a61ba..b89c48fc9 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -9,7 +9,11 @@ import requests_mock from mock import MagicMock -from execution_engine2.exceptions import CannotRetryJob, RetryFailureException +from execution_engine2.exceptions import ( + CannotRetryJob, + RetryFailureException, + InvalidParameterForBatch, +) from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.utils.clients import ( get_client_set, @@ -72,11 +76,16 @@ def setUpClass(cls): cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: + """ # Initialize these clients from None + # Set up some mocks + """ runner = copy.copy(self.__class__.method_runner) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() + runner.get_workspace() + runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) return runner def create_job_rec(self): @@ -242,10 +251,26 @@ def test_run_job(self, rq_mock, condor_mock): job = get_example_job_as_dict(user=self.user_id, wsid=self.ws_id) si = SubmissionInfo(clusterid="test", submit=job, error=None) + + # OK condor_mock.run_job = MagicMock(return_value=si) + runner.run_job(params=job) + + # Condor Failure Case Coverage + condor_mock.run_job = MagicMock(return_value=si, side_effect=Exception("fail")) + runner.get_runjob()._finish_created_job = MagicMock(return_value=None) + + with self.assertRaises(expected_exception=Exception): + runner.run_job(params=job) - job_id = runner.run_job(params=job) - print(f"Job id is {job_id} ") + # Condor Failure Case Coverage #2 + with self.assertRaisesRegex( + expected_exception=RuntimeError, + expected_regex="Condor job not run, and error not found. Something went wrong", + ): + si = SubmissionInfo(clusterid=None, submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + runner.run_job(params=job) @staticmethod def check_retry_job_state(job_id: str, retry_job_id: str): @@ -280,7 +305,7 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): ) runner = self.getRunner() runner.get_condor = MagicMock(return_value=condor_mock) - runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + job = get_example_job_as_dict( user=self.user_id, wsid=self.ws_id, source_ws_objects=[] ) @@ -375,7 +400,7 @@ def test_retry_job(self, rq_mock, condor_mock): ) runner = self.getRunner() runner.get_condor = MagicMock(return_value=condor_mock) - runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + job = get_example_job_as_dict( user=self.user_id, wsid=self.ws_id, source_ws_objects=[] ) @@ -451,8 +476,7 @@ def test_retry_job_with_params_and_nci_and_src_ws_objs(self, rq_mock, condor_moc ) ) runner = self.getRunner() - runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) - runner.workspace_auth.can_write = MagicMock(return_value=True) + runner.get_condor = MagicMock(return_value=condor_mock) quast_params = { @@ -515,31 +539,60 @@ def test_run_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() runner.get_condor = MagicMock(return_value=condor_mock) - runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) + job = get_example_job_as_dict( - user=self.user_id, wsid=self.ws_id, source_ws_objects=[] + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job2 = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job3 = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] ) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - jobs = [job, job, job] - job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + jobs = [job, job2, job3] + job_ids = runner.run_job_batch( + params=copy.deepcopy(jobs), batch_params={"wsid": self.ws_id} + ) + + for job in runner.check_jobs( + job_ids=job_ids["child_job_ids"] + [job_ids["batch_id"]] + )["job_states"]: + assert job.get("wsid") == self.ws_id + # Job input is forced to assume the batch wsid + if job["job_id"] != job_ids["batch_id"]: + assert job.get("job_input", {}).get("wsid") == self.ws_id assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) + with self.assertRaises(InvalidParameterForBatch): + job_good = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job_bad = ( + get_example_job(user=self.user_id, wsid=self.ws_id).to_mongo().to_dict() + ) + jobs = [job_good, job_bad] + runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + # Test that you can't run a job in someone elses workspace + no_perms_ws = 111970 with self.assertRaises(PermissionError): - job_bad = get_example_job(user=self.user_id, wsid=1234).to_mongo().to_dict() - job_bad["method"] = job["job_input"]["app_id"] - job_bad["app_id"] = job["job_input"]["app_id"] - job_bad["service_ver"] = job["job_input"]["service_ver"] - jobs = [job, job_bad] - runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + job_good = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job_bad = get_example_job(user=self.user_id, wsid=None).to_mongo().to_dict() + jobs = [job_good, job_bad] + runner.run_job_batch(params=jobs, batch_params={"wsid": no_perms_ws}) - # Squeeze in a retry test here + # Check wsids batch_id = job_ids["batch_id"] child_job_id = job_ids["child_job_ids"][0] + + # Squeeze in a retry test here runner.update_job_status(job_id=child_job_id, status=Status.terminated.value) batch_job = runner.check_job(job_id=batch_id) assert len(batch_job["child_jobs"]) == 3 diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index 5b9b80a98..98e73cf70 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -229,12 +229,14 @@ def test_cancel_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() # type: SDKMethodRunner runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job2 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job3 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - jobs = [job, job, job] + jobs = [job, job2, job3] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) @@ -257,12 +259,14 @@ def test_abandon_children(self, rq_mock, condor_mock): ) runner = self.getRunner() # type: SDKMethodRunner runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job2 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job3 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - jobs = [job, job, job] + jobs = [job, job2, job3] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) @@ -295,12 +299,14 @@ def test_check_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() # type: SDKMethodRunner runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job2 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job3 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - jobs = [job, job, job] + jobs = [job, job2, job3] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) job_status = runner.check_job_batch(batch_id=job_ids["batch_id"]) From 8212c8d60211bfca4c37ebe3fa422d729c64924c Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 20 Jul 2021 14:23:57 -0500 Subject: [PATCH 094/109] Update-adr (#407) * Update-adr * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md --- docs/adrs/003-Retry_endpoint_design.md | 63 ++++++++++++++++++-------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/docs/adrs/003-Retry_endpoint_design.md b/docs/adrs/003-Retry_endpoint_design.md index a8438639c..d57c86675 100644 --- a/docs/adrs/003-Retry_endpoint_design.md +++ b/docs/adrs/003-Retry_endpoint_design.md @@ -49,12 +49,24 @@ The endpoint takes a job or list of job ids and then attempts to resubmit them t * Submitting multiple jobs uses the `run_job` endpoint, and is blocking (NOT OK!) ### Desired Behavior + +#### General * Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) -* Add retry_count to retried jobs as well to aid in more book-keeping in a new field called `retry_number` * Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) * One single submission to HTCondor instead of multiple job submissions * Ability to gracefully handle jobs with children * Ability to handle database consistentcy during retry failure +* See if we can make some preflight checks fail before job submission and handle them differently than those that appear during job submission + +#### Data inconsistency +* A new `retry_ids` field will show a list of jobs that have been retried using this parent id. Retry_count will be returned as a calculated field based off of retry_ids +* `retry_toggle` field will allow a seperate process to check for jobs that didn't finish the entire retry lifecycle: +1) Launch child jobs +2) Notify the batch parent of the child, +3) Notify the retry parent of the child, +4) Update the retry_toggle field + + ### Questions @@ -79,13 +91,10 @@ Looks like the options are * accept that the db info may be incomplete and write workarounds into the clients * (upgrade to Mongo 4.4 for better transaction support) - - -### Sort of answered #### Q: how to prevent incorrect parent-child relationships being created -- should the client be allowed to specify a parent ID? Is it currently possible to add a new child to a parent job if the child is a new job, rather than an existing job ID / set of params that is being rerun? A: Not necessarily relevant to this endpoint, more of a run_job_batch endpoint question. Currently the `retry_parent` and `parent_job_id` are looked up from the ee2 record on retry, and not specified in this endpoint. -#### Answered: +#### Shorter Q and A Should we track a retry count? (Done) Should users see this retry count? (Unknown TBD) @@ -98,21 +107,37 @@ A: Not necessarily relevant to this endpoint, more of a run_job_batch endpoint q # Work estimation Priority descending -* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) -* One single submission to HTCondor instead of multiple job submission () -* Ability to gracefully handle jobs with children (may require refactoring models) -* Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) -* Add retry_count to retried jobs as well to aid in more book-keeping in a new field called `retry_number` +* Address data inconsistency via retry_count, retry_ids and retry_toggle +> Estimate 3-4 days +> https://kbase-jira.atlassian.net/browse/DATAUP-461 -# Time / Tickets to be created -* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) -> Requires refactor of run_job_batch to add jobs to an existing batch job, and force the same app `git_commit versions` and `JobRequirements` +* Preflight checks > Estimate 3-4 days +> https://kbase-jira.atlassian.net/browse/DATAUP-528 + +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) +* > Estimate 3-4 days +> Requires refactor of run_job_batch to be non blocking + + +> Requires retry to be able to force the same app `git_commit versions` and `JobRequirements` from the db records +https://kbase-jira.atlassian.net/browse/DATAUP-461 + + +* Hookup retrys to refactored code +* Requires refactor of retry to gracefully handle jobs with children by notifying the batch containers for retry of ids not in the same batch +> Estimate 3 days +> https://kbase-jira.atlassian.net/browse/DATAUP-535 + + * One single submission to HTCondor instead of multiple job submission () -> Dependent on run_job_batch to be threaded first : Estimate 1 day -* Ability to gracefully handle jobs with children -> (may require refactoring models. Especially when children spawn more jobs) : Estimate 3 day +> Estimate 1-2 days +> https://kbase-jira.atlassian.net/browse/DATAUP-391 + * Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) -> Some sort of locking mechanism or something else : Estimate 3 day -* Add retry_count to retried jobs as well to aid in more book-keeping in a new field called `retry_number` -> Requires addition to run_job and new field in model : Estimate 1.25 day +> Estimate 3-4 days +https://kbase-jira.atlassian.net/browse/DATAUP-439 + +* Create a created jobs and queued jobs reaper than queues created jobs older than 1 hour, and queued jobs over 14 days old. +> Estimate 2-3 days +https://kbase-jira.atlassian.net/browse/DATAUP-536 From d540f8719d530cf80c10633f611949521015e0e1 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 20 Jul 2021 14:35:44 -0500 Subject: [PATCH 095/109] Update 003-Retry_endpoint_design.md (#393) * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Finish up adr Co-authored-by: bio-boris --- docs/adrs/003-Retry_endpoint_design.md | 77 ++++++++++++++++---------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/docs/adrs/003-Retry_endpoint_design.md b/docs/adrs/003-Retry_endpoint_design.md index d57c86675..8ca66257a 100644 --- a/docs/adrs/003-Retry_endpoint_design.md +++ b/docs/adrs/003-Retry_endpoint_design.md @@ -1,24 +1,23 @@ -# Retry Endpoint +# Retry Endpoint Design (Round 2!) Date: 2021-05-19 ## Motivation for the Endpoint: -The current requirement for the Batch/Bulk UI is to be able to retry jobs that have either "errored" out, or were cancelled. +The current requirement for the Batch/Bulk UI is to be able to retry jobs that have either "errored" out, or were terminated. The UI allows you to retry either single jobs, or multiple jobs, and saves you from having to cancel and resubmit each job individually, -which is not really possibly with the UI anyway. +which is not currently implemented in the UI anyway. ### Motivation for the `code spike` for retry endpoint and follow up design ADR -``` -As I mentioned, as the product owner, I find our ability to deliver functionality to be pretty awful. -We have invested so much effort in refactoring that its killed our timeline - we started in late July, and it is now almost May with no functioning bulk uploader, which was just the first deliverable. -If we are going to refactor, we need to be able to do it in a timely fashion, and have it not kill the schedule any more than it has. -I want to see the estimate for a quick and dirty solution that implements a proposed retry endpoint, that can be deployed ASAP, and then once the API contract has been established, and the functional MVP is done, we begin the cleanup of the backend code. -Note that this is NOT business as usual, the usual way we do this is the nasty MVP gets deployed and then we don't go back until much later. -Here, we get the API working so that it doesn't block dependencies, and we immediately start the refactoring. The refactor needs to be broken down into smallish chunks of ~3 days estimated work, and each merge should maintain functionality and incrementally improve the codebase. -Tasks that take more than a couple of days are more likely to be far off in their estimate and this is how we mitigate the risk of poor estimation. -``` +>As I mentioned, as the product owner, I find our ability to deliver functionality to be pretty awful. +>We have invested so much effort in refactoring that its killed our timeline - we started in late July, and it is now almost May with no functioning >bulk uploader, which was just the first deliverable. +>If we are going to refactor, we need to be able to do it in a timely fashion, and have it not kill the schedule any more than it has. +>I want to see the estimate for a quick and dirty solution that implements a proposed retry endpoint, that can be deployed ASAP, and then once the API >contract has been established, and the functional MVP is done, we begin the cleanup of the backend code. +>Note that this is NOT business as usual, the usual way we do this is the nasty MVP gets deployed and then we don't go back until much later. +>Here, we get the API working so that it doesn't block dependencies, and we immediately start the refactoring. The refactor needs to be broken down into >smallish chunks of ~3 days estimated work, and each merge should maintain functionality and incrementally improve the codebase. +>Tasks that take more than a couple of days are more likely to be far off in their estimate and this is how we mitigate the risk of poor estimation. +> ### High Level Behavior of the `retry` endpoint @@ -32,21 +31,29 @@ The endpoint takes a job or list of job ids and then attempts to resubmit them t * The retry will only continue if the status of the job to be retried is in [Status.terminated.value, Status.error.value] * If the job id points to a job that has already been retried, it will attempt to retry that job's `retry_parent` instead. * If the job id has never been retried, it becomes the `retry_parent` -* EE2 looks up the job versions and parameters, and then submits the job to be retried, incrementing the `retry_count` - of the job being retried, and the newly launched job gains a pointer to the `_PARENT_RETRY_JOB_ID` -* The job is submitted and upon successful submission, notifies the `retry_parent` and notifies the `parent_job_id` that a new `child_job` has been added +* EE2 looks up the method versions and parameters, and then submits the job to be retried, incrementing the `retry_count` of the job being retried, and the newly launched job gains a field called `retry_parent` that contains the job id of the job from the original request. +* The job is submitted and upon successful submission, the child job adds the field `retry_parent` and notifies the `parent_job_id` that a new `child_job` has been added by appending itself to the `parent_job.child_jobs[]` field +* There is no way to specify ResourceRequirements with a retry at the moment, even if the job was previously submitted by an administrator and had specfified ResourceRequirements. The retry will only use resource requirements from the catalog / ee2 config. ### Batch Behavior * If a job has the attribute of `batch_job=True` the retry will fail, since there is no method to re-run. This is a bug, as it doesn't fail gracefully. * If a job has the attribute of `batch_job=True`, but is actually a child job, the parent will be notified of this new retried job * Multiple in-flight retries are allowed. +* Adds `child_job_id` to `parent_job_id.child_job_ids[]` ## Retry_job behavior -* Blocking and single submit to HTCondor. It should be fine +* Blocking and single submit to HTCondor. It should be fine as it returns relatively quickly ## Retry_jobs behavior -* Submitting multiple jobs uses the `run_job` endpoint, and is blocking (NOT OK!) +* Submitting multiple jobs for retry serially calls the same code path +used for running a single job and blocks until all jobs have been +submitted to the condor queue. This can cause issues if the +network drops, and makes the narrative not aware of the state of +the retry. Submitting 100 jobs currently takes 9 seconds, and that +is a lot of time for things to go wrong. +* (Follow up: Hopefully the making the narrative aware of the state of the retry will be mitigated by the narrative backend. It just blocks on the call anyway, with the default service timeout, which I think is something wacky like half an hour. As long as the user doesn't kill the kernel at that time, all should be well. Of course, if it were me, and it looked frozen for more than a couple minutes, I'd probably restart. ) +* Multiple in-flight retries are allowed. ### Desired Behavior @@ -56,7 +63,7 @@ The endpoint takes a job or list of job ids and then attempts to resubmit them t * One single submission to HTCondor instead of multiple job submissions * Ability to gracefully handle jobs with children * Ability to handle database consistentcy during retry failure -* See if we can make some preflight checks fail before job submission and handle them differently than those that appear during job submission +* See if we can make some preflight (before the job starts) checks fail before job submission and handle them differently than those that appear during job submission #### Data inconsistency * A new `retry_ids` field will show a list of jobs that have been retried using this parent id. Retry_count will be returned as a calculated field based off of retry_ids @@ -66,33 +73,45 @@ The endpoint takes a job or list of job ids and then attempts to resubmit them t 3) Notify the retry parent of the child, 4) Update the retry_toggle field +#### Won't do +* Prevent multiple in-flight retries of the same original job to prevent the user from wasting their own resources (and the queues resources) +* Add retry_number field +## New priority +* Create a retry_jobs field, and expose list in api, and a T/F completeness toggle +* Non blocking job submission / (Possibly htcondor submit) +* Add failure conditions in run method +* Add thread to perform actions based on toggle ### Questions -#To Be Answered - +#### Answered: #### Q: should the number of retries of a job be limited, and if so, where? e.g. a max_retries field in the parent job? wait and see whether people attempt to rerun jobs that have already failed nine zillion times? -A: Unknown TBD +A: Make a ticket for this and add to backlog -#### Q: Preventing the same params from being re-run -A: We have decided to allow it +#### Q: How do we prevent jobs with identical parameters from being rerun more than once within a retry_jobs request? +A: We have decided to allow multiple jobs with the same params to be re-run in the same `retry_jobs` request. -#### Q: Finding the most recent run of the job: I would very much like to avoid anything involving iterating over a chain of jobs before you can find the most recent run or the original run -- we can come up with better data structures than that! -A: Unknown TBD, maybe the frontend does it? +#### Q: How do we find the most recent retry of a job? +A: The client using the ee2 API would have to figure it out using the `retry_parent` and job creation date fields. (Unless we added other fields to help with this) -#### Q: It might be best to always submit a git commit for the module, maybe? -A: (This could be a narrative ticket) +#### Q: How do we ensure that the app version is correctly run each time when submitting from the narrative? +A: We would need to change the narrative to submit the git commit hash instead of a version tag #### Q: How do we handle DB consistency during retry failure? Looks like the options are -* implement db integrity checks and two-phase commits for making the relationships between a job, its retry parent, and the batch container +* implement db integrity checks and two-phase commits for making the relationships between a job, its `retry_parent`, and the batch container * accept that the db info may be incomplete and write workarounds into the clients * (upgrade to Mongo 4.4 for better transaction support) + +##### Q: Do we want to support ResourceRequirements +A: Probably not in the short term + + #### Q: how to prevent incorrect parent-child relationships being created -- should the client be allowed to specify a parent ID? Is it currently possible to add a new child to a parent job if the child is a new job, rather than an existing job ID / set of params that is being rerun? -A: Not necessarily relevant to this endpoint, more of a run_job_batch endpoint question. Currently the `retry_parent` and `parent_job_id` are looked up from the ee2 record on retry, and not specified in this endpoint. +A: Not necessarily relevant to this endpoint, more of a `run_job_batch` endpoint question. Currently the `retry_parent` and `parent_job_id` are looked up from the ee2 record on retry, and not specified in this endpoint. #### Shorter Q and A From 69816994e66302dfae027ccce853df9be8dd86b1 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 20 Jul 2021 14:51:55 -0500 Subject: [PATCH 096/109] Update Release Notes (#293) * release_notes_0.0.5 * Update RELEASE_NOTES.md * Update RELEASE_NOTES.md Co-authored-by: bio-boris --- RELEASE_NOTES.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index be8674d6a..15d8b673a 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,15 +1,24 @@ # execution_engine2 (ee2) release notes ========================================= + + ## 0.0.5 + * TODO Refactor run_jobs_batch endpoint to cache catalog calls for batch jobs, submit entire batch to condor in one transaction + * TODO: Added CreatedJobsReaper + * Added retry_job and retry_jobs endpoint along with ADRs + * TODO: Will deprecate run_job transaction and use scheduler API + * Refactored tests + * Removed slack messages for running jobs * Fix a bug that caused job requirements from the catalog in CSV format to be ignored other than the client group * Full EE2 admins can now submit job requirements when running jobs via run_job_batch and run_job. See the SDK spec for details. + ## 0.0.4 * Fix up tests * Remove dependency on slack - * Add batch endpoints + * Add batch endpoints, cancel_jobs now cancels child jobs * Rename prod branch to "main" ## 0.0.3.4 From 61a86869763ae17c45ca291f2733b8cbd8d6d1e8 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 22 Jul 2021 13:34:12 -0500 Subject: [PATCH 097/109] Propogate batch id (#410) * Batch job * update test * Update broken build * Update broken build Co-authored-by: bio-boris --- lib/execution_engine2/sdk/EE2Runjob.py | 1 + requirements.txt | 135 ++++++++++-------- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 10 +- 3 files changed, 82 insertions(+), 64 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 3fb17588e..0e57e3995 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -712,6 +712,7 @@ def _get_run_job_params_from_existing_job(job: Job, user_id: str) -> Dict: _SOURCE_WS_OBJECTS: source_ws_objects, # Must be list _SERVICE_VER: ji.service_ver, _PARENT_JOB_ID: ji.parent_job_id, + _BATCH_ID: job.batch_id, } # Then the next fields are job inputs top level requirements, app run parameters, and scheduler resource requirements diff --git a/requirements.txt b/requirements.txt index 363084bc3..2659ef5c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,75 +1,84 @@ -i https://pypi.org/simple/ -aiofiles==0.4.0 -aiohttp==3.7.4 -asn1crypto==1.3.0 +aiofiles==0.7.0 +aiohttp==3.7.4.post0 +asn1crypto==1.4.0 async-timeout==3.0.1 -attrs==20.2.0 -cachetools==3.1.1 -certifi==2019.6.16 -cffi==1.14.0 -chardet==3.0.4 +attrs==21.2.0 +brotlipy==0.7.0 +cachetools==4.2.2 +certifi==2021.5.30 +cffi==1.14.6 +chardet==4.0.0 codecov==2.1.11 -configparser==3.7.4 -confluent-kafka==1.5.0 -coverage==4.5.3 -cryptography==3.3.2 -docker==4.3.1 -gevent==20.9.0 -gprof2dot==2019.11.30 -greenlet==0.4.17 -gunicorn==20.0.4 -h11==0.8.1 -h2==3.1.0 -hpack==3.0.0 -htcondor==8.9.8 -httpcore==0.3.0 -httptools==0.0.13 -hyperframe==5.2.0 -idna==2.8 -importlib-metadata==2.0.0 +conda==4.10.3 +conda-package-handling==1.7.3 +configparser==5.0.2 +confluent-kafka==1.7.0 +coverage==5.5 +cryptography==3.4.7 +docker==5.0.0 +gevent==21.1.2 +gprof2dot==2021.2.21 +greenlet==1.1.0 +gunicorn==20.1.0 +h11==0.12.0 +h2==4.0.0 +hpack==4.0.0 +htcondor==9.1.0 +httpcore==0.2.1 +httptools==0.2.0 +hyperframe==6.0.1 +idna==2.10 +importlib-metadata==4.6.1 iniconfig==1.1.1 -jinja2==2.11.3 -jsonrpcbase==0.2.0 +Jinja2==3.0.1 +JSONRPCBase==0.2.0 maps==5.1.1 -markupsafe==1.1.1 -memory-profiler==0.55.0 -mock==3.0.5 -mongoengine==0.23.0 -multidict==4.5.2 +MarkupSafe==2.0.1 +memory-profiler==0.58.0 +mock==4.0.3 +mongoengine==0.23.1 +multidict==5.1.0 nose==1.3.7 -packaging==20.9 +packaging==21.0 +pip==21.1.3 pluggy==0.13.1 -psutil==5.6.6 +psutil==5.8.0 py==1.10.0 pycosat==0.6.3 -pycparser==2.19 -pymongo==3.8.0 -pyopenssl==19.1.0 +pycparser==2.20 +pymongo==3.12.0 +pyOpenSSL==20.0.1 pyparsing==2.4.7 -pysocks==1.7.1 -pytest-cov==2.8.1 +PySocks==1.7.1 +pytest==6.2.4 +pytest-cov==2.12.1 pytest-profiling==1.7.0 -pytest==6.1.1 -python-dateutil==2.8.0 -python-dotenv==0.10.3 -requests-async==0.5.0 -requests-mock==1.7.0 -requests==2.22.0 -rfc3986==1.3.2 -ruamel.yaml==0.15.87 -sanic==19.6.0 -sentry-sdk==0.14.3 -six==1.14.0 -slackclient==2.7.1 -toml==0.10.1 -tqdm==4.42.1 -typing-extensions==3.7.4.3 -ujson==1.35 -urllib3==1.25.8 -uvloop==0.15.2 -websocket-client==0.57.0 -websockets==6.0 -yarl==1.5.1 -zipp==3.3.1 +python-dateutil==2.8.2 +python-dotenv==0.18.0 +requests==2.25.1 +requests-async==0.4.1 +requests-mock==1.9.3 +rfc3986==1.5.0 +ruamel-yaml-conda==0.15.100 +ruamel.yaml==0.17.10 +ruamel.yaml.clib==0.2.6 +sanic==21.6.0 +sanic-routing==0.7.0 +sentry-sdk==1.3.0 +setuptools==52.0.0.post20210125 +six==1.16.0 +slackclient==2.9.3 +toml==0.10.2 +tqdm==4.61.2 +typing-extensions==3.10.0.0 +ujson==4.0.2 +urllib3==1.26.6 +uvloop==0.15.3 +websocket-client==1.1.0 +websockets==9.1 +wheel==0.36.2 +yarl==1.6.3 +zipp==3.5.0 zope.event==4.5.0 -zope.interface==5.1.2 +zope.interface==5.4.0 \ No newline at end of file diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index b89c48fc9..96ecd7349 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -274,6 +274,12 @@ def test_run_job(self, rq_mock, condor_mock): @staticmethod def check_retry_job_state(job_id: str, retry_job_id: str): + """ + Checks to see the required keys are there + :param job_id: The job that was retried + :param retry_job_id: The job id that was a result of the retry + + """ job = Job.objects.get(id=job_id) # type: Job retry_job = Job.objects.get(id=retry_job_id) # type: Job @@ -282,6 +288,7 @@ def check_retry_job_state(job_id: str, retry_job_id: str): "wsid", "authstrat", "batch_job", + "batch_id", "scheduler_type", ] @@ -597,7 +604,8 @@ def test_run_job_batch(self, rq_mock, condor_mock): batch_job = runner.check_job(job_id=batch_id) assert len(batch_job["child_jobs"]) == 3 - retry_id = runner.retry(job_id=child_job_id)["retry_id"] + retry_result = runner.retry(job_id=child_job_id) + retry_id = retry_result["retry_id"] self.check_retry_job_state(child_job_id, retry_id) batch_job = runner.check_job(job_id=batch_id) assert len(batch_job["child_jobs"]) == 4 From 00c749e7e3c811798a9b19122c9c2eb868e473f2 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 22 Jul 2021 17:12:41 -0500 Subject: [PATCH 098/109] Update deploy.cfg --- deploy.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy.cfg b/deploy.cfg index 256d289e7..8a685860c 100644 --- a/deploy.cfg +++ b/deploy.cfg @@ -80,7 +80,7 @@ request_memory = 204800M request_disk = 100GB [kb_upload] -request_cpus = {{ default .Env.kb_upload_default_cores "8" }} +request_cpus = {{ default .Env.kb_upload_default_cores "24" }} request_memory = 4500M request_disk = 50GB From 3150622a20945cf7a28a77f8356098013a340445 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 23 Jul 2021 10:49:56 -0500 Subject: [PATCH 099/109] Fix quay build (#411) * Update requirements.txt * Update requirements.txt * Update requirements.txt * update req * forgot the maps * forgot the requests Co-authored-by: bio-boris --- requirements.txt | 56 +----------------------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2659ef5c3..618dc4908 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,84 +1,30 @@ -i https://pypi.org/simple/ aiofiles==0.7.0 aiohttp==3.7.4.post0 -asn1crypto==1.4.0 -async-timeout==3.0.1 -attrs==21.2.0 -brotlipy==0.7.0 cachetools==4.2.2 -certifi==2021.5.30 -cffi==1.14.6 -chardet==4.0.0 codecov==2.1.11 -conda==4.10.3 -conda-package-handling==1.7.3 configparser==5.0.2 confluent-kafka==1.7.0 coverage==5.5 -cryptography==3.4.7 docker==5.0.0 gevent==21.1.2 -gprof2dot==2021.2.21 greenlet==1.1.0 gunicorn==20.1.0 -h11==0.12.0 -h2==4.0.0 -hpack==4.0.0 htcondor==9.1.0 -httpcore==0.2.1 -httptools==0.2.0 -hyperframe==6.0.1 -idna==2.10 -importlib-metadata==4.6.1 -iniconfig==1.1.1 Jinja2==3.0.1 JSONRPCBase==0.2.0 -maps==5.1.1 -MarkupSafe==2.0.1 -memory-profiler==0.58.0 mock==4.0.3 +maps==5.1.1 mongoengine==0.23.1 -multidict==5.1.0 -nose==1.3.7 -packaging==21.0 -pip==21.1.3 -pluggy==0.13.1 psutil==5.8.0 -py==1.10.0 -pycosat==0.6.3 -pycparser==2.20 pymongo==3.12.0 -pyOpenSSL==20.0.1 -pyparsing==2.4.7 -PySocks==1.7.1 pytest==6.2.4 pytest-cov==2.12.1 -pytest-profiling==1.7.0 python-dateutil==2.8.2 python-dotenv==0.18.0 requests==2.25.1 -requests-async==0.4.1 requests-mock==1.9.3 -rfc3986==1.5.0 -ruamel-yaml-conda==0.15.100 -ruamel.yaml==0.17.10 -ruamel.yaml.clib==0.2.6 sanic==21.6.0 -sanic-routing==0.7.0 -sentry-sdk==1.3.0 -setuptools==52.0.0.post20210125 -six==1.16.0 slackclient==2.9.3 toml==0.10.2 -tqdm==4.61.2 -typing-extensions==3.10.0.0 -ujson==4.0.2 urllib3==1.26.6 -uvloop==0.15.3 -websocket-client==1.1.0 -websockets==9.1 -wheel==0.36.2 -yarl==1.6.3 -zipp==3.5.0 -zope.event==4.5.0 -zope.interface==5.4.0 \ No newline at end of file From 8ee6c0b42d8657a8725585ff4268b55535efb0b9 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 6 Aug 2021 16:48:07 -0500 Subject: [PATCH 100/109] DATAUP-530 Refactor to bulk insert (#406) * Refactor to bulk insert * Refactor to bulk update * Timings * Timings * Timings * Timings * PR review * Try to prevent bad updates for cancelled jobs * Try to prevent bad updates for cancelled jobs * Not so trivial after all * Pr feedback * Fix up tests * Fix up assert * Fix missing test * FIx tests * Fix comment * Fix tests * Pr feedback * Fix updates * add post cancel * Fix tests * Add a test * Increase coverage * Removed unused, fixed tests * Removed unused, fixed tests * Removed unused, fixed tests * Add tests * Add tests * pr feedback * fix indent * Check job contents Co-authored-by: bio-boris --- lib/execution_engine2/db/MongoUtil.py | 88 ++++++- lib/execution_engine2/sdk/EE2Logs.py | 5 +- lib/execution_engine2/sdk/EE2Runjob.py | 191 +++++++++++--- lib/execution_engine2/sdk/EE2Status.py | 8 +- lib/execution_engine2/sdk/SDKMethodRunner.py | 22 +- lib/execution_engine2/utils/KafkaUtils.py | 5 +- test/tests_for_auth/ee2_admin_mode_test.py | 12 +- test/tests_for_db/ee2_MongoUtil_test.py | 90 ++++++- test/tests_for_sdkmr/EE2Runjob_test.py | 239 ++++++++++++++---- .../ee2_SDKMethodRunner_test.py | 36 ++- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 4 +- ...ee2_SDKMethodRunner_test_EE2Status_test.py | 11 +- test/tests_for_sdkmr/ee2_kafka_test.py | 2 +- test/utils_shared/producer.py | 5 +- 14 files changed, 584 insertions(+), 134 deletions(-) diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 1499cf267..0845cd84b 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -3,18 +3,20 @@ import time import traceback from contextlib import contextmanager -from typing import Dict +from datetime import datetime +from typing import Dict, List from bson.objectid import ObjectId from mongoengine import connect, connection -from pymongo import MongoClient +from pymongo import MongoClient, UpdateOne from pymongo.errors import ServerSelectionTimeoutError -from lib.execution_engine2.db.models.models import JobLog, Job, Status, TerminatedCode -from lib.execution_engine2.exceptions import ( +from execution_engine2.db.models.models import JobLog, Job, Status, TerminatedCode +from execution_engine2.exceptions import ( RecordNotFoundException, InvalidStatusTransitionException, ) +from execution_engine2.sdk.EE2Runjob import JobIdPair class MongoUtil: @@ -216,7 +218,9 @@ def get_job(self, job_id=None, exclude_fields=None) -> Job: return job - def get_jobs(self, job_ids=None, exclude_fields=None, sort_id_ascending=None): + def get_jobs( + self, job_ids=None, exclude_fields=None, sort_id_ascending=None + ) -> List[Job]: if not (job_ids and isinstance(job_ids, list)): raise ValueError("Please provide a non empty list of job ids") @@ -263,6 +267,68 @@ def check_if_already_finished(job_status): return True return False + def update_jobs_to_queued( + self, job_id_pairs: List[JobIdPair], scheduler_type: str = "condor" + ) -> None: + f""" + * Adds scheduler id to list of jobs + * Updates a list of {Status.created.value} jobs to queued. Does not work on jobs that already have gone through any other + status transition. If the record is not in the {Status.created.value} status, nothing will happen + :param job_id_pairs: A list of pairs of Job Ids and Scheduler Ids + :param scheduler_type: The scheduler this job was queued in, default condor + """ + + bulk_update_scheduler_jobs = [] + bulk_update_created_to_queued = [] + queue_time_now = datetime.utcnow().timestamp() + for job_id_pair in job_id_pairs: + if job_id_pair.job_id is None: + raise ValueError( + f"Provided a bad job_id_pair, missing job_id for {job_id_pair.scheduler_id}" + ) + elif job_id_pair.scheduler_id is None: + raise ValueError( + f"Provided a bad job_id_pair, missing scheduler_id for {job_id_pair.job_id}" + ) + + bulk_update_scheduler_jobs.append( + UpdateOne( + { + "_id": ObjectId(job_id_pair.job_id), + }, + { + "$set": { + "scheduler_id": job_id_pair.scheduler_id, + "scheduler_type": scheduler_type, + } + }, + ) + ) + bulk_update_created_to_queued.append( + UpdateOne( + { + "_id": ObjectId(job_id_pair.job_id), + "status": Status.created.value, + }, + { + "$set": { + "status": Status.queued.value, + "queued": queue_time_now, + } + }, + ) + ) + # Update provided jobs with scheduler id. Then only update non terminated jobs into updated status. + mongo_collection = self.config["mongo-jobs-collection"] + + if bulk_update_scheduler_jobs: + with self.pymongo_client(mongo_collection) as pymongo_client: + ee2_jobs_col = pymongo_client[self.mongo_database][mongo_collection] + # Bulk Update to add scheduler ids + ee2_jobs_col.bulk_write(bulk_update_scheduler_jobs, ordered=False) + # Bulk Update to add queued status ids + ee2_jobs_col.bulk_write(bulk_update_created_to_queued, ordered=False) + def cancel_job(self, job_id=None, terminated_code=None): """ #TODO Should we check for a valid state transition here also? @@ -420,6 +486,18 @@ def update_job_status(self, job_id, status, msg=None, error_message=None): def mongo_engine_connection(self): yield self.me_connection + def insert_jobs(self, jobs_to_insert: List[Job]) -> List[ObjectId]: + """ + Insert multiple job records using MongoEngine + :param jobs_to_insert: Multiple jobs to insert at once + :return: List of job ids from the insertion + """ + # TODO Look at pymongo write_concerns that may be useful + # TODO see if pymongo is faster + # TODO: Think about error handling + inserted = Job.objects.insert(doc_or_docs=jobs_to_insert, load_bulk=False) + return inserted + def insert_one(self, doc): """ insert a doc into collection diff --git a/lib/execution_engine2/sdk/EE2Logs.py b/lib/execution_engine2/sdk/EE2Logs.py index d96acb0b2..be04acd78 100644 --- a/lib/execution_engine2/sdk/EE2Logs.py +++ b/lib/execution_engine2/sdk/EE2Logs.py @@ -1,8 +1,8 @@ from enum import Enum from typing import Dict, NamedTuple -from lib.execution_engine2.db.models.models import JobLog as JLModel, LogLines -from lib.execution_engine2.exceptions import RecordNotFoundException +from execution_engine2.db.models.models import JobLog as JLModel, LogLines +from execution_engine2.exceptions import RecordNotFoundException # if TYPE_CHECKING: @@ -104,7 +104,6 @@ def add_job_logs(self, job_id, log_lines, as_admin=False) -> AddLogResult: self.sdkmr.get_job_with_permission( job_id, JobPermissions.WRITE, as_admin=as_admin ) - self.sdkmr.logger.debug(f"About to add logs for {job_id}") try: try: job_log = self.sdkmr.mongo_util.get_job_log_pymongo(job_id) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 0e57e3995..caaa7cdbd 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -74,6 +74,11 @@ class PreparedJobParams(NamedTuple): job_id: str +class JobIdPair(NamedTuple): + job_id: str + scheduler_id: str + + from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -87,10 +92,8 @@ def __init__(self, sdkmr): self.logger = self.sdkmr.get_logger() def _init_job_rec( - self, - user_id: str, - params: Dict, - ) -> str: + self, user_id: str, params: Dict, save: bool = True + ) -> Union[str, Job]: f""" Save an initial job record to the db and send a message to kafka @@ -163,11 +166,13 @@ def _init_job_rec( job.retry_parent = str(parent_retry_job_id) job.batch_id = str(params.get(_BATCH_ID)) if params.get(_BATCH_ID) else None - job_id = self.sdkmr.save_job(job) - self.sdkmr.get_kafka_client().send_kafka_message( - message=KafkaCreateJob(job_id=job_id, user=user_id) - ) - return job_id + if save: + job_id = self.sdkmr.save_job(job) + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaCreateJob(job_id=job_id, user=user_id) + ) + return job_id + return job def _check_ws_objects(self, source_objects) -> None: """ @@ -234,17 +239,7 @@ def _finish_created_job( error=f"{exception}", ) - def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParameters: - """ - Creates a job record and creates the job submission params - """ - - job_id = self._init_job_rec(self.sdkmr.get_user_id(), params) - - self.logger.debug( - f"User {self.sdkmr.get_user_id()} attempting to run job {params[_METHOD]} {params}" - ) - + def _generate_job_submission_params(self, job_id, params): return JobSubmissionParameters( job_id, AppInfo(params[_METHOD], params.get(_APP_ID)), @@ -258,6 +253,130 @@ def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParamet source_ws_objects=params.get(_SOURCE_WS_OBJECTS), ) + def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParameters: + """ + Creates a job record and creates the job submission params + """ + + job_id = self._init_job_rec(self.sdkmr.get_user_id(), params) + self.logger.debug( + f"User {self.sdkmr.get_user_id()} attempting to run job {params[_METHOD]} {params}" + ) + return self._generate_job_submission_params(job_id, params) + + def _run_multiple(self, runjob_params): + """ + Get the job records, bulk save them, then submit to condor. + If any condor submission fails, abort all of the jobs + :return: + """ + # Save records to db + job_records = [] + for runjob_param in runjob_params: + job_records.append( + self._init_job_rec(self.sdkmr.get_user_id(), runjob_param, save=False) + ) + job_ids = self.sdkmr.save_jobs(job_records) + + # Generate job submission params + job_submission_params = [] + for i, job_id in enumerate(job_ids): + job_submission_params.append( + self._generate_job_submission_params(job_id, runjob_params[i]) + ) + assert job_id == job_submission_params[i].job_id + + # Takes 2.5200018882751465 for 100 records, can shave off 2.5 secs by making this async + for job_id in job_ids: + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaCreateJob( + job_id=str(job_id), user=self.sdkmr.get_user_id() + ) + ) + # Submit to Condor + try: + submission_ids = self._submit_multiple(job_submission_params) + return submission_ids + except Exception as e: + self._abort_multiple_jobs(job_ids) + raise e + + def _update_to_queued_multiple(self, job_ids, scheduler_ids): + """ + This is called during job submission. If a job is terminated during job submission, + we have the chance to re-issue a termination and remove the job from the Job Queue + """ + if len(job_ids) != len(scheduler_ids): + raise Exception( + "Need to provide the same amount of job ids and scheduler_ids" + ) + jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) + self.sdkmr.get_mongo_util().update_jobs_to_queued(jobs_to_update) + jobs = self.sdkmr.get_mongo_util().get_jobs(job_ids) + + for job in jobs: + job_id = str(job.id) + if job.status == Status.queued.value: + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaQueueChange( + job_id=job_id, + new_status=Status.queued.value, + previous_status=Status.created.value, # TODO maybe change this to allow for estimating jobs + scheduler_id=job.scheduler_id, + ) + ) + elif job.status == Status.terminated.value: + # Remove from the queue, now that the scheduler_id is available + # The job record doesn't actually get updated in the db a 2nd time, and this TerminatedCode is only + # used by the initial transition to Terminated + self._safe_cancel(job_id, TerminatedCode.terminated_by_user) + + def _submit_multiple(self, job_submission_params): + """ + Submit multiple jobs. If any of the submissions are a failure, raise exception in order + to fail all submitted jobs, rather than allowing the submissions to continue + """ + begin = time.time() + job_ids = [] + condor_job_ids = [] + for job_submit_param in job_submission_params: + job_id = job_submit_param.job_id + job_ids.append(job_id) + try: + submission_info = self.sdkmr.get_condor().run_job( + params=job_submit_param + ) + condor_job_id = submission_info.clusterid + except Exception as e: + self.logger.error(e) + self._finish_created_job(job_id=job_id, exception=e) + raise e + + if submission_info.error is not None and isinstance( + submission_info.error, Exception + ): + self._finish_created_job(exception=submission_info.error, job_id=job_id) + raise submission_info.error + if condor_job_id is None: + error_msg = ( + "Condor job not run, and error not found. Something went wrong" + ) + self._finish_created_job( + job_id=job_id, exception=RuntimeError(error_msg) + ) + raise RuntimeError(error_msg) + condor_job_ids.append(condor_job_id) + + self.logger.error(f"It took {time.time() - begin} to submit jobs to condor") + # It took 4.836009502410889 to submit jobs to condor + + update_time = time.time() + self._update_to_queued_multiple(job_ids=job_ids, scheduler_ids=condor_job_ids) + # It took 1.9239885807037354 to update jobs + self.logger.error(f"It took {time.time() - update_time} to update jobs ") + + return job_ids + def _run(self, params): job_params = self._prepare_to_run(params=params) job_id = job_params.job_id @@ -265,7 +384,6 @@ def _run(self, params): try: submission_info = self.sdkmr.get_condor().run_job(params=job_params) condor_job_id = submission_info.clusterid - self.logger.debug(f"Submitted job id and got '{condor_job_id}'") except Exception as e: self.logger.error(e) self._finish_created_job(job_id=job_id, exception=e) @@ -285,14 +403,14 @@ def _run(self, params): return job_id - def _abort_child_jobs(self, child_job_ids): + def _abort_multiple_jobs(self, job_ids): """ Cancel a list of child jobs, and their child jobs """ - for child_job_id in child_job_ids: + for job_id in job_ids: try: self.sdkmr.cancel_job( - job_id=child_job_id, + job_id=job_id, terminated_code=TerminatedCode.terminated_by_batch_abort.value, ) except Exception as e: @@ -326,28 +444,26 @@ def _create_batch_job(self, wsid, meta): ) j = self.sdkmr.save_and_return_job(j) - # TODO Do we need a new kafka call? + # TODO Do we need a new kafka call for batch? self.sdkmr.get_kafka_client().send_kafka_message( message=KafkaCreateJob(job_id=str(j.id), user=j.user) ) return j def _run_batch(self, batch_job: Job, params): - child_jobs = [] + """Add the batch id, save the jobs to the db, run the jobs""" for job_param in params: job_param[_BATCH_ID] = str(batch_job.id) - try: - child_jobs.append(str(self._run(params=job_param))) - except Exception as e: - self.logger.debug( - msg=f"Failed to submit child job. Aborting entire batch job {e}" - ) - self._abort_child_jobs(child_jobs) - raise e - batch_job.child_jobs = child_jobs - self.sdkmr.save_job(batch_job) + child_jobs = self._run_multiple(params) + + # Cancel child jobs if we can't notify the batch job of the child jobs + try: + self.sdkmr.add_child_jobs(batch_job=batch_job, child_jobs=child_jobs) + except Exception as e: + self._abort_multiple_jobs(child_jobs) + raise e return child_jobs @@ -377,10 +493,8 @@ def run_batch( new_batch_job=True, as_admin=as_admin, ) - self._add_job_requirements(params, bool(as_admin)) # as_admin checked above self._check_job_arguments(params, batch_job=True) - batch_job = self._create_batch_job(wsid=wsid, meta=meta) children_jobs = self._run_batch(batch_job=batch_job, params=params) @@ -639,6 +753,7 @@ def retry_multiple( """ #TODO Add new job requirements/cgroups as an optional param #TODO Notify the parent container that it has multiple new children, instead of multiple transactions? + #TODO Prevent retry when multiple batch job containers? :param job_ids: The list of jobs to retry :param as_admin: Run with admin permission diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index f26f9ce56..053cfb77d 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -94,7 +94,6 @@ def cancel_job(self, job_id, terminated_code=None, as_admin=False): :param as_admin: Cancel the job for a different user """ # Is it inefficient to get the job twice? Is it cached? - # Maybe if the call fails, we don't actually cancel the job? job = self.sdkmr.get_job_with_permission( job_id, JobPermissions.WRITE, as_admin=as_admin @@ -123,6 +122,7 @@ def cancel_job(self, job_id, terminated_code=None, as_admin=False): ) # TODO Issue #190 IF success['TotalSuccess = 0'] == FALSE, don't send a kafka message? + self.sdkmr.get_condor().cancel_job(job_id=f"{job.scheduler_id}.0") self.sdkmr.kafka_client.send_kafka_message( message=KafkaCancelJob( @@ -373,9 +373,9 @@ def finish_job( def _update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: """ # TODO Does this need a kafka message? - :param job_id: - :param as_admin: - :return: + # TODO EE2 issue #251 : The saved job stats are inaccurate: + # The usage is not recorded until the job is completely finished. + :return: Resources at the time the job almost finished. """ # note this method is replaced by a magic mock in some tests job = self.sdkmr.get_job_with_permission( diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 5091b2fee..350599960 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -12,13 +12,14 @@ from datetime import datetime from enum import Enum from logging import Logger +from typing import List import dateutil -from lib.execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.MongoUtil import MongoUtil from execution_engine2.db.models.models import Job -from lib.execution_engine2.exceptions import AuthError -from lib.execution_engine2.sdk import ( +from execution_engine2.exceptions import AuthError +from execution_engine2.sdk import ( EE2Runjob, EE2StatusRange, EE2Authentication, @@ -251,6 +252,13 @@ def check_as_concierge(self): # at this point since MongoEngine creates a global connection to MongoDB # and makes it available to all the model objects. + def save_jobs(self, jobs: List[Job]) -> List[str]: + """ + Save multiple jobs to the Mongo DB at once, and return all of the job ids + """ + job_ids = self.get_mongo_util().insert_jobs(jobs_to_insert=jobs) + return [str(job_id) for job_id in job_ids] + def save_job(self, job: Job) -> str: """ Save a job record to the Mongo database and return the job's ID as a string. @@ -258,6 +266,14 @@ def save_job(self, job: Job) -> str: job.save() return str(job.id) + def add_child_jobs(self, batch_job: Job, child_jobs: List[str]): + """ + Add child jobs to a batch job record in the Mongo Database and return the updated job. + :return: + """ + batch_job.modify(add_to_set__child_jobs=child_jobs) + return batch_job + def save_and_return_job(self, job: Job) -> Job: """ Save a job record to the Mongo database and return the updated job. diff --git a/lib/execution_engine2/utils/KafkaUtils.py b/lib/execution_engine2/utils/KafkaUtils.py index ec2b07f7c..afb1a9473 100644 --- a/lib/execution_engine2/utils/KafkaUtils.py +++ b/lib/execution_engine2/utils/KafkaUtils.py @@ -212,11 +212,8 @@ def send_kafka_message(self, message, topic: str = DEFAULT_TOPIC): ) # TODO Remove POLL? producer.poll(2) - logger.debug( - f"Successfully sent message to kafka at topic={topic} message={json.dumps(message.__dict__)} server_address={self.server_address}" - ) except Exception as e: - logger.debug( + logger.error( f"Failed to send message to kafka at topic={topic} message={json.dumps(message.__dict__)} server_address={self.server_address}" ) raise Exception(e) diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index bbc16c465..15ee0682a 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -162,7 +162,7 @@ def test_regular_user(self): ws_auth.can_write.assert_called_once_with(self.ws_id) # RUNJOB BUT ATTEMPT TO BE AN ADMIN - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.run_job(params=job_params_1, as_admin=True) @@ -172,7 +172,7 @@ def test_regular_user(self): self.assertEqual(params["method"], job_params_1["method"]) # get_job_params BUT ATTEMPT TO BE AN ADMIN - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.get_job_params(job_id=job_id, as_admin=True) @@ -188,12 +188,12 @@ def test_regular_user(self): runner.view_job_logs(job_id=job_id) # add_job_logs and view them, BUT ATTEMPT TO BE AN ADMIN - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.add_job_logs(job_id=job_id, log_lines=lines, as_admin=True) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.view_job_logs(job_id=job_id, as_admin=True) @@ -288,7 +288,7 @@ def test_no_user(self): method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=RuntimeError, expected_regex=r"ServerError\('Token validation failed: Login failed! Server responded with code 401 Unauthorized'\)", ): @@ -317,7 +317,7 @@ def test_admin_reader(self): self.assertEqual(admin_type, {"permission": "r"}) # RUNJOB - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_admin ): runner.run_job(params=job_params_1, as_admin=True) diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index fc0e276bf..9591f16a1 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -2,12 +2,13 @@ import logging import os import unittest -from configparser import ConfigParser +from datetime import datetime from bson.objectid import ObjectId from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.db.models.models import Job, JobLog +from execution_engine2.db.models.models import Job, JobLog, Status +from execution_engine2.sdk.EE2Runjob import JobIdPair from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -57,6 +58,87 @@ def test_init_ok(self): mongo_util = self.getMongoUtil() self.assertTrue(set(class_attri) <= set(mongo_util.__dict__.keys())) + def test_insert_jobs(self): + """Check to see that jobs are inserted into mongo""" + job = get_example_job(status=Status.created.value) + job2 = get_example_job(status=Status.created.value) + jobs_to_insert = [job, job2] + job_ids = self.getMongoUtil().insert_jobs(jobs_to_insert) + assert len(job_ids) == len(jobs_to_insert) + retrieved_jobs = self.getMongoUtil().get_jobs(job_ids=job_ids) + + for i, retrieved_job in enumerate(retrieved_jobs): + assert jobs_to_insert[i].to_json() == retrieved_job.to_json() + + def test_update_jobs_enmasse(self): + """Check to see that created jobs get updated to queued""" + for state in Status: + job = get_example_job(status=Status.created.value, scheduler_id=None) + job2 = get_example_job(status=state.value, scheduler_id=None) + job3 = get_example_job(status=state.value, scheduler_id=None) + jobs = [job, job2, job3] + + for j in jobs: + j.scheduler_id = None + j.save() + assert j.scheduler_id is None + + job_ids = [job.id, job2.id, job3.id] + scheduler_ids = ["humpty", "dumpty", "alice"] + jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) + + now_ms = datetime.utcnow().timestamp() + + self.getMongoUtil().update_jobs_to_queued(jobs_to_update) + job.reload() + job2.reload() + job3.reload() + + # Check that sched ids are set + for i, val in enumerate(scheduler_ids): + assert jobs[i].scheduler_id == val + assert jobs[i].scheduler_type == "condor" + + # Checks that a timestamp in seconds since the epoch is within a second of the current time. + for j in jobs: + assert now_ms + 1 > j.updated + assert now_ms - 1 < j.updated + + # First job always should transition to queued + assert job.status == Status.queued.value + + # Created jobs should transition + if state.value == Status.created.value: + assert all(j.status == Status.queued.value for j in [job, job2, job3]) + + else: + # Don't change their state + assert all(j.status == state.value for j in [job2, job3]) + + def test_update_jobs_enmasse_bad_job_pairs(self): + job = get_example_job(status=Status.created.value).save() + job2 = get_example_job(status=Status.created.value).save() + job3 = get_example_job(status=Status.created.value).save() + job_ids = [job.id, job2.id, job3.id] + scheduler_ids = [job.scheduler_id, job2.scheduler_id, None] + job_id_pairs = list(map(JobIdPair, job_ids, scheduler_ids)) + + with self.assertRaisesRegex( + expected_exception=ValueError, + expected_regex=f"Provided a bad job_id_pair, missing scheduler_id for {job3.id}", + ): + self.getMongoUtil().update_jobs_to_queued(job_id_pairs) + + job_ids = [job.id, job2.id, None] + scheduler_ids = [job.scheduler_id, job2.scheduler_id, job3.scheduler_id] + job_id_pairs = list(map(JobIdPair, job_ids, scheduler_ids)) + + with self.assertRaisesRegex( + expected_exception=ValueError, + expected_regex=f"Provided a bad job_id_pair, missing job_id for {job3.scheduler_id}", + ): + self.getMongoUtil().update_jobs_to_queued(job_id_pairs) + def test_get_by_cluster(self): """Get a job by its condor scheduler_id""" mongo_util = self.getMongoUtil() @@ -67,7 +149,6 @@ def test_get_by_cluster(self): self.assertEqual(str(job_id), batch) def test_get_job_ok(self): - mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): @@ -142,7 +223,6 @@ def test_get_job_ok(self): self.assertEqual(ori_job_count, Job.objects.count()) def test_get_jobs_ok(self): - mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): @@ -199,7 +279,6 @@ def test_get_jobs_ok(self): self.assertEqual(ori_job_count, Job.objects.count()) def test_connection_ok(self): - mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): @@ -341,7 +420,6 @@ def test_delete_one_ok(self): self.assertEqual(col.count_documents({}), doc_count) def test_get_job_log_pymongo_ok(self): - mongo_util = self.getMongoUtil() primary_key = ObjectId() diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 3698c9124..3794104c9 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -14,13 +14,19 @@ from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.db.models.models import Job, JobInput, JobRequirements, Meta +from execution_engine2.db.models.models import ( + Job, + JobInput, + JobRequirements, + Meta, + Status, +) from execution_engine2.exceptions import ( IncorrectParamsException, AuthError, InvalidParameterForBatch, ) -from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions +from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions, JobIdPair from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.sdk.job_submission_parameters import ( JobSubmissionParameters, @@ -72,7 +78,6 @@ _CLUSTER_1 = "cluster1" _CLUSTER_2 = "cluster2" - _EMPTY_JOB_REQUIREMENTS = { "cpus": None, "memory_MB": None, @@ -529,7 +534,6 @@ def test_run_job_as_concierge_sched_reqs_empty_list_as_admin(): def _run_as_concierge_empty_as_admin(concierge_params, app): - # Set up data variables client_group = "concierge" # hardcoded default for run_as_concierge cpus = 1 @@ -763,7 +767,7 @@ def _run_and_run_batch_fail( _run_batch_fail(rj, [params], {}, as_admin, expected) -def _set_up_common_return_values_batch(mocks): +def _set_up_common_return_values_batch(mocks, returned_job_state=_QUEUED_STATE): """ Set up return values on mocks that are the same for several tests. """ @@ -773,13 +777,13 @@ def _set_up_common_return_values_batch(mocks): returned_parent_job = Job() returned_parent_job.id = ObjectId(_JOB_ID) returned_parent_job.user = _USER + + mocks[SDKMethodRunner].save_and_return_job.return_value = returned_parent_job mocks[CatalogCache].lookup_git_commit_version.side_effect = [ _GIT_COMMIT_1, _GIT_COMMIT_2, ] - mocks[SDKMethodRunner].save_and_return_job.return_value = returned_parent_job - # create job1, update job1, create job2, update job2, update parent job mocks[SDKMethodRunner].save_job.side_effect = [ _JOB_ID_1, @@ -788,6 +792,11 @@ def _set_up_common_return_values_batch(mocks): None, None, ] + + mocks[SDKMethodRunner].save_jobs.side_effect = [ + [_JOB_ID_1, _JOB_ID_2], + ] + mocks[Condor].run_job.side_effect = [ SubmissionInfo(_CLUSTER_1, {}, None), SubmissionInfo(_CLUSTER_2, {}, None), @@ -798,10 +807,25 @@ def _set_up_common_return_values_batch(mocks): retjob_2 = Job() retjob_2.id = ObjectId(_JOB_ID_2) retjob_2.status = _CREATED_STATE + + retjob_1_after_submit = Job() + retjob_1_after_submit.id = ObjectId(_JOB_ID_1) + retjob_1_after_submit.status = returned_job_state + retjob_1_after_submit.scheduler_id = _CLUSTER_1 + retjob_2_after_submit = Job() + retjob_2_after_submit.id = ObjectId(_JOB_ID_2) + retjob_2_after_submit.status = returned_job_state + retjob_2_after_submit.scheduler_id = _CLUSTER_2 + mocks[MongoUtil].get_job.side_effect = [retjob_1, retjob_2] + mocks[MongoUtil].get_jobs.side_effect = [ + [retjob_1_after_submit, retjob_2_after_submit] + ] -def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): +def _check_common_mock_calls_batch( + mocks, reqs1, reqs2, parent_wsid, terminated_during_submit=False +): """ Check that mocks are called as expected when those calls are similar or the same for several tests. @@ -834,7 +858,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): ] ) - assert len(sdkmr.save_job.call_args_list) == 5 + assert len(sdkmr.save_jobs.call_args_list) == 1 # initial child jobs data save expected_job_1 = _create_job( @@ -846,7 +870,7 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): wsid=parent_wsid, batch_id=_JOB_ID, ) - got_job_1 = sdkmr.save_job.call_args_list[0][0][0] + got_job_1 = sdkmr.save_jobs.call_args_list[0][0][0][0] assert_jobs_equal(got_job_1, expected_job_1) expected_job_2 = _create_job( @@ -857,8 +881,8 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): wsid=parent_wsid, batch_id=_JOB_ID, ) - # index 2 because job 1 is updated with save_job before this job is created - got_job_2 = sdkmr.save_job.call_args_list[2][0][0] + # index 1 because save_jobs returns a list of two jobs + got_job_2 = sdkmr.save_jobs.call_args_list[0][0][0][1] assert_jobs_equal(got_job_2, expected_job_2) jsp_expected_1 = JobSubmissionParameters( @@ -882,38 +906,53 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): [call(params=jsp_expected_1), call(params=jsp_expected_2)] ) - # updated job data save - mocks[MongoUtil].get_job.assert_has_calls([call(_JOB_ID_1), call(_JOB_ID_2)]) - # update to queued state - got_queued_job_1 = sdkmr.save_job.call_args_list[1][0][0] - got_queued_job_2 = sdkmr.save_job.call_args_list[3][0][0] - _check_queued_job_save(got_queued_job_1, _JOB_ID_1, _CLUSTER_1) - _check_queued_job_save(got_queued_job_2, _JOB_ID_2, _CLUSTER_2) - - mocks[KafkaClient].send_kafka_message.assert_has_calls( - [ - call(KafkaCreateJob(job_id=_JOB_ID, user=_USER)), # parent job - call(KafkaCreateJob(job_id=_JOB_ID_1, user=_USER)), - call( - KafkaQueueChange( - job_id=_JOB_ID_1, - new_status=_QUEUED_STATE, - previous_status=_CREATED_STATE, - scheduler_id=_CLUSTER_1, - ) - ), - call(KafkaCreateJob(job_id=_JOB_ID_2, user=_USER)), - call( - KafkaQueueChange( - job_id=_JOB_ID_2, - new_status=_QUEUED_STATE, - previous_status=_CREATED_STATE, - scheduler_id=_CLUSTER_2, - ) - ), - ] - ) + child_job_pairs = [ + JobIdPair(_JOB_ID_1, _CLUSTER_1), + JobIdPair(_JOB_ID_2, _CLUSTER_2), + ] + mocks[MongoUtil].update_jobs_to_queued.assert_has_calls([call(child_job_pairs)]) + job_ids = [child_job_pair.job_id for child_job_pair in child_job_pairs] + mocks[MongoUtil].get_jobs.assert_has_calls([call(job_ids)]) + + if not terminated_during_submit: + mocks[KafkaClient].send_kafka_message.assert_has_calls( + [ + call(KafkaCreateJob(job_id=_JOB_ID, user=_USER)), # parent job + call(KafkaCreateJob(job_id=_JOB_ID_1, user=_USER)), + call(KafkaCreateJob(job_id=_JOB_ID_2, user=_USER)), + call( + KafkaQueueChange( + job_id=_JOB_ID_1, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER_1, + ) + ), + call( + KafkaQueueChange( + job_id=_JOB_ID_2, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER_2, + ) + ), + ] + ) + else: + mocks[KafkaClient].send_kafka_message.assert_has_calls( + [ + call(KafkaCreateJob(job_id=_JOB_ID, user=_USER)), # parent job + call(KafkaCreateJob(job_id=_JOB_ID_1, user=_USER)), + call(KafkaCreateJob(job_id=_JOB_ID_2, user=_USER)), + ] + ) + mocks[SDKMethodRunner].cancel_job.assert_has_calls( + [ + call(job_id=_JOB_ID_1, terminated_code=0), + call(job_id=_JOB_ID_2, terminated_code=0), + ] + ) # Removed for now, but might be added back in if run_job_message is re-added # mocks[SlackClient].run_job_message.assert_has_calls( @@ -923,12 +962,114 @@ def _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid): # ] # ) - final_expected_parent_job = Job() - final_expected_parent_job.id = ObjectId(_JOB_ID) - final_expected_parent_job.user = _USER - final_expected_parent_job.child_jobs = [_JOB_ID_1, _JOB_ID_2] - final_got_parent_job = sdkmr.save_job.call_args_list[4][0][0] - assert_jobs_equal(final_got_parent_job, final_expected_parent_job) + # Test to see if add_child jobs is called with correct batch_container and children + expected_batch_container = Job() + expected_batch_container.id = ObjectId(_JOB_ID) + expected_batch_container.user = _USER + + batch_job = sdkmr.add_child_jobs.call_args_list[0][1]["batch_job"] + sdkmr.add_child_jobs.assert_called_once_with( + batch_job=expected_batch_container, child_jobs=[_JOB_ID_1, _JOB_ID_2] + ) + """ + So this test doesn't actually check that the call is correct, but the assert_jobs_equal line below does + the assert below is necessary because of how equality works for Job objects + ( because they have the same object ID, which is what Job equality is based on. ) + and that the assert_called_once_with doesn't correctly check the job object + """ + assert_jobs_equal(batch_job, expected_batch_container) + + +def test_run_job_batch_with_cancellation_during_submit(): + """ + A basic unit test of the run_batch() method, providing a workspace ID for the parent job. This one also checks for + cancellation during submit causing a job cancellation request to be processed . + + This test is a fairly minimal test of the run_batch() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + # When an assertion is failed, this test doesn't show you where failed in PyCharm, so use + # Additional arguments `--no-cov -s` or run from cmd line + # PYTHONPATH=.:lib:test pytest test/tests_for_sdkmr/EE2Runjob_test.py::test_run_job_batch_with_parent_job_wsid --no-cov + + # set up variables + parent_wsid = 89 + wsid = 32 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + mocks[WorkspaceAuth].can_write.return_value = True + mocks[WorkspaceAuth].can_write_list.return_value = {wsid: True} + + jrr.normalize_job_reqs.side_effect = [{}, {}] + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.STANDARD, + ] + reqs1 = ResolvedRequirements( + cpus=1, + memory_MB=2, + disk_GB=3, + client_group="cg1", + ) + reqs2 = ResolvedRequirements( + cpus=10, + memory_MB=20, + disk_GB=30, + client_group="cg2", + ) + jrr.resolve_requirements.side_effect = [reqs1, reqs2] + + _set_up_common_return_values_batch( + mocks, returned_job_state=Status.terminated.value + ) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = [ + { + "method": _METHOD_1, + "app_id": _APP_1, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + }, + { + "method": _METHOD_2, + "app_id": _APP_2, + "wsid": wsid, + }, + ] + params[1]["wsid"] = None + assert rj.run_batch(params, {"wsid": parent_wsid}) == { + "batch_id": _JOB_ID, + "child_job_ids": [_JOB_ID_1, _JOB_ID_2], + } + + # check mocks called as expected. The order here is the order that they're called in the code. + mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) + + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.assert_has_calls( + [call({}, "input job"), call({}, "input job")] + ) + jrr.get_requirements_type.assert_has_calls( + [call(**_EMPTY_JOB_REQUIREMENTS), call(**_EMPTY_JOB_REQUIREMENTS)] + ) + jrr.resolve_requirements.assert_has_calls( + [ + call(_METHOD_1, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + ] + ) + _check_common_mock_calls_batch( + mocks, reqs1, reqs2, parent_wsid, terminated_during_submit=True + ) def test_run_job_batch_with_parent_job_wsid(): diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index a90044703..ce98c0fb9 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -19,9 +19,13 @@ from execution_engine2.authorization.workspaceauth import WorkspaceAuth from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job, Status, TerminatedCode from execution_engine2.exceptions import AuthError +from execution_engine2.exceptions import InvalidStatusTransitionException +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.sdk.job_submission_parameters import JobRequirements from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo from execution_engine2.utils.KafkaUtils import KafkaClient from execution_engine2.utils.SlackUtils import SlackClient from execution_engine2.utils.clients import UserClientSet, ClientSet @@ -30,10 +34,6 @@ JobRequirementsResolver, RequirementsType, ) -from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode -from lib.execution_engine2.exceptions import InvalidStatusTransitionException -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from test.utils_shared.test_utils import ( @@ -48,7 +48,7 @@ logging.basicConfig(level=logging.INFO) bootstrap() -from lib.execution_engine2.sdk.EE2Runjob import EE2RunJob +from execution_engine2.sdk.EE2Runjob import EE2RunJob from installed_clients.CatalogClient import Catalog from installed_clients.WorkspaceClient import Workspace @@ -174,7 +174,7 @@ def test_getters(self): is clients_and_mocks[JobRequirementsResolver] ) - def test_save_job(self): + def test_save_job_and_save_jobs(self): ws = Workspace("https://fake.com") wsa = WorkspaceAuth("user", ws) cliset = UserClientSet("user", "token", ws, wsa) @@ -190,9 +190,31 @@ def test_save_job(self): j = create_autospec(Job, spec_set=False, instance=True) j.id = bson.objectid.ObjectId("603051cfaf2e3401b0500982") assert sdkmr.save_job(j) == "603051cfaf2e3401b0500982" - j.save.assert_called_once_with() + # Test Save Jobs + job1 = Job() + job1.id = bson.objectid.ObjectId("603051cfaf2e3401b0500980") + job2 = Job() + job2.id = bson.objectid.ObjectId("603051cfaf2e3401b0500981") + sdkmr.get_mongo_util().insert_jobs.return_value = [job1.id, job2.id] + jobs = sdkmr.save_jobs([job1, job2]) + sdkmr.get_mongo_util().insert_jobs.assert_called_with( + jobs_to_insert=[job1, job2] + ) + assert jobs == [str(job1.id), str(job2.id)] + + def test_add_child_jobs(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + sdkmr = SDKMethodRunner(cliset, clients_and_mocks[ClientSet]) + j = create_autospec(Job, spec_set=False, instance=True) + returned_job = sdkmr.add_child_jobs(batch_job=j, child_jobs=["a", "b", "c"]) + j.modify.assert_called_once_with(add_to_set__child_jobs=["a", "b", "c"]) + assert returned_job == j + def test_save_and_return_job(self): ws = Workspace("https://fake.com") wsa = WorkspaceAuth("user", ws) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 96ecd7349..f30c153f7 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -334,7 +334,7 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): "'123' is not a valid ObjectId, it must be a 12-byte input or a 24-character " "hex string" ) - with self.assertRaisesRegexp(RetryFailureException, errmsg): + with self.assertRaisesRegex(RetryFailureException, errmsg): runner.retry_multiple(job_ids=[parent_job_id1, 123]) # 3. Retry the jobs with duplicate job ids @@ -382,7 +382,7 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): self.check_retry_job_state(parent_job_id4, job4["job_id"]) # Test no job ids - with self.assertRaisesRegexp(ValueError, "No job_ids provided to retry"): + with self.assertRaisesRegex(ValueError, "No job_ids provided to retry"): runner.retry_multiple(job_ids=None) # Test error during retry, but passing validate diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index 98e73cf70..f356d8ce0 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -131,7 +131,7 @@ def test_check_job(self, rq_mock, condor_mock): @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_run_job_and_handle_held(self, rq_mock, condor_mock): """ - Run a job, then call it held as an admin, and then check to see if the record contains condor info about the job + Run a job, then call it held as an admin, and then check to see if the record is set to error or terminated :param rq_mock: :param condor_mock: :return: @@ -157,8 +157,15 @@ def test_run_job_and_handle_held(self, rq_mock, condor_mock): print( f"Job id is {job_id}. Status is {check_job.get('status')} Cluster is {check_job.get('scheduler_id')} " ) + self.assertEqual(check_job.get("status"), Status.queued.value) job_record = runner.handle_held_job(cluster_id=check_job.get("scheduler_id")) - self.assertEqual(self.fake_used_resources, job_record.get("condor_job_ads")) + # This flaky test changes depending on your test environment + self.assertIn( + job_record.get("status"), [Status.terminated.value, Status.error.value] + ) + # Condor ads are actually wrong and should only be updated after the job is completed, + # so we don't need to check them in this test right now. + # See EE2 issue #251 def test_update_job_status(self): runner = self.getRunner() diff --git a/test/tests_for_sdkmr/ee2_kafka_test.py b/test/tests_for_sdkmr/ee2_kafka_test.py index ebd73a845..60856d18c 100644 --- a/test/tests_for_sdkmr/ee2_kafka_test.py +++ b/test/tests_for_sdkmr/ee2_kafka_test.py @@ -29,7 +29,7 @@ def setUpClass(cls): def test_status_change(self): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=TypeError, expected_regex=r"__init__\(\) missing 1 required positional argument: 'scheduler_id'", ): diff --git a/test/utils_shared/producer.py b/test/utils_shared/producer.py index b90926da4..0ff2f5ace 100644 --- a/test/utils_shared/producer.py +++ b/test/utils_shared/producer.py @@ -22,11 +22,8 @@ def send_kafka_message(self, message, topic=DEFAULT_TOPIC): producer = Producer({"bootstrap.servers": self.server_address}) producer.produce(topic, str(message), callback=_delivery_report) producer.poll(2) - logging.info( - f"Successfully sent message to kafka at topic={topic} message={json.dumps(message)} server_address={self.server_address}" - ) except Exception as e: - logging.info( + logging.error( f"Failed to send message to kafka at topic={topic} message={json.dumps(message)} server_address={self.server_address}" ) raise Exception(e) From d6fc1ccc700a8bc67350180a49f45e5c8c160c0d Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Mon, 9 Aug 2021 13:14:21 -0700 Subject: [PATCH 101/109] DATAUP-434 retry ADR3 review (#409) * remove ADR3 * readd ADR3 * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md * Update 003-Retry_endpoint_design.md Co-authored-by: bio-boris Co-authored-by: bio-boris --- docs/adrs/003-Retry_endpoint_design.md | 62 ++++++++++++++------------ 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/docs/adrs/003-Retry_endpoint_design.md b/docs/adrs/003-Retry_endpoint_design.md index 8ca66257a..e5a992b24 100644 --- a/docs/adrs/003-Retry_endpoint_design.md +++ b/docs/adrs/003-Retry_endpoint_design.md @@ -7,7 +7,7 @@ Date: 2021-05-19 The current requirement for the Batch/Bulk UI is to be able to retry jobs that have either "errored" out, or were terminated. The UI allows you to retry either single jobs, or multiple jobs, and saves you from having to cancel and resubmit each job individually, -which is not currently implemented in the UI anyway. +which is not currently implemented in the UI anyway. ### Motivation for the `code spike` for retry endpoint and follow up design ADR >As I mentioned, as the product owner, I find our ability to deliver functionality to be pretty awful. @@ -25,6 +25,7 @@ The current implementation of retry is to run jobs using the `retry_job` or `ret The endpoint takes a job or list of job ids and then attempts to resubmit them to the queue, using the exact same set of parameters and version of the app. ### Current Behavior + * Spec file is located at https://github.com/kbase/execution_engine2/blob/8baab8e3ac5212f4bbe59fd935980aa41b4ee06d/execution_engine2.spec#L201-L247 * A job id is provided. If there are sufficient permissions, the call will proceed, if not, it will error out, unless the `as_admin` flag is provided by an admin @@ -37,7 +38,8 @@ The endpoint takes a job or list of job ids and then attempts to resubmit them t ### Batch Behavior -* If a job has the attribute of `batch_job=True` the retry will fail, since there is no method to re-run. This is a bug, as it doesn't fail gracefully. + +* If a job has the attribute of `batch_job=True` the retry will fail, since there is no method to re-run. This is a bug, as it doesn't fail gracefully. Gracefully handling jobs with children means that it won't throw an error about not having a method to re-run, and instead will throw an error that says "Cannot retry batch job parents. Must retry individual jobs" * If a job has the attribute of `batch_job=True`, but is actually a child job, the parent will be notified of this new retried job * Multiple in-flight retries are allowed. * Adds `child_job_id` to `parent_job_id.child_job_ids[]` @@ -61,27 +63,31 @@ is a lot of time for things to go wrong. * Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) * Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) * One single submission to HTCondor instead of multiple job submissions -* Ability to gracefully handle jobs with children -* Ability to handle database consistentcy during retry failure +* Ability to gracefully handle batch container jobs with children to throw proper error ([See Batch Behavior](#Batch-Behavior)) +* Ability to handle database consistency during retry failure * See if we can make some preflight (before the job starts) checks fail before job submission and handle them differently than those that appear during job submission #### Data inconsistency * A new `retry_ids` field will show a list of jobs that have been retried using this parent id. Retry_count will be returned as a calculated field based off of retry_ids -* `retry_toggle` field will allow a seperate process to check for jobs that didn't finish the entire retry lifecycle: +* `retry_toggle` field will allow a seperate process to check and possibly correct for jobs that didn't finish the entire retry lifecycle: 1) Launch child jobs 2) Notify the batch parent of the child, 3) Notify the retry parent of the child, 4) Update the retry_toggle field #### Won't do -* Prevent multiple in-flight retries of the same original job to prevent the user from wasting their own resources (and the queues resources) * Add retry_number field ## New priority + +For MVP * Create a retry_jobs field, and expose list in api, and a T/F completeness toggle +* Add failure conditions in run method to fail before creating db records + +Not for mvp * Non blocking job submission / (Possibly htcondor submit) -* Add failure conditions in run method -* Add thread to perform actions based on toggle +* Add thread/reaper to perform actions based on toggle + ### Questions @@ -104,7 +110,7 @@ Looks like the options are * implement db integrity checks and two-phase commits for making the relationships between a job, its `retry_parent`, and the batch container * accept that the db info may be incomplete and write workarounds into the clients * (upgrade to Mongo 4.4 for better transaction support) - +A: We have decided to use a `retry_toggle` in order to mark that the entire transaction has occurred for a retry job, and to set up a monitor to fix the jobs that didn't finish the retry lifecycle. ##### Q: Do we want to support ResourceRequirements A: Probably not in the short term @@ -116,7 +122,7 @@ A: Not necessarily relevant to this endpoint, more of a `run_job_batch` endpoint #### Shorter Q and A Should we track a retry count? (Done) - Should users see this retry count? (Unknown TBD) + Should users see this retry count? A: Visible in the EE2 API, UI is TBD Are retried jobs saved in some sort of data structure linking them, possibly indirectly, to the parent job or are they orphaned? (Yes, retry_parent) If the former, is the retry relationship linear or a tree? E.g. what happens if there are two simultaneous calls to retry a job? (Tree, simultaneous jobs run) Should it be at least theoretically possible to see the list of retried jobs in order? (It is possible by sorting on creation date) @@ -124,39 +130,39 @@ A: Not necessarily relevant to this endpoint, more of a `run_job_batch` endpoint Can a job in states other than failed or canceled be retried? Or should the user be required to cancel a job before it can be retried? (Job must be in Error/Cancel state) -# Work estimation +# Work estimation for MVP Priority descending -* Address data inconsistency via retry_count, retry_ids and retry_toggle + +### Address data inconsistency via retry_count, retry_ids and retry_toggle > Estimate 3-4 days > https://kbase-jira.atlassian.net/browse/DATAUP-461 -* Preflight checks +### Preflight checks > Estimate 3-4 days > https://kbase-jira.atlassian.net/browse/DATAUP-528 - -* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) -* > Estimate 3-4 days -> Requires refactor of run_job_batch to be non blocking - - > Requires retry to be able to force the same app `git_commit versions` and `JobRequirements` from the db records https://kbase-jira.atlassian.net/browse/DATAUP-461 +### Create a created jobs and queued jobs reaper that cancels created jobs older than 1 hour, and cancels queued jobs over 14 days old. +> Estimate 2-3 days +https://kbase-jira.atlassian.net/browse/DATAUP-536 -* Hookup retrys to refactored code -* Requires refactor of retry to gracefully handle jobs with children by notifying the batch containers for retry of ids not in the same batch +# Work estimation for POST MVP + +### Hookup retries to refactored code +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_multiple` +* Requires refactor of retry to gracefully handle jobs with children by notifying the batch containers for retry of ids not in the same batch. If you retry jobs from batch 1 and from batch 2, you want the correct batch parent to be notified. +* Switching from starting the retried jobs one at a time to starting them in batch mode will require refactoring how the batch and retry parents are updated > Estimate 3 days > https://kbase-jira.atlassian.net/browse/DATAUP-535 - -* One single submission to HTCondor instead of multiple job submission () +Not for MVP +### One single submission to HTCondor instead of multiple job submission () > Estimate 1-2 days > https://kbase-jira.atlassian.net/browse/DATAUP-391 - -* Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) +Not for MVP +### Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) > Estimate 3-4 days https://kbase-jira.atlassian.net/browse/DATAUP-439 -* Create a created jobs and queued jobs reaper than queues created jobs older than 1 hour, and queued jobs over 14 days old. -> Estimate 2-3 days -https://kbase-jira.atlassian.net/browse/DATAUP-536 + From b239594591938ce55aaf9dbe37e6f95a38db9a3b Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 9 Aug 2021 15:22:56 -0500 Subject: [PATCH 102/109] DATAUP-536 Bad jobs reaper (#412) * skeleton * skeleton * updated reaper works * Fix broken condor deploy reference * Setup cron * Setup cronjob * Fix retryable writes * Fix retryable writes * Fix missing env vars * Fix missing env vars * Fix env * Fix env and format * Fix env and format * Fix env and format * Fix env * Move crontab to cron * Fix cron log * Fix cron * Fix cron * Fix docker hard links * Attempt touching file and using cron without service * pr feedback * adding config var * adding config var * Fixed constants * Fix path * PR feedback * ran black Co-authored-by: bio-boris --- Dockerfile | 9 +- README.md | 18 +++ bin/PurgeBadJobs.py | 127 ++++++++++++++++++++++ bin/PurgeHeldJobs.py | 26 ++--- bin/cron_vars | 2 + bin/ee2_cronjobs | 6 + build/templates/deploy.docker.cfg.templ | 1 + deploy.cfg | 1 + lib/execution_engine2/db/MongoUtil.py | 7 +- lib/execution_engine2/utils/SlackUtils.py | 6 +- scripts/entrypoint.sh | 20 ++-- test/deploy.cfg | 1 + 12 files changed, 191 insertions(+), 33 deletions(-) create mode 100644 bin/PurgeBadJobs.py create mode 100644 bin/cron_vars create mode 100644 bin/ee2_cronjobs diff --git a/Dockerfile b/Dockerfile index 252b32794..dfba2f2b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,12 +43,14 @@ RUN echo "mongodb-org hold" | dpkg --set-selections \ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ && bash ~/miniconda.sh -b -p /miniconda-latest +# Setup Cron +COPY ./bin/ee2_cronjobs /etc/cron.d/ee2_cronjobs + # Need to change startup scripts to match this in MAKEFILE ENV PATH=/miniconda-latest/bin:$PATH RUN pip install --upgrade pip && python -V - - COPY ./requirements.txt /kb/module/requirements.txt + RUN pip install -r /kb/module/requirements.txt RUN adduser --disabled-password --gecos '' -shell /bin/bash kbase # ----------------------------------------- @@ -63,7 +65,10 @@ WORKDIR /kb/module/scripts RUN chmod +x download_runner.sh && ./download_runner.sh WORKDIR /kb/module/ + +# Set deploy.cfg location ENV KB_DEPLOYMENT_CONFIG=/kb/module/deploy.cfg +ENV PATH=/kb/module:$PATH ENTRYPOINT [ "./scripts/entrypoint.sh" ] CMD [ ] diff --git a/README.md b/README.md index f7762d853..bd3f0d54c 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,24 @@ Note that the representation of this data in the catalog API is idiosyncratic - CSV data are split by commas into parts. EE2 will detect JSON entries and reconsitute them before deserialization. + +# CronJobs/Reaper Scripts + +* Notifications are sent to the #ee_notifications slack channel + +### PurgeBadJobs +* Cronjobs are copied in and launched via the Dockerfile +* There are cronjobs configured in /etc/cron.d/ee2_cronjobs +* You can monitor them by reading the logs in /root/cron-purge.log + +### PurgeHeldJobs +* This is a daemon launched by entrypoint.sh +* It is not a cronjob because there is no way to easy way to seek through the HTCondor EXECUTE log, which takes a while to seek through + +#### Horizontal Scaling +* These scripts will have to be rethought if we do not want multiple copies running if ee2 is horizontally scaled. + + # Help Contact @Tianhao-Gu, @bio_boris, @briehl diff --git a/bin/PurgeBadJobs.py b/bin/PurgeBadJobs.py new file mode 100644 index 000000000..d2a182c84 --- /dev/null +++ b/bin/PurgeBadJobs.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Script to purge jobs that have been queued for too long, or stuck in the created state for too long + +import logging +import os +from configparser import ConfigParser +from datetime import datetime, timedelta, timezone +from time import sleep + +import pymongo +from bson import ObjectId + +from lib.execution_engine2.db.models.models import TerminatedCode, Status +from lib.execution_engine2.utils.SlackUtils import SlackClient +from lib.installed_clients.execution_engine2Client import execution_engine2 + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +config = ConfigParser() +config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) +ee2_endpoint = config.get(section="execution_engine2", option="ee2-url") +slack_token = config.get(section="execution_engine2", option="slack-token") + +ee2 = execution_engine2(url=ee2_endpoint, token=os.environ["EE2_ADMIN_SERVICE_TOKEN"]) +slack_client = SlackClient( + slack_token, channel="#ee_notifications", debug=True, endpoint=ee2_endpoint +) +db_client = pymongo.MongoClient( + host=config.get(section="execution_engine2", option="mongo-host"), + port=int(config.get(section="execution_engine2", option="mongo-port")), + username=config.get(section="execution_engine2", option="mongo-user"), + password=config.get(section="execution_engine2", option="mongo-password"), + authSource=config.get(section="execution_engine2", option="mongo-database"), + authMechanism=config.get(section="execution_engine2", option="mongo-authmechanism"), + serverSelectionTimeoutMS=1000, +) +ee2_db = db_client.get_database( + config.get(section="execution_engine2", option="mongo-database") +) +ee2_jobs_collection = ee2_db.get_collection( + config.get(section="execution_engine2", option="mongo-jobs-collection") +) + +CREATED_MINUTES_AGO = 5 +QUEUE_THRESHOLD_DAYS = 14 + + +def cancel(record): + job_id = str(record["_id"]) + scheduler_id = record.get("scheduler_id") + cjp = { + "as_admin": True, + "job_id": job_id, + "terminated_code": TerminatedCode.terminated_by_automation.value, + } + print("About to cancel ee2 job", cjp) + ee2.cancel_job(params=cjp) + slack_client.cancel_job_message( + job_id=job_id, + scheduler_id=scheduler_id, + termination_code=TerminatedCode.terminated_by_automation.value, + ) + # Avoid rate limit of 1 msg per second + sleep(1) + + +def cancel_jobs_stuck_in_queue(): + """ + For jobs over 14 days old, cancel them + Update a completed Job as necessary to test this out: + ee2.update_job_status({'job_id': '601af2afeeb773acaf9de80d', 'as_admin': True, 'status': 'queued'}) + :return: + """ + queue_threshold_days = QUEUE_THRESHOLD_DAYS + before_days = ( + datetime.today() - timedelta(days=queue_threshold_days + 1) + ).timestamp() + print({"status": "queued", "queued": {"$lt": before_days}}) + stuck_jobs = ee2_jobs_collection.find( + {"status": Status.queued.value, "queued": {"$lt": before_days}} + ) + print( + f"Found {stuck_jobs.count()} jobs that were stuck in the {Status.queued.value} state over {queue_threshold_days} days" + ) + for record in stuck_jobs: + queued_time = record["queued"] + now = datetime.now(timezone.utc).timestamp() + elapsed = now - queued_time + print("queued days=", elapsed / 86000) + cancel(record) + + +def cancel_created(): + """ + For jobs that are not batch jobs, and have been in the created state for more than 5 minutes, uh oh, spaghettio, time to go + """ + + five_mins_ago = ObjectId.from_datetime( + datetime.now(timezone.utc) - timedelta(minutes=CREATED_MINUTES_AGO) + ) + stuck_jobs = ee2_jobs_collection.find( + {"status": "created", "_id": {"$lt": five_mins_ago}, "batch_job": {"$ne": True}} + ) + print( + f"Found {stuck_jobs.count()} jobs that were stuck in the {Status.created.value} state for over 5 mins" + ) + for record in stuck_jobs: + cancel(record) + + +def clean_retried_jobs(): + """Clean up jobs that couldn't finish the retry lifecycle""" + # TODO + + +def purge(): + cancel_jobs_stuck_in_queue() + cancel_created() + + +if __name__ == "__main__": + try: + purge() + except Exception as e: + slack_client.ee2_reaper_failure(endpoint=ee2_endpoint, e=e) + raise e diff --git a/bin/PurgeHeldJobs.py b/bin/PurgeHeldJobs.py index 294a7623b..b8e037646 100644 --- a/bin/PurgeHeldJobs.py +++ b/bin/PurgeHeldJobs.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import logging import os -import sys import time from configparser import ConfigParser from datetime import datetime, timedelta @@ -9,31 +8,20 @@ import htcondor -# I wish a knew a better way to do this -sys.path.append(".") - from lib.execution_engine2.utils.SlackUtils import SlackClient from lib.installed_clients.execution_engine2Client import execution_engine2 -from lib.execution_engine2.utils.Condor import Condor logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) config = ConfigParser() -config_filepath = os.environ["KB_DEPLOYMENT_CONFIG"] - -# Condor -condor = Condor(config_filepath=config_filepath) -# EE2 - -cfg = condor.config -ee2_endpoint = cfg.get(section="execution_engine2", option="ee2-url") - +config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) +ee2_endpoint = config.get(section="execution_engine2", option="ee2-url") +slack_token = config.get(section="execution_engine2", option="slack-token") ee2 = execution_engine2(url=ee2_endpoint, token=os.environ["EE2_ADMIN_SERVICE_TOKEN"]) -# Slack -slack_token = cfg.get(section="execution_engine2", option="slack-token") -# TODO change this channel -slack_client = SlackClient(slack_token, channel="#ee_notifications", debug=True) +slack_client = SlackClient( + slack_token, channel="#ee_notifications", debug=True, endpoint=ee2_endpoint +) def read_events(path): @@ -172,4 +160,4 @@ def handle_hold_event(event): ) time.sleep(5) except Exception as e: - slack_client.ee2_reaper_failure(endpoint=ee2_endpoint) + slack_client.ee2_reaper_failure(endpoint=ee2_endpoint, e=e) diff --git a/bin/cron_vars b/bin/cron_vars new file mode 100644 index 000000000..d7b9cec77 --- /dev/null +++ b/bin/cron_vars @@ -0,0 +1,2 @@ +EE2_ADMIN_SERVICE_TOKEN=$EE2_ADMIN_SERVICE_TOKEN +KB_DEPLOYMENT_CONFIG=$KB_DEPLOYMENT_CONFIG \ No newline at end of file diff --git a/bin/ee2_cronjobs b/bin/ee2_cronjobs new file mode 100644 index 000000000..036231a15 --- /dev/null +++ b/bin/ee2_cronjobs @@ -0,0 +1,6 @@ +SHELL=/bin/bash +BASH_ENV=/etc/environment +# Check the cron-purge.log for issues why the script isn't running, such as missing `EE2_ADMIN_SERVICE_TOKEN` + +# m h dom mon dow user command + * * * * * root . /etc/environment; /miniconda-latest/bin/python3 /kb/module/bin/PurgeBadJobs.py >> /root/cron-purge.log 2>&1 diff --git a/build/templates/deploy.docker.cfg.templ b/build/templates/deploy.docker.cfg.templ index 849e88219..a12b338cb 100644 --- a/build/templates/deploy.docker.cfg.templ +++ b/build/templates/deploy.docker.cfg.templ @@ -27,6 +27,7 @@ mongo-database = ee2 mongo-user = travis mongo-password = travis mongo-authmechanism = DEFAULT +mongo-retry-rewrites = False start-local-mongo = 0 diff --git a/deploy.cfg b/deploy.cfg index 8a685860c..9618cb902 100644 --- a/deploy.cfg +++ b/deploy.cfg @@ -25,6 +25,7 @@ mongo-database = {{ default .Env.mongodb_database "ee2" }} mongo-user = {{ default .Env.mongodb_user "" }} mongo-password = {{ default .Env.mongodb_pwd "" }} mongo-authmechanism = {{ default .Env.mongodb_auth_mechanism "DEFAULT" }} +mongo-retry-rewrites = {{ default .Env.mongodb_retry_rewrites "False" }} start-local-mongo = {{ default .Env.start_local_mongo "0" }} mongo-collection = legacy diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 0845cd84b..349b066bc 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -5,7 +5,6 @@ from contextlib import contextmanager from datetime import datetime from typing import Dict, List - from bson.objectid import ObjectId from mongoengine import connect, connection from pymongo import MongoClient, UpdateOne @@ -16,6 +15,8 @@ RecordNotFoundException, InvalidStatusTransitionException, ) + +from lib.execution_engine2.utils.arg_processing import parse_bool from execution_engine2.sdk.EE2Runjob import JobIdPair @@ -27,6 +28,7 @@ def __init__(self, config: Dict): self.mongo_database = config["mongo-database"] self.mongo_user = config["mongo-user"] self.mongo_pass = config["mongo-password"] + self.retry_rewrites = parse_bool(config["mongo-retry-rewrites"]) self.mongo_authmechanism = config["mongo-authmechanism"] self.mongo_collection = None self._start_local_service() @@ -42,6 +44,7 @@ def _get_pymongo_client(self): password=self.mongo_pass, authSource=self.mongo_database, authMechanism=self.mongo_authmechanism, + retryWrites=self.retry_rewrites, ) def _get_mongoengine_client(self) -> connection: @@ -53,7 +56,9 @@ def _get_mongoengine_client(self) -> connection: password=self.mongo_pass, authentication_source=self.mongo_database, authentication_mechanism=self.mongo_authmechanism, + retryWrites=self.retry_rewrites, ) + # This MongoDB deployment does not support retryable writes def _start_local_service(self): try: diff --git a/lib/execution_engine2/utils/SlackUtils.py b/lib/execution_engine2/utils/SlackUtils.py index 836c41dc2..5a8c13fa8 100644 --- a/lib/execution_engine2/utils/SlackUtils.py +++ b/lib/execution_engine2/utils/SlackUtils.py @@ -26,8 +26,8 @@ def held_job_message(self, held_job): message = f"Held Job Stats {held_job}" self.safe_chat_post_message(channel=self.channel, text=message) - def ee2_reaper_failure(self, endpoint="Unknown EE2 URL", job_id="Unknown"): - message = f"EE2 Held Job reaper failed for {endpoint} (job {job_id}). Please check it out" + def ee2_reaper_failure(self, endpoint="Unknown EE2 URL", job_id="Unknown", e=None): + message = f"EE2 Held Job reaper failed for {endpoint} (job {job_id}), {e}. Please check it out" self.safe_chat_post_message(channel=self.channel, text=message) def ee2_reaper_success( @@ -55,7 +55,7 @@ def cancel_job_message(self, job_id, scheduler_id, termination_code): if self.debug is False: return - message = f"scheduler_id:{scheduler_id} job_id:{job_id} has been canceled due to {termination_code} ({self.endpoint})" + message = f"scheduler_id:`{scheduler_id}` job_id:`{job_id}` has been canceled due to `{termination_code}` ({self.endpoint})" self.safe_chat_post_message(channel=self.channel, text=message) def finish_job_message(self, job_id, scheduler_id, finish_status, error_code=None): diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 642ca69f3..2d5fbbab2 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -2,25 +2,29 @@ cp ./deploy.cfg ./work/config.properties -#condor_shared=condor_shared - - -if [ $# -eq 0 ] ; then +if [ $# -eq 0 ]; then useradd kbase - if [ "${POOL_PASSWORD}" ] ; then - /usr/sbin/condor_store_cred -p "${POOL_PASSWORD}" -f /etc/condor/password - chown kbase:kbase /etc/condor/password + if [ "${POOL_PASSWORD}" ]; then + /usr/sbin/condor_store_cred -p "${POOL_PASSWORD}" -f /etc/condor/password + chown kbase:kbase /etc/condor/password fi chown kbase /etc/condor/password + + # Copy downloaded JobRunner to a shared volume mount cp -rf /runner/JobRunner.tgz /condor_shared cp -rf ./scripts/execute_runner.sh /condor_shared + # Give permissions to transfer logs into here mkdir /condor_shared/runner_logs && chown kbase /condor_shared/runner_logs mkdir /condor_shared/cluster_logs && chown kbase /condor_shared/cluster_logs + # Save ENV Variables to file for cron and Remove _=/usr/bin/env + envsubst /etc/environment + chmod a+rw /etc/environment + service cron start sh ./scripts/start_server.sh -elif [ "${1}" = "test" ] ; then +elif [ "${1}" = "test" ]; then echo "Run Tests" make test diff --git a/test/deploy.cfg b/test/deploy.cfg index 0bfb9bc55..fa520f56c 100644 --- a/test/deploy.cfg +++ b/test/deploy.cfg @@ -28,6 +28,7 @@ mongo-database = ee2 mongo-user = travis mongo-password = travis mongo-authmechanism = DEFAULT +mongo-retry-rewrites = False # mongo-in-docker-compose = mini_kb_ci-mongo_1 # mongo-in-docker-compose = condor_mongo_1 From 77231e196fdb0e59b2c26ed903826b5da9ebf29a Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 13 Aug 2021 12:19:07 -0500 Subject: [PATCH 103/109] DATAUP-555 Dedupe and fail (#415) * Dedupe and fail * Fixing up * Fixing up tests * Add space Co-authored-by: bio-boris --- lib/execution_engine2/sdk/EE2Runjob.py | 30 +++++++--------- ...ee2_SDKMethodRunner_test_EE2Runjob_test.py | 35 ++++++++++++++----- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index caaa7cdbd..3087fadf1 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -6,7 +6,7 @@ """ import os import time -from collections import defaultdict +from collections import Counter from enum import Enum from typing import Optional, Dict, NamedTuple, Union, List, Any @@ -762,44 +762,38 @@ def retry_multiple( if not job_ids: raise ValueError("No job_ids provided to retry") - offending_ids = defaultdict(int) - for job_id in job_ids: - if job_ids.count(job_id) > 1: - offending_ids[job_id] += 1 - - if offending_ids.keys(): + offending_ids = [item for item, count in Counter(job_ids).items() if count > 1] + if offending_ids: raise ValueError( f"Retry of the same id in the same request is not supported." - f" Offending ids:{list(offending_ids.keys())} " + f" Offending ids: {offending_ids} " ) # Check all inputs before attempting to start submitting jobs retried_jobs = [] - jobs = [] - batch_jobs = [] for job_id in job_ids: + # Check for presubmission failures try: job, batch_job = self._validate_retry_presubmit( job_id=job_id, as_admin=as_admin ) - jobs.append(job) - batch_jobs.append(batch_job) except Exception as e: - raise RetryFailureException(e) - - # Submit all of the collected jobs - for i, job_id in enumerate(job_ids): + # Collect the presubmit error and don't submit the job + retried_jobs.append({"job_id": job_id, "error": f"{e}"}) + continue + # Presubmit worked, write to the db and submit try: retried_jobs.append( self._retry( job_id=job_id, - job=jobs[i], - batch_job=batch_jobs[i], + job=job, + batch_job=batch_job, as_admin=as_admin, ) ) except Exception as e: retried_jobs.append({"job_id": job_id, "error": f"{e}"}) + return retried_jobs @staticmethod diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index f30c153f7..af441a81d 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -319,11 +319,13 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) + parent_job_id0 = runner.run_job(params=job) parent_job_id1 = runner.run_job(params=job) parent_job_id2 = runner.run_job(params=job) parent_job_id3 = runner.run_job(params=job) parent_job_id4 = runner.run_job(params=job) + runner.update_job_status(job_id=parent_job_id0, status=Status.terminated.value) runner.update_job_status(job_id=parent_job_id1, status=Status.terminated.value) runner.update_job_status(job_id=parent_job_id2, status=Status.error.value) runner.update_job_status(job_id=parent_job_id3, status=Status.terminated.value) @@ -334,8 +336,14 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): "'123' is not a valid ObjectId, it must be a 12-byte input or a 24-character " "hex string" ) - with self.assertRaisesRegex(RetryFailureException, errmsg): - runner.retry_multiple(job_ids=[parent_job_id1, 123]) + errmsg2 = ( + "'1234' is not a valid ObjectId, it must be a 12-byte input or a 24-character " + "hex string" + ) + retry_results = runner.retry_multiple(job_ids=[1234, 123, parent_job_id0]) + assert retry_results[0] == {"job_id": 1234, "error": errmsg2} + assert retry_results[1] == {"job_id": 123, "error": errmsg} + assert retry_results[2]["job_id"] == parent_job_id0 # 3. Retry the jobs with duplicate job ids retry_candidates = ( @@ -344,7 +352,7 @@ def test_retry_job_multiple(self, rq_mock, condor_mock): parent_job_id1, parent_job_id2, ) - fail_msg = f"Retry of the same id in the same request is not supported. Offending ids:{[parent_job_id1, parent_job_id2]} " + fail_msg = f"Retry of the same id in the same request is not supported. Offending ids: {[parent_job_id1, parent_job_id2]} " with self.assertRaises(ValueError) as e: runner.retry_multiple(retry_candidates) @@ -614,11 +622,22 @@ def test_run_job_batch(self, rq_mock, condor_mock): job = runner.check_job(job_id=child_job_id) retry_count = job["retry_count"] - # Test to see if one input fails, so fail them all - with self.assertRaises(expected_exception=RetryFailureException): - runner.retry_multiple(job_ids=[child_job_id, "grail", "fail"]) - # Check to see other job wasn't retried - assert retry_count == runner.check_job(job_id=child_job_id)["retry_count"] + # Test to see if one input fails, so keep going + results = runner.retry_multiple(job_ids=[child_job_id, "grail", "fail"]) + assert results[0]["job_id"] == child_job_id + assert "error" in results[1] + assert "error" in results[2] + + # Check to see child_job_id was retried + assert retry_count + 1 == runner.check_job(job_id=child_job_id)["retry_count"] + + # Test for duplicates + with self.assertRaises(expected_exception=ValueError) as e: + runner.retry_multiple(job_ids=[1, 2, 2]) + assert ( + e.exception.args[0] + == "Retry of the same id in the same request is not supported. Offending ids: [2] " + ) @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) From 96b6f697088179f8144a67b673ad929bef69a259 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 27 Aug 2021 14:19:59 -0500 Subject: [PATCH 104/109] DATAUP-565 Throw it into a thread (#419) * Throw it into a thread * Throw it into a thread Co-authored-by: bio-boris --- lib/execution_engine2/sdk/EE2Runjob.py | 39 +++++++++++++------- test/tests_for_integration/api_to_db_test.py | 12 ++++++ test/tests_for_sdkmr/EE2Runjob_test.py | 7 ++++ 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 3087fadf1..ec6d3952c 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -5,6 +5,7 @@ """ import os +import threading import time from collections import Counter from enum import Enum @@ -264,20 +265,7 @@ def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParamet ) return self._generate_job_submission_params(job_id, params) - def _run_multiple(self, runjob_params): - """ - Get the job records, bulk save them, then submit to condor. - If any condor submission fails, abort all of the jobs - :return: - """ - # Save records to db - job_records = [] - for runjob_param in runjob_params: - job_records.append( - self._init_job_rec(self.sdkmr.get_user_id(), runjob_param, save=False) - ) - job_ids = self.sdkmr.save_jobs(job_records) - + def _submit_multiple_wrapper(self, job_ids: list, runjob_params: List[Dict]): # Generate job submission params job_submission_params = [] for i, job_id in enumerate(job_ids): @@ -301,6 +289,29 @@ def _run_multiple(self, runjob_params): self._abort_multiple_jobs(job_ids) raise e + def _run_multiple(self, runjob_params: List[Dict]): + """ + Get the job records, bulk save them, then submit to condor. + If any condor submission fails, abort all of the jobs + :return: + """ + # Save records to db + job_records = [] + for runjob_param in runjob_params: + job_records.append( + self._init_job_rec(self.sdkmr.get_user_id(), runjob_param, save=False) + ) + job_ids = self.sdkmr.save_jobs(job_records) + + # Start up job submission thread + # For testing, mock this out and check to see it is called with these params? + threading.Thread( + target=self._submit_multiple_wrapper, + kwargs={"runjob_params": runjob_params, "job_ids": job_ids}, + daemon=True, + ).start() + return job_ids + def _update_to_queued_multiple(self, job_ids, scheduler_ids): """ This is called during job submission. If a job is terminated during job submission, diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index ca67c539f..aebab5726 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -1204,6 +1204,10 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): } ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + + # May need to increase sleep if thread takes too long + time.sleep(0.1) + batch_id = ret["batch_id"] job_id_1, job_id_2 = ret["child_job_ids"] @@ -1417,6 +1421,10 @@ def test_run_job_batch_with_no_batch_wsid(ee2_port, ws_controller, mongo_client) } ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + + # May need to increase sleep if thread takes too long + time.sleep(0.1) + batch_id = ret["batch_id"] job_id_1, job_id_2 = ret["child_job_ids"] @@ -1614,6 +1622,10 @@ def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_cli job_batch_params = {"wsid": job_batch_wsid, "as_admin": "foo"} ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_WRITE_ADMIN) ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + + # May need to increase sleep if thread takes too long + time.sleep(0.1) + batch_id = ret["batch_id"] job_id_1, job_id_2 = ret["child_job_ids"] diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 3794104c9..6dec768bf 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -5,6 +5,7 @@ # Incomplete by a long way. Will add more unit tests as they come up. import copy +import time from logging import Logger from typing import List, Dict, Any from unittest.mock import create_autospec, call @@ -1050,6 +1051,8 @@ def test_run_job_batch_with_cancellation_during_submit(): "batch_id": _JOB_ID, "child_job_ids": [_JOB_ID_1, _JOB_ID_2], } + # May need to increase sleep if thread takes too long + time.sleep(0.1) # check mocks called as expected. The order here is the order that they're called in the code. mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) @@ -1143,6 +1146,8 @@ def test_run_job_batch_with_parent_job_wsid(): "batch_id": _JOB_ID, "child_job_ids": [_JOB_ID_1, _JOB_ID_2], } + # May need to increase sleep if thread takes too long + time.sleep(0.1) # check mocks called as expected. The order here is the order that they're called in the code. mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) @@ -1249,6 +1254,8 @@ def test_run_job_batch_as_admin_with_job_requirements(): "batch_id": _JOB_ID, "child_job_ids": [_JOB_ID_1, _JOB_ID_2], } + # May need to increase sleep if thread takes too long + time.sleep(0.1) # check mocks called as expected. The order here is the order that they're called in the code. sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) From 01e3226c93064f1304f7ca5f14601835c8877ab1 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 7 Sep 2021 14:33:16 -0500 Subject: [PATCH 105/109] Update README.md (#421) --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index bd3f0d54c..3b584efa4 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![codecov](https://codecov.io/gh/kbase/execution_engine2/branch/develop/graph/badge.svg)](https://codecov.io/gh/kbase/execution_engine2) [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=kbase_execution_engine2&metric=alert_status)](https://sonarcloud.io/dashboard?id=kbase_execution_engine2) - This is a [KBase](https://kbase.us) module generated by the [KBase Software Development Kit (SDK)](https://github.com/kbase/kb_sdk). You will need to have the SDK installed to use this module. [Learn more about the SDK and how to use it](https://kbase.github.io/kb_sdk_docs/). From ee0d6412d6b198c4f24ed272f2de9ae81bed5930 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 15 Oct 2021 12:08:14 -0500 Subject: [PATCH 106/109] check_job_batch_bugaroo (#423) * check_job_batch_bugaroo * add a test * add a test * add a test * Fix test Co-authored-by: bio-boris --- .../execution_engine2Impl.py | 2 +- test/tests_for_integration/api_to_db_test.py | 29 +++++++++++++++ test/tests_for_sdkmr/ee2_load_test.py | 35 +++++++++++++++++++ test/utils_shared/test_utils.py | 7 ++-- 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 49e7b9b9a..5b6366de5 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -1288,7 +1288,7 @@ def check_job_batch(self, ctx, params): clients=self.clients, ) returnVal = mr.check_job_batch( - parent_job_id=params["job_id"], exclude_fields=params.get("exclude_fields", None), + batch_id=params["job_id"], exclude_fields=params.get("exclude_fields", None), as_admin=params.get('as_admin') ) #END check_job_batch diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py index aebab5726..c2943d1d0 100644 --- a/test/tests_for_integration/api_to_db_test.py +++ b/test/tests_for_integration/api_to_db_test.py @@ -1360,6 +1360,35 @@ def test_run_job_batch(ee2_port, ws_controller, mongo_client): sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 ) + # Check to see check_job_batch has both the batch and child jobstates + ret = ee2.check_job_batch(params={"job_id": batch_id}) + batch_jobstate = ret["batch_jobstate"] + child_jobstates = ret["child_jobstates"] + + # Check to see that the BATCH jobstate is as expected + expected_batch_jobstate = expected_parent_job + del expected_batch_jobstate["_id"] + expected_batch_jobstate.update( + {"batch_id": None, "job_id": batch_id, "retry_count": 0} + ) + del batch_jobstate["created"] + del batch_jobstate["updated"] + assert batch_jobstate == expected_batch_jobstate + + # Check to see the child states are as expected + for expected_child_job_state in [expected_job1, expected_job2]: + expected_child_job_state["retry_count"] = 0 + expected_child_job_state["job_id"] = str(expected_child_job_state["_id"]) + del expected_child_job_state["_id"] + + for received_child_job_state in child_jobstates: + del received_child_job_state["created"] + del received_child_job_state["queued"] + del received_child_job_state["updated"] + + assert child_jobstates[0] == expected_job1 + assert child_jobstates[1] == expected_job2 + def test_run_job_batch_with_no_batch_wsid(ee2_port, ws_controller, mongo_client): """ diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index c1801a909..55b614e6f 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -278,6 +278,41 @@ def test_retry_job_stress( ) self.impl.retry_job(ctx=self.ctx, params={"job_id": job_id}) + @patch.object(Condor, "run_job", return_value=si) + @patch.object(WorkspaceAuth, "can_write", return_value=True) + @patch( + "installed_clients.CatalogClient.Catalog.list_client_group_configs", + autospec=True, + ) + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + @patch("installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) + def test_check_job_batch_stress( + self, cc_log_stats, cc_get_mod_ver, cc_list_cli_configs, workspace, condor + ): + # Note, not a stress test, just an impl file test + cc_get_mod_ver.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_list_cli_configs.return_value = [] + + # set job method differently to distinguish + method_1 = "app1.a_method" + method_2 = "app2.b_method" + + job_params_1 = get_sample_job_params( + method=method_1, app_id="app1/a", wsid=None, parent_job_id=None + ) + job_params_2 = get_sample_job_params( + method=method_2, app_id="app2/b", wsid=None, parent_job_id=None + ) + + batch_id = self.impl.run_job_batch( + ctx=self.ctx, params=[job_params_1, job_params_2], batch_params={} + )[0]["batch_id"] + check_job_batch_status = self.impl.check_job_batch( + ctx=self.ctx, params={"job_id": batch_id} + ) + assert "batch_jobstate" in check_job_batch_status[0] + assert "child_jobstates" in check_job_batch_status[0] + @patch.object(Condor, "run_job", return_value=si) @patch.object(WorkspaceAuth, "can_write", return_value=True) @patch( diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index a39e5e4b8..af5583791 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -415,7 +415,10 @@ def get_sample_condor_info(job=None, error=None): def get_sample_job_params( - method="MEGAHIT.default_method", wsid=123, app_id="MEGAHIT/run_megahit" + method="MEGAHIT.default_method", + wsid=123, + app_id="MEGAHIT/run_megahit", + parent_job_id="9998", ): job_params = { "wsid": wsid, @@ -434,7 +437,7 @@ def get_sample_job_params( } ], "job_input": {}, - "parent_job_id": "9998", + "parent_job_id": parent_job_id, "meta": {"tag": "dev", "token_id": "12345"}, } From 96c8e4849486a6721356e04b98123fd6ae293590 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 3 Nov 2021 13:45:27 -0500 Subject: [PATCH 107/109] Update RELEASE_NOTES.md --- RELEASE_NOTES.md | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 15d8b673a..0dcf295ba 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,17 +2,28 @@ ========================================= + + ## 0.0.5 - * TODO Refactor run_jobs_batch endpoint to cache catalog calls for batch jobs, submit entire batch to condor in one transaction - * TODO: Added CreatedJobsReaper - * Added retry_job and retry_jobs endpoint along with ADRs - * TODO: Will deprecate run_job transaction and use scheduler API - * Refactored tests - * Removed slack messages for running jobs - * Fix a bug that caused job requirements from the catalog in CSV format to be ignored other +### New Endpoints +* run_job_batch +* retry_job +* retry_jobs +* abandon_children + +### BugFixes +* Fix a bug that caused job requirements from the catalog in CSV format to be ignored other than the client group - * Full EE2 admins can now submit job requirements when running jobs via run_job_batch and - run_job. See the SDK spec for details. + +### Other features and refactoring +* Refactor run_jobs_batch endpoint to cache catalog calls for batch jobs, submit entire batch to condor in one transaction +* Refactored tests +* Removed slack messages for running jobs +* Added CreatedJobsReaper +* Added retry_job and retry_jobs endpoint along with ADRs +* Full EE2 admins can now submit job requirements when running jobs via run_job_batch and +run_job. See the SDK spec for details. +* Added ADRs for retry endpoint ## 0.0.4 From 8517a629355f3e04ef36ec67870480b9a0acc6e4 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 3 Nov 2021 14:04:15 -0500 Subject: [PATCH 108/109] Update RELEASE_NOTES.md --- RELEASE_NOTES.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 0dcf295ba..47d5e0548 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,8 +2,6 @@ ========================================= - - ## 0.0.5 ### New Endpoints * run_job_batch From 1e268dfb1445e100da6fcb79219402953a32c0d1 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 3 Nov 2021 14:07:33 -0500 Subject: [PATCH 109/109] Update RELEASE_NOTES.md --- RELEASE_NOTES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 47d5e0548..68376cfdd 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,9 @@ # execution_engine2 (ee2) release notes ========================================= +## 0.0.6 +* Release of MVP + ## 0.0.5 ### New Endpoints