diff --git a/.flake8 b/.flake8 index 1ddfc3680..cd50db704 100644 --- a/.flake8 +++ b/.flake8 @@ -9,4 +9,6 @@ exclude = execution_engine2Impl.py, lib/installed_clients/, lib/execution_engine2/execution_engine2Impl.py, + lib/execution_engine2/authclient.py, + lib/biokbase/log.py, *Impl.py diff --git a/.github/codeql.yml b/.github/codeql.yml new file mode 100644 index 000000000..9771ca0f4 --- /dev/null +++ b/.github/codeql.yml @@ -0,0 +1,52 @@ +name: "Code scanning - action" + +on: + push: + pull_request: + schedule: + - cron: '0 19 * * 0' + +jobs: + CodeQL-Build: + + # CodeQL runs on ubuntu-latest and windows-latest + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + # We must fetch at least the immediate parents so that if this is + # a pull request then we can checkout the head. + fetch-depth: 2 + + # If this run was triggered by a pull request event, then checkout + # the head of the pull request instead of the merge commit. + - run: git checkout HEAD^2 + if: ${{ github.event_name == 'pull_request' }} + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + # Override language selection by uncommenting this and choosing your languages + # with: + # languages: go, javascript, csharp, python, cpp, java + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v1 + + # ℹī¸ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏ī¸ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 4b3eaba61..0ffad221c 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -9,12 +9,12 @@ # Testing Instructions * Details for how to test the PR: -- [ ] Tests pass in travis and locally +- [ ] Tests pass in Github Actions and locally - [ ] Changes available by spinning up a local test suite and doing X # Dev Checklist: -- [ ] My code follows the guidelines at https://sites.google.com/truss.works/kbasetruss/development +- [ ] My code follows the guidelines at https://sites.google.com/truss.works/kbasetruss/data-upload-project/development - [ ] I have performed a self-review of my own code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation @@ -22,7 +22,7 @@ - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged and published in downstream modules -- [ ] I have run Black and Flake8 on changed Python Code manually or with git precommit (and the travis build passes) +- [ ] I have run Black and Flake8 on changed Python Code manually or with git precommit (and the Github Actions build passes) # Updating Version and Release Notes (if applicable) diff --git a/.github/workflows/build_feature_branch.yml b/.github/workflows/build_feature_branch.yml new file mode 100644 index 000000000..b62fc1c89 --- /dev/null +++ b/.github/workflows/build_feature_branch.yml @@ -0,0 +1,26 @@ +name: Build Feature Branches + +on: [pull_request] + +jobs: + main: + runs-on: ubuntu-20.04 + steps: + - + name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ secrets.GHCR_USERNAME }} + password: ${{ secrets.GHCR_TOKEN }} + - + name: Build and push this feature branch + id: docker_build + uses: docker/build-push-action@v2 + with: + push: true + tags: ghcr.io/${{ github.repository }}:${{ github.head_ref }} + + - + name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/build_on_push.yaml b/.github/workflows/build_on_push.yaml new file mode 100644 index 000000000..d610acfae --- /dev/null +++ b/.github/workflows/build_on_push.yaml @@ -0,0 +1,31 @@ +name: Build Main/Develop Branches on push + +on: + push: + branches: + - main + - master + - develop + +jobs: + main: + runs-on: ubuntu-20.04 + steps: + - + name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ secrets.GHCR_USERNAME }} + password: ${{ secrets.GHCR_TOKEN }} + - + name: Build and push the main branch + id: docker_build + uses: docker/build-push-action@v2 + with: + push: true + tags: ghcr.io/${{ github.repository }}:${ GITHUB_REF##*/ } + + - + name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ee2-tests.yml b/.github/workflows/ee2-tests.yml index 0972e556c..c4920ce01 100644 --- a/.github/workflows/ee2-tests.yml +++ b/.github/workflows/ee2-tests.yml @@ -22,15 +22,18 @@ jobs: uses: actions/setup-python@v2 with: python-version: 3.8 - - name: Install dependencies + - name: Lint with flake8 and black run: | python -m pip install --upgrade pip pip install flake8 black pytest - if [ -f requirements.txt ]; then pip install -r requirements-dev.txt; fi - - name: Lint with flake8 and black - run: | flake8 ./lib ./test black --check ./lib ./test + - name: Install dependencies + run: | + if [ -f requirements.txt ]; then pip install -r requirements-dev.txt; fi + cd /opt + git clone https://github.com/kbase/jars + cd - - name: Build Docker Image run: | docker build . -t execution_engine2:test @@ -39,4 +42,5 @@ jobs: docker-compose up -d cp test/env/test.travis.env test.env make test-coverage + codecov diff --git a/.gitignore b/.gitignore index 5a4d9b298..4d823b979 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ sdk.cfg .pytest_cache lib/execution_engine2/execution_engine2Impl.py.bak* +coverage.xml +test_temp_can_delete \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9325ecde8..d3cb732c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/ambv/black - rev: 20.8b1 + rev: 21.5b0 hooks: - id: black exclude: '.+Impl.py' diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d70fe1dc9..000000000 --- a/.travis.yml +++ /dev/null @@ -1,33 +0,0 @@ -language: python -python: - - "3.7" - -dist: xenial -sudo: required - -services: - - docker - -env: - - KB_DEPLOYMENT_CONFIG=test/deploy.cfg - -before_install: - - sudo useradd kbase - - sed -i '/conda/d' ./requirements.txt - - (cd test/dockerfiles/condor && docker-compose up -d) - - cp test/env/test.travis.env test/env/test.env - - -install: - - pip install -r requirements.txt - - pip install black flake8 - - flake8 ./lib ./test - - black --check ./lib ./test - - make setup-database - -script: - - make test-coverage - # - make integration_test Doesn't yet work in travis - -after_success: - - codecov diff --git a/Dockerfile b/Dockerfile index 50144e7a3..dfba2f2b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM kbase/sdkbase2:python +FROM quay.io/kbase/sdkbase2:python MAINTAINER KBase Developer RUN apt-get clean all && apt-get update --fix-missing -y @@ -18,6 +18,12 @@ RUN DEBIAN_FRONTEND=noninteractive wget -qO - https://research.cs.wisc.edu/htcon && apt-get update -y \ && apt-get install -y condor +# install jars +# perhaps we should have test and prod dockerfiles to avoid jars and mongo installs in prod +RUN cd /opt \ + && git clone https://github.com/kbase/jars \ + && cd - + # install mongodb RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2930ADAE8CAF5059EE73BB4B58712A2291FA4AD5 \ && echo "deb http://repo.mongodb.org/apt/debian stretch/mongodb-org/3.6 main" | tee /etc/apt/sources.list.d/mongodb-org-3.6.list \ @@ -37,12 +43,14 @@ RUN echo "mongodb-org hold" | dpkg --set-selections \ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ && bash ~/miniconda.sh -b -p /miniconda-latest +# Setup Cron +COPY ./bin/ee2_cronjobs /etc/cron.d/ee2_cronjobs + # Need to change startup scripts to match this in MAKEFILE ENV PATH=/miniconda-latest/bin:$PATH RUN pip install --upgrade pip && python -V - - COPY ./requirements.txt /kb/module/requirements.txt + RUN pip install -r /kb/module/requirements.txt RUN adduser --disabled-password --gecos '' -shell /bin/bash kbase # ----------------------------------------- @@ -57,7 +65,10 @@ WORKDIR /kb/module/scripts RUN chmod +x download_runner.sh && ./download_runner.sh WORKDIR /kb/module/ + +# Set deploy.cfg location ENV KB_DEPLOYMENT_CONFIG=/kb/module/deploy.cfg +ENV PATH=/kb/module:$PATH ENTRYPOINT [ "./scripts/entrypoint.sh" ] CMD [ ] diff --git a/KIDLspec.css b/KIDLspec.css new file mode 100644 index 000000000..4d2a3e3af --- /dev/null +++ b/KIDLspec.css @@ -0,0 +1,65 @@ +html, body { + height: 100%; +} +html { + display: table; + margin: auto; +} +body { + background-color: white; + color: #000; + font-family: Menlo, Monaco, Consolas, "Courier New", monospace; + font-weight: normal; + font-size: 12px; + margin: 0; + padding: 20px; + display: table-cell; + vertical-align: middle; +} +span.space { + display: inline-block; + width: 7px; +} +span.tab { + display: inline-block; + width: 30px; +} +span.keyword { + font-weight: bold; + color: #008; +} +span.name { + color: #000; !important +} +span.deprecated { + text-decoration: line-through; +} +span.annotation { + color: #303030; +} +span.primitive { + font-weight: bold; + color: #066; +} +div.body { + background-color: #ffffff; + color: #3e4349; + padding: 0 30px; +} +div.comment { + color: #A0A0A0; +} +a { + color: #004b6b; + text-decoration: none; +} +a:hover { + color: #6d4100; + text-decoration: underline; +} +:target { + background-color: #ffa; +} +div.body p, div.body dd, div.body li { + line-height: 1.4em; +} diff --git a/Makefile b/Makefile index 0e50f5547..848d03d5f 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,10 @@ compile: --pysrvname $(SERVICE_CAPS).$(SERVICE_CAPS)Server \ --pyimplname $(SERVICE_CAPS).$(SERVICE_CAPS)Impl; + kb-sdk compile $(SPEC_FILE) \ + --out . \ + --html \ + build: chmod +x $(SCRIPTS_DIR)/entrypoint.sh diff --git a/Pipfile b/Pipfile new file mode 100644 index 000000000..ec7e9deb0 --- /dev/null +++ b/Pipfile @@ -0,0 +1,84 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +aiofiles = "==0.4.0" +aiohttp = "==3.7.4" +asn1crypto = "==1.3.0" +async-timeout = "==3.0.1" +attrs = "==20.2.0" +cachetools = "==3.1.1" +certifi = "==2019.6.16" +cffi = "==1.14.0" +chardet = "==3.0.4" +codecov = "==2.0.15" +configparser = "==3.7.4" +confluent-kafka = "==1.5.0" +coverage = "==4.5.3" +cryptography = "==3.3.2" +docker = "==4.3.1" +gevent = "==20.9.0" +gprof2dot = "==2019.11.30" +greenlet = "==0.4.17" +gunicorn = "==20.0.4" +h11 = "==0.8.1" +h2 = "==3.1.0" +hpack = "==3.0.0" +htcondor = "==8.9.8" +httpcore = "==0.3.0" +httptools = "==0.0.13" +hyperframe = "==5.2.0" +idna = "==2.8" +importlib-metadata = "==2.0.0" +iniconfig = "==1.1.1" +maps = "==5.1.1" +memory-profiler = "==0.55.0" +mock = "==3.0.5" +mongoengine = "==0.23.0" +multidict = "==4.5.2" +nose = "==1.3.7" +pluggy = "==0.13.1" +psutil = "==5.6.6" +py = "==1.10.0" +pycosat = "==0.6.3" +pycparser = "==2.19" +pymongo = "==3.8.0" +pyparsing = "==2.4.7" +pytest = "==6.1.1" +pytest-cov = "==2.8.1" +pytest-profiling = "==1.7.0" +python-dateutil = "==2.8.0" +python-dotenv = "==0.10.3" +requests = "==2.22.0" +requests-async = "==0.5.0" +requests-mock = "==1.7.0" +rfc3986 = "==1.3.2" +sanic = "==19.6.0" +sentry-sdk = "==0.14.3" +six = "==1.14.0" +slackclient = "==2.7.1" +toml = "==0.10.1" +tqdm = "==4.42.1" +typing-extensions = "==3.7.4.3" +ujson = "==1.35" +urllib3 = "==1.25.8" +uvloop = "==0.12.2" +websockets = "==6.0" +yarl = "==1.5.1" +zipp = "==3.3.1" +Jinja2 = "==2.11.3" +JSONRPCBase = "==0.2.0" +MarkupSafe = "==1.1.1" +pyOpenSSL = "==19.1.0" +PySocks = "==1.7.1" +"ruamel.yaml" = "==0.15.87" +websocket_client = "==0.57.0" +"zope.event" = "==4.5.0" +"zope.interface" = "==5.1.2" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 000000000..5df9ba0ff --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,986 @@ +{ + "_meta": { + "hash": { + "sha256": "96e75d4a0d655bab93d08d5d163c1bdb458d7ff8bc22b3e48af30e07797d0340" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "aiofiles": { + "hashes": [ + "sha256:021ea0ba314a86027c166ecc4b4c07f2d40fc0f4b3a950d1868a0f2571c2bbee", + "sha256:1e644c2573f953664368de28d2aa4c89dfd64550429d0c27c4680ccd3aa4985d" + ], + "index": "pypi", + "version": "==0.4.0" + }, + "aiohttp": { + "hashes": [ + "sha256:119feb2bd551e58d83d1b38bfa4cb921af8ddedec9fad7183132db334c3133e0", + "sha256:16d0683ef8a6d803207f02b899c928223eb219111bd52420ef3d7a8aa76227b6", + "sha256:2eb3efe243e0f4ecbb654b08444ae6ffab37ac0ef8f69d3a2ffb958905379daf", + "sha256:2ffea7904e70350da429568113ae422c88d2234ae776519549513c8f217f58a9", + "sha256:40bd1b101b71a18a528ffce812cc14ff77d4a2a1272dfb8b11b200967489ef3e", + "sha256:418597633b5cd9639e514b1d748f358832c08cd5d9ef0870026535bd5eaefdd0", + "sha256:481d4b96969fbfdcc3ff35eea5305d8565a8300410d3d269ccac69e7256b1329", + "sha256:4c1bdbfdd231a20eee3e56bd0ac1cd88c4ff41b64ab679ed65b75c9c74b6c5c2", + "sha256:5563ad7fde451b1986d42b9bb9140e2599ecf4f8e42241f6da0d3d624b776f40", + "sha256:58c62152c4c8731a3152e7e650b29ace18304d086cb5552d317a54ff2749d32a", + "sha256:5b50e0b9460100fe05d7472264d1975f21ac007b35dcd6fd50279b72925a27f4", + "sha256:5d84ecc73141d0a0d61ece0742bb7ff5751b0657dab8405f899d3ceb104cc7de", + "sha256:5dde6d24bacac480be03f4f864e9a67faac5032e28841b00533cd168ab39cad9", + "sha256:5e91e927003d1ed9283dee9abcb989334fc8e72cf89ebe94dc3e07e3ff0b11e9", + "sha256:62bc216eafac3204877241569209d9ba6226185aa6d561c19159f2e1cbb6abfb", + "sha256:6c8200abc9dc5f27203986100579fc19ccad7a832c07d2bc151ce4ff17190076", + "sha256:6ca56bdfaf825f4439e9e3673775e1032d8b6ea63b8953d3812c71bd6a8b81de", + "sha256:71680321a8a7176a58dfbc230789790639db78dad61a6e120b39f314f43f1907", + "sha256:7c7820099e8b3171e54e7eedc33e9450afe7cd08172632d32128bd527f8cb77d", + "sha256:7dbd087ff2f4046b9b37ba28ed73f15fd0bc9f4fdc8ef6781913da7f808d9536", + "sha256:822bd4fd21abaa7b28d65fc9871ecabaddc42767884a626317ef5b75c20e8a2d", + "sha256:8ec1a38074f68d66ccb467ed9a673a726bb397142c273f90d4ba954666e87d54", + "sha256:950b7ef08b2afdab2488ee2edaff92a03ca500a48f1e1aaa5900e73d6cf992bc", + "sha256:99c5a5bf7135607959441b7d720d96c8e5c46a1f96e9d6d4c9498be8d5f24212", + "sha256:b84ad94868e1e6a5e30d30ec419956042815dfaea1b1df1cef623e4564c374d9", + "sha256:bc3d14bf71a3fb94e5acf5bbf67331ab335467129af6416a437bd6024e4f743d", + "sha256:c2a80fd9a8d7e41b4e38ea9fe149deed0d6aaede255c497e66b8213274d6d61b", + "sha256:c44d3c82a933c6cbc21039326767e778eface44fca55c65719921c4b9661a3f7", + "sha256:cc31e906be1cc121ee201adbdf844522ea3349600dd0a40366611ca18cd40e81", + "sha256:d5d102e945ecca93bcd9801a7bb2fa703e37ad188a2f81b1e65e4abe4b51b00c", + "sha256:dd7936f2a6daa861143e376b3a1fb56e9b802f4980923594edd9ca5670974895", + "sha256:dee68ec462ff10c1d836c0ea2642116aba6151c6880b688e56b4c0246770f297", + "sha256:e76e78863a4eaec3aee5722d85d04dcbd9844bc6cd3bfa6aa880ff46ad16bfcb", + "sha256:eab51036cac2da8a50d7ff0ea30be47750547c9aa1aa2cf1a1b710a1827e7dbe", + "sha256:f4496d8d04da2e98cc9133e238ccebf6a13ef39a93da2e87146c8c8ac9768242", + "sha256:fbd3b5e18d34683decc00d9a360179ac1e7a320a5fee10ab8053ffd6deab76e0", + "sha256:feb24ff1226beeb056e247cf2e24bba5232519efb5645121c4aea5b6ad74c1f2" + ], + "index": "pypi", + "version": "==3.7.4" + }, + "asn1crypto": { + "hashes": [ + "sha256:5a215cb8dc12f892244e3a113fe05397ee23c5c4ca7a69cd6e69811755efc42d", + "sha256:831d2710d3274c8a74befdddaf9f17fcbf6e350534565074818722d6d615b315" + ], + "index": "pypi", + "version": "==1.3.0" + }, + "async-timeout": { + "hashes": [ + "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f", + "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3" + ], + "index": "pypi", + "version": "==3.0.1" + }, + "attrs": { + "hashes": [ + "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594", + "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc" + ], + "index": "pypi", + "version": "==20.2.0" + }, + "cachetools": { + "hashes": [ + "sha256:428266a1c0d36dc5aca63a2d7c5942e88c2c898d72139fca0e97fdd2380517ae", + "sha256:8ea2d3ce97850f31e4a08b0e2b5e6c34997d7216a9d2c98e0f3978630d4da69a" + ], + "index": "pypi", + "version": "==3.1.1" + }, + "certifi": { + "hashes": [ + "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", + "sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695" + ], + "index": "pypi", + "version": "==2019.6.16" + }, + "cffi": { + "hashes": [ + "sha256:001bf3242a1bb04d985d63e138230802c6c8d4db3668fb545fb5005ddf5bb5ff", + "sha256:00789914be39dffba161cfc5be31b55775de5ba2235fe49aa28c148236c4e06b", + "sha256:028a579fc9aed3af38f4892bdcc7390508adabc30c6af4a6e4f611b0c680e6ac", + "sha256:14491a910663bf9f13ddf2bc8f60562d6bc5315c1f09c704937ef17293fb85b0", + "sha256:1cae98a7054b5c9391eb3249b86e0e99ab1e02bb0cc0575da191aedadbdf4384", + "sha256:2089ed025da3919d2e75a4d963d008330c96751127dd6f73c8dc0c65041b4c26", + "sha256:2d384f4a127a15ba701207f7639d94106693b6cd64173d6c8988e2c25f3ac2b6", + "sha256:337d448e5a725bba2d8293c48d9353fc68d0e9e4088d62a9571def317797522b", + "sha256:399aed636c7d3749bbed55bc907c3288cb43c65c4389964ad5ff849b6370603e", + "sha256:3b911c2dbd4f423b4c4fcca138cadde747abdb20d196c4a48708b8a2d32b16dd", + "sha256:3d311bcc4a41408cf5854f06ef2c5cab88f9fded37a3b95936c9879c1640d4c2", + "sha256:62ae9af2d069ea2698bf536dcfe1e4eed9090211dbaafeeedf5cb6c41b352f66", + "sha256:66e41db66b47d0d8672d8ed2708ba91b2f2524ece3dee48b5dfb36be8c2f21dc", + "sha256:675686925a9fb403edba0114db74e741d8181683dcf216be697d208857e04ca8", + "sha256:7e63cbcf2429a8dbfe48dcc2322d5f2220b77b2e17b7ba023d6166d84655da55", + "sha256:8a6c688fefb4e1cd56feb6c511984a6c4f7ec7d2a1ff31a10254f3c817054ae4", + "sha256:8c0ffc886aea5df6a1762d0019e9cb05f825d0eec1f520c51be9d198701daee5", + "sha256:95cd16d3dee553f882540c1ffe331d085c9e629499ceadfbda4d4fde635f4b7d", + "sha256:99f748a7e71ff382613b4e1acc0ac83bf7ad167fb3802e35e90d9763daba4d78", + "sha256:b8c78301cefcf5fd914aad35d3c04c2b21ce8629b5e4f4e45ae6812e461910fa", + "sha256:c420917b188a5582a56d8b93bdd8e0f6eca08c84ff623a4c16e809152cd35793", + "sha256:c43866529f2f06fe0edc6246eb4faa34f03fe88b64a0a9a942561c8e22f4b71f", + "sha256:cab50b8c2250b46fe738c77dbd25ce017d5e6fb35d3407606e7a4180656a5a6a", + "sha256:cef128cb4d5e0b3493f058f10ce32365972c554572ff821e175dbc6f8ff6924f", + "sha256:cf16e3cf6c0a5fdd9bc10c21687e19d29ad1fe863372b5543deaec1039581a30", + "sha256:e56c744aa6ff427a607763346e4170629caf7e48ead6921745986db3692f987f", + "sha256:e577934fc5f8779c554639376beeaa5657d54349096ef24abe8c74c5d9c117c3", + "sha256:f2b0fa0c01d8a0c7483afd9f31d7ecf2d71760ca24499c8697aeb5ca37dc090c" + ], + "index": "pypi", + "version": "==1.14.0" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "index": "pypi", + "version": "==3.0.4" + }, + "codecov": { + "hashes": [ + "sha256:8ed8b7c6791010d359baed66f84f061bba5bd41174bf324c31311e8737602788", + "sha256:ae00d68e18d8a20e9c3288ba3875ae03db3a8e892115bf9b83ef20507732bed4" + ], + "index": "pypi", + "version": "==2.0.15" + }, + "configparser": { + "hashes": [ + "sha256:8be81d89d6e7b4c0d4e44bcc525845f6da25821de80cb5e06e7e0238a2899e32", + "sha256:da60d0014fd8c55eb48c1c5354352e363e2d30bbf7057e5e171a468390184c75" + ], + "index": "pypi", + "version": "==3.7.4" + }, + "confluent-kafka": { + "hashes": [ + "sha256:00acc73f7d49961bf427f5e4fd6c0a220a6bfa5ccc91e0ad1f9ffa1751a169b0", + "sha256:0a59afbb90bdd22b9acdd3bb134f5ee1dff3cc5df55eaf52bf97b2f8d0d00de3", + "sha256:13b0e2011560f461ff39daf38089dd7f91404b3e66dba0456ccce0700f93c4f2", + "sha256:175c7064c8f19975616974558c45f42c147a202d4b1c0b0a83afefb920367696", + "sha256:22d7201d1aa89f1c5546749e781492925ed3eb0d7bd8f781fc57294cd45ddde3", + "sha256:3034cacc3b0d03eb3ce39cc5a64c1070d223870246f5d90c9113996be9db7df8", + "sha256:3e2d4f55ca952aeada3831d6615dc13a8a42c8e97175855ca08bbc6e6091b080", + "sha256:5a1c47320d6afc5b2599f8f8e143aed6845a2d903facde984606e02f10f11221", + "sha256:7b03bd9cc7b5e4df0a27eed359762c61a35313d4981ef1d9b418069eee454e66", + "sha256:85ff4823770ce2efaabb46d88e5ae26a840e0051fd481abaa805f21a5a84d003", + "sha256:9534cd2c0313df75b70eb4cf729382998970d97bbdda5cf3aef7081b855ccebe", + "sha256:99b13d0957a5967c85aee6138ef5f9acec90294267a549c5683744f20cf5d7b4", + "sha256:9a1c77291c1ac4b991aa0358f2f44636686eb8f52fb628502d30c312160a14e9", + "sha256:9ac812006000887f76c95b8a33a9f0b65845bf072fbc54a42a1acffd34e41120", + "sha256:9c47b8aacfe347bffd86bf75b98626718912b63df87f256dff1abc06a0355410", + "sha256:a116382ae67e0d6a54684bab4ee9b1be54e789d031a6e5e74c3edc657c79d23c", + "sha256:b1c89f3653385acc5da71570e03281f35ac6960367f2b2a426ae431deb1a1a35", + "sha256:bb77276d569f511abe4a5b32a53f8a30285bc7be68219e5711a44720bf356ac2", + "sha256:bbd9633552840ab9367fb762ea21272759db8caec2c34ff16ee28be177644cdf", + "sha256:bfdfa81e4e72d2c24e408a5e199aae0a477499ae40647dfa6906d002d9b07f38", + "sha256:c7461d6db081c23a6d38ceba348e7c178d7e974cf22c45ba8a4918ecb8855a44", + "sha256:d6a5d4c72360a75e875e88f7cce42b66a786d037ca2002303ab1c580d49caf53", + "sha256:dabed41cc60d1fc6d3cb44a90fe02e5192c9bf0f73c7b35761981e62ecabc592", + "sha256:dd544847c713eeeb525031348ff6ffea4ecdd11c13590893e599a9d4676a9bd4", + "sha256:eba169a9de8c978c9f33c763857c5279eceac46a4fd55a381c2528b9d4b3359e", + "sha256:f2d1ee0bfdf618017bbfaa42406546155c1a86263e4f286295318578c723803b" + ], + "index": "pypi", + "version": "==1.5.0" + }, + "coverage": { + "hashes": [ + "sha256:3684fabf6b87a369017756b551cef29e505cb155ddb892a7a29277b978da88b9", + "sha256:39e088da9b284f1bd17c750ac672103779f7954ce6125fd4382134ac8d152d74", + "sha256:3c205bc11cc4fcc57b761c2da73b9b72a59f8d5ca89979afb0c1c6f9e53c7390", + "sha256:465ce53a8c0f3a7950dfb836438442f833cf6663d407f37d8c52fe7b6e56d7e8", + "sha256:48020e343fc40f72a442c8a1334284620f81295256a6b6ca6d8aa1350c763bbe", + "sha256:5296fc86ab612ec12394565c500b412a43b328b3907c0d14358950d06fd83baf", + "sha256:5f61bed2f7d9b6a9ab935150a6b23d7f84b8055524e7be7715b6513f3328138e", + "sha256:68a43a9f9f83693ce0414d17e019daee7ab3f7113a70c79a3dd4c2f704e4d741", + "sha256:6b8033d47fe22506856fe450470ccb1d8ba1ffb8463494a15cfc96392a288c09", + "sha256:7ad7536066b28863e5835e8cfeaa794b7fe352d99a8cded9f43d1161be8e9fbd", + "sha256:7bacb89ccf4bedb30b277e96e4cc68cd1369ca6841bde7b005191b54d3dd1034", + "sha256:839dc7c36501254e14331bcb98b27002aa415e4af7ea039d9009409b9d2d5420", + "sha256:8f9a95b66969cdea53ec992ecea5406c5bd99c9221f539bca1e8406b200ae98c", + "sha256:932c03d2d565f75961ba1d3cec41ddde00e162c5b46d03f7423edcb807734eab", + "sha256:988529edadc49039d205e0aa6ce049c5ccda4acb2d6c3c5c550c17e8c02c05ba", + "sha256:998d7e73548fe395eeb294495a04d38942edb66d1fa61eb70418871bc621227e", + "sha256:9de60893fb447d1e797f6bf08fdf0dbcda0c1e34c1b06c92bd3a363c0ea8c609", + "sha256:9e80d45d0c7fcee54e22771db7f1b0b126fb4a6c0a2e5afa72f66827207ff2f2", + "sha256:a545a3dfe5082dc8e8c3eb7f8a2cf4f2870902ff1860bd99b6198cfd1f9d1f49", + "sha256:a5d8f29e5ec661143621a8f4de51adfb300d7a476224156a39a392254f70687b", + "sha256:aca06bfba4759bbdb09bf52ebb15ae20268ee1f6747417837926fae990ebc41d", + "sha256:bb23b7a6fd666e551a3094ab896a57809e010059540ad20acbeec03a154224ce", + "sha256:bfd1d0ae7e292105f29d7deaa9d8f2916ed8553ab9d5f39ec65bcf5deadff3f9", + "sha256:c62ca0a38958f541a73cf86acdab020c2091631c137bd359c4f5bddde7b75fd4", + "sha256:c709d8bda72cf4cd348ccec2a4881f2c5848fd72903c185f363d361b2737f773", + "sha256:c968a6aa7e0b56ecbd28531ddf439c2ec103610d3e2bf3b75b813304f8cb7723", + "sha256:df785d8cb80539d0b55fd47183264b7002077859028dfe3070cf6359bf8b2d9c", + "sha256:f406628ca51e0ae90ae76ea8398677a921b36f0bd71aab2099dfed08abd0322f", + "sha256:f46087bbd95ebae244a0eda01a618aff11ec7a069b15a3ef8f6b520db523dcf1", + "sha256:f8019c5279eb32360ca03e9fac40a12667715546eed5c5eb59eb381f2f501260", + "sha256:fc5f4d209733750afd2714e9109816a29500718b32dd9a5db01c0cb3a019b96a" + ], + "index": "pypi", + "version": "==4.5.3" + }, + "cryptography": { + "hashes": [ + "sha256:0d7b69674b738068fa6ffade5c962ecd14969690585aaca0a1b1fc9058938a72", + "sha256:1bd0ccb0a1ed775cd7e2144fe46df9dc03eefd722bbcf587b3e0616ea4a81eff", + "sha256:3c284fc1e504e88e51c428db9c9274f2da9f73fdf5d7e13a36b8ecb039af6e6c", + "sha256:49570438e60f19243e7e0d504527dd5fe9b4b967b5a1ff21cc12b57602dd85d3", + "sha256:541dd758ad49b45920dda3b5b48c968f8b2533d8981bcdb43002798d8f7a89ed", + "sha256:5a60d3780149e13b7a6ff7ad6526b38846354d11a15e21068e57073e29e19bed", + "sha256:7951a966613c4211b6612b0352f5bf29989955ee592c4a885d8c7d0f830d0433", + "sha256:922f9602d67c15ade470c11d616f2b2364950602e370c76f0c94c94ae672742e", + "sha256:a0f0b96c572fc9f25c3f4ddbf4688b9b38c69836713fb255f4a2715d93cbaf44", + "sha256:a777c096a49d80f9d2979695b835b0f9c9edab73b59e4ceb51f19724dda887ed", + "sha256:a9a4ac9648d39ce71c2f63fe7dc6db144b9fa567ddfc48b9fde1b54483d26042", + "sha256:aa4969f24d536ae2268c902b2c3d62ab464b5a66bcb247630d208a79a8098e9b", + "sha256:c7390f9b2119b2b43160abb34f63277a638504ef8df99f11cb52c1fda66a2e6f", + "sha256:e18e6ab84dfb0ab997faf8cca25a86ff15dfea4027b986322026cc99e0a892da" + ], + "index": "pypi", + "version": "==3.3.2" + }, + "docker": { + "hashes": [ + "sha256:13966471e8bc23b36bfb3a6fb4ab75043a5ef1dac86516274777576bed3b9828", + "sha256:bad94b8dd001a8a4af19ce4becc17f41b09f228173ffe6a4e0355389eef142f2" + ], + "index": "pypi", + "version": "==4.3.1" + }, + "gevent": { + "hashes": [ + "sha256:10110d4881aec04f218c316cb796b18c8b2cac67ae0eb5b0c5780056757268a2", + "sha256:1628a403fc9c3ea9b35924638a4d4fbe236f60ecdf4e22ed133fbbaf0bc7cb6b", + "sha256:1cfa3674866294623e324fa5b76eba7b96744d1956a605cfe24d26c5cd890f91", + "sha256:2269574444113cb4ca1c1808ab9460a87fe25e1c34a6e36d975d4af46e4afff9", + "sha256:283a021a2e14adfad718346f18982b80569d9c3a59e97cfae1b7d4c5b017941a", + "sha256:2aa70726ad1883fe7c17774e5ccc91ac6e30334efa29bafb9b8fe8ca6091b219", + "sha256:315a63a35068183dfb9bc0331c7bb3c265ee7db8a11797cbe98dadbdb45b5d35", + "sha256:324808a8558c733f7a9734525483795d52ca3bbd5662b24b361d81c075414b1f", + "sha256:33a63f230755c6813fca39d9cea2a8894df32df2ee58fd69d8bf8fcc1d8e018e", + "sha256:5f6d48051d336561ec08995431ee4d265ac723a64bba99cc58c3eb1a4d4f5c8d", + "sha256:8d338cd6d040fe2607e5305dd7991b5960b3780ae01f804c2ac5760d31d3b2c6", + "sha256:906175e3fb25f377a0b581e79d3ed5a7d925c136ff92fd022bb3013e25f5f3a9", + "sha256:93980e51dd2e5f81899d644a0b6ef4a73008c679fcedd50e3b21cc3451ba2424", + "sha256:9bb477f514cf39dc20651b479bf1ad4f38b9a679be2bfa3e162ec0c3785dfa2a", + "sha256:a8733a01974433d91308f8c44fa6cc13428b15bb39d46540657e260ff8852cb1", + "sha256:adbb267067f56696b2babced3d0856aa39dcf14b8ccd2dffa1fab587b00c6f80", + "sha256:afc177c37de41ce9c27d351ac84cbaf34407effcab5d6641645838f39d365be1", + "sha256:b07fcbca3e819296979d82fac3d8b44f0d5ced57b9a04dffcfd194da99c8eb2d", + "sha256:b2948566003a1030e47507755fe1f446995e8671c0c67571091539e01faf94cc", + "sha256:db208e74a32cff7f55f5aa1ba5d7d1c1a086a6325c8702ae78a5c741155552ff", + "sha256:dd4c6b2f540b25c3d0f277a725bc1a900ce30a681b90a081216e31f814be453b", + "sha256:e11de4b4d107ca2f35000eb08e9c4c4621c153103b400f48a9ea95b96d8c7e0b", + "sha256:eba19bae532d0c48d489fa16815b242ce074b1f4b63e8a8e663232cbe311ead9", + "sha256:fb33dc1ab27557bccd64ad4bf81e68c8b0d780fe937b1e2c0814558798137229" + ], + "index": "pypi", + "version": "==20.9.0" + }, + "gprof2dot": { + "hashes": [ + "sha256:b43fe04ebb3dfe181a612bbfc69e90555b8957022ad6a466f0308ed9c7f22e99" + ], + "index": "pypi", + "version": "==2019.11.30" + }, + "greenlet": { + "hashes": [ + "sha256:1023d7b43ca11264ab7052cb09f5635d4afdb43df55e0854498fc63070a0b206", + "sha256:124a3ae41215f71dc91d1a3d45cbf2f84e46b543e5d60b99ecc20e24b4c8f272", + "sha256:13037e2d7ab2145300676852fa069235512fdeba4ed1e3bb4b0677a04223c525", + "sha256:3af587e9813f9bd8be9212722321a5e7be23b2bc37e6323a90e592ab0c2ef117", + "sha256:41d8835c69a78de718e466dd0e6bfd4b46125f21a67c3ff6d76d8d8059868d6b", + "sha256:4481002118b2f1588fa3d821936ffdc03db80ef21186b62b90c18db4ba5e743b", + "sha256:47825c3a109f0331b1e54c1173d4e57fa000aa6c96756b62852bfa1af91cd652", + "sha256:5494e3baeacc371d988345fbf8aa4bd15555b3077c40afcf1994776bb6d77eaf", + "sha256:75e4c27188f28149b74e7685809f9227410fd15432a4438fc48627f518577fa5", + "sha256:97f2b01ab622a4aa4b3724a3e1fba66f47f054c434fbaa551833fa2b41e3db51", + "sha256:a34023b9eabb3525ee059f3bf33a417d2e437f7f17e341d334987d4091ae6072", + "sha256:ac85db59aa43d78547f95fc7b6fd2913e02b9e9b09e2490dfb7bbdf47b2a4914", + "sha256:be7a79988b8fdc5bbbeaed69e79cfb373da9759242f1565668be4fb7f3f37552", + "sha256:bee111161420f341a346731279dd976be161b465c1286f82cc0779baf7b729e8", + "sha256:ccd62f09f90b2730150d82f2f2ffc34d73c6ce7eac234aed04d15dc8a3023994", + "sha256:d3436110ca66fe3981031cc6aff8cc7a40d8411d173dde73ddaa5b8445385e2d", + "sha256:e495096e3e2e8f7192afb6aaeba19babc4fb2bdf543d7b7fed59e00c1df7f170", + "sha256:e66a824f44892bc4ec66c58601a413419cafa9cec895e63d8da889c8a1a4fa4a" + ], + "index": "pypi", + "version": "==0.4.17" + }, + "gunicorn": { + "hashes": [ + "sha256:1904bb2b8a43658807108d59c3f3d56c2b6121a701161de0ddf9ad140073c626", + "sha256:cd4a810dd51bf497552cf3f863b575dabd73d6ad6a91075b65936b151cbf4f9c" + ], + "index": "pypi", + "version": "==20.0.4" + }, + "h11": { + "hashes": [ + "sha256:acca6a44cb52a32ab442b1779adf0875c443c689e9e028f8d831a3769f9c5208", + "sha256:f2b1ca39bfed357d1f19ac732913d5f9faa54a5062eca7d2ec3a916cfb7ae4c7" + ], + "index": "pypi", + "version": "==0.8.1" + }, + "h2": { + "hashes": [ + "sha256:c8f387e0e4878904d4978cd688a3195f6b169d49b1ffa572a3d347d7adc5e09f", + "sha256:fd07e865a3272ac6ef195d8904de92dc7b38dc28297ec39cfa22716b6d62e6eb" + ], + "index": "pypi", + "version": "==3.1.0" + }, + "hpack": { + "hashes": [ + "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89", + "sha256:8eec9c1f4bfae3408a3f30500261f7e6a65912dc138526ea054f9ad98892e9d2" + ], + "index": "pypi", + "version": "==3.0.0" + }, + "htcondor": { + "hashes": [ + "sha256:34ea1e214284aca5a06cee4d756c8873a1787477f4fe6a045d3e1a0b42702b52", + "sha256:3bed2a0c4138e37c6bf41b18a559b0513b90fac4a01f0cd97f99ce02b12d6e83", + "sha256:7acc1bde00339634806b3e35010b62ab605aa83bfa56ae1040301c6008983371", + "sha256:7c6dd6524a4f986801cc3a65c69c6b2946e9fa0e1243dffd004bfd52b56e06fa", + "sha256:937daed135d2153cd6d29562cf8253674df0c6748f2887f67e38ba9c42906e1f", + "sha256:b3f9e7557061fd6c3dd8a0ac7d75f045e0b99c6037c9bd7a120e271b6d79b02d", + "sha256:ca436eac7a27f353045278b7276f5146f3f41af618cb6c4234019e19ea6631a7" + ], + "index": "pypi", + "version": "==8.9.8" + }, + "httpcore": { + "hashes": [ + "sha256:96f910b528d47b683242ec207050c7bbaa99cd1b9a07f78ea80cf61e55556b58" + ], + "index": "pypi", + "version": "==0.3.0" + }, + "httptools": { + "hashes": [ + "sha256:e00cbd7ba01ff748e494248183abc6e153f49181169d8a3d41bb49132ca01dfc" + ], + "index": "pypi", + "version": "==0.0.13" + }, + "hyperframe": { + "hashes": [ + "sha256:5187962cb16dcc078f23cb5a4b110098d546c3f41ff2d4038a9896893bbd0b40", + "sha256:a9f5c17f2cc3c719b917c4f33ed1c61bd1f8dfac4b1bd23b7c80b3400971b41f" + ], + "index": "pypi", + "version": "==5.2.0" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "index": "pypi", + "version": "==2.8" + }, + "importlib-metadata": { + "hashes": [ + "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da", + "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3" + ], + "index": "pypi", + "version": "==2.0.0" + }, + "iniconfig": { + "hashes": [ + "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", + "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" + ], + "index": "pypi", + "version": "==1.1.1" + }, + "jinja2": { + "hashes": [ + "sha256:03e47ad063331dd6a3f04a43eddca8a966a26ba0c5b7207a9a9e4e08f1b29419", + "sha256:a6d58433de0ae800347cab1fa3043cebbabe8baa9d29e668f1c768cb87a333c6" + ], + "index": "pypi", + "version": "==2.11.3" + }, + "jsonrpcbase": { + "hashes": [ + "sha256:7ea67fc1a7c87756e9a876e18a342e431e80d0ef3ba867dfd6f3fac5bf3fcc0d" + ], + "index": "pypi", + "version": "==0.2.0" + }, + "maps": { + "hashes": [ + "sha256:a92131122b3f6a2acc008e6a4d341a8510da5a83da39b76ef7a49807e1b28de5" + ], + "index": "pypi", + "version": "==5.1.1" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", + "sha256:195d7d2c4fbb0ee8139a6cf67194f3973a6b3042d742ebe0a9ed36d8b6f0c07f", + "sha256:22c178a091fc6630d0d045bdb5992d2dfe14e3259760e713c490da5323866c39", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:2beec1e0de6924ea551859edb9e7679da6e4870d32cb766240ce17e0a0ba2014", + "sha256:3b8a6499709d29c2e2399569d96719a1b21dcd94410a586a18526b143ec8470f", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:6f1e273a344928347c1290119b493a1f0303c52f5a5eae5f16d74f48c15d4a85", + "sha256:6fffc775d90dcc9aed1b89219549b329a9250d918fd0b8fa8d93d154918422e1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:7fed13866cf14bba33e7176717346713881f56d9d2bcebab207f7a036f41b850", + "sha256:84dee80c15f1b560d55bcfe6d47b27d070b4681c699c572af2e3c7cc90a3b8e0", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98bae9582248d6cf62321dcb52aaf5d9adf0bad3b40582925ef7c7f0ed85fceb", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:a6a744282b7718a2a62d2ed9d993cad6f5f585605ad352c11de459f4108df0a1", + "sha256:acf08ac40292838b3cbbb06cfe9b2cb9ec78fce8baca31ddb87aaac2e2dc3bc2", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b1dba4527182c95a0db8b6060cc98ac49b9e2f5e64320e2b56e47cb2831978c7", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:b7d644ddb4dbd407d31ffb699f1d140bc35478da613b441c582aeb7c43838dd8", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:bf5aa3cbcfdf57fa2ee9cd1822c862ef23037f5c832ad09cfea57fa846dec193", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:caabedc8323f1e93231b52fc32bdcde6db817623d33e100708d9a68e1f53b26b", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:d53bc011414228441014aa71dbec320c66468c1030aae3a6e29778a3382d96e5", + "sha256:d73a845f227b0bfe8a7455ee623525ee656a9e2e749e4742706d80a6065d5e2c", + "sha256:d9be0ba6c527163cbed5e0857c451fcd092ce83947944d6c14bc95441203f032", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be", + "sha256:feb7b34d6325451ef96bc0e36e1a6c0c1c64bc1fbec4b854f4529e51887b1621" + ], + "index": "pypi", + "version": "==1.1.1" + }, + "memory-profiler": { + "hashes": [ + "sha256:5fa47b274c929dd2cbcd9190afb62fec110701251d2ac2d301caaf545c81afc1" + ], + "index": "pypi", + "version": "==0.55.0" + }, + "mock": { + "hashes": [ + "sha256:83657d894c90d5681d62155c82bda9c1187827525880eda8ff5df4ec813437c3", + "sha256:d157e52d4e5b938c550f39eb2fd15610db062441a9c2747d3dbfa9298211d0f8" + ], + "index": "pypi", + "version": "==3.0.5" + }, + "mongoengine": { + "hashes": [ + "sha256:136d93af442d867e1d52f90c3d3066112ad91578239d0bf5b8131b3ea15b3d75", + "sha256:4b5a4aa317a138f09df956deaac1d62c1618af24a27c1826a6893709007d9047" + ], + "index": "pypi", + "version": "==0.23.0" + }, + "multidict": { + "hashes": [ + "sha256:024b8129695a952ebd93373e45b5d341dbb87c17ce49637b34000093f243dd4f", + "sha256:041e9442b11409be5e4fc8b6a97e4bcead758ab1e11768d1e69160bdde18acc3", + "sha256:045b4dd0e5f6121e6f314d81759abd2c257db4634260abcfe0d3f7083c4908ef", + "sha256:047c0a04e382ef8bd74b0de01407e8d8632d7d1b4db6f2561106af812a68741b", + "sha256:068167c2d7bbeebd359665ac4fff756be5ffac9cda02375b5c5a7c4777038e73", + "sha256:148ff60e0fffa2f5fad2eb25aae7bef23d8f3b8bdaf947a65cdbe84a978092bc", + "sha256:1d1c77013a259971a72ddaa83b9f42c80a93ff12df6a4723be99d858fa30bee3", + "sha256:1d48bc124a6b7a55006d97917f695effa9725d05abe8ee78fd60d6588b8344cd", + "sha256:31dfa2fc323097f8ad7acd41aa38d7c614dd1960ac6681745b6da124093dc351", + "sha256:34f82db7f80c49f38b032c5abb605c458bac997a6c3142e0d6c130be6fb2b941", + "sha256:3d5dd8e5998fb4ace04789d1d008e2bb532de501218519d70bb672c4c5a2fc5d", + "sha256:4a6ae52bd3ee41ee0f3acf4c60ceb3f44e0e3bc52ab7da1c2b2aa6703363a3d1", + "sha256:4b02a3b2a2f01d0490dd39321c74273fed0568568ea0e7ea23e02bd1fb10a10b", + "sha256:4b843f8e1dd6a3195679d9838eb4670222e8b8d01bc36c9894d6c3538316fa0a", + "sha256:5de53a28f40ef3c4fd57aeab6b590c2c663de87a5af76136ced519923d3efbb3", + "sha256:61b2b33ede821b94fa99ce0b09c9ece049c7067a33b279f343adfe35108a4ea7", + "sha256:6a3a9b0f45fd75dc05d8e93dc21b18fc1670135ec9544d1ad4acbcf6b86781d0", + "sha256:76ad8e4c69dadbb31bad17c16baee61c0d1a4a73bed2590b741b2e1a46d3edd0", + "sha256:7ba19b777dc00194d1b473180d4ca89a054dd18de27d0ee2e42a103ec9b7d014", + "sha256:7c1b7eab7a49aa96f3db1f716f0113a8a2e93c7375dd3d5d21c4941f1405c9c5", + "sha256:7fc0eee3046041387cbace9314926aa48b681202f8897f8bff3809967a049036", + "sha256:8ccd1c5fff1aa1427100ce188557fc31f1e0a383ad8ec42c559aabd4ff08802d", + "sha256:8e08dd76de80539d613654915a2f5196dbccc67448df291e69a88712ea21e24a", + "sha256:c18498c50c59263841862ea0501da9f2b3659c00db54abfbf823a80787fde8ce", + "sha256:c49db89d602c24928e68c0d510f4fcf8989d77defd01c973d6cbe27e684833b1", + "sha256:ce20044d0317649ddbb4e54dab3c1bcc7483c78c27d3f58ab3d0c7e6bc60d26a", + "sha256:d1071414dd06ca2eafa90c85a079169bfeb0e5f57fd0b45d44c092546fcd6fd9", + "sha256:d3be11ac43ab1a3e979dac80843b42226d5d3cccd3986f2e03152720a4297cd7", + "sha256:db603a1c235d110c860d5f39988ebc8218ee028f07a7cbc056ba6424372ca31b" + ], + "index": "pypi", + "version": "==4.5.2" + }, + "nose": { + "hashes": [ + "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", + "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", + "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" + ], + "index": "pypi", + "version": "==1.3.7" + }, + "packaging": { + "hashes": [ + "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5", + "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a" + ], + "version": "==20.9" + }, + "pluggy": { + "hashes": [ + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" + ], + "index": "pypi", + "version": "==0.13.1" + }, + "psutil": { + "hashes": [ + "sha256:06660136ab88762309775fd47290d7da14094422d915f0466e0adf8e4b22214e", + "sha256:0c11adde31011a286197630ba2671e34651f004cc418d30ae06d2033a43c9e20", + "sha256:0c211eec4185725847cb6c28409646c7cfa56fdb531014b35f97b5dc7fe04ff9", + "sha256:0fc7a5619b47f74331add476fbc6022d7ca801c22865c7069ec0867920858963", + "sha256:3004361c6b93dbad71330d992c1ae409cb8314a6041a0b67507cc882357f583e", + "sha256:5e8dbf31871b0072bcba8d1f2861c0ec6c84c78f13c723bb6e981bce51b58f12", + "sha256:6d81b9714791ef9a3a00b2ca846ee547fc5e53d259e2a6258c3d2054928039ff", + "sha256:724390895cff80add7a1c4e7e0a04d9c94f3ee61423a2dcafd83784fabbd1ee9", + "sha256:ad21281f7bd6c57578dd53913d2d44218e9e29fd25128d10ff7819ef16fa46e7", + "sha256:f21a7bb4b207e4e7c60b3c40ffa89d790997619f04bbecec9db8e3696122bc78", + "sha256:f60042bef7dc50a78c06334ca8e25580455948ba2fa98f240d034a4fed9141a5" + ], + "index": "pypi", + "version": "==5.6.6" + }, + "py": { + "hashes": [ + "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3", + "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a" + ], + "index": "pypi", + "version": "==1.10.0" + }, + "pycosat": { + "hashes": [ + "sha256:4c99874946a7e939bb941bbb019dd2c20e6068e3107c91366e7779c69d70e0ed" + ], + "index": "pypi", + "version": "==0.6.3" + }, + "pycparser": { + "hashes": [ + "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" + ], + "index": "pypi", + "version": "==2.19" + }, + "pymongo": { + "hashes": [ + "sha256:32421df60d06f479d71b6b539642e410ece3006e8910688e68df962c8eb40a21", + "sha256:324b22a8443e11faca44c96b20e7ec8a9e59a1e664457edeeb4f796080b31cde", + "sha256:4505ff8b7923dd7a8bed1bf25c9c4d0df5ab0b8b2821f2296533f2149a55f401", + "sha256:460b224681ea711e48e3638d15be2249024031b7dcb9622ba19c2e85bd5a26cc", + "sha256:47473b70c5f3cd5ddd2c49ab3b9ceafdafbbed5bc963f147df22a9343d7978f5", + "sha256:49375839af76834e9c5c3cc78c78386873fd0b2ad9a0860a7dc4ec9fe73af9dd", + "sha256:4a65f0f71ece86c860d30a1436b646db8ea32aec518845ef2903ca569faec32e", + "sha256:530621906c5dd6d27305b39c4e017701e5f4299aa68b93cde70eb985f94ca26f", + "sha256:54f4770b5810e8dc3cbeed675874195f02bb2bc4e95a9d665068edfb3baff4f7", + "sha256:5ed9382410e938b0ff76041c34018210504729a83bcf4f6a70c7092c28169f6f", + "sha256:61cad83637ae12c1c825130d7f9325cd6c162e3a64e8747a8144866020be3ff4", + "sha256:61e8e1c58b4fdf47ab79b7c7db8bb022c1e40b3b5fcbbaeea5fc94dc5c75638d", + "sha256:6e04e496af7d156b66cce70460011c621ecbadf5dcdce325c7acbb3cd6ea245d", + "sha256:7ef89ec435e89da902451dde6845066fe2770befaf0301fe2a1ac426b51fced3", + "sha256:854e8425e5eb775ccfffad04ecd094c99923d60a2c2d49babb5c435e836a91fa", + "sha256:9569796d48498e4db4e1d56284b626a8ed15f641ce3a8b2085f06bb03f4c2c88", + "sha256:9d50c99c6388863cbfdc5db9bad62e3a7c2e5fc151554a07c7f3c2530334a34f", + "sha256:9ea016c2c011df21f77c1f806ce45129a344ba2d414bd50f9e065b13a4a134be", + "sha256:a8421f0823174888fb12a5fa675322e756499d71e77ff712b4412d4b8f3c6503", + "sha256:aef7d88384ada699976350a285c7a333f96ebc959e98e7d2c98589f47bbf3b7f", + "sha256:b4d7ff9957ee770cf03bd7156a68a2f2e838e60712d9608eadc8741c15d01e72", + "sha256:c1db85c39e6a60588f855dbc7bd68fb0dab796096148ab5aa4abecaff19e1c6e", + "sha256:cee2fc0b94e66e7230da12fc4b3d34793c49957e16ee04f6468a94e264a1e41d", + "sha256:cf1dea28379a16b23e47db312883f07b3ba8d9d6abc1c59e51d4c8ae1820ab43", + "sha256:d1cd175df7c8b5fc976bade78bf4d9fb5aa7ab465c0f59931e380bbe188ef8fc", + "sha256:d48a94edf3cdd34524936a72ea01b352682b337f33a42db10ba29a96c37147d3", + "sha256:d9cc103a4e97f78bc77a1d72759ab3722f6cdf0374ad4fb4b0c53bd3238bdf98", + "sha256:fcb9ae8aa9158106c5d98a4349ec0d90b68f052d620b2d24622ba03b91e4d81d" + ], + "index": "pypi", + "version": "==3.8.0" + }, + "pyopenssl": { + "hashes": [ + "sha256:621880965a720b8ece2f1b2f54ea2071966ab00e2970ad2ce11d596102063504", + "sha256:9a24494b2602aaf402be5c9e30a0b82d4a5c67528fe8fb475e3f3bc00dd69507" + ], + "index": "pypi", + "version": "==19.1.0" + }, + "pyparsing": { + "hashes": [ + "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1", + "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b" + ], + "index": "pypi", + "version": "==2.4.7" + }, + "pysocks": { + "hashes": [ + "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", + "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", + "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" + ], + "index": "pypi", + "version": "==1.7.1" + }, + "pytest": { + "hashes": [ + "sha256:7a8190790c17d79a11f847fba0b004ee9a8122582ebff4729a082c109e81a4c9", + "sha256:8f593023c1a0f916110285b6efd7f99db07d59546e3d8c36fc60e2ab05d3be92" + ], + "index": "pypi", + "version": "==6.1.1" + }, + "pytest-cov": { + "hashes": [ + "sha256:cc6742d8bac45070217169f5f72ceee1e0e55b0221f54bcf24845972d3a47f2b", + "sha256:cdbdef4f870408ebdbfeb44e63e07eb18bb4619fae852f6e760645fa36172626" + ], + "index": "pypi", + "version": "==2.8.1" + }, + "pytest-profiling": { + "hashes": [ + "sha256:93938f147662225d2b8bd5af89587b979652426a8a6ffd7e73ec4a23e24b7f29", + "sha256:999cc9ac94f2e528e3f5d43465da277429984a1c237ae9818f8cfd0b06acb019" + ], + "index": "pypi", + "version": "==1.7.0" + }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" + ], + "index": "pypi", + "version": "==2.8.0" + }, + "python-dotenv": { + "hashes": [ + "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", + "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + ], + "index": "pypi", + "version": "==0.10.3" + }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, + "requests-async": { + "hashes": [ + "sha256:8731420451383196ecf2fd96082bfc8ae5103ada90aba185888499d7784dde6f" + ], + "index": "pypi", + "version": "==0.5.0" + }, + "requests-mock": { + "hashes": [ + "sha256:510df890afe08d36eca5bb16b4aa6308a6f85e3159ad3013bac8b9de7bd5a010", + "sha256:88d3402dd8b3c69a9e4f9d3a73ad11b15920c6efd36bc27bf1f701cf4a8e4646" + ], + "index": "pypi", + "version": "==1.7.0" + }, + "rfc3986": { + "hashes": [ + "sha256:0344d0bd428126ce554e7ca2b61787b6a28d2bbd19fc70ed2dd85efe31176405", + "sha256:df4eba676077cefb86450c8f60121b9ae04b94f65f85b69f3f731af0516b7b18" + ], + "index": "pypi", + "version": "==1.3.2" + }, + "ruamel.yaml": { + "hashes": [ + "sha256:18078354bfcf00d51bcc17984aded80840379aed36036f078479e191b59bc059", + "sha256:211e6ef2530f44fc3197c713892678e7fbfbc40a1db6741179d6981514be1674", + "sha256:2e8f7cee12a2372cec4480fe81086b1fdab163f4b56e58b5592a105c52973b78", + "sha256:48cc8e948a7ec4917bf94adff2cc1255e98f1eef5e1961889886acc4ff3a7194", + "sha256:4a0c7f970aa0e30bc541f690fbd14aca19de1cab70787180de5083b902ec40b5", + "sha256:5dd0ea7c5c703e8675f3caf2898a50b4dadaa52838f8e104637a452a05e03030", + "sha256:612fb4833f1978ceb7fd7a24d86a5ebd103bcc408394f3af621293194658cf1b", + "sha256:61c421a7a2b8e2886a94fbe29866df6b99451998abaa1584b9fdc9c10c33e40b", + "sha256:6483416847980aa7090b697d177a8754c4f340683cc84abd38da7b850826687d", + "sha256:6622f3b0cae7ed6fe5d3d6a6d8d8cb9413a05b408d69a789a57b77a616bb6562", + "sha256:80b2acde0d1b9d25e5c041960a9149480c15c6d9f4c24b8ddb381b14e9e70ea4", + "sha256:8f9ed94be17f306485df8fd0274a30f130a73f127798657d4dc65b1f89ec7a36", + "sha256:9a6b94cc9b6e738036426498ac9fe8ca05afea4249fb9dec1be32ce4823d5756", + "sha256:a4b11dfe421a9836c723107a4ccc9cab9674de611ba60b8212e85526ea8bf254", + "sha256:a55e55c6ecb5725ba472f9b811940e8d258a32fb36f5793dbc38582d6f377f3f", + "sha256:a736ab1d8c2d5566254a1a2ee38e7c5460520bcccd4a8f0feb25a4463735e5a7", + "sha256:c29d0a3cffa5a25f5259bfeac06ffdc5e7d1fd38a0a26a6664d160192730434f", + "sha256:c33458217a8c352b59c86065c4f05f3f1ac28b01c3e1a422845c306237446bf3", + "sha256:cc9bd3c3fa8a928f7b6e19fe8de13a61deb91f257eccbe0d16114ce8c54cdc81", + "sha256:d63b7c828a7358ce5b03a3e2c2a3e5a7058a954f8919334cb09b3d8541d1fff6", + "sha256:fbd301680a3563e84d667042dac1c5d50ef402ecf1f4b1763507a6877b8181ad", + "sha256:fc67e79e2f5083be6fd1000c4646e13a891585772a503f56f51f845b547fe621" + ], + "index": "pypi", + "version": "==0.15.87" + }, + "sanic": { + "hashes": [ + "sha256:cc64978266025afb0e7c0f8be928e2b81670c5d58ddac290d04c9d0da6ec2112", + "sha256:ebd806298782400db811ea9d63e8096e835e67f0b5dc5e66e507532984a82bb3" + ], + "index": "pypi", + "version": "==19.6.0" + }, + "sentry-sdk": { + "hashes": [ + "sha256:23808d571d2461a4ce3784ec12bbee5bdb8c026c143fe79d36cef8a6d653e71f", + "sha256:bb90a4e19c7233a580715fc986cc44be2c48fc10b31e71580a2037e1c94b6950" + ], + "index": "pypi", + "version": "==0.14.3" + }, + "six": { + "hashes": [ + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" + ], + "index": "pypi", + "version": "==1.14.0" + }, + "slackclient": { + "hashes": [ + "sha256:b1b24df115e78b908565d9fa67bb3a86e66dd9a133954b953eb1c0559e7205b9", + "sha256:ccb0b8b203bc6087f7ab995fb4d2971dbe8925472afb078087ed76d1d8f939ca" + ], + "index": "pypi", + "version": "==2.7.1" + }, + "toml": { + "hashes": [ + "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f", + "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88" + ], + "index": "pypi", + "version": "==0.10.1" + }, + "tqdm": { + "hashes": [ + "sha256:251ee8440dbda126b8dfa8a7c028eb3f13704898caaef7caa699b35e119301e2", + "sha256:fe231261cfcbc6f4a99165455f8f6b9ef4e1032a6e29bccf168b4bf42012f09c" + ], + "index": "pypi", + "version": "==4.42.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918", + "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c", + "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f" + ], + "index": "pypi", + "version": "==3.7.4.3" + }, + "ujson": { + "hashes": [ + "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86" + ], + "index": "pypi", + "version": "==1.35" + }, + "urllib3": { + "hashes": [ + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" + ], + "index": "pypi", + "version": "==1.25.8" + }, + "uvloop": { + "hashes": [ + "sha256:0fcd894f6fc3226a962ee7ad895c4f52e3f5c3c55098e21efb17c071849a0573", + "sha256:2f31de1742c059c96cb76b91c5275b22b22b965c886ee1fced093fa27dde9e64", + "sha256:459e4649fcd5ff719523de33964aa284898e55df62761e7773d088823ccbd3e0", + "sha256:67867aafd6e0bc2c30a079603a85d83b94f23c5593b3cc08ec7e58ac18bf48e5", + "sha256:8c200457e6847f28d8bb91c5e5039d301716f5f2fce25646f5fb3fd65eda4a26", + "sha256:958906b9ca39eb158414fbb7d6b8ef1b7aee4db5c8e8e5d00fcbb69a1ce9dca7", + "sha256:ac1dca3d8f3ef52806059e81042ee397ac939e5a86c8a3cea55d6b087db66115", + "sha256:b284c22d8938866318e3b9d178142b8be316c52d16fcfe1560685a686718a021", + "sha256:c48692bf4587ce281d641087658eca275a5ad3b63c78297bbded96570ae9ce8f", + "sha256:fefc3b2b947c99737c348887db2c32e539160dcbeb7af9aa6b53db7a283538fe" + ], + "index": "pypi", + "version": "==0.12.2" + }, + "websocket-client": { + "hashes": [ + "sha256:0fc45c961324d79c781bab301359d5a1b00b13ad1b10415a4780229ef71a5549", + "sha256:d735b91d6d1692a6a181f2a8c9e0238e5f6373356f561bb9dc4c7af36f452010" + ], + "index": "pypi", + "version": "==0.57.0" + }, + "websockets": { + "hashes": [ + "sha256:0e2f7d6567838369af074f0ef4d0b802d19fa1fee135d864acc656ceefa33136", + "sha256:2a16dac282b2fdae75178d0ed3d5b9bc3258dabfae50196cbb30578d84b6f6a6", + "sha256:5a1fa6072405648cb5b3688e9ed3b94be683ce4a4e5723e6f5d34859dee495c1", + "sha256:5c1f55a1274df9d6a37553fef8cff2958515438c58920897675c9bc70f5a0538", + "sha256:669d1e46f165e0ad152ed8197f7edead22854a6c90419f544e0f234cc9dac6c4", + "sha256:695e34c4dbea18d09ab2c258994a8bf6a09564e762655408241f6a14592d2908", + "sha256:6b2e03d69afa8d20253455e67b64de1a82ff8612db105113cccec35d3f8429f0", + "sha256:79ca7cdda7ad4e3663ea3c43bfa8637fc5d5604c7737f19a8964781abbd1148d", + "sha256:7fd2dd9a856f72e6ed06f82facfce01d119b88457cd4b47b7ae501e8e11eba9c", + "sha256:82c0354ac39379d836719a77ee360ef865377aa6fdead87909d50248d0f05f4d", + "sha256:8f3b956d11c5b301206382726210dc1d3bee1a9ccf7aadf895aaf31f71c3716c", + "sha256:91ec98640220ae05b34b79ee88abf27f97ef7c61cf525eec57ea8fcea9f7dddb", + "sha256:952be9540d83dba815569d5cb5f31708801e0bbfc3a8c5aef1890b57ed7e58bf", + "sha256:99ac266af38ba1b1fe13975aea01ac0e14bb5f3a3200d2c69f05385768b8568e", + "sha256:9fa122e7adb24232247f8a89f2d9070bf64b7869daf93ac5e19546b409e47e96", + "sha256:a0873eadc4b8ca93e2e848d490809e0123eea154aa44ecd0109c4d0171869584", + "sha256:cb998bd4d93af46b8b49ecf5a72c0a98e5cc6d57fdca6527ba78ad89d6606484", + "sha256:e02e57346f6a68523e3c43bbdf35dde5c440318d1f827208ae455f6a2ace446d", + "sha256:e79a5a896bcee7fff24a788d72e5c69f13e61369d055f28113e71945a7eb1559", + "sha256:ee55eb6bcf23ecc975e6b47c127c201b913598f38b6a300075f84eeef2d3baff", + "sha256:f1414e6cbcea8d22843e7eafdfdfae3dd1aba41d1945f6ca66e4806c07c4f454" + ], + "index": "pypi", + "version": "==6.0" + }, + "yarl": { + "hashes": [ + "sha256:040b237f58ff7d800e6e0fd89c8439b841f777dd99b4a9cca04d6935564b9409", + "sha256:17668ec6722b1b7a3a05cc0167659f6c95b436d25a36c2d52db0eca7d3f72593", + "sha256:3a584b28086bc93c888a6c2aa5c92ed1ae20932f078c46509a66dce9ea5533f2", + "sha256:4439be27e4eee76c7632c2427ca5e73703151b22cae23e64adb243a9c2f565d8", + "sha256:48e918b05850fffb070a496d2b5f97fc31d15d94ca33d3d08a4f86e26d4e7c5d", + "sha256:9102b59e8337f9874638fcfc9ac3734a0cfadb100e47d55c20d0dc6087fb4692", + "sha256:9b930776c0ae0c691776f4d2891ebc5362af86f152dd0da463a6614074cb1b02", + "sha256:b3b9ad80f8b68519cc3372a6ca85ae02cc5a8807723ac366b53c0f089db19e4a", + "sha256:bc2f976c0e918659f723401c4f834deb8a8e7798a71be4382e024bcc3f7e23a8", + "sha256:c22c75b5f394f3d47105045ea551e08a3e804dc7e01b37800ca35b58f856c3d6", + "sha256:c52ce2883dc193824989a9b97a76ca86ecd1fa7955b14f87bf367a61b6232511", + "sha256:ce584af5de8830d8701b8979b18fcf450cef9a382b1a3c8ef189bedc408faf1e", + "sha256:da456eeec17fa8aa4594d9a9f27c0b1060b6a75f2419fe0c00609587b2695f4a", + "sha256:db6db0f45d2c63ddb1a9d18d1b9b22f308e52c83638c26b422d520a815c4b3fb", + "sha256:df89642981b94e7db5596818499c4b2219028f2a528c9c37cc1de45bf2fd3a3f", + "sha256:f18d68f2be6bf0e89f1521af2b1bb46e66ab0018faafa81d70f358153170a317", + "sha256:f379b7f83f23fe12823085cd6b906edc49df969eb99757f58ff382349a3303c6" + ], + "index": "pypi", + "version": "==1.5.1" + }, + "zipp": { + "hashes": [ + "sha256:16522f69653f0d67be90e8baa4a46d66389145b734345d68a257da53df670903", + "sha256:c1532a8030c32fd52ff6a288d855fe7adef5823ba1d26a29a68fd6314aa72baa" + ], + "index": "pypi", + "version": "==3.3.1" + }, + "zope.event": { + "hashes": [ + "sha256:2666401939cdaa5f4e0c08cf7f20c9b21423b95e88f4675b1443973bdb080c42", + "sha256:5e76517f5b9b119acf37ca8819781db6c16ea433f7e2062c4afc2b6fbedb1330" + ], + "index": "pypi", + "version": "==4.5.0" + }, + "zope.interface": { + "hashes": [ + "sha256:040f833694496065147e76581c0bf32b229a8b8c5eda120a0293afb008222387", + "sha256:11198b44e4a3d8c7a80cc20bbdd65522258a4d82fe467cd310c9fcce8ffe2ed2", + "sha256:121a9dccfe0c34be9c33b2c28225f0284f9b8e090580ffdff26c38fa16c7ffe1", + "sha256:15f3082575e7e19581a80b866664f843719b647a7f7189c811ba7f9ab3309f83", + "sha256:1d73d8986f948525536956ddd902e8a587a6846ebf4492117db16daba2865ddf", + "sha256:208e82f73b242275b8566ac07a25158e7b21fa2f14e642a7881048430612d1a6", + "sha256:2557833df892558123d791d6ff80ac4a2a0351f69c7421c7d5f0c07db72c8865", + "sha256:25ea6906f9987d42546329d06f9750e69f0ee62307a2e7092955ed0758e64f09", + "sha256:2c867914f7608674a555ac8daf20265644ac7be709e1da7d818089eebdfe544e", + "sha256:2eadac20711a795d3bb7a2bfc87c04091cb5274d9c3281b43088a1227099b662", + "sha256:37999d5ebd5d7bcd32438b725ca3470df05a7de8b1e9c0395bef24296b31ca99", + "sha256:3ae8946d51789779f76e4fa326fd6676d8c19c1c3b4c4c5e9342807185264875", + "sha256:5636cd7e60583b1608044ae4405e91575399430e66a5e1812f4bf30bcc55864e", + "sha256:570e637cb6509998555f7e4af13006d89fad6c09cfc5c4795855385391063e4b", + "sha256:590a40447ff3803c44050ce3c17c3958f11ca028dae3eacdd7b96775184394fa", + "sha256:5aab51b9c1af1b8a84f40aa49ffe1684d41810b18d6c3e94aa50194e0a563f01", + "sha256:5ffe4e0753393bcbcfc9a58133ed3d3a584634cc7cc2e667f8e3e6fbcbb2155d", + "sha256:663982381bd428a275a841009e52983cc69c471a4979ce01344fadbf72cf353d", + "sha256:6d06bf8e24dd6c473c4fbd8e16a83bd2e6d74add6ba25169043deb46d497b211", + "sha256:6e5b9a4bf133cf1887b4a04c21c10ca9f548114f19c83957b2820d5c84254940", + "sha256:70a2aed9615645bbe9d82c0f52bc7e676d2c0f8a63933d68418e0cb307f30536", + "sha256:7750746421c4395e3d2cc3d805919f4f57bb9f2a9a0ccd955566a9341050a1b4", + "sha256:7fc8708bc996e50fc7a9a2ad394e1f015348e389da26789fa6916630237143d7", + "sha256:91abd2f080065a7c007540f6bbd93ef7bdbbffa6df4a4cfab3892d8623b83c98", + "sha256:988f8b2281f3d95c66c01bdb141cefef1cc97db0d473c25c3fe2927ef00293b9", + "sha256:9f56121d8a676802044584e6cc41250bbcde069d8adf725b9b817a6b0fd87f09", + "sha256:a0f51536ce6e817a7aa25b0dca8b62feb210d4dc22cabfe8d1a92d47979372cd", + "sha256:a1cdd7390d7f66ddcebf545203ca3728c4890d605f9f2697bc8e31437906e8e7", + "sha256:b10eb4d0a77609679bf5f23708e20b1cd461a1643bd8ea42b1ca4149b1a5406c", + "sha256:b274ac8e511b55ffb62e8292316bd2baa80c10e9fe811b1aa5ce81da6b6697d8", + "sha256:c75b502af2c83fcfa2ee9c2257c1ba5806634a91a50db6129ff70e67c42c7e7b", + "sha256:c9c8e53a5472b77f6a391b515c771105011f4b40740ce53af8428d1c8ca20004", + "sha256:d867998a56c5133b9d31992beb699892e33b72150a8bf40f86cb52b8c606c83f", + "sha256:eb566cab630ec176b2d6115ed08b2cf4d921b47caa7f02cca1b4a9525223ee94", + "sha256:f61e6b95b414431ffe9dc460928fe9f351095fde074e2c2f5c6dda7b67a2192d", + "sha256:f718675fd071bcce4f7cbf9250cbaaf64e2e91ef1b0b32a1af596e7412647556", + "sha256:f9d4bfbd015e4b80dbad11c97049975f94592a6a0440e903ee647309f6252a1f", + "sha256:fae50fc12a5e8541f6f1cc4ed744ca8f76a9543876cf63f618fb0e6aca8f8375", + "sha256:fcf9c8edda7f7b2fd78069e97f4197815df5e871ec47b0f22580d330c6dec561", + "sha256:fdedce3bc5360bd29d4bb90396e8d4d3c09af49bc0383909fe84c7233c5ee675" + ], + "index": "pypi", + "version": "==5.1.2" + } + }, + "develop": {} +} diff --git a/Pipfile_notes.md b/Pipfile_notes.md new file mode 100644 index 000000000..a6a4c6600 --- /dev/null +++ b/Pipfile_notes.md @@ -0,0 +1,5 @@ +Note that requirements*.txt is currently the source of truth for which modules and versions +are required for this software. The Pipfile is provided as a convenience for users of tools +that consume one. + +You should verify that it is equivalent to requirements*.txt before using it. \ No newline at end of file diff --git a/README.md b/README.md index 425bac778..3b584efa4 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![codecov](https://codecov.io/gh/kbase/execution_engine2/branch/develop/graph/badge.svg)](https://codecov.io/gh/kbase/execution_engine2) [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=kbase_execution_engine2&metric=alert_status)](https://sonarcloud.io/dashboard?id=kbase_execution_engine2) - This is a [KBase](https://kbase.us) module generated by the [KBase Software Development Kit (SDK)](https://github.com/kbase/kb_sdk). You will need to have the SDK installed to use this module. [Learn more about the SDK and how to use it](https://kbase.github.io/kb_sdk_docs/). @@ -20,10 +19,15 @@ You can also learn more about the apps implemented in this module from its [cata See the .travis file for information on how to test locally -# Setup and test with docker-compose on MacOS +# Setup and test with docker-compose on MacOS/Linux ## Build and exec into the dev container +Make sure you have the latest versions of + +* docker +* docker-compose + ``` git clone https://github.com/kbase/execution_engine2.git cd execution_engine2 @@ -35,6 +39,9 @@ cd /ee2 make test-coverage ``` +Once the docker image is built, it does not need to be rebuilt after code changes to rerun tests. +Just ensure the services are up, exec into the container, and run the tests. + ## To run a specific test directory or specific file ``` PYTHONPATH=.:lib:test pytest --cov-report=xml --cov lib/execution_engine2/ --verbose test/tests_for_db/ @@ -44,14 +51,105 @@ PYTHONPATH=.:lib:test pytest --cov-report=xml --cov lib/execution_engine2/ --ver ## To run a specific test file via PyCharm See [Testing with Pycharm](docs/testing_with_pycharm.md) +## To run pre-commit hooks +`exec` into the docker container as before and switch to the `/ee2` directory. + +``` +pip install pre-commit +pre-commit install +pre-commit run --all-files +``` + +To remove the pre commit hooks: +``` +pre-commit uninstall +``` + +## Installing HTCondor Bindings from the mac +* You may not be able to load without disabling the mac Security Gatekeeper with `sudo spctl --master-disable` +* The HTCondor bindings only work on the Python.org install of python or your system install of python2.7. They will not work with anaconda. So download python from python.org +* Download the mac bindings at https://research.cs.wisc.edu/htcondor/tarball/current/8.9.10/release/ +* Current version is [8.9.10](https://research.cs.wisc.edu/htcondor/tarball/current/8.9.10/release/condor-8.9.10-x86_64_MacOSX-unstripped.tar.gz) +* Add /lib/python3 to PYTHONPATH. +* `import htcondor` ## Test Running Options ### PyCharm * Use a remote ssh debugger with the correct path mappings * Right click on the file you'd like to run and select run test +## Develop + +* To add a bugfix or new feature: + * Create a new feature branch, branching from `develop`. Ask a repo owner for help if + necessary. + * If you're a repo owner you can push directly to this branch. If not, make pull requests to + the branch as necessary. + * Add: + * Feature / bugfix code + * Tests + * Documentation, if applicable + * Release notes, if applicable + * See the PR template in `worksflows/pull_request_template.md` for details + * Once the feature is complete, create a PR from the feature branch to `develop` and request a + review from person with EE2 knowledge via the Github interface and via Slack. + * When the PR is approved, squash and merge into `develop` and delete the feature branch. +* To create a new release: + * Increment the version as per [semantic versioning](https://semver.org/) in `kbase.yml`. + * Update the release notes to the correct version, if necessary. + * Run `make compile`. + * Go through the process above to get the changes into `develop`. + * Make a PR from `develop` to `main`. + * Once the PR is apporoved, merge (no squash) to `main`. + * Tag the merge commit in GitHub with the semantic version from `kbase.yml`. +## KBase Catalog interactions + +### Client Groups + +EE2 understands client group specifications in JSON and CSV formats. Both formats have special +fields in common: +* `request_cpus` - the number of CPUs to request +* `request_memory` - the amount of memory, in MB, to request +* `request_disk` - the amount of memory, in GB, to request +* `client_group_regex` - boolean - treat the client group (see below) as a regular expression +* `debug_mode` - boolean - run the job in debug mode + +The client group is handled differently for JSON and CSV: +* The JSON format has the `clientgroup` field, which is optional. +* The CSV format must have the client group in the first 'column' of the CSV and is required. The + remainder of the 'columns' must be in `key=value` format. + +Any fields other than the above are sent on to the scheduler as key value pairs. + +For example, to set the client group to `bigmem`, request 32 CPUs, 64GB of memory, and 1TB of disk, +the following would be entered in the catalog UI: +* CSV: `bigmem, request_cpus=32, request_memory=64000, request_disk=1000` +* JSON: `{"client_group": "bigmem", "request_cpus" : "32", "request_memory" : "64000", "request_disk" : "1000"}` + +Note that the representation of this data in the catalog API is idiosyncratic - both the JSON and +CSV data are split by commas into parts. EE2 will detect JSON entries and reconsitute them before +deserialization. + + +# CronJobs/Reaper Scripts + +* Notifications are sent to the #ee_notifications slack channel + +### PurgeBadJobs +* Cronjobs are copied in and launched via the Dockerfile +* There are cronjobs configured in /etc/cron.d/ee2_cronjobs +* You can monitor them by reading the logs in /root/cron-purge.log + +### PurgeHeldJobs +* This is a daemon launched by entrypoint.sh +* It is not a cronjob because there is no way to easy way to seek through the HTCondor EXECUTE log, which takes a while to seek through + +#### Horizontal Scaling +* These scripts will have to be rethought if we do not want multiple copies running if ee2 is horizontally scaled. + + # Help Contact @Tianhao-Gu, @bio_boris, @briehl diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 8328e9839..68376cfdd 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,9 +1,37 @@ # execution_engine2 (ee2) release notes ========================================= + +## 0.0.6 +* Release of MVP + + +## 0.0.5 +### New Endpoints +* run_job_batch +* retry_job +* retry_jobs +* abandon_children + +### BugFixes +* Fix a bug that caused job requirements from the catalog in CSV format to be ignored other + than the client group + +### Other features and refactoring +* Refactor run_jobs_batch endpoint to cache catalog calls for batch jobs, submit entire batch to condor in one transaction +* Refactored tests +* Removed slack messages for running jobs +* Added CreatedJobsReaper +* Added retry_job and retry_jobs endpoint along with ADRs +* Full EE2 admins can now submit job requirements when running jobs via run_job_batch and +run_job. See the SDK spec for details. +* Added ADRs for retry endpoint + + ## 0.0.4 * Fix up tests * Remove dependency on slack - * Add batch endpoints + * Add batch endpoints, cancel_jobs now cancels child jobs + * Rename prod branch to "main" ## 0.0.3.4 * Change 7 day periodic_remove to 7 day hold diff --git a/bin/PurgeBadJobs.py b/bin/PurgeBadJobs.py new file mode 100644 index 000000000..d2a182c84 --- /dev/null +++ b/bin/PurgeBadJobs.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Script to purge jobs that have been queued for too long, or stuck in the created state for too long + +import logging +import os +from configparser import ConfigParser +from datetime import datetime, timedelta, timezone +from time import sleep + +import pymongo +from bson import ObjectId + +from lib.execution_engine2.db.models.models import TerminatedCode, Status +from lib.execution_engine2.utils.SlackUtils import SlackClient +from lib.installed_clients.execution_engine2Client import execution_engine2 + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +config = ConfigParser() +config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) +ee2_endpoint = config.get(section="execution_engine2", option="ee2-url") +slack_token = config.get(section="execution_engine2", option="slack-token") + +ee2 = execution_engine2(url=ee2_endpoint, token=os.environ["EE2_ADMIN_SERVICE_TOKEN"]) +slack_client = SlackClient( + slack_token, channel="#ee_notifications", debug=True, endpoint=ee2_endpoint +) +db_client = pymongo.MongoClient( + host=config.get(section="execution_engine2", option="mongo-host"), + port=int(config.get(section="execution_engine2", option="mongo-port")), + username=config.get(section="execution_engine2", option="mongo-user"), + password=config.get(section="execution_engine2", option="mongo-password"), + authSource=config.get(section="execution_engine2", option="mongo-database"), + authMechanism=config.get(section="execution_engine2", option="mongo-authmechanism"), + serverSelectionTimeoutMS=1000, +) +ee2_db = db_client.get_database( + config.get(section="execution_engine2", option="mongo-database") +) +ee2_jobs_collection = ee2_db.get_collection( + config.get(section="execution_engine2", option="mongo-jobs-collection") +) + +CREATED_MINUTES_AGO = 5 +QUEUE_THRESHOLD_DAYS = 14 + + +def cancel(record): + job_id = str(record["_id"]) + scheduler_id = record.get("scheduler_id") + cjp = { + "as_admin": True, + "job_id": job_id, + "terminated_code": TerminatedCode.terminated_by_automation.value, + } + print("About to cancel ee2 job", cjp) + ee2.cancel_job(params=cjp) + slack_client.cancel_job_message( + job_id=job_id, + scheduler_id=scheduler_id, + termination_code=TerminatedCode.terminated_by_automation.value, + ) + # Avoid rate limit of 1 msg per second + sleep(1) + + +def cancel_jobs_stuck_in_queue(): + """ + For jobs over 14 days old, cancel them + Update a completed Job as necessary to test this out: + ee2.update_job_status({'job_id': '601af2afeeb773acaf9de80d', 'as_admin': True, 'status': 'queued'}) + :return: + """ + queue_threshold_days = QUEUE_THRESHOLD_DAYS + before_days = ( + datetime.today() - timedelta(days=queue_threshold_days + 1) + ).timestamp() + print({"status": "queued", "queued": {"$lt": before_days}}) + stuck_jobs = ee2_jobs_collection.find( + {"status": Status.queued.value, "queued": {"$lt": before_days}} + ) + print( + f"Found {stuck_jobs.count()} jobs that were stuck in the {Status.queued.value} state over {queue_threshold_days} days" + ) + for record in stuck_jobs: + queued_time = record["queued"] + now = datetime.now(timezone.utc).timestamp() + elapsed = now - queued_time + print("queued days=", elapsed / 86000) + cancel(record) + + +def cancel_created(): + """ + For jobs that are not batch jobs, and have been in the created state for more than 5 minutes, uh oh, spaghettio, time to go + """ + + five_mins_ago = ObjectId.from_datetime( + datetime.now(timezone.utc) - timedelta(minutes=CREATED_MINUTES_AGO) + ) + stuck_jobs = ee2_jobs_collection.find( + {"status": "created", "_id": {"$lt": five_mins_ago}, "batch_job": {"$ne": True}} + ) + print( + f"Found {stuck_jobs.count()} jobs that were stuck in the {Status.created.value} state for over 5 mins" + ) + for record in stuck_jobs: + cancel(record) + + +def clean_retried_jobs(): + """Clean up jobs that couldn't finish the retry lifecycle""" + # TODO + + +def purge(): + cancel_jobs_stuck_in_queue() + cancel_created() + + +if __name__ == "__main__": + try: + purge() + except Exception as e: + slack_client.ee2_reaper_failure(endpoint=ee2_endpoint, e=e) + raise e diff --git a/bin/PurgeHeldJobs.py b/bin/PurgeHeldJobs.py index f675de91c..b8e037646 100644 --- a/bin/PurgeHeldJobs.py +++ b/bin/PurgeHeldJobs.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import logging import os -import sys import time from configparser import ConfigParser from datetime import datetime, timedelta @@ -9,31 +8,20 @@ import htcondor -# I wish a knew a better way to do this -sys.path.append(".") - from lib.execution_engine2.utils.SlackUtils import SlackClient from lib.installed_clients.execution_engine2Client import execution_engine2 -from lib.execution_engine2.utils.Condor import Condor logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) config = ConfigParser() -config_filepath = os.environ["KB_DEPLOYMENT_CONFIG"] - -# Condor -condor = Condor(config_filepath=config_filepath) -# EE2 - -cfg = condor.config -ee2_endpoint = cfg.get(section="execution_engine2", option="ee2-url") - +config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) +ee2_endpoint = config.get(section="execution_engine2", option="ee2-url") +slack_token = config.get(section="execution_engine2", option="slack-token") ee2 = execution_engine2(url=ee2_endpoint, token=os.environ["EE2_ADMIN_SERVICE_TOKEN"]) -# Slack -slack_token = cfg.get(section="execution_engine2", option="slack-token") -# TODO change this channel -slack_client = SlackClient(slack_token, channel="#ee_notifications", debug=True) +slack_client = SlackClient( + slack_token, channel="#ee_notifications", debug=True, endpoint=ee2_endpoint +) def read_events(path): @@ -172,5 +160,4 @@ def handle_hold_event(event): ) time.sleep(5) except Exception as e: - slack_client.ee2_reaper_failure(endpoint=ee2_endpoint) - + slack_client.ee2_reaper_failure(endpoint=ee2_endpoint, e=e) diff --git a/bin/cron_vars b/bin/cron_vars new file mode 100644 index 000000000..d7b9cec77 --- /dev/null +++ b/bin/cron_vars @@ -0,0 +1,2 @@ +EE2_ADMIN_SERVICE_TOKEN=$EE2_ADMIN_SERVICE_TOKEN +KB_DEPLOYMENT_CONFIG=$KB_DEPLOYMENT_CONFIG \ No newline at end of file diff --git a/bin/ee2_cronjobs b/bin/ee2_cronjobs new file mode 100644 index 000000000..036231a15 --- /dev/null +++ b/bin/ee2_cronjobs @@ -0,0 +1,6 @@ +SHELL=/bin/bash +BASH_ENV=/etc/environment +# Check the cron-purge.log for issues why the script isn't running, such as missing `EE2_ADMIN_SERVICE_TOKEN` + +# m h dom mon dow user command + * * * * * root . /etc/environment; /miniconda-latest/bin/python3 /kb/module/bin/PurgeBadJobs.py >> /root/cron-purge.log 2>&1 diff --git a/build/templates/deploy.docker.cfg.templ b/build/templates/deploy.docker.cfg.templ index c237b344a..a12b338cb 100644 --- a/build/templates/deploy.docker.cfg.templ +++ b/build/templates/deploy.docker.cfg.templ @@ -27,6 +27,7 @@ mongo-database = ee2 mongo-user = travis mongo-password = travis mongo-authmechanism = DEFAULT +mongo-retry-rewrites = False start-local-mongo = 0 @@ -47,6 +48,11 @@ transfer_input_files = ../scripts/JobRunner.tgz # Log Level and sending DEBUG=true to the jobs, which means containers do not get cleaned up debug = false +#---------------------------------------------------------------------------------------# +[concierge] +request_cpus = 4 +request_memory = 23000M +request_disk = 100GB #---------------------------------------------------------------------------------------# [njs] request_cpus = 4 @@ -76,7 +82,7 @@ request_disk = 100GB [hpc] request_cpus = 4 request_memory = 2000M -request_disk = 100GBraiss +request_disk = 100GB #---------------------------------------------------------------------------------------# [DEFAULT] default_client_group = njs diff --git a/dependencies.json b/dependencies.json index 4553e5758..257a87658 100644 --- a/dependencies.json +++ b/dependencies.json @@ -1,4 +1,8 @@ [ { + "module_name" : "execution_engine2", + "type" : "core", + "file_path" : "./execution_engine2.spec" +}, { "module_name" : "FakeObjectsForTests", "type" : "sdk", "version_tag" : "release" diff --git a/deploy.cfg b/deploy.cfg index 256d289e7..9618cb902 100644 --- a/deploy.cfg +++ b/deploy.cfg @@ -25,6 +25,7 @@ mongo-database = {{ default .Env.mongodb_database "ee2" }} mongo-user = {{ default .Env.mongodb_user "" }} mongo-password = {{ default .Env.mongodb_pwd "" }} mongo-authmechanism = {{ default .Env.mongodb_auth_mechanism "DEFAULT" }} +mongo-retry-rewrites = {{ default .Env.mongodb_retry_rewrites "False" }} start-local-mongo = {{ default .Env.start_local_mongo "0" }} mongo-collection = legacy @@ -80,7 +81,7 @@ request_memory = 204800M request_disk = 100GB [kb_upload] -request_cpus = {{ default .Env.kb_upload_default_cores "8" }} +request_cpus = {{ default .Env.kb_upload_default_cores "24" }} request_memory = 4500M request_disk = 50GB diff --git a/docs/adrs/002-Retry_endpoint.md b/docs/adrs/002-Retry_endpoint.md new file mode 100644 index 000000000..0e15235ce --- /dev/null +++ b/docs/adrs/002-Retry_endpoint.md @@ -0,0 +1,94 @@ +# Retry Endpoint + +Date: 2021-04-27 + + +## For discussion on this ADR, see the following PR https://github.com/kbase/execution_engine2/pull/367 + +The current requirement for the Batch/Bulk UI is to be able to retry jobs. Using a job id, it should be possible to get information from the database. + +The current implementation of retry is to run jobs using the `run_job` or `run_job_batch` endpoint. This is not adequate due to the following deficiencies: + + +* Lack of book-keeping for the parent job and the child job relationship: +* 1) Launching a new job will not update the relationship between those jobs. +* 2) e.g. the child job can specify a parent_job_id, but the parent_job will not know about the child_job +* 3) e.g. the parent will not know about new retried jobs +* 4) e.g. the child will not know how many times it was retried + +* Lack of book-keeping for number of retries for a particular job / set of job inputs +* Lack of ability to launch multiple jobs using the `run_job_batch` endpoint without creating a new parent job +* Lack of ability to ensure that the proper catalog version /git commit of an app is used from the front end based on a tag, such as "beta/dev/release" +* Lack of ability to specify which retries succeeded and which ones failed during submit time. +* Code is split more than is necessary + +### Note about submit time vs run time for job submission +The job first needs to be submitted to ee2_runjob. It can fail there. Afterwards, it gets submitted to condor, it can fail there too. Currently those two happen at the same time, but they are supposed to happen in a thread at some point, so the current single point of submit time will become two separate points of submission. Once the job begins running, it can fail at Run Time. + +## Author(s) + +@bio-boris + +## Status and Decision Outcome + +* We have decided to go with the retry endpoint in favor of retrofitting the run_job endpoint +* We are implementing a minimal retry endpoint, then iterating over a design ADR to create a more fully featured/robust Retry endpoint when time permits +* Minimal endpoint PR found at https://github.com/kbase/execution_engine2/pull/383 +* Design ADR will be added to ee2 repo + +### The new ADR will contain: +* Copy of comments to be addressed +* Link to spec file with inputs and outputs for the retry endpoint +* Link to Jira Ticket with business logic documentation for success and cancel cases + + +## Alternatives Considered + +* Not book-keeping, or doing minimal book-keeping and calling run_job multiple times +* Re-writing run_job/run_job_batch to address the aforementioned deficiencies +* Creating a retry endpoint dedicated to addressing book-keeping and job launching features + + +### Possible additional things to think about +* Creating sets of objects and what to do at the end of a batch run +* What to do about a set if a child task fails during processing +* Convenience endpoints that operate on the parent_job_id or list of child job ids may be out of scope (e.g. cancel all jobs with a certain status) + +## Consequences +* Requires 2nd ADR + +## Pros and Cons of the Alternatives + +### Not book-keeping, or doing minimal book-keeping and calling `run_job` multiple times +* `+` Can re-use existing endpoints without any additional work +* `+` Less api endpoints to manage +* `-` Issues on re-rendering/regenerating a cell based on just the job record +* `-` Loss of information about job runs, and ability to infer relationships between parents and child jobs. +* `-` Loss of control of jobs, such as the ability to restrict a job's running based on number of retries/failures. +* `-` Wrong version of app will run if the app was updated after job completion, and a version tag rather than a git commit was provided +* `-` Increase complexity of `run_job*` methods +* `-` The client will have to keep track of the child_job relationship, so that info is lost once the client is terminated + +### Re-writing `run_job` to address the aforementioned deficiencies without refactoring +* `+` Solves most requirements, but +* `-` Adds more complexity to `run_job` methods +* `-` Increase difficulty in maintaining and testing `run_job` method +* `-` Wrong version of app will run if the app was updated after job completion, and a version tag rather than a git commit was provided +* `-` Inefficient job submission +* `-` Possibly Insufficient error handling + +### Re-writing `run_job/run_job_batch` to address the aforementioned deficiencies with some refactoring +* `+` Same as above, but if you are refactoring, you might as well have a retry endpoint, and clean out/decouple `run_job` endpoint from having so many features and branching logic + +### Creating `retry` endpoint to address the aforementioned deficiencies with some refactoring +* `+` Decrease coupling between `run_job` and retry functionality, possibly making testing and development easier +* `+` Faster development than a full refactor +* `-` Faster development than a full refactor, but creates technical debt, might have to update both `run_job` and `retry` each time a change is made +* `-` Extra endpoint to manage + + +### Creating `retry` endpoint to address the aforementioned deficiencies with full refactoring where run_job functions are split out into their own functions +* `+` Decrease coupling between `run_job` and retry functionality, possibly making testing and development easier +* `+` Increase DRYNESS of the code +* `+` Allows retry to benefit from changes to `run_job` +* `-` Slower development for a full refactor, but decreases technical debt diff --git a/docs/adrs/003-Retry_endpoint_design.md b/docs/adrs/003-Retry_endpoint_design.md new file mode 100644 index 000000000..e5a992b24 --- /dev/null +++ b/docs/adrs/003-Retry_endpoint_design.md @@ -0,0 +1,168 @@ +# Retry Endpoint Design (Round 2!) + +Date: 2021-05-19 + + +## Motivation for the Endpoint: + +The current requirement for the Batch/Bulk UI is to be able to retry jobs that have either "errored" out, or were terminated. +The UI allows you to retry either single jobs, or multiple jobs, and saves you from having to cancel and resubmit each job individually, +which is not currently implemented in the UI anyway. + +### Motivation for the `code spike` for retry endpoint and follow up design ADR +>As I mentioned, as the product owner, I find our ability to deliver functionality to be pretty awful. +>We have invested so much effort in refactoring that its killed our timeline - we started in late July, and it is now almost May with no functioning >bulk uploader, which was just the first deliverable. +>If we are going to refactor, we need to be able to do it in a timely fashion, and have it not kill the schedule any more than it has. +>I want to see the estimate for a quick and dirty solution that implements a proposed retry endpoint, that can be deployed ASAP, and then once the API >contract has been established, and the functional MVP is done, we begin the cleanup of the backend code. +>Note that this is NOT business as usual, the usual way we do this is the nasty MVP gets deployed and then we don't go back until much later. +>Here, we get the API working so that it doesn't block dependencies, and we immediately start the refactoring. The refactor needs to be broken down into >smallish chunks of ~3 days estimated work, and each merge should maintain functionality and incrementally improve the codebase. +>Tasks that take more than a couple of days are more likely to be far off in their estimate and this is how we mitigate the risk of poor estimation. +> + + +### High Level Behavior of the `retry` endpoint +The current implementation of retry is to run jobs using the `retry_job` or `retry_jobs` endpoint. +The endpoint takes a job or list of job ids and then attempts to resubmit them to the queue, using the exact same set of parameters and version of the app. + +### Current Behavior + +* Spec file is located at https://github.com/kbase/execution_engine2/blob/8baab8e3ac5212f4bbe59fd935980aa41b4ee06d/execution_engine2.spec#L201-L247 + +* A job id is provided. If there are sufficient permissions, the call will proceed, if not, it will error out, unless the `as_admin` flag is provided by an admin +* The retry will only continue if the status of the job to be retried is in [Status.terminated.value, Status.error.value] +* If the job id points to a job that has already been retried, it will attempt to retry that job's `retry_parent` instead. +* If the job id has never been retried, it becomes the `retry_parent` +* EE2 looks up the method versions and parameters, and then submits the job to be retried, incrementing the `retry_count` of the job being retried, and the newly launched job gains a field called `retry_parent` that contains the job id of the job from the original request. +* The job is submitted and upon successful submission, the child job adds the field `retry_parent` and notifies the `parent_job_id` that a new `child_job` has been added by appending itself to the `parent_job.child_jobs[]` field +* There is no way to specify ResourceRequirements with a retry at the moment, even if the job was previously submitted by an administrator and had specfified ResourceRequirements. The retry will only use resource requirements from the catalog / ee2 config. + + +### Batch Behavior + +* If a job has the attribute of `batch_job=True` the retry will fail, since there is no method to re-run. This is a bug, as it doesn't fail gracefully. Gracefully handling jobs with children means that it won't throw an error about not having a method to re-run, and instead will throw an error that says "Cannot retry batch job parents. Must retry individual jobs" +* If a job has the attribute of `batch_job=True`, but is actually a child job, the parent will be notified of this new retried job +* Multiple in-flight retries are allowed. +* Adds `child_job_id` to `parent_job_id.child_job_ids[]` + +## Retry_job behavior +* Blocking and single submit to HTCondor. It should be fine as it returns relatively quickly + +## Retry_jobs behavior +* Submitting multiple jobs for retry serially calls the same code path +used for running a single job and blocks until all jobs have been +submitted to the condor queue. This can cause issues if the +network drops, and makes the narrative not aware of the state of +the retry. Submitting 100 jobs currently takes 9 seconds, and that +is a lot of time for things to go wrong. +* (Follow up: Hopefully the making the narrative aware of the state of the retry will be mitigated by the narrative backend. It just blocks on the call anyway, with the default service timeout, which I think is something wacky like half an hour. As long as the user doesn't kill the kernel at that time, all should be well. Of course, if it were me, and it looked frozen for more than a couple minutes, I'd probably restart. ) +* Multiple in-flight retries are allowed. + +### Desired Behavior + +#### General +* Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_batch` (requires refactor of run_job_batch) +* One single submission to HTCondor instead of multiple job submissions +* Ability to gracefully handle batch container jobs with children to throw proper error ([See Batch Behavior](#Batch-Behavior)) +* Ability to handle database consistency during retry failure +* See if we can make some preflight (before the job starts) checks fail before job submission and handle them differently than those that appear during job submission + +#### Data inconsistency +* A new `retry_ids` field will show a list of jobs that have been retried using this parent id. Retry_count will be returned as a calculated field based off of retry_ids +* `retry_toggle` field will allow a seperate process to check and possibly correct for jobs that didn't finish the entire retry lifecycle: +1) Launch child jobs +2) Notify the batch parent of the child, +3) Notify the retry parent of the child, +4) Update the retry_toggle field + +#### Won't do +* Add retry_number field + +## New priority + +For MVP +* Create a retry_jobs field, and expose list in api, and a T/F completeness toggle +* Add failure conditions in run method to fail before creating db records + +Not for mvp +* Non blocking job submission / (Possibly htcondor submit) +* Add thread/reaper to perform actions based on toggle + + + +### Questions + +#### Answered: +#### Q: should the number of retries of a job be limited, and if so, where? e.g. a max_retries field in the parent job? wait and see whether people attempt to rerun jobs that have already failed nine zillion times? +A: Make a ticket for this and add to backlog + +#### Q: How do we prevent jobs with identical parameters from being rerun more than once within a retry_jobs request? +A: We have decided to allow multiple jobs with the same params to be re-run in the same `retry_jobs` request. + +#### Q: How do we find the most recent retry of a job? +A: The client using the ee2 API would have to figure it out using the `retry_parent` and job creation date fields. (Unless we added other fields to help with this) + +#### Q: How do we ensure that the app version is correctly run each time when submitting from the narrative? +A: We would need to change the narrative to submit the git commit hash instead of a version tag + +#### Q: How do we handle DB consistency during retry failure? +Looks like the options are +* implement db integrity checks and two-phase commits for making the relationships between a job, its `retry_parent`, and the batch container +* accept that the db info may be incomplete and write workarounds into the clients +* (upgrade to Mongo 4.4 for better transaction support) +A: We have decided to use a `retry_toggle` in order to mark that the entire transaction has occurred for a retry job, and to set up a monitor to fix the jobs that didn't finish the retry lifecycle. + +##### Q: Do we want to support ResourceRequirements +A: Probably not in the short term + + +#### Q: how to prevent incorrect parent-child relationships being created -- should the client be allowed to specify a parent ID? Is it currently possible to add a new child to a parent job if the child is a new job, rather than an existing job ID / set of params that is being rerun? +A: Not necessarily relevant to this endpoint, more of a `run_job_batch` endpoint question. Currently the `retry_parent` and `parent_job_id` are looked up from the ee2 record on retry, and not specified in this endpoint. + +#### Shorter Q and A + + Should we track a retry count? (Done) + Should users see this retry count? A: Visible in the EE2 API, UI is TBD + Are retried jobs saved in some sort of data structure linking them, possibly indirectly, to the parent job or are they orphaned? (Yes, retry_parent) + If the former, is the retry relationship linear or a tree? E.g. what happens if there are two simultaneous calls to retry a job? (Tree, simultaneous jobs run) + Should it be at least theoretically possible to see the list of retried jobs in order? (It is possible by sorting on creation date) + Should there be a maximum retry count? Or a warning that more retries are not likely to help? (Unknown TBD) + Can a job in states other than failed or canceled be retried? Or should the user be required to cancel a job before it can be retried? (Job must be in Error/Cancel state) + + +# Work estimation for MVP +Priority descending + +### Address data inconsistency via retry_count, retry_ids and retry_toggle +> Estimate 3-4 days +> https://kbase-jira.atlassian.net/browse/DATAUP-461 + +### Preflight checks +> Estimate 3-4 days +> https://kbase-jira.atlassian.net/browse/DATAUP-528 +> Requires retry to be able to force the same app `git_commit versions` and `JobRequirements` from the db records +https://kbase-jira.atlassian.net/browse/DATAUP-461 + +### Create a created jobs and queued jobs reaper that cancels created jobs older than 1 hour, and cancels queued jobs over 14 days old. +> Estimate 2-3 days +https://kbase-jira.atlassian.net/browse/DATAUP-536 + +# Work estimation for POST MVP + +### Hookup retries to refactored code +* Non blocking job submission for submitting multiple jobs, possibly via using `run_job_multiple` +* Requires refactor of retry to gracefully handle jobs with children by notifying the batch containers for retry of ids not in the same batch. If you retry jobs from batch 1 and from batch 2, you want the correct batch parent to be notified. +* Switching from starting the retried jobs one at a time to starting them in batch mode will require refactoring how the batch and retry parents are updated +> Estimate 3 days +> https://kbase-jira.atlassian.net/browse/DATAUP-535 + +Not for MVP +### One single submission to HTCondor instead of multiple job submission () +> Estimate 1-2 days +> https://kbase-jira.atlassian.net/browse/DATAUP-391 +Not for MVP +### Prevent multiple in-flight retries to prevent the user from wasting their own resources (and the queues resources) +> Estimate 3-4 days +https://kbase-jira.atlassian.net/browse/DATAUP-439 + + diff --git a/execution_engine2.html b/execution_engine2.html new file mode 100644 index 000000000..14eda82c6 --- /dev/null +++ b/execution_engine2.html @@ -0,0 +1 @@ +execution_engine2
moduleexecution_engine2{

/*
*@range[0,1]
*/
typedefintboolean;

/*
*A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
*character Z (representing the UTC timezone) or the difference
*in time to UTC in the format +/-HHMM, eg:
*2012-12-17T23:24:06-0500 (EST time)
*2013-04-03T08:56:32+0000 (UTC time)
*2013-04-03T08:56:32Z (UTC time)
*/
typedefstringtimestamp;

/*
*A job id.
*/
typedefstringjob_id;

/*
*A structure representing the Execution Engine status
*git_commit - the Git hash of the version of the module.
*version - the semantic version for the module.
*service - the name of the service.
*server_time - the current server timestamp since epoch
*
*# TODO - add some or all of the following
*reboot_mode - if 1, then in the process of rebooting
*stopping_mode - if 1, then in the process of stopping
*running_tasks_total - number of total running jobs
*running_tasks_per_user - mapping from user id to number of running jobs for that user
*tasks_in_queue - number of jobs in the queue that are not running
*/
typedefstructure{
stringgit_commit;
stringversion;
stringservice;
floatserver_time;
}
Status;

/*
*Returns the service configuration, including URL endpoints and timeouts.
*The returned values are:
*external-url - string - url of this service
*kbase-endpoint - string - url of the services endpoint for the KBase environment
*workspace-url - string - Workspace service url
*catalog-url - string - catalog service url
*shock-url - string - shock service url
*handle-url - string - handle service url
*auth-service-url - string - legacy auth service url
*auth-service-url-v2 - string - current auth service url
*auth-service-url-allow-insecure - boolean string (true or false) - whether to allow insecure requests
*scratch - string - local path to scratch directory
*executable - string - name of Job Runner executable
*docker_timeout - int - time in seconds before a job will be timed out and terminated
*initial_dir - string - initial dir for HTCondor to search for passed input/output files
*transfer_input_files - initial list of files to transfer to HTCondor for job running
*/
funcdeflist_config()returns(mapping<string,string>)authenticationoptional;

/*
*Returns the current running version of the execution_engine2 servicve as a semantic version string.
*/
funcdefver()returns(string)authenticationnone;

/*
*Simply check the status of this service to see queue details
*/
funcdefstatus()returns(Status)authenticationnone;

/*
*A workspace object reference of the form X/Y/Z, where
*X is the workspace id,
*Y is the object id,
*Z is the version.
*/
typedefstringwsref;

/*
*Narrative metadata for a job. All fields are optional.
*run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID.
*token_id - the ID of the token used to run the method.
*tag - the release tag, e.g. dev/beta/release.
*cell_id - the ID of the narrative cell from which the job was run.
*/
typedefstructure{
stringrun_id;
stringtoken_id;
stringtag;
stringcell_id;
}
Meta;

/*
*Job requirements for a job. All fields are optional. To submit job requirements,
*the user must have full EE2 admin permissions. Ignored for the run concierge endpoint.
*
*request_cpus: the number of CPUs to request for the job.
*request_memory: the amount of memory, in MB, to request for the job.
*request_disk: the amount of disk space, in GB, to request for the job.
*client_group: the name of the client group on which to run the job.
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*bill_to_user: the job will be counted against the provided user's fair share quota.
*ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false.
*scheduler_requirements: arbitrary key-value pairs to be provided to the job
*scheduler. Requires knowledge of the scheduler interface.
*debug_mode: Whether to run the job in debug mode. Default false.
*/
typedefstructure{
intrequest_cpus;
intrequst_memory;
intrequest_disk;
stringclient_group;
booleanclient_group_regex;
stringbill_to_user;
booleanignore_concurrency_limits;
mapping<string,string>scheduler_requirements;
booleandebug_mode;
}
JobRequirements;

/*
*method - the SDK method to run in module.method format, e.g.
*'KBaseTrees.construct_species_tree'
*params - the parameters to pass to the method.
*
*Optional parameters:
*app_id - the id of the Narrative application (UI) running this job (e.g.
*repo/name)
*service_ver - specific version of deployed service, last version is
*used if this parameter is not defined
*source_ws_objects - denotes the workspace objects that will serve as a
*source of data when running the SDK method. These references will
*be added to the autogenerated provenance. Must be in UPA format (e.g.
*6/90/4).
*meta - Narrative metadata to associate with the job.
*wsid - an optional workspace id to associate with the job. This is passed to the
*workspace service, which will share the job based on the permissions of
*the workspace rather than owner of the job
*parent_job_id - EE2 job id for the parent of the current job.
*For run_job and run_job_concierge, this value can be specified to denote
*the parent job of the job being created.
*Warning: No checking is done on the validity of the job ID, and the parent job
*record is not altered.
*Submitting a job with a parent ID to run_job_batch will cause an error to be
*returned.
*job_requirements: the requirements for the job. The user must have full EE2
*administration rights to use this parameter. Note that the job_requirements
*are not returned along with the rest of the job parameters when querying the EE2
*API - they are only considered when submitting a job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*Note that this field is not included in returned data when querying EE2.
*/
typedefstructure{
stringmethod;
stringapp_id;
list<UnspecifiedObject>params;
stringservice_ver;
list<wsref>source_ws_objects;
Metameta;
intwsid;
stringparent_job_id;
JobRequirementsjob_requirements;
booleanas_admin;
}
RunJobParams;

/*
*Start a new job.
*/
funcdefrun_job(RunJobParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*Additional parameters for a batch job.
*wsid: the workspace with which to associate the parent job.
*as_admin: run the job with full EE2 permissions, meaning that any supplied workspace
*IDs are not checked for accessibility and job_requirements may be supplied. The
*user must have full EE2 administration rights.
*/
typedefstructure{
intwsid;
booleanas_admin;
}
BatchParams;

typedefstructure{
job_idbatch_id;
list<job_id>child_job_ids;
}
BatchSubmission;

typedefstructure{
job_idbatch_id;
list<job_id>child_job_ids;
booleanas_admin;
}
AbandonChildren;

/*
*Run a batch job, consisting of a parent job and one or more child jobs.
*Note that the as_admin parameters in the list of child jobs are ignored -
*only the as_admin parameter in the batch_params is considered.
*/
funcdefrun_job_batch(list<RunJobParams>params,BatchParamsbatch_params)returns(BatchSubmissionjob_ids)authenticationrequired;

/*
*job_id of retried job
*retry_id: job_id of the job that was launched
*str error: reason as to why that particular retry failed (available for bulk retry only)
*/
typedefstructure{
job_idjob_id;
job_idretry_id;
stringerror;
}
RetryResult;

/*
*job_id of job to retry
*as_admin: retry someone elses job in your namespace
*#TODO Possibly Add JobRequirements job_requirements;
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
RetryParams;

/*
*job_ids of job to retry
*as_admin: retry someone else's job in your namespace
*#TODO: Possibly Add list<JobRequirements> job_requirements;
*/
typedefstructure{
list<job_id>job_ids;
booleanas_admin;
}
BulkRetryParams;

/*
*#TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present
*#TODO Add retry child that checks the status of the child? to prevent multiple retries
*Allowed Jobs
** Regular Job with no children
** Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call
*Not Allowed
** Regular Job with children (Should not be possible to create yet)
** Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs)
*/
funcdefretry_job(RetryParamsparams)returns(RetryResultretry_result)authenticationrequired;

/*
*Same as retry_job, but accepts multiple jobs
*/
funcdefretry_jobs(BulkRetryParamsparams)returns(list<RetryResult>retry_result)authenticationrequired;

funcdefabandon_children(AbandonChildrenparams)returns(BatchSubmissionparent_and_child_ids)authenticationrequired;

/*
*EE2Constants Concierge Params are
*request_cpus: int
*request_memory: int in MB
*request_disk: int in GB
*job_priority: int = None range from -20 to +20, with higher values meaning better priority.
*Note: job_priority is currently not implemented.
*account_group: str = None # Someone elses account
*ignore_concurrency_limits: ignore any limits on simultaneous job runs.
*Default 1 (True).
*requirements_list: list = None ['machine=worker102','color=red']
*client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup
*client_group_regex: Whether to treat the client group string, whether provided here,
*from the catalog, or as a default, as a regular expression when matching
*clientgroups. Default True for HTC, but the default depends on the scheduler.
*Omit to use the default.
*debug_mode: Whether to run the job in debug mode. Default 0 (False).
*/
typedefstructure{
intrequest_cpu;
intrequest_memory;
intrequest_disk;
intjob_priority;
stringaccount_group;
booleanignore_concurrency_limits;
list<string>requirements_list;
stringclient_group;
booleanclient_group_regex;
booleandebug_mode;
}
ConciergeParams;

funcdefrun_job_concierge(RunJobParamsparams,ConciergeParamsconcierge_params)returns(job_idjob_id)authenticationrequired;

/*
*Get job params necessary for job execution
*@optionalas_admin
*/
typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobParams;

funcdefget_job_params(GetJobParamsparams)returns(RunJobParamsparams)authenticationrequired;

/*
*job_id - a job id
*status - the new status to set for the job.
*/
typedefstructure{
job_idjob_id;
stringstatus;
booleanas_admin;
}
UpdateJobStatusParams;

funcdefupdate_job_status(UpdateJobStatusParamsparams)returns(job_idjob_id)authenticationrequired;

/*
*line - string - a string to set for the log line.
*is_error - int - if 1, then this line should be treated as an error, default 0
*ts - int - a timestamp since epoch in milliseconds for the log line (optional)
*
*@optionalts
*/
typedefstructure{
stringline;
booleanis_error;
intts;
}
LogLine;

/*
*@successWhether or not the add operation was successful
*@line_numberthe line number of the last added log
*/
typedefstructure{
booleansuccess;
intline_number;
}
AddJobLogsResults;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
AddJobLogsParams;

funcdefadd_job_logs(AddJobLogsParamsparams,list<LogLine>lines)returns(AddJobLogsResultsresults)authenticationrequired;

/*
*last_line_number - common number of lines (including those in skip_lines
*parameter), this number can be used as next skip_lines value to
*skip already loaded lines next time.
*/
typedefstructure{
list<LogLine>lines;
intlast_line_number;
intcount;
}
GetJobLogsResults;

/*
*job id - the job id
*optional skip_lines Legacy Parameter for Offset
*optional offset Number of lines to skip (in case they were already loaded before).
*optional limit optional parameter, maximum number of lines returned
*optional as_admin request read access to record normally not allowed..
*/
typedefstructure{
job_idjob_id;
intskip_lines;
intoffset;
intlimit;
booleanas_admin;
}
GetJobLogsParams;

funcdefget_job_logs(GetJobLogsParamsparams)returns(GetJobLogsResults)authenticationrequired;

/*
*Error block of JSON RPC response
*/
typedefstructure{
stringname;
intcode;
stringmessage;
stringerror;
}
JsonRpcError;

/*
*job_id - string - the id of the job to mark completed or finished with an error
*error_message - string - optional unless job is finished with an error
*error_code - int - optional unless job finished with an error
*error - JsonRpcError - optional output from SDK Job Containers
*job_output - job output if job completed successfully
*/
typedefstructure{
job_idjob_id;
stringerror_message;
interror_code;
UnspecifiedObjectjob_output;
booleanas_admin;
}
FinishJobParams;

/*
*Register results of already started job
*/
funcdeffinish_job(FinishJobParamsparams)returns()authenticationrequired;

/*
*skip_estimation: default true. If set true, job will set to running status skipping estimation step
*/
typedefstructure{
job_idjob_id;
booleanskip_estimation;
booleanas_admin;
}
StartJobParams;

funcdefstart_job(StartJobParamsparams)returns()authenticationrequired;

/*
*exclude_fields: exclude certain fields to return. default None.
*exclude_fields strings can be one of fields defined in execution_engine2.db.models.models.Job
*/
typedefstructure{
job_idjob_id;
list<string>exclude_fields;
booleanas_admin;
}
CheckJobParams;

/*
*job_id - string - id of the job
*user - string - user who started the job
*wsid - int - optional id of the workspace where the job is bound
*authstrat - string - what strategy used to authenticate the job
*job_input - object - inputs to the job (from the run_job call) ## TODO - verify
*job_output - object - outputs from the job (from the run_job call) ## TODO - verify
*updated - int - timestamp since epoch in milliseconds of the last time the status was updated
*running - int - timestamp since epoch in milliseconds of when it entered the running state
*created - int - timestamp since epoch in milliseconds when the job was created
*finished - int - timestamp since epoch in milliseconds when the job was finished
*status - string - status of the job. one of the following:
*created - job has been created in the service
*estimating - an estimation job is running to estimate resources required for the main
*job, and which queue should be used
*queued - job is queued to be run
*running - job is running on a worker node
*completed - job was completed successfully
*error - job is no longer running, but failed with an error
*terminated - job is no longer running, terminated either due to user cancellation,
*admin cancellation, or some automated task
*error_code - int - internal reason why the job is an error. one of the following:
*0 - unknown
*1 - job crashed
*2 - job terminated by automation
*3 - job ran over time limit
*4 - job was missing its automated output document
*5 - job authentication token expired
*errormsg - string - message (e.g. stacktrace) accompanying an errored job
*error - object - the JSON-RPC error package that accompanies the error code and message
*
*#TODO, add these to the structure?
*condor_job_ads - dict - condor related job information
*
*retry_count - int - generated field based on length of retry_ids
*retry_ids - list - list of jobs that are retried based off of this job
*retry_parent - str - job_id of the parent this retry is based off of. Not available on a retry_parent itself
*
*batch_id - str - the parent of the job, if the job is a child job created via run_job_batch
*batch_job - bool - whether or not this is a batch parent container
*child_jobs - array - Only parent container should have child job ids
*
*scheduler_type - str - scheduler, such as awe or condor
*scheduler_id - str - scheduler generated id
*scheduler_estimator_id - str - id for the job spawned for estimation
*
*
*terminated_code - int - internal reason why a job was terminated, one of:
*0 - user cancellation
*1 - admin cancellation
*2 - terminated by some automatic process
*
*@optionalerror
*@optionalerror_code
*@optionalerrormsg
*@optionalterminated_code
*@optionalestimating
*@optionalrunning
*@optionalfinished
*/
typedefstructure{
job_idjob_id;
stringuser;
stringauthstrat;
intwsid;
stringstatus;
RunJobParamsjob_input;
intcreated;
intqueued;
intestimating;
intrunning;
intfinished;
intupdated;
interror_code;
stringerrormsg;
intterminated_code;
stringbatch_id;
}
JobState;

/*
*get current status of a job
*/
funcdefcheck_job(CheckJobParamsparams)returns(JobStatejob_state)authenticationrequired;

/*
*batch_jobstate - state of parent job of the batch
*child_jobstates - states of child jobs
*IDEA: ADD aggregate_states - count of all available child job states, even if they are zero
*/
typedefstructure{
JobStatebatch_jobstate;
list<JobState>child_jobstates;
}
CheckJobBatchResults;

/*
*get current status of a parent job, and it's children, if it has any.
*/
funcdefcheck_job_batch(CheckJobParamsparams)returns(CheckJobBatchResults)authenticationrequired;

/*
*job_states - states of jobs
*could be mapping<job_id, JobState> or list<JobState>
*/
typedefstructure{
list<JobState>job_states;
}
CheckJobsResults;

/*
*As in check_job, exclude_fields strings can be used to exclude fields.
*see CheckJobParams for allowed strings.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 1.
*/
typedefstructure{
list<job_id>job_ids;
list<string>exclude_fields;
booleanreturn_list;
}
CheckJobsParams;

funcdefcheck_jobs(CheckJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*Check status of all jobs in a given workspace. Only checks jobs that have been associated
*with a workspace at their creation.
*
*return_list - optional, return list of job state if set to 1. Otherwise return a dict. Default 0.
*/
typedefstructure{
stringworkspace_id;
list<string>exclude_fields;
booleanreturn_list;
booleanas_admin;
}
CheckWorkspaceJobsParams;

funcdefcheck_workspace_jobs(CheckWorkspaceJobsParamsparams)returns(CheckJobsResults)authenticationrequired;

/*
*cancel_and_sigterm
*"""
*Reasons for why the job was cancelled
*Current Default is `terminated_by_user 0` so as to not update narrative client
*terminated_by_user = 0
*terminated_by_admin = 1
*terminated_by_automation = 2
*"""
*job_id job_id
*@optionalterminated_code
*/
typedefstructure{
job_idjob_id;
intterminated_code;
booleanas_admin;
}
CancelJobParams;

/*
*Cancels a job. This results in the status becoming "terminated" with termination_code 0.
*/
funcdefcancel_job(CancelJobParamsparams)returns()authenticationrequired;

/*
*job_id - id of job running method
*finished - indicates whether job is done (including error/cancel cases) or not
*canceled - whether the job is canceled or not.
*ujs_url - url of UserAndJobState service used by job service
*/
typedefstructure{
job_idjob_id;
booleanfinished;
booleancanceled;
stringujs_url;
booleanas_admin;
}
CheckJobCanceledResult;

/*
*Check whether a job has been canceled. This method is lightweight compared to check_job.
*/
funcdefcheck_job_canceled(CancelJobParamsparams)returns(CheckJobCanceledResultresult)authenticationrequired;

typedefstructure{
stringstatus;
}
GetJobStatusResult;

typedefstructure{
job_idjob_id;
booleanas_admin;
}
GetJobStatusParams;

/*
*Just returns the status string for a job of a given id.
*/
funcdefget_job_status(GetJobStatusParamsparams)returns(GetJobStatusResultresult)authenticationrequired;

/*
*Projection Fields
*user = StringField(required=True)
*authstrat = StringField(
*required=True, default="kbaseworkspace", validation=valid_authstrat
*)
*wsid = IntField(required=False)
*status = StringField(required=True, validation=valid_status)
*updated = DateTimeField(default=datetime.datetime.utcnow, autonow=True)
*estimating = DateTimeField(default=None) # Time when job began estimating
*running = DateTimeField(default=None) # Time when job started
*# Time when job finished, errored out, or was terminated by the user/admin
*finished = DateTimeField(default=None)
*errormsg = StringField()
*msg = StringField()
*error = DynamicField()
*
*terminated_code = IntField(validation=valid_termination_code)
*error_code = IntField(validation=valid_errorcode)
*scheduler_type = StringField()
*scheduler_id = StringField()
*scheduler_estimator_id = StringField()
*job_input = EmbeddedDocumentField(JobInput, required=True)
*job_output = DynamicField()
*/*
*
*
*/*
*Results of check_jobs_date_range methods.
*
*jobs - the jobs matching the query, up to `limit` jobs.
*count - the number of jobs returned.
*query_count - the number of jobs that matched the filters.
*filter - DEPRECATED - this field may change in the future. The filters that were
*applied to the jobs.
*skip - the number of jobs that were skipped prior to beginning to return jobs.
*projection - the list of fields included in the returned job. By default all fields.
*limit - the maximum number of jobs returned.
*sort_order - the order in which the results were sorted by the job ID - + for
*ascending, - for descending.
*
*TODO: DOCUMENT THE RETURN OF STATS mapping
*/
typedefstructure{
list<JobState>jobs;
intcount;
intquery_count;
mapping<string,string>filter;
intskip;
list<string>projection;
intlimit;
stringsort_order;
}
CheckJobsDateRangeResults;

/*
*Check job for all jobs in a given date/time range for all users (Admin function)
*Notes on start_time and end_time:
*These fields are designated as floats but floats, ints, and strings are all
*accepted. Times are determined as follows:
*- if the field is a float or a string that contains a float and only a float,
*the field value is treated as seconds since the epoch.
*- if the field is an int or a string that contains an int and only an int,
*the field value is treated as milliseconds since the epoch.
*- if the field is a string not matching the criteria above, it is treated as
*a date and time. Nearly any unambigous format can be parsed.
*
*float start_time - Filter based on job creation timestamp since epoch
*float end_time - Filter based on job creation timestamp since epoch
*list<string> projection - A list of fields to include in the projection, default ALL
*See "Projection Fields" above
*list<string> filter - DEPRECATED: this field may change or be removed in the future.
*A list of simple filters to "AND" together, such as error_code=1, wsid=1234,
*terminated_code = 1
*int limit - The maximum number of records to return
*string user - The user whose job records will be returned. Optional. Default is the
*current user.
*int offset - the number of jobs to skip before returning records.
*boolean ascending - true to sort by job ID ascending, false descending.
*boolean as_admin - true to run the query as an admin; user must have admin EE2
*permissions. Required if setting `user` to something other than your own.
*TODO: this seems to have no effect
*@optionalprojection
*@optionalfilter
*@optionallimit
*@optionaluser
*@optionaloffset
*@optionalascending
*/
typedefstructure{
floatstart_time;
floatend_time;
list<string>projection;
list<string>filter;
intlimit;
stringuser;
intoffset;
booleanascending;
booleanas_admin;
}
CheckJobsDateRangeParams;

funcdefcheck_jobs_date_range_for_user(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

funcdefcheck_jobs_date_range_for_all(CheckJobsDateRangeParamsparams)returns(CheckJobsDateRangeResults)authenticationrequired;

typedefstructure{
UnspecifiedObjectheld_job;
}
HeldJob;

/*
*Handle a held CONDOR job. You probably never want to run this, only the reaper should run it.
*/
funcdefhandle_held_job(stringcluster_id)returns(HeldJob)authenticationrequired;

/*
*Check if current user has ee2 admin rights.
*/
funcdefis_admin()returns(boolean)authenticationrequired;

/*
*str permission - One of 'r|w|x' (('read' | 'write' | 'none'))
*/
typedefstructure{
stringpermission;
}
AdminRolesResults;

/*
*Check if current user has ee2 admin rights.
*If so, return the type of rights and their roles
*/
funcdefget_admin_permission()returns(AdminRolesResults)authenticationrequired;

/*
*Get a list of clientgroups manually extracted from the config file
*/
funcdefget_client_groups()returns(list<string>client_groups)authenticationnone;
};

Function Index

abandon_children
add_job_logs
cancel_job
check_job
check_job_batch
check_job_canceled
check_jobs
check_jobs_date_range_for_all
check_jobs_date_range_for_user
check_workspace_jobs
finish_job
get_admin_permission
get_client_groups
get_job_logs
get_job_params
get_job_status
handle_held_job
is_admin
list_config
retry_job
retry_jobs
run_job
run_job_batch
run_job_concierge
start_job
status
update_job_status
ver

Type Index

AbandonChildren
AddJobLogsParams
AddJobLogsResults
AdminRolesResults
BatchParams
BatchSubmission
boolean
BulkRetryParams
CancelJobParams
CheckJobBatchResults
CheckJobCanceledResult
CheckJobParams
CheckJobsDateRangeParams
CheckJobsDateRangeResults
CheckJobsParams
CheckJobsResults
CheckWorkspaceJobsParams
ConciergeParams
FinishJobParams
GetJobLogsParams
GetJobLogsResults
GetJobParams
GetJobStatusParams
GetJobStatusResult
HeldJob
job_id
JobRequirements
JobState
JsonRpcError
LogLine
Meta
RetryParams
RetryResult
RunJobParams
StartJobParams
Status
timestamp
UpdateJobStatusParams
wsref
\ No newline at end of file diff --git a/execution_engine2.spec b/execution_engine2.spec index 9cac4d958..eb042674e 100644 --- a/execution_engine2.spec +++ b/execution_engine2.spec @@ -66,122 +66,223 @@ /*================================================================================*/ /* Running long running methods through Docker images of services from Registry */ /*================================================================================*/ - /* A workspace object reference of the form X/Y or X/Y/Z, where - X is the workspace name or id, - Y is the object name or id, - Z is the version, which is optional. + /* A workspace object reference of the form X/Y/Z, where + X is the workspace id, + Y is the object id, + Z is the version. */ typedef string wsref; - /* - time - the time the call was started; - method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); - job_id - job id if method is asynchronous (optional field). + /* Narrative metadata for a job. All fields are optional. + run_id - the Narrative-assigned ID of the job run. 1:1 with a job ID. + token_id - the ID of the token used to run the method. + tag - the release tag, e.g. dev/beta/release. + cell_id - the ID of the narrative cell from which the job was run. */ typedef structure { - timestamp time; - string method; - job_id job_id; - } MethodCall; + string run_id; + string token_id; + string tag; + string cell_id; + } Meta; + + /* Job requirements for a job. All fields are optional. To submit job requirements, + the user must have full EE2 admin permissions. Ignored for the run concierge endpoint. + + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the job. + request_disk: the amount of disk space, in GB, to request for the job. + client_group: the name of the client group on which to run the job. + client_group_regex: Whether to treat the client group string, whether provided here, + from the catalog, or as a default, as a regular expression when matching + clientgroups. Default True for HTC, but the default depends on the scheduler. + Omit to use the default. + bill_to_user: the job will be counted against the provided user's fair share quota. + ignore_concurrency_limits: ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided to the job + scheduler. Requires knowledge of the scheduler interface. + debug_mode: Whether to run the job in debug mode. Default false. - /* - call_stack - upstream calls details including nested service calls and - parent jobs where calls are listed in order from outer to inner. */ typedef structure { - list call_stack; - string run_id; - } RpcContext; + int request_cpus; + int requst_memory; + int request_disk; + string client_group; + boolean client_group_regex; + string bill_to_user; + boolean ignore_concurrency_limits; + mapping scheduler_requirements; + boolean debug_mode; + } JobRequirements; /* - method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); - params - the parameters of the method that performed this call; + method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' + params - the parameters to pass to the method. Optional parameters: + app_id - the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, last version is used if this parameter is not defined - rpc_context - context of current method call including nested call - history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that will serve as a source of data when running the SDK method. These references will - be added to the autogenerated provenance. - app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) - mapping meta - user defined metadata to associate with - the job. + be added to the autogenerated provenance. Must be in UPA format (e.g. + 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an optional workspace id to associate with the job. This is passed to the workspace service, which will share the job based on the permissions of the workspace rather than owner of the job - parent_job_id - EE2 id of the parent of a batch job. Batch jobs will add - this id to the EE2 database under the field "parent_job_id" + parent_job_id - EE2 job id for the parent of the current job. + For run_job and run_job_concierge, this value can be specified to denote + the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and the parent job + record is not altered. + Submitting a job with a parent ID to run_job_batch will cause an error to be + returned. + job_requirements: the requirements for the job. The user must have full EE2 + administration rights to use this parameter. Note that the job_requirements + are not returned along with the rest of the job parameters when querying the EE2 + API - they are only considered when submitting a job. + as_admin: run the job with full EE2 permissions, meaning that any supplied workspace + IDs are not checked for accessibility and job_requirements may be supplied. The + user must have full EE2 administration rights. + Note that this field is not included in returned data when querying EE2. */ typedef structure { string method; + string app_id; list params; string service_ver; - RpcContext rpc_context; - string remote_url; list source_ws_objects; - string app_id; - mapping meta; + Meta meta; int wsid; string parent_job_id; + JobRequirements job_requirements; + boolean as_admin; } RunJobParams; /* - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. + Start a new job. */ funcdef run_job(RunJobParams params) returns (job_id job_id) authentication required; + /* Additional parameters for a batch job. + wsid: the workspace with which to associate the parent job. + as_admin: run the job with full EE2 permissions, meaning that any supplied workspace + IDs are not checked for accessibility and job_requirements may be supplied. The + user must have full EE2 administration rights. + */ typedef structure { int wsid; + boolean as_admin; } BatchParams; typedef structure { - job_id parent_job_id; + job_id batch_id; list child_job_ids; } BatchSubmission; typedef structure { - job_id parent_job_id; + job_id batch_id; list child_job_ids; boolean as_admin; } AbandonChildren; + /* Run a batch job, consisting of a parent job and one or more child jobs. + Note that the as_admin parameters in the list of child jobs are ignored - + only the as_admin parameter in the batch_params is considered. + */ + funcdef run_job_batch(list params, BatchParams batch_params) + returns (BatchSubmission job_ids) authentication required; + + /* + job_id of retried job + retry_id: job_id of the job that was launched + str error: reason as to why that particular retry failed (available for bulk retry only) + */ + typedef structure { + job_id job_id; + job_id retry_id; + string error; + } RetryResult; + + /* + job_id of job to retry + as_admin: retry someone elses job in your namespace + #TODO Possibly Add JobRequirements job_requirements; + */ + typedef structure { + job_id job_id; + boolean as_admin; + } RetryParams; + + /* + job_ids of job to retry + as_admin: retry someone else's job in your namespace + #TODO: Possibly Add list job_requirements; + */ + typedef structure { + list job_ids; + boolean as_admin; + } BulkRetryParams; + + /* + #TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present + #TODO Add retry child that checks the status of the child? to prevent multiple retries + Allowed Jobs + * Regular Job with no children + * Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call + Not Allowed + * Regular Job with children (Should not be possible to create yet) + * Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs) + */ + funcdef retry_job(RetryParams params) returns (RetryResult retry_result) authentication required; + + /* + Same as retry_job, but accepts multiple jobs + */ + funcdef retry_jobs(BulkRetryParams params) returns (list retry_result) authentication required; + - funcdef run_job_batch(list params, BatchParams batch_params) returns (BatchSubmission job_ids) authentication required; - funcdef abandon_children(AbandonChildren params) returns (BatchSubmission parent_and_child_ids) authentication required; + + funcdef abandon_children(AbandonChildren params) + returns (BatchSubmission parent_and_child_ids) authentication required; + /* EE2Constants Concierge Params are request_cpus: int request_memory: int in MB - request_disk: int in MB + request_disk: int in GB job_priority: int = None range from -20 to +20, with higher values meaning better priority. + Note: job_priority is currently not implemented. account_group: str = None # Someone elses account + ignore_concurrency_limits: ignore any limits on simultaneous job runs. + Default 1 (True). requirements_list: list = None ['machine=worker102','color=red'] client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can leave default or specify a clientgroup + client_group_regex: Whether to treat the client group string, whether provided here, + from the catalog, or as a default, as a regular expression when matching + clientgroups. Default True for HTC, but the default depends on the scheduler. + Omit to use the default. + debug_mode: Whether to run the job in debug mode. Default 0 (False). */ typedef structure { int request_cpu; - int request_memory_mb; - int request_disk_mb; + int request_memory; + int request_disk; int job_priority; string account_group; + boolean ignore_concurrency_limits; list requirements_list; string client_group; + boolean client_group_regex; + boolean debug_mode; } ConciergeParams; @@ -327,6 +428,7 @@ wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) ## TODO - verify + job_output - object - outputs from the job (from the run_job call) ## TODO - verify updated - int - timestamp since epoch in milliseconds of the last time the status was updated running - int - timestamp since epoch in milliseconds of when it entered the running state created - int - timestamp since epoch in milliseconds when the job was created @@ -351,18 +453,34 @@ errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, one of: - 0 - user cancellation - 1 - admin cancellation - 2 - terminated by some automatic process + #TODO, add these to the structure? + condor_job_ads - dict - condor related job information - @optional error - @optional error_code - @optional errormsg - @optional terminated_code - @optional estimating - @optional running - @optional finished + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this job + retry_parent - str - job_id of the parent this retry is based off of. Not available on a retry_parent itself + + batch_id - str - the coordinating job, if the job is a child job created via run_job_batch + batch_job - bool - whether or not this is a batch parent container + child_jobs - array - Only parent container should have child job ids + + scheduler_type - str - scheduler, such as awe or condor + scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for estimation + + + terminated_code - int - internal reason why a job was terminated, one of: + 0 - user cancellation + 1 - admin cancellation + 2 - terminated by some automatic process + + @optional error + @optional error_code + @optional errormsg + @optional terminated_code + @optional estimating + @optional running + @optional finished */ @@ -383,6 +501,8 @@ int error_code; string errormsg; int terminated_code; + string batch_id; + } JobState; /* @@ -391,12 +511,12 @@ funcdef check_job(CheckJobParams params) returns (JobState job_state) authentication required; /* - parent_job - state of parent job - job_states - states of child jobs + batch_jobstate - state of the coordinating job for the batch + child_jobstates - states of child jobs IDEA: ADD aggregate_states - count of all available child job states, even if they are zero */ typedef structure { - JobState parent_jobstate; + JobState batch_jobstate; list child_jobstates; } CheckJobBatchResults; @@ -525,30 +645,59 @@ /* - Results of check_jobs_date_range - TODO : DOCUMENT THE RETURN OF STATS mapping + Results of check_jobs_date_range methods. + + jobs - the jobs matching the query, up to `limit` jobs. + count - the number of jobs returned. + query_count - the number of jobs that matched the filters. + filter - DEPRECATED - this field may change in the future. The filters that were + applied to the jobs. + skip - the number of jobs that were skipped prior to beginning to return jobs. + projection - the list of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. + sort_order - the order in which the results were sorted by the job ID - + for + ascending, - for descending. + + TODO: DOCUMENT THE RETURN OF STATS mapping */ typedef structure { - mapping jobs; + list jobs; int count; int query_count; - list filter; + mapping filter; int skip; list projection; int limit; string sort_order; } CheckJobsDateRangeResults; - - /* Check job for all jobs in a given date/time range for all users (Admin function) - float start_time; # Filter based on creation timestamp since epoch - float end_time; # Filter based on creation timestamp since epoch - list projection; # A list of fields to include in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, such as error_code=1, wsid=1234, terminated_code = 1 - int limit; # The maximum number of records to return - string user; # Optional. Defaults off of your token + Notes on start_time and end_time: + These fields are designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: + - if the field is a float or a string that contains a float and only a float, + the field value is treated as seconds since the epoch. + - if the field is an int or a string that contains an int and only an int, + the field value is treated as milliseconds since the epoch. + - if the field is a string not matching the criteria above, it is treated as + a date and time. Nearly any unambigous format can be parsed. + + float start_time - Filter based on job creation timestamp since epoch + float end_time - Filter based on job creation timestamp since epoch + list projection - A list of fields to include in the projection, default ALL + See "Projection Fields" above + list filter - DEPRECATED: this field may change or be removed in the future. + A list of simple filters to "AND" together, such as error_code=1, wsid=1234, + terminated_code = 1 + int limit - The maximum number of records to return + string user - The user whose job records will be returned. Optional. Default is the + current user. + int offset - the number of jobs to skip before returning records. + boolean ascending - true to sort by job ID ascending, false descending. + boolean as_admin - true to run the query as an admin; user must have admin EE2 + permissions. Required if setting `user` to something other than your own. + TODO: this seems to have no effect @optional projection @optional filter @optional limit @@ -568,8 +717,10 @@ boolean as_admin; } CheckJobsDateRangeParams; - funcdef check_jobs_date_range_for_user(CheckJobsDateRangeParams params) returns (CheckJobsResults) authentication required; - funcdef check_jobs_date_range_for_all(CheckJobsDateRangeParams params) returns (CheckJobsResults) authentication required; + funcdef check_jobs_date_range_for_user(CheckJobsDateRangeParams params) + returns (CheckJobsDateRangeResults) authentication required; + funcdef check_jobs_date_range_for_all(CheckJobsDateRangeParams params) + returns (CheckJobsDateRangeResults) authentication required; typedef structure { UnspecifiedObject held_job; @@ -586,7 +737,7 @@ /* - str permission; # One of 'r|w|x' (('read' | 'write' | 'none')) + str permission - One of 'r|w|x' (('read' | 'write' | 'none')) */ typedef structure { string permission; diff --git a/kbase.yml b/kbase.yml index ddfbbc858..0cee4a309 100644 --- a/kbase.yml +++ b/kbase.yml @@ -8,8 +8,8 @@ service-language: python module-version: - 0.0.1 + 0.0.5 owners: - [bsadkhin, tgu2, wjriehl] + [bsadkhin, tgu2, wjriehl, gaprice] diff --git a/lgtm.yml b/lgtm.yml new file mode 100644 index 000000000..b064fdbb9 --- /dev/null +++ b/lgtm.yml @@ -0,0 +1,3 @@ +path_classifiers: + generated: + - lib/biokbase/log.py diff --git a/lib/biokbase/README.md b/lib/biokbase/README.md new file mode 100644 index 000000000..2caf66c73 --- /dev/null +++ b/lib/biokbase/README.md @@ -0,0 +1,3 @@ +log.py lives here: https://raw.githubusercontent.com/kbase/sdkbase2/python/log.py + +However, it's needed to run tests so it's checked into this repo. diff --git a/lib/biokbase/log.py b/lib/biokbase/log.py new file mode 100644 index 000000000..5626ac03f --- /dev/null +++ b/lib/biokbase/log.py @@ -0,0 +1,368 @@ +""" +NAME + log + +DESCRIPTION + A library for sending logging messages to syslog. + +METHODS + log(string subsystem, hashref constraints): Initializes log. You + should call this at the beginning of your program. Constraints are + optional. + + log_message(int level, string message): sends log message to syslog. + + * level: (0-9) The logging level for this message is compared to + the logging level that has been set in log. If it is <= + the set logging level, the message will be sent to syslog, + otherwise it will be ignored. Logging level is set to 6 + if control API cannot be reached and the user does + not set the log level. Log level can also be entered as + string (e.g. 'DEBUG') + + * message: This is the log message. + + get_log_level(): Returns the current log level as an integer. + + set_log_level(integer level) : Sets the log level. Only use this if you + wish to override the log levels that are defined by the control API. + Can also be entered as string (e.g. 'DEBUG') + + * level : priority + + * 0 : EMERG - system is unusable + + * 1 : ALERT - component must be fixed immediately + + * 2 : CRIT - secondary component must be fixed immediately + + * 3 : ERR - non-urgent failure + + * 4 : WARNING - warning that an error will occur if no action + is taken + + * 5 : NOTICE - unusual but safe conditions + + * 6 : INFO - normal operational messages + + * 7 : DEBUG - lowest level of debug + + * 8 : DEBUG2 - second level of debug + + * 9 : DEBUG3 - highest level of debug + + set_log_msg_check_count(integer count): used to set the number the + messages that log will log before querying the control API for the + log level (default is 100 messages). + + set_log_msg_check_interval(integer seconds): used to set the interval, + in seconds, that will be allowed to pass before log will query the + control API for the log level (default is 300 seconds). + + update_api_log_level() : Checks the control API for the currently set + log level. + + use_api_log_level() : Removes the user-defined log level and tells log + to use the control API-defined log level. +""" + +import json as _json +import urllib.request as _urllib2 +import syslog as _syslog +import platform as _platform +import inspect as _inspect +import os as _os +import getpass as _getpass +import warnings as _warnings +from configparser import ConfigParser as _ConfigParser +import time + +MLOG_ENV_FILE = 'MLOG_CONFIG_FILE' +_GLOBAL = 'global' +MLOG_LOG_LEVEL = 'mlog_log_level' +MLOG_API_URL = 'mlog_api_url' +MLOG_LOG_FILE = 'mlog_log_file' + +DEFAULT_LOG_LEVEL = 6 +#MSG_CHECK_COUNT = 100 +#MSG_CHECK_INTERVAL = 300 # 300s = 5min +MSG_FACILITY = _syslog.LOG_LOCAL1 +EMERG_FACILITY = _syslog.LOG_LOCAL0 + +EMERG = 0 +ALERT = 1 +CRIT = 2 +ERR = 3 +WARNING = 4 +NOTICE = 5 +INFO = 6 +DEBUG = 7 +DEBUG2 = 8 +DEBUG3 = 9 +_MLOG_TEXT_TO_LEVEL = {'EMERG': EMERG, + 'ALERT': ALERT, + 'CRIT': CRIT, + 'ERR': ERR, + 'WARNING': WARNING, + 'NOTICE': NOTICE, + 'INFO': INFO, + 'DEBUG': DEBUG, + 'DEBUG2': DEBUG2, + 'DEBUG3': DEBUG3, + } +_MLOG_TO_SYSLOG = [_syslog.LOG_EMERG, _syslog.LOG_ALERT, _syslog.LOG_CRIT, + _syslog.LOG_ERR, _syslog.LOG_WARNING, _syslog.LOG_NOTICE, + _syslog.LOG_INFO, _syslog.LOG_DEBUG, _syslog.LOG_DEBUG, + _syslog.LOG_DEBUG] +#ALLOWED_LOG_LEVELS = set(_MLOG_TEXT_TO_LEVEL.values()) +_MLOG_LEVEL_TO_TEXT = {} +for k, v in _MLOG_TEXT_TO_LEVEL.items(): + _MLOG_LEVEL_TO_TEXT[v] = k +LOG_LEVEL_MIN = min(_MLOG_LEVEL_TO_TEXT.keys()) +LOG_LEVEL_MAX = max(_MLOG_LEVEL_TO_TEXT.keys()) +del k, v + + +class log(object): + """ + This class contains the methods necessary for sending log messages. + """ + + def __init__(self, subsystem, constraints=None, config=None, logfile=None, + ip_address=False, authuser=False, module=False, + method=False, call_id=False, changecallback=None): + if not subsystem: + raise ValueError("Subsystem must be supplied") + + self.user = _getpass.getuser() + self.parentfile = _os.path.abspath(_inspect.getfile( + _inspect.stack()[1][0])) + self.ip_address = ip_address + self.authuser = authuser + self.module = module + self.method = method + self.call_id = call_id + noop = lambda: None + self._callback = changecallback or noop + self._subsystem = str(subsystem) + self._mlog_config_file = config + if not self._mlog_config_file: + self._mlog_config_file = _os.environ.get(MLOG_ENV_FILE, None) + if self._mlog_config_file: + self._mlog_config_file = str(self._mlog_config_file) + self._user_log_level = -1 + self._config_log_level = -1 + self._user_log_file = logfile + self._config_log_file = None + self._api_log_level = -1 + self._msgs_since_config_update = 0 + self._time_at_config_update = time.time() + self.msg_count = 0 + self._recheck_api_msg = 100 + self._recheck_api_time = 300 # 5 mins + self._log_constraints = {} if not constraints else constraints + + self._init = True + self.update_config() + self._init = False + + def _get_time_since_start(self): + time_diff = time.time() - self._time_at_config_update + return time_diff + + def get_log_level(self): + if(self._user_log_level != -1): + return self._user_log_level + elif(self._config_log_level != -1): + return self._config_log_level + elif(self._api_log_level != -1): + return self._api_log_level + else: + return DEFAULT_LOG_LEVEL + + def _get_config_items(self, cfg, section): + cfgitems = {} + if cfg.has_section(section): + for k, v in cfg.items(section): + cfgitems[k] = v + return cfgitems + + def update_config(self): + loglevel = self.get_log_level() + logfile = self.get_log_file() + + self._api_log_level = -1 + self._msgs_since_config_update = 0 + self._time_at_config_update = time.time() + + # Retrieving the control API defined log level + api_url = None + if self._mlog_config_file and _os.path.isfile(self._mlog_config_file): + cfg = _ConfigParser() + cfg.read(self._mlog_config_file) + cfgitems = self._get_config_items(cfg, _GLOBAL) + cfgitems.update(self._get_config_items(cfg, self._subsystem)) + if MLOG_LOG_LEVEL in cfgitems: + try: + self._config_log_level = int(cfgitems[MLOG_LOG_LEVEL]) + except: + _warnings.warn( + 'Cannot parse log level {} from file {} to int'.format( + cfgitems[MLOG_LOG_LEVEL], self._mlog_config_file) + + '. Keeping current log level.') + if MLOG_API_URL in cfgitems: + api_url = cfgitems[MLOG_API_URL] + if MLOG_LOG_FILE in cfgitems: + self._config_log_file = cfgitems[MLOG_LOG_FILE] + elif self._mlog_config_file: + _warnings.warn('Cannot read config file ' + self._mlog_config_file) + + if (api_url): + subsystem_api_url = api_url + "/" + self._subsystem + try: + data = _json.load(_urllib2.urlopen(subsystem_api_url, + timeout=5)) + except _urllib2.URLError as e: + code_ = None + if hasattr(e, 'code'): + code_ = ' ' + str(e.code) + _warnings.warn( + 'Could not connect to mlog api server at ' + + '{}:{} {}. Using default log level {}.'.format( + subsystem_api_url, code_, str(e.reason), + str(DEFAULT_LOG_LEVEL))) + else: + max_matching_level = -1 + for constraint_set in data['log_levels']: + level = constraint_set['level'] + constraints = constraint_set['constraints'] + if level <= max_matching_level: + continue + + matches = 1 + for constraint in constraints: + if constraint not in self._log_constraints: + matches = 0 + elif (self._log_constraints[constraint] != + constraints[constraint]): + matches = 0 + + if matches == 1: + max_matching_level = level + + self._api_log_level = max_matching_level + if ((self.get_log_level() != loglevel or + self.get_log_file() != logfile) and not self._init): + self._callback() + + def _resolve_log_level(self, level): + if(level in _MLOG_TEXT_TO_LEVEL): + level = _MLOG_TEXT_TO_LEVEL[level] + elif(level not in _MLOG_LEVEL_TO_TEXT): + raise ValueError('Illegal log level') + return level + + def set_log_level(self, level): + self._user_log_level = self._resolve_log_level(level) + self._callback() + + def get_log_file(self): + if self._user_log_file: + return self._user_log_file + if self._config_log_file: + return self._config_log_file + return None + + def set_log_file(self, filename): + self._user_log_file = filename + self._callback() + + def set_log_msg_check_count(self, count): + count = int(count) + if count < 0: + raise ValueError('Cannot check a negative number of messages') + self._recheck_api_msg = count + + def set_log_msg_check_interval(self, interval): + interval = int(interval) + if interval < 0: + raise ValueError('interval must be positive') + self._recheck_api_time = interval + + def clear_user_log_level(self): + self._user_log_level = -1 + self._callback() + + def _get_ident(self, level, user, parentfile, ip_address, authuser, module, + method, call_id): + infos = [self._subsystem, _MLOG_LEVEL_TO_TEXT[level], + repr(time.time()), user, parentfile, str(_os.getpid())] + if self.ip_address: + infos.append(str(ip_address) if ip_address else '-') + if self.authuser: + infos.append(str(authuser) if authuser else '-') + if self.module: + infos.append(str(module) if module else '-') + if self.method: + infos.append(str(method) if method else '-') + if self.call_id: + infos.append(str(call_id) if call_id else '-') + return "[" + "] [".join(infos) + "]" + + def _syslog(self, facility, level, ident, message): + _syslog.openlog(ident, facility) + if isinstance(message, str): + _syslog.syslog(_MLOG_TO_SYSLOG[level], message) + else: + try: + for m in message: + _syslog.syslog(_MLOG_TO_SYSLOG[level], m) + except TypeError: + _syslog.syslog(_MLOG_TO_SYSLOG[level], str(message)) + _syslog.closelog() + + def _log(self, ident, message): + ident = ' '.join([str(time.strftime( + "%Y-%m-%d %H:%M:%S", time.localtime())), + _platform.node(), ident + ': ']) + try: + with open(self.get_log_file(), 'a') as log: + if isinstance(message, str): + log.write(ident + message + '\n') + else: + try: + for m in message: + log.write(ident + m + '\n') + except TypeError: + log.write(ident + str(message) + '\n') + except Exception as e: + err = 'Could not write to log file ' + str(self.get_log_file()) + \ + ': ' + str(e) + '.' + _warnings.warn(err) + + def log_message(self, level, message, ip_address=None, authuser=None, + module=None, method=None, call_id=None): +# message = str(message) + level = self._resolve_log_level(level) + + self.msg_count += 1 + self._msgs_since_config_update += 1 + + if(self._msgs_since_config_update >= self._recheck_api_msg + or self._get_time_since_start() >= self._recheck_api_time): + self.update_config() + + ident = self._get_ident(level, self.user, self.parentfile, ip_address, + authuser, module, method, call_id) + # If this message is an emergency, send a copy to the emergency + # facility first. + if(level == 0): + self._syslog(EMERG_FACILITY, level, ident, message) + + if(level <= self.get_log_level()): + self._syslog(MSG_FACILITY, level, ident, message) + if self.get_log_file(): + self._log(ident, message) + +if __name__ == '__main__': + pass diff --git a/lib/execution_engine2/README.md b/lib/execution_engine2/README.md new file mode 100644 index 000000000..84e6f898c --- /dev/null +++ b/lib/execution_engine2/README.md @@ -0,0 +1,3 @@ +authclient.py lives here: https://github.com/kbase/kb_sdk/blob/master/src/java/us/kbase/templates/authclient.py + +... but is checked in as it's needed for tests. \ No newline at end of file diff --git a/lib/execution_engine2/authclient.py b/lib/execution_engine2/authclient.py new file mode 100644 index 000000000..844f9b0c2 --- /dev/null +++ b/lib/execution_engine2/authclient.py @@ -0,0 +1,94 @@ +''' +Created on Aug 1, 2016 + +A very basic KBase auth client for the Python server. + +@author: gaprice@lbl.gov +''' +import time as _time +import requests as _requests +import threading as _threading +import hashlib + + +class TokenCache(object): + ''' A basic cache for tokens. ''' + + _MAX_TIME_SEC = 5 * 60 # 5 min + + _lock = _threading.RLock() + + def __init__(self, maxsize=2000): + self._cache = {} + self._maxsize = maxsize + self._halfmax = maxsize / 2 # int division to round down + + def get_user(self, token): + token = hashlib.sha256(token.encode('utf-8')).hexdigest() + with self._lock: + usertime = self._cache.get(token) + if not usertime: + return None + + user, intime = usertime + if _time.time() - intime > self._MAX_TIME_SEC: + return None + return user + + def add_valid_token(self, token, user): + if not token: + raise ValueError('Must supply token') + if not user: + raise ValueError('Must supply user') + token = hashlib.sha256(token.encode('utf-8')).hexdigest() + with self._lock: + self._cache[token] = [user, _time.time()] + if len(self._cache) > self._maxsize: + sorted_items = sorted( + list(self._cache.items()), + key=(lambda v: v[1][1]) + ) + for i, (t, _) in enumerate(sorted_items): + if i <= self._halfmax: + del self._cache[t] + else: + break + + +class KBaseAuth(object): + ''' + A very basic KBase auth client for the Python server. + ''' + + _LOGIN_URL = 'https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login' + + def __init__(self, auth_url=None): + ''' + Constructor + ''' + self._authurl = auth_url + if not self._authurl: + self._authurl = self._LOGIN_URL + self._cache = TokenCache() + + def get_user(self, token): + if not token: + raise ValueError('Must supply token') + user = self._cache.get_user(token) + if user: + return user + + d = {'token': token, 'fields': 'user_id'} + ret = _requests.post(self._authurl, data=d) + if not ret.ok: + try: + err = ret.json() + except Exception as e: + ret.raise_for_status() + raise ValueError('Error connecting to auth service: {} {}\n{}' + .format(ret.status_code, ret.reason, + err['error']['message'])) + + user = ret.json()['user_id'] + self._cache.add_valid_token(token, user) + return user diff --git a/lib/execution_engine2/authorization/authstrategy.py b/lib/execution_engine2/authorization/authstrategy.py index 5ba8c2fe7..8ac55a034 100644 --- a/lib/execution_engine2/authorization/authstrategy.py +++ b/lib/execution_engine2/authorization/authstrategy.py @@ -10,68 +10,60 @@ KBASE_WS_AUTHSTRAT = "kbaseworkspace" -def can_read_job(job: Job, user_id: str, token: str, config: Dict[str, str]) -> bool: +def can_read_job(job: Job, user_id: str, ws_auth: WorkspaceAuth) -> bool: """ Returns True if the user has read access to the job, False otherwise. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: bool - True if the user can read the job info """ - return _check_permissions(job, user_id, token, config, level="read") + return _check_permissions(job, user_id, ws_auth, level="read") -def can_write_job(job: Job, user_id: str, token: str, config: Dict[str, str]) -> bool: +def can_write_job(job: Job, user_id: str, ws_auth: WorkspaceAuth) -> bool: """ Returns True if the user has write access to the job, False otherwise. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: bool - True if the user can read the job info """ - return _check_permissions(job, user_id, token, config, level="write") + return _check_permissions(job, user_id, ws_auth, level="write") -def can_read_jobs( - jobs: List[Job], user_id: str, token: str, config: Dict[str, str] -) -> List[bool]: +def can_read_jobs(jobs: List[Job], user_id: str, ws_auth: WorkspaceAuth) -> List[bool]: """ Returns a list of job permissions in the same order as the given list of Jobs. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: List[bool] - Has True values if the user can read job info, False otherwise """ - return _check_permissions_list(jobs, user_id, token, config, level="read") + return _check_permissions_list(jobs, user_id, ws_auth, level="read") -def can_write_jobs( - jobs: List[Job], user_id: str, token: str, config: Dict[str, str] -) -> List[bool]: +def can_write_jobs(jobs: List[Job], user_id: str, ws_auth: WorkspaceAuth) -> List[bool]: """ Returns a list of job write permissions in the same order as the given list of Jobs. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config + :param ws_auth: a workspace authorization instance initialized with the user's token. :returns: List[bool] - Has True values if the user can write job info, False otherwise """ - return _check_permissions_list(jobs, user_id, token, config, level="write") + return _check_permissions_list(jobs, user_id, ws_auth, level="write") def _check_permissions( - job: Job, user_id: str, token: str, config: Dict[str, str], level="read" + job: Job, user_id: str, ws_auth: WorkspaceAuth, level="read" ) -> bool: """ Returns a job permissions, for either read or write ability :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config - :param level: string - if "read", then returns the read value, if "write", return whether the user can write. + :param ws_auth: a workspace authorization instance initialized with the user's token. + :param level: string - if "read", then returns the read value, if "write", return whether + the user can write. :returns: bool - True if the permission is valid, False otherwise. """ if user_id == job.user: @@ -79,7 +71,6 @@ def _check_permissions( if job.authstrat == KBASE_WS_AUTHSTRAT: if job.wsid is None: return False - ws_auth = WorkspaceAuth(token, user_id, config["workspace-url"]) if level == "read": return ws_auth.can_read(job.wsid) else: @@ -89,15 +80,15 @@ def _check_permissions( def _check_permissions_list( - jobs: List[Job], user_id: str, token: str, config: Dict[str, str], level="read" + jobs: List[Job], user_id: str, ws_auth: WorkspaceAuth, level="read" ) -> List[bool]: """ Returns True for each job the user has read access to, and False for the ones they don't. :param job: a Job model object :param user_id: string - the user id - :param token: string - the user's auth token - :param config: dict - the service config - :param level: string - if "read" then tests if the Job can be read, otherwise checks if it can be written + :param ws_auth: a workspace authorization instance initialized with the user's token + :param level: string - if "read" then tests if the Job can be read, otherwise checks if it + can be written :returns: List[bool] - Has True values if the user can write job info, False otherwise """ @@ -134,7 +125,6 @@ def _check_permissions_list( if len(ws_ids_to_jobs): # If there's workspaces to look up, go do it. - ws_auth = WorkspaceAuth(token, user_id, config["workspace-url"]) if level == "read": ws_perms = ws_auth.can_read_list( list(ws_ids_to_jobs.keys()) diff --git a/lib/execution_engine2/authorization/workspaceauth.py b/lib/execution_engine2/authorization/workspaceauth.py index 06865873f..cecdc84f8 100644 --- a/lib/execution_engine2/authorization/workspaceauth.py +++ b/lib/execution_engine2/authorization/workspaceauth.py @@ -1,8 +1,8 @@ from typing import List, Dict from enum import Enum -from lib.execution_engine2.authorization.basestrategy import AuthStrategy -from lib.installed_clients.WorkspaceClient import Workspace -from lib.installed_clients.baseclient import ServerError +from execution_engine2.authorization.basestrategy import AuthStrategy +from installed_clients.WorkspaceClient import Workspace +from installed_clients.baseclient import ServerError STRATEGY = "kbaseworkspace" @@ -15,8 +15,8 @@ class WorkspacePermission(Enum): class WorkspaceAuth(AuthStrategy): - def __init__(self, token: str, user_id: str, ws_url: str): - self.ws_client = Workspace(url=ws_url, token=token) + def __init__(self, user_id: str, workspace: Workspace): + self.ws_client = workspace self.user_id = user_id def can_read(self, auth_param: str) -> bool: diff --git a/lib/execution_engine2/cruft.py b/lib/execution_engine2/cruft.py deleted file mode 100644 index 1d290cbaf..000000000 --- a/lib/execution_engine2/cruft.py +++ /dev/null @@ -1,180 +0,0 @@ -# # def _run_admin_command(self, command, params): -# # available_commands = ["cancel_job", "view_job_logs"] -# # if command not in available_commands: -# # raise ValueError(f"{command} not an admin command. See {available_commands} ") -# # commands = {"cancel_job": self.cancel_job, "view_job_logs": self.view_job_logs} -# # p = { -# # "cancel_job": { -# # "job_id": params.get("job_id"), -# # "terminated_code": params.get( -# # "terminated_code", TerminatedCode.terminated_by_admin.value -# # ), -# # }, -# # "view_job_logs": {"job_id": params.get("job_id")}, -# # } -# # return commands[command](**p[command]) -# # -# # def admin_role(self, token): -# # """ -# # Check to see which role the given token has -# # :param token: Token to inspect -# # :return: One of 'EE2_ADMIN_RO' or 'EE2_ADMIN` or None -# # """ -# # return AdminAuthUtil(self.auth_url, self.admin_roles).get_admin_role( -# # token=token, read_role="EE2_ADMIN_RO", write_role="EE2_ADMIN" -# # ) -# # -# # def get_job_wrapper(self, job_id, required_admin_role=None): -# # """ -# # If you are an admin, you can -# # If you are not an admin, you -# # :param job_id: -# # :return: -# # """ -# # if required_admin_role is not None and required_admin_role in self.roles: -# # job = self.get_mongo_util().get_job(job_id=job_id) -# # logging.info(f"ADMIN USER has permission to cancel job {job_id}") -# # self.logger.debug(f"ADMIN USER has permission to cancel job {job_id}") -# # else: -# # job = self.get_job_with_permission(job_id, JobPermissions.WRITE) -# # logging.info(f"User has permission to cancel job {job_id}") -# # self.logger.debug(f"User has permission to cancel job {job_id}") -# # return job -# # -# # def administer(self, command, params, token): -# # """ -# # Run commands as an administrator. Requires a token for a user with an EE2 administrative role. -# # Currently allowed commands are cancel_job and view_job_logs. -# # -# # Commands are given as strings, and their parameters are given as a dictionary of keys and values. -# # For example: -# # administer("cancel_job", {"job_id": 12345}, auth_token) -# # is the same as running -# # cancel_job(12345) -# # but with administrative privileges. -# # :param command: The command to run (See specfile) -# # :param params: The parameters for that command that will be expanded (See specfile) -# # :param token: The auth token (Will be checked for the correct auth role) -# # :return: -# # """ -# # logging.info( -# # f'Attempting to run administrative command "{command}" as user {self.user_id}' -# # ) -# # # set admin privs, one way or the other -# # self.is_admin = self._is_admin(token) -# # if not self.is_admin: -# # raise PermissionError( -# # f"User {self.user_id} is not authorized to run administrative commands." -# # ) -# # self._run_admin_command(command, params) -# # self.is_admin = False -# -# -# def process_old_format(self, cg_resources_requirements): -# """ -# Old format is njs,request_cpu=1,request_memory=1,request_disk=1,request_color=blue -# Regex is assumed to be true -# -# :param cg_resources_requirements: -# :return: -# """ -# cg_res_req_split = cg_resources_requirements.split(",") # List -# -# # Access and remove clientgroup from the statement -# client_group = cg_res_req_split.pop(0) -# -# requirements = dict() -# for item in cg_res_req_split: -# (req, value) = item.split("=") -# requirements[req] = value -# -# # Set up default resources -# resources = self.get_default_resources(client_group) -# -# if client_group is None or client_group is "": -# client_group = resources[self.CG] -# -# requirements_statement = [] -# -# for key, value in requirements.items(): -# if key in resources: -# # Overwrite the resources with catalog entries -# resources[key] = value -# else: -# # Otherwise add it to the requirements statement -# requirements_statement.append(f"{key}={value}") -# -# # Delete special keys -# print(resources) -# print(requirements) -# -# del requirements[self.REQUEST_MEMORY] -# del requirements[self.REQUEST_CPUS] -# del requirements[self.REQUEST_DISK] -# -# # Set the clientgroup just in case it was blank -# -# # Add clientgroup to resources because it is special -# # Regex is enabled by default -# cge = f'regexp("{client_group}",CLIENTGROUP)' -# requirements_statement.append(cge) -# -# rv = dict() -# rv[self.CG] = client_group -# rv["client_group_expression"] = cge -# rv["requirements"] = "".join(requirements_statement) -# rv["requirements_statement"] = cge -# for key, value in resources.items(): -# rv[key] = value -# -# return rv -# -# -# def process_new_format(self, client_group_and_requirements): -# """ -# New format is {'client_group' : 'njs', 'request_cpu' : 1, 'request_disk' : -# :param client_group_and_requirements: -# :return: -# """ -# reqs = json.loads(client_group_and_requirements) -# -# def generate_requirements(self, cg_resources_requirements): -# print(cg_resources_requirements) -# if "{" in cg_resources_requirements: -# reqs = self.process_new_format(cg_resources_requirements) -# else: -# reqs = self.process_old_format(cg_resources_requirements) -# -# self.check_for_missing_requirements(reqs) -# -# return self.resource_requirements( -# request_cpus=reqs["request_cpus"], -# request_disk=reqs["request_disk"], -# request_memory=reqs["request_memory"], -# requirements_statement=reqs["requirements"], -# ) -# return r -# -# @staticmethod -# def check_for_missing_requirements(requirements): -# for item in ( -# "client_group_expression", -# "request_cpus", -# "request_disk", -# "request_memory", -# ): -# if item not in requirements: -# raise MissingCondorRequirementsException( -# f"{item} not found in requirements" -# ) -# -# def _process_requirements_new_format(self, requirements): -# requirements = dict() -# cg = requirements.get("client_group", "") -# if cg is "": -# # requirements[ -# -# if bool(requirements.get("regex", False)) is True: -# cg["client_group_requirement"] = f'regexp("{cg}",CLIENTGROUP)' -# else: -# cg["client_group_requirement"] = f"+CLIENTGROUP == {client_group} " diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 61fe0137b..349b066bc 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -3,27 +3,32 @@ import time import traceback from contextlib import contextmanager - +from datetime import datetime +from typing import Dict, List from bson.objectid import ObjectId from mongoengine import connect, connection -from pymongo import MongoClient +from pymongo import MongoClient, UpdateOne from pymongo.errors import ServerSelectionTimeoutError -from lib.execution_engine2.db.models.models import JobLog, Job, Status, TerminatedCode -from lib.execution_engine2.exceptions import ( +from execution_engine2.db.models.models import JobLog, Job, Status, TerminatedCode +from execution_engine2.exceptions import ( RecordNotFoundException, InvalidStatusTransitionException, ) +from lib.execution_engine2.utils.arg_processing import parse_bool +from execution_engine2.sdk.EE2Runjob import JobIdPair + class MongoUtil: - def __init__(self, config: dict): + def __init__(self, config: Dict): self.config = config self.mongo_host = config["mongo-host"] self.mongo_port = int(config["mongo-port"]) self.mongo_database = config["mongo-database"] self.mongo_user = config["mongo-user"] self.mongo_pass = config["mongo-password"] + self.retry_rewrites = parse_bool(config["mongo-retry-rewrites"]) self.mongo_authmechanism = config["mongo-authmechanism"] self.mongo_collection = None self._start_local_service() @@ -39,9 +44,10 @@ def _get_pymongo_client(self): password=self.mongo_pass, authSource=self.mongo_database, authMechanism=self.mongo_authmechanism, + retryWrites=self.retry_rewrites, ) - def _get_mongoengine_client(self): + def _get_mongoengine_client(self) -> connection: return connect( db=self.mongo_database, host=self.mongo_host, @@ -50,7 +56,9 @@ def _get_mongoengine_client(self): password=self.mongo_pass, authentication_source=self.mongo_database, authentication_mechanism=self.mongo_authmechanism, - ) # type: connection + retryWrites=self.retry_rewrites, + ) + # This MongoDB deployment does not support retryable writes def _start_local_service(self): try: @@ -215,7 +223,9 @@ def get_job(self, job_id=None, exclude_fields=None) -> Job: return job - def get_jobs(self, job_ids=None, exclude_fields=None, sort_id_ascending=None): + def get_jobs( + self, job_ids=None, exclude_fields=None, sort_id_ascending=None + ) -> List[Job]: if not (job_ids and isinstance(job_ids, list)): raise ValueError("Please provide a non empty list of job ids") @@ -262,6 +272,68 @@ def check_if_already_finished(job_status): return True return False + def update_jobs_to_queued( + self, job_id_pairs: List[JobIdPair], scheduler_type: str = "condor" + ) -> None: + f""" + * Adds scheduler id to list of jobs + * Updates a list of {Status.created.value} jobs to queued. Does not work on jobs that already have gone through any other + status transition. If the record is not in the {Status.created.value} status, nothing will happen + :param job_id_pairs: A list of pairs of Job Ids and Scheduler Ids + :param scheduler_type: The scheduler this job was queued in, default condor + """ + + bulk_update_scheduler_jobs = [] + bulk_update_created_to_queued = [] + queue_time_now = datetime.utcnow().timestamp() + for job_id_pair in job_id_pairs: + if job_id_pair.job_id is None: + raise ValueError( + f"Provided a bad job_id_pair, missing job_id for {job_id_pair.scheduler_id}" + ) + elif job_id_pair.scheduler_id is None: + raise ValueError( + f"Provided a bad job_id_pair, missing scheduler_id for {job_id_pair.job_id}" + ) + + bulk_update_scheduler_jobs.append( + UpdateOne( + { + "_id": ObjectId(job_id_pair.job_id), + }, + { + "$set": { + "scheduler_id": job_id_pair.scheduler_id, + "scheduler_type": scheduler_type, + } + }, + ) + ) + bulk_update_created_to_queued.append( + UpdateOne( + { + "_id": ObjectId(job_id_pair.job_id), + "status": Status.created.value, + }, + { + "$set": { + "status": Status.queued.value, + "queued": queue_time_now, + } + }, + ) + ) + # Update provided jobs with scheduler id. Then only update non terminated jobs into updated status. + mongo_collection = self.config["mongo-jobs-collection"] + + if bulk_update_scheduler_jobs: + with self.pymongo_client(mongo_collection) as pymongo_client: + ee2_jobs_col = pymongo_client[self.mongo_database][mongo_collection] + # Bulk Update to add scheduler ids + ee2_jobs_col.bulk_write(bulk_update_scheduler_jobs, ordered=False) + # Bulk Update to add queued status ids + ee2_jobs_col.bulk_write(bulk_update_created_to_queued, ordered=False) + def cancel_job(self, job_id=None, terminated_code=None): """ #TODO Should we check for a valid state transition here also? @@ -372,8 +444,6 @@ def update_job_status(self, job_id, status, msg=None, error_message=None): f"Cannot change already finished/terminated/errored job. {j.status} to {status}" ) - self.logger.debug(f"job status is {j.status}. going to update to {status}") - # A job in status running can only be terminated/error/finished if j.status == Status.running.value: if status not in [ @@ -421,6 +491,18 @@ def update_job_status(self, job_id, status, msg=None, error_message=None): def mongo_engine_connection(self): yield self.me_connection + def insert_jobs(self, jobs_to_insert: List[Job]) -> List[ObjectId]: + """ + Insert multiple job records using MongoEngine + :param jobs_to_insert: Multiple jobs to insert at once + :return: List of job ids from the insertion + """ + # TODO Look at pymongo write_concerns that may be useful + # TODO see if pymongo is faster + # TODO: Think about error handling + inserted = Job.objects.insert(doc_or_docs=jobs_to_insert, load_bulk=False) + return inserted + def insert_one(self, doc): """ insert a doc into collection @@ -442,7 +524,7 @@ def insert_one(self, doc): return rec.inserted_id def _push_job_logs(self, log_lines: JobLog, job_id: str, record_count: int): - """ append a list of job logs, and update the record count """ + """append a list of job logs, and update the record count""" update_filter = {"_id": ObjectId(job_id)} push_op = {"lines": {"$each": log_lines}} diff --git a/lib/execution_engine2/db/models/models.py b/lib/execution_engine2/db/models/models.py index f8eb128a7..99e115412 100644 --- a/lib/execution_engine2/db/models/models.py +++ b/lib/execution_engine2/db/models/models.py @@ -109,7 +109,9 @@ class Meta(EmbeddedDocument): token_id = StringField() tag = StringField() cell_id = StringField() - status = StringField() + + def __repr__(self): + return self.to_json() class CondorResourceUsage(EmbeddedDocument): @@ -147,6 +149,9 @@ class JobRequirements(EmbeddedDocument): disk = IntField() estimate = EmbeddedDocumentField(Estimate) + def __repr__(self): + return self.to_json() + class JobInput(EmbeddedDocument): """ @@ -158,12 +163,17 @@ class JobInput(EmbeddedDocument): requested_release = StringField() params = DynamicField() service_ver = StringField(required=True) - app_id = StringField(required=True) + app_id = StringField() source_ws_objects = ListField() + # this ID is for jobs submitted via run_job with a parent_job_id field included by the + # client. For this case, the parent job is not updated at all. parent_job_id = StringField() requirements = EmbeddedDocumentField(JobRequirements) narrative_cell_info = EmbeddedDocumentField(Meta, required=True) + def __repr__(self): + return self.to_json() + class JobOutput(EmbeddedDocument): """ @@ -209,6 +219,7 @@ class TerminatedCode(Enum): terminated_by_admin = 1 terminated_by_automation = 2 terminated_by_batch_abort = 3 + terminated_by_server_failure = 4 class Status(Enum): @@ -304,15 +315,26 @@ class Job(Document): terminated_code = IntField(validation=valid_termination_code) error_code = IntField(validation=valid_errorcode) - + batch_job = BooleanField(default=False) scheduler_type = StringField() scheduler_id = StringField() scheduler_estimator_id = StringField() job_input = EmbeddedDocumentField(JobInput, required=True) job_output = DynamicField() condor_job_ads = DynamicField() - child_jobs = ListField() - batch_job = BooleanField(default=False) + # this is the ID of the coordinating job created as part of run_job_batch. Only child jobs + # in a "true" batch job maintained by EE2 should have this field. Coordinating jobs will + # be updated with the child ID in child_jobs, unlike "fake" batch jobs that are created + # outside of the EE2 codebase using the 'parent_job_id' field. + batch_id = StringField() + child_jobs = ListField() # Only coordinating jobs should have child jobs + # batch_parent_container = BooleanField(default=False) # Only parent container should have this + retry_ids = ListField() # The retry_parent has been used to launch these jobs + # Only present on a retried job, not it's parent. If attempting to retry this job, use its parent instead + retry_parent = StringField() + retry_saved_toggle = BooleanField( + default=False + ) # Marked true when all retry steps have completed meta = {"collection": "ee2_jobs"} @@ -324,6 +346,53 @@ def __repr__(self): return self.to_json() +# class BatchJobCollection(Document): +# """ +# A container for storing related batch job containers +# Does this need to exist before creating a collection? +# """ +# +# # User and wsid are used for permission handling +# user = StringField(required=True) +# wsid = IntField(required=False, default=None) +# batch_jobs = ListField(required=True) +# updated = FloatField(default=time.time) +# title = StringField(required=False) +# description = StringField(required=False) +# +# def save(self, *args, **kwargs): +# self.updated = time.time() +# return super(BatchJobCollection, self).save(*args, **kwargs) +# +# def __repr__(self): +# return self.to_json() +# +# +# class BatchJobContainer(Document): +# """ +# A container for storing jobs information +# Can be created via run_job_batch endpoint, or through the UI/ee2 api, +# or a running job with the ee2_client +# """ +# +# meta = {"collection": "ee2_jobs"} +# user = StringField(required=True) +# wsid = IntField(required=False, default=None) +# updated = FloatField(default=time.time) +# scheduler_type = StringField(default="htcondor", required=False) +# child_jobs = ListField(required=True) +# title = StringField(required=False) +# description = StringField(required=False) +# meta = {"collection": "ee2_jobs"} +# +# def save(self, *args, **kwargs): +# self.updated = time.time() +# return super(BatchJobContainer, self).save(*args, **kwargs) +# +# def __repr__(self): +# return self.to_json() + + # Unused for now class HeldJob(Document): job_id = ReferenceField(Job) diff --git a/lib/execution_engine2/exceptions.py b/lib/execution_engine2/exceptions.py index 523ac086b..13961697e 100644 --- a/lib/execution_engine2/exceptions.py +++ b/lib/execution_engine2/exceptions.py @@ -1,54 +1,74 @@ class ExecutionEngineValueError(ValueError): - """Base Class for ee2 exceptions""" + """ + Base Class for ee2 value exceptions + Subclass exceptions use docstring as default message + """ - pass + def __init__(self, msg=None, *args, **kwargs): + super().__init__(msg or self.__doc__, *args, **kwargs) class ExecutionEngineException(Exception): - pass + """ + Base Class for ee2 exceptions + Subclass exceptions use docstring as default message + """ + + def __init__(self, msg=None, *args, **kwargs): + super().__init__(msg or self.__doc__, *args, **kwargs) class IncorrectParamsException(ExecutionEngineValueError): - pass + """Wrong parameters were provided""" + + +class InvalidParameterForBatch(ExecutionEngineValueError): + """Workspace ids are not allowed in RunJobParams in Batch Mode""" class MissingRunJobParamsException(ExecutionEngineValueError): - pass + """Provided an empty (RunJobParams) parameter mapping""" class InvalidStatusTransitionException(ExecutionEngineValueError): - pass + """Raised if the status transition is NOT ALLOWED""" class InvalidOperationForStatusException(ExecutionEngineValueError): - pass + """The current operation is not valid for this job status""" class MissingCondorRequirementsException(ExecutionEngineValueError): - pass + """Raised if malformed requirements information is retrieved for an ee2 job""" class MalformedJobIdException(ExecutionEngineValueError): - pass + """Raised if bad ee2 id is passed in""" class MalformedTimestampException(ExecutionEngineException): - pass + """Bad timestamps""" class ChildrenNotFoundError(ExecutionEngineException): - pass + """Raised if children are not found for a given parent when attempting to abandon children""" class RecordNotFoundException(ExecutionEngineException): - pass + """Raised if ee2 job or ee2 job log record is not found in db""" class CondorJobNotFoundException(ExecutionEngineException): - pass + """Raised if condor job is not found""" + + +class RetryFailureException(ExecutionEngineException): + """General exception for couldn't Retry the job failures'""" + + +class CannotRetryJob(ExecutionEngineException): + """Can only retry errored or cancelled jobs, and not batch parents""" class AuthError(ExecutionEngineException): """Raised if a user is unauthorized for a particular action, or doesn't have the right auth role""" - - pass diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index fdd718e89..5b6366de5 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -1,13 +1,15 @@ # -*- coding: utf-8 -*- #BEGIN_HEADER +import os import time from cachetools import TTLCache -from lib.execution_engine2.db.MongoUtil import MongoUtil from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.APIHelpers import GenerateFromConfig +from execution_engine2.utils.clients import get_client_set - +_AS_ADMIN = "as_admin" #END_HEADER @@ -26,9 +28,9 @@ class execution_engine2: # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa - VERSION = "0.0.1" - GIT_URL = "https://bio-boris@github.com/kbase/execution_engine2" - GIT_COMMIT_HASH = "78ab4aaa17181deb81e06cd077c31bf6929b009f" + VERSION = "0.0.5" + GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" + GIT_COMMIT_HASH = "2ad95ce47caa4f1e7b939651f2b1773840e67a8a" #BEGIN_CLASS_HEADER MONGO_COLLECTION = "jobs" @@ -58,10 +60,12 @@ def __init__(self, config): self.admin_permissions_cache = TTLCache( maxsize=self.ADMIN_ROLES_CACHE_SIZE, ttl=self.ADMIN_ROLES_CACHE_EXPIRE_TIME ) - self.mongo_util = MongoUtil(config) - - - + self.gen_cfg = GenerateFromConfig(config) + # move these into GFC? Since they're only generated once it doesn't seem necessary + configpath = os.environ["KB_DEPLOYMENT_CONFIG"] + override = os.environ.get("OVERRIDE_CLIENT_GROUP") + with open(configpath) as cf: + self.clients = get_client_set(config, cf, override) #END_CONSTRUCTOR pass @@ -174,64 +178,88 @@ def status(self, ctx): def run_job(self, ctx, params): """ - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String + Start a new job. + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ # ctx is the context object # return variables are: job_id #BEGIN run_job mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util + admin_permissions_cache=self.admin_permissions_cache, ) - job_id = mr.run_job(params) + job_id = mr.run_job(params, as_admin=bool(params.get(_AS_ADMIN))) #END run_job # At some point might do deeper type checking... @@ -243,66 +271,101 @@ def run_job(self, ctx, params): def run_job_batch(self, ctx, params, batch_params): """ - :param params: instance of list of type "RunJobParams" (method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String - :param batch_params: instance of type "BatchParams" -> structure: - parameter "wsid" of Long + Run a batch job, consisting of a parent job and one or more child jobs. + Note that the as_admin parameters in the list of child jobs are ignored - + only the as_admin parameter in the batch_params is considered. + :param params: instance of list of type "RunJobParams" (method - the + SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) + :param batch_params: instance of type "BatchParams" (Additional + parameters for a batch job. wsid: the workspace with which to + associate the parent job. as_admin: run the job with full EE2 + permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights.) -> structure: + parameter "wsid" of Long, parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "BatchSubmission" -> structure: parameter - "parent_job_id" of type "job_id" (A job id.), parameter - "child_job_ids" of list of type "job_id" (A job id.) + "batch_id" of type "job_id" (A job id.), parameter "child_job_ids" + of list of type "job_id" (A job id.) """ # ctx is the context object # return variables are: job_ids #BEGIN run_job_batch mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util + admin_permissions_cache=self.admin_permissions_cache ) - job_ids = mr.run_job_batch(params, batch_params) + job_ids = mr.run_job_batch( + params, batch_params, as_admin=bool(batch_params.get(_AS_ADMIN))) #END run_job_batch # At some point might do deeper type checking... @@ -312,25 +375,101 @@ def run_job_batch(self, ctx, params, batch_params): # return the results return [job_ids] + def retry_job(self, ctx, params): + """ + #TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present + #TODO Add retry child that checks the status of the child? to prevent multiple retries + Allowed Jobs + Regular Job with no children + Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call + Not Allowed + Regular Job with children (Should not be possible to create yet) + Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs) + :param params: instance of type "RetryParams" (job_id of job to retry + as_admin: retry someone elses job in your namespace #TODO Possibly + Add JobRequirements job_requirements;) -> structure: parameter + "job_id" of type "job_id" (A job id.), parameter "as_admin" of + type "boolean" (@range [0,1]) + :returns: instance of type "RetryResult" (job_id of retried job + retry_id: job_id of the job that was launched str error: reason as + to why that particular retry failed (available for bulk retry + only)) -> structure: parameter "job_id" of type "job_id" (A job + id.), parameter "retry_id" of type "job_id" (A job id.), parameter + "error" of String + """ + # ctx is the context object + # return variables are: retry_result + #BEGIN retry_job + mr = SDKMethodRunner( + user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, + job_permission_cache=self.job_permission_cache, + admin_permissions_cache=self.admin_permissions_cache + ) + retry_result = mr.retry(job_id=params.get('job_id'), as_admin=params.get('as_admin')) + #END retry_job + + # At some point might do deeper type checking... + if not isinstance(retry_result, dict): + raise ValueError('Method retry_job return value ' + + 'retry_result is not type dict as required.') + # return the results + return [retry_result] + + def retry_jobs(self, ctx, params): + """ + Same as retry_job, but accepts multiple jobs + :param params: instance of type "BulkRetryParams" (job_ids of job to + retry as_admin: retry someone else's job in your namespace #TODO: + Possibly Add list job_requirements;) -> + structure: parameter "job_ids" of list of type "job_id" (A job + id.), parameter "as_admin" of type "boolean" (@range [0,1]) + :returns: instance of list of type "RetryResult" (job_id of retried + job retry_id: job_id of the job that was launched str error: + reason as to why that particular retry failed (available for bulk + retry only)) -> structure: parameter "job_id" of type "job_id" (A + job id.), parameter "retry_id" of type "job_id" (A job id.), + parameter "error" of String + """ + # ctx is the context object + # return variables are: retry_result + #BEGIN retry_jobs + mr = SDKMethodRunner( + user_clients=self.gen_cfg.get_user_clients(ctx), + clients = self.clients, + job_permission_cache=self.job_permission_cache, + admin_permissions_cache=self.admin_permissions_cache + ) + retry_result = mr.retry_multiple(job_ids=params.get('job_ids'), as_admin=params.get('as_admin')) + #END retry_jobs + + # At some point might do deeper type checking... + if not isinstance(retry_result, list): + raise ValueError('Method retry_jobs return value ' + + 'retry_result is not type list as required.') + # return the results + return [retry_result] + def abandon_children(self, ctx, params): """ :param params: instance of type "AbandonChildren" -> structure: - parameter "parent_job_id" of type "job_id" (A job id.), parameter + parameter "batch_id" of type "job_id" (A job id.), parameter "child_job_ids" of list of type "job_id" (A job id.), parameter "as_admin" of type "boolean" (@range [0,1]) :returns: instance of type "BatchSubmission" -> structure: parameter - "parent_job_id" of type "job_id" (A job id.), parameter - "child_job_ids" of list of type "job_id" (A job id.) + "batch_id" of type "job_id" (A job id.), parameter "child_job_ids" + of list of type "job_id" (A job id.) """ # ctx is the context object # return variables are: parent_and_child_ids #BEGIN abandon_children mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, - admin_permissions_cache=self.admin_permissions_cache, mongo_util=self.mongo_util + admin_permissions_cache=self.admin_permissions_cache, ) - parent_and_child_ids = mr.abandon_children(parent_job_id=params['parent_job_id'], + parent_and_child_ids = mr.abandon_children(batch_id=params['batch_id'], child_job_ids=params['child_job_ids'], as_admin=params.get('as_admin')) #END abandon_children @@ -344,72 +483,106 @@ def abandon_children(self, ctx, params): def run_job_concierge(self, ctx, params, concierge_params): """ - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int - request_memory: int in MB request_disk: int in MB job_priority: + request_memory: int in MB request_disk: int in GB job_priority: int = None range from -20 to +20, with higher values meaning - better priority. account_group: str = None # Someone elses account - requirements_list: list = None ['machine=worker102','color=red'] - client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can - leave default or specify a clientgroup) -> structure: parameter - "request_cpu" of Long, parameter "request_memory_mb" of Long, - parameter "request_disk_mb" of Long, parameter "job_priority" of - Long, parameter "account_group" of String, parameter - "requirements_list" of list of String, parameter "client_group" of - String + better priority. Note: job_priority is currently not implemented. + account_group: str = None # Someone elses account + ignore_concurrency_limits: ignore any limits on simultaneous job + runs. Default 1 (True). requirements_list: list = None + ['machine=worker102','color=red'] client_group: Optional[str] = + CONCIERGE_CLIENTGROUP # You can leave default or specify a + clientgroup client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. debug_mode: Whether to run the job in debug mode. Default + 0 (False).) -> structure: parameter "request_cpu" of Long, + parameter "request_memory" of Long, parameter "request_disk" of + Long, parameter "job_priority" of Long, parameter "account_group" + of String, parameter "ignore_concurrency_limits" of type "boolean" + (@range [0,1]), parameter "requirements_list" of list of String, + parameter "client_group" of String, parameter "client_group_regex" + of type "boolean" (@range [0,1]), parameter "debug_mode" of type + "boolean" (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ # ctx is the context object # return variables are: job_id #BEGIN run_job_concierge mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) job_id = mr.run_job_concierge(params=params,concierge_params=concierge_params) #END run_job_concierge @@ -427,62 +600,84 @@ def get_job_params(self, ctx, params): necessary for job execution @optional as_admin) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "RunJobParams" (method - service defined - in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String + :returns: instance of type "RunJobParams" (method - the SDK method to + run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) """ # ctx is the context object # return variables are: params #BEGIN get_job_params mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) params = mr.get_job_params(job_id=params['job_id'], as_admin=params.get('as_admin')) #END get_job_params @@ -507,12 +702,10 @@ def update_job_status(self, ctx, params): # return variables are: job_id #BEGIN update_job_status mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) job_id = mr.update_job_status(job_id=params['job_id'], status=params['status'], @@ -546,12 +739,10 @@ def add_job_logs(self, ctx, params, lines): # return variables are: results #BEGIN add_job_logs mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) add_job_logs = mr.add_job_logs(job_id=params['job_id'], log_lines=lines, as_admin=params.get('as_admin')) @@ -598,12 +789,10 @@ def get_job_logs(self, ctx, params): raise ValueError("Please provide only one of skip_lines or offset") mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) returnVal = mr.view_job_logs( job_id=params["job_id"], @@ -640,12 +829,10 @@ def finish_job(self, ctx, params): # ctx is the context object #BEGIN finish_job mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) mr.finish_job( job_id=params["job_id"], @@ -670,12 +857,10 @@ def start_job(self, ctx, params): # ctx is the context object #BEGIN start_job mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) mr.start_job( params["job_id"], skip_estimation=params.get("skip_estimation", True), @@ -699,93 +884,130 @@ def check_job(self, ctx, params): id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) ## TODO - verify - updated - int - timestamp since epoch in milliseconds of the last - time the status was updated running - int - timestamp since epoch - in milliseconds of when it entered the running state created - int - - timestamp since epoch in milliseconds when the job was created - finished - int - timestamp since epoch in milliseconds when the - job was finished status - string - status of the job. one of the - following: created - job has been created in the service - estimating - an estimation job is running to estimate resources - required for the main job, and which queue should be used queued - - job is queued to be run running - job is running on a worker node - completed - job was completed successfully error - job is no - longer running, but failed with an error terminated - job is no - longer running, terminated either due to user cancellation, admin - cancellation, or some automated task error_code - int - internal - reason why the job is an error. one of the following: 0 - unknown - 1 - job crashed 2 - job terminated by automation 3 - job ran over - time limit 4 - job was missing its automated output document 5 - - job authentication token expired errormsg - string - message (e.g. - stacktrace) accompanying an errored job error - object - the - JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, - one of: 0 - user cancellation 1 - admin cancellation 2 - - terminated by some automatic process @optional error @optional + job_output - object - outputs from the job (from the run_job call) + ## TODO - verify updated - int - timestamp since epoch in + milliseconds of the last time the status was updated running - int + - timestamp since epoch in milliseconds of when it entered the + running state created - int - timestamp since epoch in + milliseconds when the job was created finished - int - timestamp + since epoch in milliseconds when the job was finished status - + string - status of the job. one of the following: created - job + has been created in the service estimating - an estimation job is + running to estimate resources required for the main job, and which + queue should be used queued - job is queued to be run running - + job is running on a worker node completed - job was completed + successfully error - job is no longer running, but failed with an + error terminated - job is no longer running, terminated either due + to user cancellation, admin cancellation, or some automated task + error_code - int - internal reason why the job is an error. one of + the following: 0 - unknown 1 - job crashed 2 - job terminated by + automation 3 - job ran over time limit 4 - job was missing its + automated output document 5 - job authentication token expired + errormsg - string - message (e.g. stacktrace) accompanying an + errored job error - object - the JSON-RPC error package that + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional error_code @optional errormsg @optional terminated_code @optional estimating @optional running @optional finished) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: job_state #BEGIN check_job mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) job_state = mr.check_job( params["job_id"], exclude_fields=params.get("exclude_fields", None), @@ -810,15 +1032,17 @@ def check_job_batch(self, ctx, params): "job_id" of type "job_id" (A job id.), parameter "exclude_fields" of list of String, parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobBatchResults" (parent_job - state - of parent job job_states - states of child jobs aggregate_states - - count of all available child job states, even if they are zero) -> - structure: parameter "parent_job" of type "JobState" (job_id - - string - id of the job user - string - user who started the job - wsid - int - optional id of the workspace where the job is bound - authstrat - string - what strategy used to authenticate the job - job_input - object - inputs to the job (from the run_job call) ## - TODO - verify updated - int - timestamp since epoch in + :returns: instance of type "CheckJobBatchResults" (batch_jobstate - + state of parent job of the batch child_jobstates - states of child + jobs IDEA: ADD aggregate_states - count of all available child job + states, even if they are zero) -> structure: parameter + "batch_jobstate" of type "JobState" (job_id - string - id of the + job user - string - user who started the job wsid - int - optional + id of the workspace where the job is bound authstrat - string - + what strategy used to authenticate the job job_input - object - + inputs to the job (from the run_job call) ## TODO - verify + job_output - object - outputs from the job (from the run_job call) + ## TODO - verify updated - int - timestamp since epoch in milliseconds of the last time the status was updated running - int - timestamp since epoch in milliseconds of when it entered the running state created - int - timestamp since epoch in @@ -838,163 +1062,233 @@ def check_job_batch(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "job_states" of - list of type "JobState" (job_id - string - id of the job user - - string - user who started the job wsid - int - optional id of the - workspace where the job is bound authstrat - string - what - strategy used to authenticate the job job_input - object - inputs - to the job (from the run_job call) ## TODO - verify updated - int - - timestamp since epoch in milliseconds of the last time the - status was updated running - int - timestamp since epoch in - milliseconds of when it entered the running state created - int - - timestamp since epoch in milliseconds when the job was created - finished - int - timestamp since epoch in milliseconds when the - job was finished status - string - status of the job. one of the - following: created - job has been created in the service - estimating - an estimation job is running to estimate resources - required for the main job, and which queue should be used queued - - job is queued to be run running - job is running on a worker node - completed - job was completed successfully error - job is no - longer running, but failed with an error terminated - job is no - longer running, terminated either due to user cancellation, admin - cancellation, or some automated task error_code - int - internal - reason why the job is an error. one of the following: 0 - unknown - 1 - job crashed 2 - job terminated by automation 3 - job ran over - time limit 4 - job was missing its automated output document 5 - - job authentication token expired errormsg - string - message (e.g. - stacktrace) accompanying an errored job error - object - the - JSON-RPC error package that accompanies the error code and message - terminated_code - int - internal reason why a job was terminated, - one of: 0 - user cancellation 1 - admin cancellation 2 - - terminated by some automatic process @optional error @optional + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String, parameter "child_jobstates" of list of type "JobState" + (job_id - string - id of the job user - string - user who started + the job wsid - int - optional id of the workspace where the job is + bound authstrat - string - what strategy used to authenticate the + job job_input - object - inputs to the job (from the run_job call) + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in + milliseconds when the job was created finished - int - timestamp + since epoch in milliseconds when the job was finished status - + string - status of the job. one of the following: created - job + has been created in the service estimating - an estimation job is + running to estimate resources required for the main job, and which + queue should be used queued - job is queued to be run running - + job is running on a worker node completed - job was completed + successfully error - job is no longer running, but failed with an + error terminated - job is no longer running, terminated either due + to user cancellation, admin cancellation, or some automated task + error_code - int - internal reason why the job is an error. one of + the following: 0 - unknown 1 - job crashed 2 - job terminated by + automation 3 - job ran over time limit 4 - job was missing its + automated output document 5 - job authentication token expired + errormsg - string - message (e.g. stacktrace) accompanying an + errored job error - object - the JSON-RPC error package that + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional error_code @optional errormsg @optional terminated_code @optional estimating @optional running @optional finished) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long, parameter "aggregate_states" - of unspecified object + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: returnVal #BEGIN check_job_batch mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.check_job_batch( - parent_job_id=params["job_id"], exclude_fields=params.get("exclude_fields", None), + batch_id=params["job_id"], exclude_fields=params.get("exclude_fields", None), as_admin=params.get('as_admin') ) #END check_job_batch @@ -1022,10 +1316,11 @@ def check_jobs(self, ctx, params): the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1042,74 +1337,109 @@ def check_jobs(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: returnVal #BEGIN check_jobs mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.check_jobs( params.get("job_ids"), @@ -1143,10 +1473,11 @@ def check_workspace_jobs(self, ctx, params): the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1163,73 +1494,110 @@ def check_workspace_jobs(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String """ # ctx is the context object # return variables are: returnVal #BEGIN check_workspace_jobs - mr = SDKMethodRunner(self.config, user_id=ctx["user_id"], token=ctx["token"], - mongo_util=self.mongo_util) + mr = SDKMethodRunner( + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, + ) returnVal = mr.check_workspace_jobs( params.get("workspace_id"), exclude_fields=params.get("exclude_fields", None), @@ -1260,12 +1628,10 @@ def cancel_job(self, ctx, params): # ctx is the context object #BEGIN cancel_job mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) mr.cancel_job( @@ -1300,8 +1666,8 @@ def check_job_canceled(self, ctx, params): # return variables are: result #BEGIN check_job_canceled mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) result = mr.check_job_canceled(job_id=params["job_id"], as_admin=params.get('as_admin')) #END check_job_canceled @@ -1326,12 +1692,10 @@ def get_job_status(self, ctx, params): # return variables are: result #BEGIN get_job_status mr = SDKMethodRunner( - self.config, - user_id=ctx.get("user_id"), - token=ctx.get("token"), + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, job_permission_cache=self.job_permission_cache, admin_permissions_cache=self.admin_permissions_cache, - mongo_util=self.mongo_util ) result = mr.get_job_status_field(job_id=params['job_id'], as_admin=params.get('as_admin')) #END get_job_status @@ -1347,14 +1711,30 @@ def check_jobs_date_range_for_user(self, ctx, params): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -1362,17 +1742,43 @@ def check_jobs_date_range_for_user(self, ctx, params): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1389,74 +1795,112 @@ def check_jobs_date_range_for_user(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String, parameter "count" of Long, parameter "query_count" of + Long, parameter "filter" of mapping from String to String, + parameter "skip" of Long, parameter "projection" of list of + String, parameter "limit" of Long, parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal #BEGIN check_jobs_date_range_for_user mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.check_jobs_date_range_for_user( creation_start_time=params.get("start_time"), @@ -1482,14 +1926,30 @@ def check_jobs_date_range_for_all(self, ctx, params): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -1497,17 +1957,43 @@ def check_jobs_date_range_for_all(self, ctx, params): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the job job_input - object - inputs to the job (from the run_job call) - ## TODO - verify updated - int - timestamp since epoch in - milliseconds of the last time the status was updated running - int - - timestamp since epoch in milliseconds of when it entered the - running state created - int - timestamp since epoch in + ## TODO - verify job_output - object - outputs from the job (from + the run_job call) ## TODO - verify updated - int - timestamp since + epoch in milliseconds of the last time the status was updated + running - int - timestamp since epoch in milliseconds of when it + entered the running state created - int - timestamp since epoch in milliseconds when the job was created finished - int - timestamp since epoch in milliseconds when the job was finished status - string - status of the job. one of the following: created - job @@ -1524,74 +2010,112 @@ def check_jobs_date_range_for_all(self, ctx, params): automated output document 5 - job authentication token expired errormsg - string - message (e.g. stacktrace) accompanying an errored job error - object - the JSON-RPC error package that - accompanies the error code and message terminated_code - int - - internal reason why a job was terminated, one of: 0 - user - cancellation 1 - admin cancellation 2 - terminated by some - automatic process @optional error @optional error_code @optional - errormsg @optional terminated_code @optional estimating @optional - running @optional finished) -> structure: parameter "job_id" of - type "job_id" (A job id.), parameter "user" of String, parameter - "authstrat" of String, parameter "wsid" of Long, parameter - "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + accompanies the error code and message #TODO, add these to the + structure? condor_job_ads - dict - condor related job information + retry_count - int - generated field based on length of retry_ids + retry_ids - list - list of jobs that are retried based off of this + job retry_parent - str - job_id of the parent this retry is based + off of. Not available on a retry_parent itself batch_id - str - + the parent of the job, if the job is a child job created via + run_job_batch batch_job - bool - whether or not this is a batch + parent container child_jobs - array - Only parent container should + have child job ids scheduler_type - str - scheduler, such as awe + or condor scheduler_id - str - scheduler generated id + scheduler_estimator_id - str - id for the job spawned for + estimation terminated_code - int - internal reason why a job was + terminated, one of: 0 - user cancellation 1 - admin cancellation 2 + - terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "batch_id" + of String, parameter "count" of Long, parameter "query_count" of + Long, parameter "filter" of mapping from String to String, + parameter "skip" of Long, parameter "projection" of list of + String, parameter "limit" of Long, parameter "sort_order" of String """ # ctx is the context object # return variables are: returnVal #BEGIN check_jobs_date_range_for_all mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.check_jobs_date_range_for_user( creation_start_time=params.get("start_time"), @@ -1624,8 +2148,8 @@ def handle_held_job(self, ctx, cluster_id): # return variables are: returnVal #BEGIN handle_held_job mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.handle_held_job(cluster_id=cluster_id) #END handle_held_job @@ -1646,8 +2170,8 @@ def is_admin(self, ctx): # return variables are: returnVal #BEGIN is_admin mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.check_is_admin() #END is_admin @@ -1663,7 +2187,7 @@ def get_admin_permission(self, ctx): """ Check if current user has ee2 admin rights. If so, return the type of rights and their roles - :returns: instance of type "AdminRolesResults" (str permission; # One + :returns: instance of type "AdminRolesResults" (str permission - One of 'r|w|x' (('read' | 'write' | 'none'))) -> structure: parameter "permission" of String """ @@ -1671,8 +2195,8 @@ def get_admin_permission(self, ctx): # return variables are: returnVal #BEGIN get_admin_permission mr = SDKMethodRunner( - self.config, user_id=ctx.get("user_id"), token=ctx.get("token"), - mongo_util=self.mongo_util + user_clients=self.gen_cfg.get_user_clients(ctx), + clients=self.clients, ) returnVal = mr.get_admin_permission() #END get_admin_permission @@ -1692,6 +2216,7 @@ def get_client_groups(self, ctx): # ctx is the context object # return variables are: client_groups #BEGIN get_client_groups + # TODO I think this needs to be actually extracted from the config file client_groups = ['njs', 'bigmem', 'bigmemlong', 'extreme', 'concierge', 'hpc', 'kb_upload', 'terabyte', 'multi_tb', 'kb_upload_bulk'] #END get_client_groups diff --git a/lib/execution_engine2/execution_engine2Server.py b/lib/execution_engine2/execution_engine2Server.py index e160802d9..b63fe2210 100644 --- a/lib/execution_engine2/execution_engine2Server.py +++ b/lib/execution_engine2/execution_engine2Server.py @@ -122,7 +122,12 @@ def _call_method(self, ctx, request): newerr = JSONServerError() newerr.trace = traceback.format_exc() if len(e.args) == 1: - newerr.data = repr(e.args[0]) + # THIS WAS CHANGED INTENTIONALLY - if you recompile please restore. + # repr adds single quotes around string arguments which is not what we want. + if type(e.args[0]) == str: + newerr.data = e.args[0] + else: + newerr.data = repr(e.args[0]) else: newerr.data = repr(e.args) raise newerr @@ -395,6 +400,18 @@ def __init__(self): self.method_authentication[ "execution_engine2.run_job_batch" ] = "required" # noqa + self.rpc_service.add( + impl_execution_engine2.retry_job, + name="execution_engine2.retry_job", + types=[dict], + ) + self.method_authentication["execution_engine2.retry_job"] = "required" # noqa + self.rpc_service.add( + impl_execution_engine2.retry_jobs, + name="execution_engine2.retry_jobs", + types=[dict], + ) + self.method_authentication["execution_engine2.retry_jobs"] = "required" # noqa self.rpc_service.add( impl_execution_engine2.abandon_children, name="execution_engine2.abandon_children", diff --git a/lib/execution_engine2/sdk/EE2Authentication.py b/lib/execution_engine2/sdk/EE2Authentication.py index 94ded3964..649d6ab1f 100644 --- a/lib/execution_engine2/sdk/EE2Authentication.py +++ b/lib/execution_engine2/sdk/EE2Authentication.py @@ -3,8 +3,8 @@ from cachetools import TTLCache from lib.execution_engine2.authorization.authstrategy import can_read_job, can_write_job -from lib.execution_engine2.authorization.roles import AdminAuthUtil from lib.execution_engine2.db.models.models import Job +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE class JobPermissions(Enum): @@ -30,15 +30,14 @@ def get_cache(cache, size, expire): return cache def _lookup_admin_permissions(self): - aau = AdminAuthUtil(self.sdkmr.auth_url, self.sdkmr.admin_roles) - p = aau.get_admin_role( + p = self.sdkmr.auth_admin.get_admin_role( token=self.sdkmr.token, - read_role=self.sdkmr.ADMIN_READ_ROLE, - write_role=self.sdkmr.ADMIN_WRITE_ROLE, + read_role=ADMIN_READ_ROLE, + write_role=ADMIN_WRITE_ROLE, ) - if p == self.sdkmr.ADMIN_READ_ROLE: + if p == ADMIN_READ_ROLE: return AdminPermissions.READ - elif p == self.sdkmr.ADMIN_WRITE_ROLE: + elif p == ADMIN_WRITE_ROLE: return AdminPermissions.WRITE else: return AdminPermissions.NONE @@ -149,16 +148,12 @@ def test_job_permissions( perm = False try: if level.value == JobPermissions.READ.value: - perm = can_read_job( - job, self.sdkmr.user_id, self.sdkmr.token, self.sdkmr.config - ) + perm = can_read_job(job, self.sdkmr.user_id, self.sdkmr.workspace_auth) self._update_job_permission_cache( job_id, self.sdkmr.user_id, level, perm ) elif level.value == JobPermissions.WRITE.value: - perm = can_write_job( - job, self.sdkmr.user_id, self.sdkmr.token, self.sdkmr.config - ) + perm = can_write_job(job, self.sdkmr.user_id, self.sdkmr.workspace_auth) self._update_job_permission_cache( job_id, self.sdkmr.user_id, level, perm ) diff --git a/lib/execution_engine2/sdk/EE2Constants.py b/lib/execution_engine2/sdk/EE2Constants.py index dff073399..a821ee33f 100644 --- a/lib/execution_engine2/sdk/EE2Constants.py +++ b/lib/execution_engine2/sdk/EE2Constants.py @@ -1,8 +1,20 @@ -from dataclasses import dataclass -from typing import Optional, NamedTuple +from typing import NamedTuple +# May want to make this configurable. Hardcoded for now as we want concierge data to be owned +# by this user. +# An alternative approach would be to configure a kbaseconcierge token in the config, and then +# specify an auth2 role that allows users to replace their token with the kbaseconcierge token +# when running jobs. Needs more thought. KBASE_CONCIERGE_USERNAME = "kbaseconcierge" -CONCIERGE_CLIENTGROUP = "kbase_concierge" +CONCIERGE_CLIENTGROUP = "concierge" + +EE2_CONFIG_SECTION = "execution_engine2" +EE2_DEFAULT_SECTION = "DEFAULT" +EE2_DEFAULT_CLIENT_GROUP = "default_client_group" + +# these also probably should be configurable. +ADMIN_READ_ROLE = "EE2_ADMIN_RO" +ADMIN_WRITE_ROLE = "EE2_ADMIN" class JobError(NamedTuple): @@ -10,16 +22,3 @@ class JobError(NamedTuple): message: str code: int error: str - - -@dataclass() -class ConciergeParams: - """ Set requested params. If you don't specify CG, its automatically set for you""" - - request_cpus: int - request_memory: int - request_disk: int - job_priority: int = None - account_group: str = None - requirements_list: list = None - client_group: Optional[str] = CONCIERGE_CLIENTGROUP diff --git a/lib/execution_engine2/sdk/EE2Logs.py b/lib/execution_engine2/sdk/EE2Logs.py index daca2347e..be04acd78 100644 --- a/lib/execution_engine2/sdk/EE2Logs.py +++ b/lib/execution_engine2/sdk/EE2Logs.py @@ -1,8 +1,8 @@ from enum import Enum from typing import Dict, NamedTuple -from lib.execution_engine2.db.models.models import JobLog as JLModel, LogLines -from lib.execution_engine2.exceptions import RecordNotFoundException +from execution_engine2.db.models.models import JobLog as JLModel, LogLines +from execution_engine2.exceptions import RecordNotFoundException # if TYPE_CHECKING: @@ -23,7 +23,6 @@ class AddLogResult(NamedTuple): class EE2Logs: def __init__(self, sdkmr): self.sdkmr = sdkmr - self.mongo_util = self.sdkmr.get_mongo_util() def _format_job_logs(self, record_position, log_lines): @@ -49,14 +48,13 @@ def _create_new_log(self, pk, log_lines: list): :param log_lines: The lines to add to this log :return: """ - with self.mongo_util.mongo_engine_connection(): - jl = JLModel() - jl.primary_key = pk - jl.original_line_count = 0 - jl.stored_line_count = 0 - jl.lines = self._format_job_logs(record_position=-1, log_lines=log_lines) - jl.original_line_count = jl.stored_line_count = len(log_lines) - jl.save() + jl = JLModel() + jl.primary_key = pk + jl.original_line_count = 0 + jl.stored_line_count = 0 + jl.lines = self._format_job_logs(record_position=-1, log_lines=log_lines) + jl.original_line_count = jl.stored_line_count = len(log_lines) + jl.save() return jl def _add_first_logs(self, log_lines, job_id): @@ -71,12 +69,12 @@ def _add_first_logs(self, log_lines, job_id): return AddLogResult(success=True, stored_line_count=log.stored_line_count) def _add_subsequent_logs(self, job_log, log_lines): - """ Add logs to an existing log entry """ + """Add logs to an existing log entry""" formatted_logs = self._format_job_logs( record_position=job_log["stored_line_count"] - 1, log_lines=log_lines ) record_count = int(job_log["stored_line_count"]) + len(formatted_logs) - slc = self.mongo_util._push_job_logs( + slc = self.sdkmr.mongo_util._push_job_logs( formatted_logs, job_id=job_log["_id"], record_count=record_count ) return AddLogResult(success=True, stored_line_count=slc) @@ -106,10 +104,9 @@ def add_job_logs(self, job_id, log_lines, as_admin=False) -> AddLogResult: self.sdkmr.get_job_with_permission( job_id, JobPermissions.WRITE, as_admin=as_admin ) - self.sdkmr.logger.debug(f"About to add logs for {job_id}") try: try: - job_log = self.mongo_util.get_job_log_pymongo(job_id) + job_log = self.sdkmr.mongo_util.get_job_log_pymongo(job_id) except RecordNotFoundException: return self._add_first_logs(log_lines=log_lines, job_id=job_id) return self._add_subsequent_logs(job_log, log_lines) @@ -145,7 +142,7 @@ def _get_job_logs(self, job_id, skip_lines, limit=None) -> Dict: :return: """ - log = self.mongo_util.get_job_log_pymongo(job_id) + log = self.sdkmr.mongo_util.get_job_log_pymongo(job_id) lines = [] last_line_number = 0 count = len(log.get("lines", [])) diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index 5ffcf01d6..ec6d3952c 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -5,11 +5,13 @@ """ import os +import threading import time +from collections import Counter from enum import Enum -from typing import Optional, Dict, NamedTuple, Union, List +from typing import Optional, Dict, NamedTuple, Union, List, Any -from lib.execution_engine2.db.models.models import ( +from execution_engine2.db.models.models import ( Job, JobInput, Meta, @@ -18,9 +20,48 @@ ErrorCode, TerminatedCode, ) -from lib.execution_engine2.sdk.EE2Constants import ConciergeParams -from lib.execution_engine2.utils.CondorTuples import CondorResources -from lib.execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange +from execution_engine2.exceptions import ( + IncorrectParamsException, + AuthError, + CannotRetryJob, + RetryFailureException, + InvalidParameterForBatch, +) +from execution_engine2.sdk.EE2Constants import CONCIERGE_CLIENTGROUP +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements as ResolvedRequirements, + AppInfo, + UserCreds, +) +from execution_engine2.utils.KafkaUtils import KafkaCreateJob, KafkaQueueChange +from execution_engine2.utils.job_requirements_resolver import ( + REQUEST_CPUS, + REQUEST_DISK, + REQUEST_MEMORY, + CLIENT_GROUP, + CLIENT_GROUP_REGEX, + BILL_TO_USER, + IGNORE_CONCURRENCY_LIMITS, + DEBUG_MODE, +) +from execution_engine2.utils.job_requirements_resolver import RequirementsType + +_JOB_REQUIREMENTS = "job_reqs" +_JOB_REQUIREMENTS_INCOMING = "job_requirements" +_SCHEDULER_REQUIREMENTS = "scheduler_requirements" +_META = "meta" # narrative_cell_info +_APP_PARAMS = "params" # application parameters +_REQUIREMENTS_LIST = "requirements_list" +_METHOD = "method" +_APP_ID = "app_id" +_BATCH_ID = "batch_id" +_PARENT_JOB_ID = "parent_job_id" +_PARENT_RETRY_JOB_ID = "retry_parent" +_RETRY_IDS = "retry_ids" +_WORKSPACE_ID = "wsid" +_SOURCE_WS_OBJECTS = "source_ws_objects" +_SERVICE_VER = "service_ver" class JobPermissions(Enum): @@ -34,6 +75,11 @@ class PreparedJobParams(NamedTuple): job_id: str +class JobIdPair(NamedTuple): + job_id: str + scheduler_id: str + + from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -44,87 +90,90 @@ class EE2RunJob: def __init__(self, sdkmr): self.sdkmr = sdkmr # type: SDKMethodRunner self.override_clientgroup = os.environ.get("OVERRIDE_CLIENT_GROUP", None) - self.logger = self.sdkmr.logger + self.logger = self.sdkmr.get_logger() def _init_job_rec( - self, - user_id: str, - params: Dict, - resources: CondorResources = None, - concierge_params: ConciergeParams = None, - ) -> str: + self, user_id: str, params: Dict, save: bool = True + ) -> Union[str, Job]: + f""" + Save an initial job record to the db and send a message to kafka + + *** Expected OPTIONAL Parameters *** + {_WORKSPACE_ID} (The workspace id) + {_APP_PARAMS} (job params for the app/method itself) + {_SERVICE_VER} (app version) + {_APP_ID} (app UI) + {_SOURCE_WS_OBJECTS} (collected workspace objects for this app) + {_BATCH_ID} (parent of the job for EE2 batch jobs, the parent should be updated) + {_PARENT_JOB_ID} (parent of this job, doesn't update/notify the parent) + {_META} (narrative cell information) + + *** Expected REQUIRED Parameters *** + {_METHOD} (The app method to run) + {_JOB_REQUIREMENTS} (Job Resource information) + """ job = Job() inputs = JobInput() job.user = user_id job.authstrat = "kbaseworkspace" - job.wsid = params.get("wsid") + job.wsid = params.get(_WORKSPACE_ID) job.status = "created" # Inputs inputs.wsid = job.wsid - inputs.method = params.get("method") + + required_job_inputs = [_JOB_REQUIREMENTS, _METHOD] + for item in required_job_inputs: + if item not in params: + raise ValueError(f"{item} is required for job initialization") + + inputs.method = params[_METHOD] inputs.params = params.get("params") - params["service_ver"] = self._get_module_git_commit( - params.get("method"), params.get("service_ver") + # Catalog git commit + params[_SERVICE_VER] = self.sdkmr.get_catalog_cache().lookup_git_commit_version( + method=params.get(_METHOD), service_ver=params.get(_SERVICE_VER) ) - inputs.service_ver = params.get("service_ver") + inputs.service_ver = params.get(_SERVICE_VER) + inputs.app_id = params.get(_APP_ID) + inputs.source_ws_objects = params.get(_SOURCE_WS_OBJECTS) + + parent_job_id = params.get(_PARENT_JOB_ID) + if parent_job_id: + inputs.parent_job_id = str(parent_job_id) - inputs.app_id = params.get("app_id") - inputs.source_ws_objects = params.get("source_ws_objects") - inputs.parent_job_id = str(params.get("parent_job_id")) inputs.narrative_cell_info = Meta() - meta = params.get("meta") + # Meta and Requirements + meta = params.get(_META) if meta: - for meta_attr in ["run_id", "token_id", "tag", "cell_id", "status"]: + for meta_attr in ["run_id", "token_id", "tag", "cell_id"]: inputs.narrative_cell_info[meta_attr] = meta.get(meta_attr) - - if resources: - # TODO Should probably do some type checking on these before its passed in - jr = JobRequirements() - if concierge_params: - jr.cpu = concierge_params.request_cpus - jr.memory = concierge_params.request_memory - jr.disk = concierge_params.request_disk - jr.clientgroup = concierge_params.client_group - else: - jr.clientgroup = resources.client_group - if self.override_clientgroup: - jr.clientgroup = self.override_clientgroup - jr.cpu = resources.request_cpus - jr.memory = resources.request_memory[:-1] # Memory always in mb - jr.disk = resources.request_disk[:-2] # Space always in gb - - inputs.requirements = jr - - job.job_input = inputs - self.logger.debug(job.job_input.to_mongo().to_dict()) - - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - self.logger.debug(job.to_mongo().to_dict()) - job.save() - - self.sdkmr.kafka_client.send_kafka_message( - message=KafkaCreateJob(job_id=str(job.id), user=user_id) - ) - - return str(job.id) - - def _get_module_git_commit(self, method, service_ver=None) -> Optional[str]: - module_name = method.split(".")[0] - - if not service_ver: - service_ver = "release" - - self.logger.debug(f"Getting commit for {module_name} {service_ver}") - - module_version = self.sdkmr.catalog_utils.catalog.get_module_version( - {"module_name": module_name, "version": service_ver} + resolved_reqs = params[_JOB_REQUIREMENTS] # type: ResolvedRequirements + jr = JobRequirements( + cpu=resolved_reqs.cpus, + memory=resolved_reqs.memory_MB, + disk=resolved_reqs.disk_GB, + clientgroup=resolved_reqs.client_group, ) + inputs.requirements = jr + job.job_input = inputs - git_commit_hash = module_version.get("git_commit_hash") - - return git_commit_hash + f""" + Set the id of the parent that was retried to get this job + The {_PARENT_RETRY_JOB_ID} will only be set on a job retry + """ + parent_retry_job_id = params.get(_PARENT_RETRY_JOB_ID) + if parent_retry_job_id: + job.retry_parent = str(parent_retry_job_id) + job.batch_id = str(params.get(_BATCH_ID)) if params.get(_BATCH_ID) else None + + if save: + job_id = self.sdkmr.save_job(job) + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaCreateJob(job_id=job_id, user=user_id) + ) + return job_id + return job def _check_ws_objects(self, source_objects) -> None: """ @@ -138,6 +187,7 @@ def _check_ws_objects(self, source_objects) -> None: ) paths = info.get("paths") + # TODO It would be nice to show which object is inaccessible if None in paths: raise ValueError("Some workspace object is inaccessible") @@ -152,15 +202,20 @@ def _check_workspace_permissions(self, wsid): ) def _check_workspace_permissions_list(self, wsids): - perms = self.sdkmr.get_workspace_auth().can_write_list(wsids) - bad_ws = [key for key in perms.keys() if perms[key] is False] - if bad_ws: - self.logger.debug( - f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." - ) - raise PermissionError( - f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." + # TODO Cover this in tests once you can execute multiple independent runs + unique_not_none_not_zero_wsids = [wsid for wsid in set(wsids) if wsid] + if unique_not_none_not_zero_wsids: + perms = self.sdkmr.get_workspace_auth().can_write_list( + unique_not_none_not_zero_wsids ) + bad_ws = [key for key in perms.keys() if perms[key] is False] + if bad_ws: + self.logger.debug( + f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." + ) + raise PermissionError( + f"User {self.sdkmr.user_id} doesn't have permission to run jobs in workspace {bad_ws}." + ) def _finish_created_job( self, job_id, exception, error_code=None, error_message=None @@ -185,50 +240,161 @@ def _finish_created_job( error=f"{exception}", ) - def _prepare_to_run(self, params, concierge_params=None) -> PreparedJobParams: + def _generate_job_submission_params(self, job_id, params): + return JobSubmissionParameters( + job_id, + AppInfo(params[_METHOD], params.get(_APP_ID)), + params[_JOB_REQUIREMENTS], + UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), + # a job should have a parent ID or a batch ID or nothing, but never both + # Do we want to distinguish between the two cases in the sub params? + # It's informational only for Condor + parent_job_id=params.get(_BATCH_ID) or params.get(_PARENT_JOB_ID), + wsid=params.get(_WORKSPACE_ID), + source_ws_objects=params.get(_SOURCE_WS_OBJECTS), + ) + + def _prepare_to_run(self, params, concierge_params=None) -> JobSubmissionParameters: """ - Creates a job record, grabs info about the objects, - checks the catalog resource requirements, and submits to condor + Creates a job record and creates the job submission params """ - # perform sanity checks before creating job - self._check_ws_objects(source_objects=params.get("source_ws_objects")) - method = params.get("method") - # Normalize multiple formats into one format (csv vs json) - normalized_resources = self.sdkmr.catalog_utils.get_normalized_resources(method) - # These are for saving into job inputs. Maybe its best to pass this into condor as well? - extracted_resources = self.sdkmr.get_condor().extract_resources( - cgrr=normalized_resources - ) # type: CondorResources - # insert initial job document into db - job_id = self._init_job_rec( - self.sdkmr.user_id, params, extracted_resources, concierge_params + job_id = self._init_job_rec(self.sdkmr.get_user_id(), params) + self.logger.debug( + f"User {self.sdkmr.get_user_id()} attempting to run job {params[_METHOD]} {params}" ) + return self._generate_job_submission_params(job_id, params) + + def _submit_multiple_wrapper(self, job_ids: list, runjob_params: List[Dict]): + # Generate job submission params + job_submission_params = [] + for i, job_id in enumerate(job_ids): + job_submission_params.append( + self._generate_job_submission_params(job_id, runjob_params[i]) + ) + assert job_id == job_submission_params[i].job_id - params["job_id"] = job_id - params["user_id"] = self.sdkmr.user_id - params["token"] = self.sdkmr.token - params["cg_resources_requirements"] = normalized_resources + # Takes 2.5200018882751465 for 100 records, can shave off 2.5 secs by making this async + for job_id in job_ids: + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaCreateJob( + job_id=str(job_id), user=self.sdkmr.get_user_id() + ) + ) + # Submit to Condor + try: + submission_ids = self._submit_multiple(job_submission_params) + return submission_ids + except Exception as e: + self._abort_multiple_jobs(job_ids) + raise e - self.logger.debug( - f"User {self.sdkmr.user_id} attempting to run job {method} {params}" - ) + def _run_multiple(self, runjob_params: List[Dict]): + """ + Get the job records, bulk save them, then submit to condor. + If any condor submission fails, abort all of the jobs + :return: + """ + # Save records to db + job_records = [] + for runjob_param in runjob_params: + job_records.append( + self._init_job_rec(self.sdkmr.get_user_id(), runjob_param, save=False) + ) + job_ids = self.sdkmr.save_jobs(job_records) + + # Start up job submission thread + # For testing, mock this out and check to see it is called with these params? + threading.Thread( + target=self._submit_multiple_wrapper, + kwargs={"runjob_params": runjob_params, "job_ids": job_ids}, + daemon=True, + ).start() + return job_ids + + def _update_to_queued_multiple(self, job_ids, scheduler_ids): + """ + This is called during job submission. If a job is terminated during job submission, + we have the chance to re-issue a termination and remove the job from the Job Queue + """ + if len(job_ids) != len(scheduler_ids): + raise Exception( + "Need to provide the same amount of job ids and scheduler_ids" + ) + jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) + self.sdkmr.get_mongo_util().update_jobs_to_queued(jobs_to_update) + jobs = self.sdkmr.get_mongo_util().get_jobs(job_ids) + + for job in jobs: + job_id = str(job.id) + if job.status == Status.queued.value: + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaQueueChange( + job_id=job_id, + new_status=Status.queued.value, + previous_status=Status.created.value, # TODO maybe change this to allow for estimating jobs + scheduler_id=job.scheduler_id, + ) + ) + elif job.status == Status.terminated.value: + # Remove from the queue, now that the scheduler_id is available + # The job record doesn't actually get updated in the db a 2nd time, and this TerminatedCode is only + # used by the initial transition to Terminated + self._safe_cancel(job_id, TerminatedCode.terminated_by_user) - return PreparedJobParams(params=params, job_id=job_id) + def _submit_multiple(self, job_submission_params): + """ + Submit multiple jobs. If any of the submissions are a failure, raise exception in order + to fail all submitted jobs, rather than allowing the submissions to continue + """ + begin = time.time() + job_ids = [] + condor_job_ids = [] + for job_submit_param in job_submission_params: + job_id = job_submit_param.job_id + job_ids.append(job_id) + try: + submission_info = self.sdkmr.get_condor().run_job( + params=job_submit_param + ) + condor_job_id = submission_info.clusterid + except Exception as e: + self.logger.error(e) + self._finish_created_job(job_id=job_id, exception=e) + raise e - def _run(self, params, concierge_params=None): - prepared = self._prepare_to_run( - params=params, concierge_params=concierge_params - ) - params = prepared.params - job_id = prepared.job_id + if submission_info.error is not None and isinstance( + submission_info.error, Exception + ): + self._finish_created_job(exception=submission_info.error, job_id=job_id) + raise submission_info.error + if condor_job_id is None: + error_msg = ( + "Condor job not run, and error not found. Something went wrong" + ) + self._finish_created_job( + job_id=job_id, exception=RuntimeError(error_msg) + ) + raise RuntimeError(error_msg) + condor_job_ids.append(condor_job_id) + + self.logger.error(f"It took {time.time() - begin} to submit jobs to condor") + # It took 4.836009502410889 to submit jobs to condor + + update_time = time.time() + self._update_to_queued_multiple(job_ids=job_ids, scheduler_ids=condor_job_ids) + # It took 1.9239885807037354 to update jobs + self.logger.error(f"It took {time.time() - update_time} to update jobs ") + + return job_ids + + def _run(self, params): + job_params = self._prepare_to_run(params=params) + job_id = job_params.job_id try: - submission_info = self.sdkmr.get_condor().run_job( - params=params, concierge_params=concierge_params - ) + submission_info = self.sdkmr.get_condor().run_job(params=job_params) condor_job_id = submission_info.clusterid - self.logger.debug(f"Submitted job id and got '{condor_job_id}'") except Exception as e: self.logger.error(e) self._finish_created_job(job_id=job_id, exception=e) @@ -240,36 +406,29 @@ def _run(self, params, concierge_params=None): self._finish_created_job(exception=submission_info.error, job_id=job_id) raise submission_info.error if condor_job_id is None: - error_msg = "Condor job not ran, and error not found. Something went wrong" + error_msg = "Condor job not run, and error not found. Something went wrong" self._finish_created_job(job_id=job_id, exception=RuntimeError(error_msg)) raise RuntimeError(error_msg) - self.logger.debug( - f"Attempting to update job to queued {job_id} {condor_job_id} {submission_info}" - ) - self.update_job_to_queued(job_id=job_id, scheduler_id=condor_job_id) - self.sdkmr.slack_client.run_job_message( - job_id=job_id, scheduler_id=condor_job_id, username=self.sdkmr.user_id - ) return job_id - def _abort_child_jobs(self, child_job_ids): + def _abort_multiple_jobs(self, job_ids): """ Cancel a list of child jobs, and their child jobs """ - for child_job_id in child_job_ids: + for job_id in job_ids: try: self.sdkmr.cancel_job( - job_id=child_job_id, + job_id=job_id, terminated_code=TerminatedCode.terminated_by_batch_abort.value, ) except Exception as e: # TODO Maybe add a retry here? self.logger.error(f"Couldn't cancel child job {e}") - def _create_parent_job(self, wsid, meta): + def _create_batch_job(self, wsid, meta): """ This creates the parent job for all children to mark as their ancestor :param params: @@ -286,41 +445,36 @@ def _create_parent_job(self, wsid, meta): job_input.narrative_cell_info.token_id = meta.get("token_id") job_input.narrative_cell_info.tag = meta.get("tag") job_input.narrative_cell_info.cell_id = meta.get("cell_id") - job_input.narrative_cell_info.status = meta.get("status") - - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - j = Job( - job_input=job_input, - batch_job=True, - status=Status.created.value, - wsid=wsid, - user=self.sdkmr.user_id, - ) - j.save() - # TODO Do we need a new kafka call? - self.sdkmr.kafka_client.send_kafka_message( + j = Job( + job_input=job_input, + batch_job=True, + status=Status.created.value, + wsid=wsid, + user=self.sdkmr.get_user_id(), + ) + j = self.sdkmr.save_and_return_job(j) + + # TODO Do we need a new kafka call for batch? + self.sdkmr.get_kafka_client().send_kafka_message( message=KafkaCreateJob(job_id=str(j.id), user=j.user) ) return j - def _run_batch(self, parent_job: Job, params): - child_jobs = [] + def _run_batch(self, batch_job: Job, params): + """Add the batch id, save the jobs to the db, run the jobs""" + for job_param in params: - if "parent_job_id" not in job_param: - job_param["parent_job_id"] = str(parent_job.id) - try: - child_jobs.append(str(self._run(params=job_param))) - except Exception as e: - self.logger.debug( - msg=f"Failed to submit child job. Aborting entire batch job {e}" - ) - self._abort_child_jobs(child_jobs) - raise e + job_param[_BATCH_ID] = str(batch_job.id) - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - parent_job.child_jobs = child_jobs - parent_job.save() + child_jobs = self._run_multiple(params) + + # Cancel child jobs if we can't notify the batch job of the child jobs + try: + self.sdkmr.add_child_jobs(batch_job=batch_job, child_jobs=child_jobs) + except Exception as e: + self._abort_multiple_jobs(child_jobs) + raise e return child_jobs @@ -328,69 +482,530 @@ def run_batch( self, params, batch_params, as_admin=False ) -> Dict[str, Union[Job, List[str]]]: """ + Warning: modifies params in place :param params: List of RunJobParams (See Spec File) - :param batch_params: List of Batch Params, such as wsid (See Spec file) + :param batch_params: Mapping of Batch Params, such as {wsid, as_admin} (See Spec file) :param as_admin: Allows you to run jobs in other people's workspaces :return: A list of condor job ids or a failure notification """ - wsid = batch_params.get("wsid") - meta = batch_params.get("meta") + + if type(params) != list: + raise IncorrectParamsException("params must be a list") + + if type(batch_params) != dict: + raise IncorrectParamsException("batch params must be a mapping") + + wsid = batch_params.get(_WORKSPACE_ID) + meta = batch_params.get(_META) + + self._preflight( + runjob_params=params, + batch_params=batch_params, + new_batch_job=True, + as_admin=as_admin, + ) + self._add_job_requirements(params, bool(as_admin)) # as_admin checked above + self._check_job_arguments(params, batch_job=True) + batch_job = self._create_batch_job(wsid=wsid, meta=meta) + children_jobs = self._run_batch(batch_job=batch_job, params=params) + + return {_BATCH_ID: str(batch_job.id), "child_job_ids": children_jobs} + + # modifies the jobs in place + def _add_job_requirements(self, jobs: List[Dict[str, Any]], is_write_admin: bool): + f""" + Adds the job requirements, generated from the job requirements resolver, + to the provided RunJobParams dicts. Expects the required field {_METHOD} in the param + dicts. Looks in the {_JOB_REQUIREMENTS_INCOMING} key for a dictionary containing the + optional keys {REQUEST_CPUS}, {REQUEST_MEMORY}, {REQUEST_DISK}, {CLIENT_GROUP}, + {CLIENT_GROUP_REGEX}, {BILL_TO_USER}, {IGNORE_CONCURRENCY_LIMITS}, + {_SCHEDULER_REQUIREMENTS}, and {DEBUG_MODE}. Adds the {_JOB_REQUIREMENTS} field to the + param dicts, which holds the job requirements object. + """ + # could add a cache in the job requirements resolver to avoid making the same + # catalog call over and over if all the jobs have the same method + jrr = self.sdkmr.get_job_requirements_resolver() + for i, job in enumerate(jobs): + # TODO I feel like a class for just handling error formatting would be useful + # but too much work for a minor benefit + pre = f"Job #{i + 1}: " if len(jobs) > 1 else "" + job_reqs = job.get(_JOB_REQUIREMENTS_INCOMING) or {} + if not isinstance(job_reqs, dict): + raise IncorrectParamsException( + f"{pre}{_JOB_REQUIREMENTS_INCOMING} must be a mapping" + ) + try: + norm = jrr.normalize_job_reqs(job_reqs, "input job") + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, pre) + self._check_job_requirements_vs_admin( + jrr, norm, job_reqs, is_write_admin, pre + ) + + try: + job[_JOB_REQUIREMENTS] = jrr.resolve_requirements( + method=job.get(_METHOD), + catalog_cache=self.sdkmr.get_catalog_cache(), + cpus=norm.get(REQUEST_CPUS), + memory_MB=norm.get(REQUEST_MEMORY), + disk_GB=norm.get(REQUEST_DISK), + client_group=norm.get(CLIENT_GROUP), + client_group_regex=norm.get(CLIENT_GROUP_REGEX), + bill_to_user=job_reqs.get(BILL_TO_USER), + ignore_concurrency_limits=bool( + job_reqs.get(IGNORE_CONCURRENCY_LIMITS) + ), + scheduler_requirements=job_reqs.get(_SCHEDULER_REQUIREMENTS), + debug_mode=norm.get(DEBUG_MODE), + ) + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, pre) + + def _check_job_requirements_vs_admin( + self, jrr, norm, job_reqs, is_write_admin, err_prefix + ): + # just a helper method for _add_job_requirements to make that method a bit shorter. + # treat it as part of that method + try: + perm_type = jrr.get_requirements_type( + cpus=norm.get(REQUEST_CPUS), + memory_MB=norm.get(REQUEST_MEMORY), + disk_GB=norm.get(REQUEST_DISK), + client_group=norm.get(CLIENT_GROUP), + client_group_regex=norm.get(CLIENT_GROUP_REGEX), + # Note that this is never confirmed to be a real user. May want to fix that, but + # since it's admin only... YAGNI + bill_to_user=self._check_is_string( + job_reqs.get(BILL_TO_USER), "bill_to_user" + ), + ignore_concurrency_limits=bool(job_reqs.get(IGNORE_CONCURRENCY_LIMITS)), + scheduler_requirements=job_reqs.get(_SCHEDULER_REQUIREMENTS), + debug_mode=norm.get(DEBUG_MODE), + ) + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, err_prefix) + if perm_type != RequirementsType.STANDARD and not is_write_admin: + raise AuthError( + f"{err_prefix}In order to specify job requirements you must be a full admin" + ) + + def _check_is_string(self, putative_str, name): + if not putative_str: + return None + if type(putative_str) != str: + raise IncorrectParamsException(f"{name} must be a string") + return putative_str + + def _rethrow_incorrect_params_with_error_prefix( + self, error: IncorrectParamsException, error_prefix: str + ): + if not error_prefix: + raise error + raise IncorrectParamsException(f"{error_prefix}{error.args[0]}") from error + + def _check_job_arguments(self, jobs, batch_job=False): + # perform sanity checks before creating any jobs, including the parent job for batch jobs + for i, job in enumerate(jobs): + # Could make an argument checker method, or a class that doesn't require a job id. + # Seems like more code & work for no real benefit though. + # Just create the class for checks, don't use yet + pre = f"Job #{i + 1}: " if len(jobs) > 1 else "" + try: + JobSubmissionParameters( + "fakejobid", + AppInfo(job.get(_METHOD), job.get(_APP_ID)), + job[_JOB_REQUIREMENTS], + UserCreds(self.sdkmr.get_user_id(), self.sdkmr.get_token()), + wsid=job.get(_WORKSPACE_ID), + source_ws_objects=job.get(_SOURCE_WS_OBJECTS), + ) + except IncorrectParamsException as e: + self._rethrow_incorrect_params_with_error_prefix(e, pre) + if batch_job and job.get(_PARENT_JOB_ID): + raise IncorrectParamsException( + f"{pre}batch jobs may not specify a parent job ID" + ) + # This is also an opportunity for caching + # although most likely jobs aren't operating on the same object + self._check_ws_objects(source_objects=job.get(_SOURCE_WS_OBJECTS)) + + @staticmethod + def _retryable(status: str): + return status in [Status.terminated.value, Status.error.value] + + def _safe_cancel( + self, + job_id: str, + terminated_code: TerminatedCode, + ): + try: + self.sdkmr.cancel_job(job_id=job_id, terminated_code=terminated_code.value) + except Exception as e: + self.logger.error(f"Couldn't cancel {job_id} due to {e}") + + def _db_update_failure( + self, job_that_failed_operation: str, job_to_abort: str, exception: Exception + ): + """Attempt to cancel created/queued/running retried job and then raise exception""" + # TODO Use and create a method in sdkmr? + msg = ( + f"Couldn't update job record:{job_that_failed_operation} during retry. Aborting:{job_to_abort}" + f" Exception:{exception} " + ) + self._safe_cancel( + job_id=job_to_abort, + terminated_code=TerminatedCode.terminated_by_server_failure, + ) + # TODO Maybe move this log into multiple so not multiple error messages are generated + self.logger.error(msg, exc_info=True, stack_info=True) + raise RetryFailureException(msg) + + def _validate_retry_presubmit(self, job_id: str, as_admin: bool = False): + """ + Validate retry request before attempting to contact scheduler + + _validate doesn't do a recursive check if if the job has a retry parent, + but the _validate call on the recursion is guaranteed to pass because + the parent was retried once already so the _validate must have passed previously. + Since the parent job's state can't have changed it would just pass again. + """ + + # Check to see if you still have permissions to the job and then optionally the parent job id + job = self.sdkmr.get_job_with_permission( + job_id, JobPermissions.WRITE, as_admin=as_admin + ) # type: Job + + batch_job = None + if job.batch_id: + batch_job = self.sdkmr.get_job_with_permission( + job.batch_id, JobPermissions.WRITE, as_admin=as_admin + ) + + if job.batch_job: + raise CannotRetryJob( + "Cannot retry batch job parents. Must retry individual jobs" + ) + + if not self._retryable(job.status): + raise CannotRetryJob( + f"Error retrying job {job_id} with status {job.status}: can only retry jobs with status 'error' or 'terminated'" + ) + + return job, batch_job + + def _retry(self, job_id: str, job: Job, batch_job: Job, as_admin: bool = False): + # Cannot retry a retried job, you must retry the retry_parent + if job.retry_parent: + return self.retry(str(job.retry_parent), as_admin=as_admin) + + # Get run job params from db, and inject parent job id, then run it + run_job_params = self._get_run_job_params_from_existing_job( + job, user_id=self.sdkmr.user_id + ) + # Submit job to job scheduler or fail and not count it as a retry attempt + run_job_params[_PARENT_RETRY_JOB_ID] = job_id + retry_job_id = self.run(params=run_job_params, as_admin=as_admin) + + # Save that the job has been retried, and increment the count. Notify the parent(s) + # 1) Notify the batch container that it has a new child. Note that the parent jobs of + # 'manual' batch jobs using the job_input.parent_job_id field *are not* modified to + # include their children, so we don't do that here either. + if batch_job: + try: + batch_job.modify(add_to_set__child_jobs=retry_job_id) + except Exception as e: + self._db_update_failure( + job_that_failed_operation=str(batch_job.id), + job_to_abort=retry_job_id, + exception=e, + ) + + # 2) Notify the retry_parent that it has been retried by adding a retry id + try: + job.modify(add_to_set__retry_ids=retry_job_id) + except Exception as e: + self._db_update_failure( + job_that_failed_operation=str(job.id), + job_to_abort=retry_job_id, + exception=e, + ) + # 3) If the retry_ids is updated and if present, the child_jobs, is updated, set toggle to true + try: + retry_job = self.sdkmr.get_mongo_util().get_job(job_id=retry_job_id) + retry_job.modify(set__retry_saved_toggle=True) + except Exception: + self.logger.error( + f"Couldn't toggle job retry state for {retry_job_id} ", + exc_info=True, + stack_info=True, + ) + + # Should we compare the original and child job to make sure certain fields match, + # to make sure the retried job is correctly submitted? Or save that for a unit test? + return {"job_id": job_id, "retry_id": retry_job_id} + + def retry(self, job_id: str, as_admin=False) -> Dict[str, Optional[str]]: + """ + #TODO Add new job requirements/cgroups as an optional param + :param job_id: The main job to retry + :param as_admin: Run with admin permission + :return: The child job id that has been retried + """ + job, batch_job = self._validate_retry_presubmit( + job_id=job_id, as_admin=as_admin + ) + return self._retry( + job_id=job_id, job=job, batch_job=batch_job, as_admin=as_admin + ) + + def retry_multiple( + self, job_ids, as_admin=False + ) -> List[Dict[str, Union[str, Any]]]: + """ + #TODO Add new job requirements/cgroups as an optional param + #TODO Notify the parent container that it has multiple new children, instead of multiple transactions? + #TODO Prevent retry when multiple batch job containers? + + :param job_ids: The list of jobs to retry + :param as_admin: Run with admin permission + :return: The child job ids that have been retried or errors + """ + if not job_ids: + raise ValueError("No job_ids provided to retry") + + offending_ids = [item for item, count in Counter(job_ids).items() if count > 1] + if offending_ids: + raise ValueError( + f"Retry of the same id in the same request is not supported." + f" Offending ids: {offending_ids} " + ) + + # Check all inputs before attempting to start submitting jobs + retried_jobs = [] + for job_id in job_ids: + # Check for presubmission failures + try: + job, batch_job = self._validate_retry_presubmit( + job_id=job_id, as_admin=as_admin + ) + except Exception as e: + # Collect the presubmit error and don't submit the job + retried_jobs.append({"job_id": job_id, "error": f"{e}"}) + continue + # Presubmit worked, write to the db and submit + try: + retried_jobs.append( + self._retry( + job_id=job_id, + job=job, + batch_job=batch_job, + as_admin=as_admin, + ) + ) + except Exception as e: + retried_jobs.append({"job_id": job_id, "error": f"{e}"}) + + return retried_jobs + + @staticmethod + def _get_run_job_params_from_existing_job(job: Job, user_id: str) -> Dict: + """ + Get top level fields from job model to be sent into `run_job` + """ + ji = job.job_input # type: JobInput + + meta = None + if ji.narrative_cell_info: + meta = ji.narrative_cell_info.to_mongo().to_dict() + + source_ws_objects = list() + if ji.source_ws_objects: + source_ws_objects = list(ji.source_ws_objects) + + run_job_params = { + _WORKSPACE_ID: job.wsid, + _META: meta, + _APP_PARAMS: ji.params or {}, + "user": user_id, # REQUIRED, it runs as the current user + _METHOD: ji.method, # REQUIRED + _APP_ID: ji.app_id, + _SOURCE_WS_OBJECTS: source_ws_objects, # Must be list + _SERVICE_VER: ji.service_ver, + _PARENT_JOB_ID: ji.parent_job_id, + _BATCH_ID: job.batch_id, + } + + # Then the next fields are job inputs top level requirements, app run parameters, and scheduler resource requirements + return run_job_params + + def _check_ws_perms( + self, + runjob_params: Union[dict, list], + new_batch_job: bool, + batch_params: dict, + as_admin: bool = False, + ): + """ + Check a single job, a single batch job, or a retry_multiple request with a mix of different jobs. + """ if as_admin: - self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) + return self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) + # Batch Param runs + if new_batch_job: + if batch_params: + return self._check_workspace_permissions(batch_params.get("wsid")) + # Single job runs + elif isinstance(runjob_params, dict): + return self._check_workspace_permissions(runjob_params.get("wsid")) + # Multiple independent job runs, think retry_multiple() + elif isinstance(runjob_params, list): + return self._check_workspace_permissions_list( + [job_param.get("wsid") for job_param in runjob_params] + ) else: - # Make sure you aren't running a job in someone elses workspace - self._check_workspace_permissions(wsid) - wsids = [job_input.get("wsid", wsid) for job_input in params] - self._check_workspace_permissions_list(wsids) + raise IncorrectParamsException( + "Runjob params must be an instance of a dict, or a list of dicts" + ) - parent_job = self._create_parent_job(wsid=wsid, meta=meta) - children_jobs = self._run_batch(parent_job=parent_job, params=params) - return {"parent_job_id": str(parent_job.id), "child_job_ids": children_jobs} + @staticmethod + def _propagate_wsid_for_new_batch_jobs( + runjob_params: dict, batch_params: dict, new_batch_job: bool + ): + """ + For batch jobs, check to make sure the job params do not provide a wsid other than None + Then Modify the run job params to use the batch params wsid, which may be set to None + """ + if new_batch_job: + batch_wsid = batch_params.get("wsid") if batch_params else None + for runjob_param in runjob_params: + if runjob_param.get("wsid") is not None: + raise InvalidParameterForBatch() + # Do we do a deepcopy here in case the params point to the same obj? + runjob_param["wsid"] = batch_wsid + + def _preflight( + self, + runjob_params: Union[dict, list], + batch_params: dict = None, + new_batch_job: bool = False, + as_admin: bool = False, + ) -> None: + """ + Propagate and check ws permissions for job(s) + :param runjob_params: List of RunJobParams or a single RunJobParams mapping + :param batch_params: Optional mapping for Batch Jobs + :param new_batch_job: Whether or not this is a new batch job + :param as_admin: For checking ws permissions as an admin or not + """ + if batch_params and not new_batch_job: + raise IncorrectParamsException( + "Programming error, you forgot to set the new_batch_job flag to True" + ) + if batch_params == runjob_params: + raise IncorrectParamsException( + "RunJobParams and BatchParams cannot be identical" + ) + + self._propagate_wsid_for_new_batch_jobs( + runjob_params=runjob_params, + batch_params=batch_params, + new_batch_job=new_batch_job, + ) + self._check_ws_perms( + runjob_params=runjob_params, + new_batch_job=new_batch_job, + batch_params=batch_params, + as_admin=as_admin, + ) def run( self, params=None, as_admin=False, concierge_params: Dict = None ) -> Optional[str]: """ - :param params: SpecialRunJobParamsParams object (See spec file) + Warning: modifies params in place :param params: RunJobParams object (See spec file) :param as_admin: Allows you to run jobs in other people's workspaces :param concierge_params: Allows you to specify request_cpu, request_memory, request_disk, clientgroup :return: The condor job id """ - if as_admin: - self.sdkmr.check_as_admin(requested_perm=JobPermissions.WRITE) - else: - self._check_workspace_permissions(params.get("wsid")) + + # TODO Test this + if type(params) != dict: + raise IncorrectParamsException("params must be a mapping") + + self._preflight(runjob_params=params, as_admin=as_admin) if concierge_params: - cp = ConciergeParams(**concierge_params) self.sdkmr.check_as_concierge() + # we don't check requirements type because the concierge can do what they like + params[_JOB_REQUIREMENTS] = self._get_job_reqs_from_concierge_params( + params.get(_METHOD), concierge_params + ) else: - cp = None - - return self._run(params=params, concierge_params=cp) + # as_admin checked above + self._add_job_requirements([params], bool(as_admin)) + self._check_job_arguments([params]) + return self._run(params=params) + + def _get_job_reqs_from_concierge_params( + self, method: str, concierge_params: Dict[str, Any] + ) -> ResolvedRequirements: + jrr = self.sdkmr.get_job_requirements_resolver() + norm = jrr.normalize_job_reqs(concierge_params, "concierge parameters") + rl = concierge_params.get(_REQUIREMENTS_LIST) + schd_reqs = {} + if rl: + if type(rl) != list: + raise IncorrectParamsException(f"{_REQUIREMENTS_LIST} must be a list") + for s in rl: + if type(s) != str or "=" not in s: + raise IncorrectParamsException( + f"Found illegal requirement in {_REQUIREMENTS_LIST}: {s}" + ) + key, val = s.split("=") + schd_reqs[key.strip()] = val.strip() + + return jrr.resolve_requirements( + method=method, + catalog_cache=self.sdkmr.get_catalog_cache(), + cpus=norm.get(REQUEST_CPUS), + memory_MB=norm.get(REQUEST_MEMORY), + disk_GB=norm.get(REQUEST_DISK), + client_group=norm.get(CLIENT_GROUP) or CONCIERGE_CLIENTGROUP, + client_group_regex=norm.get(CLIENT_GROUP_REGEX), + # error messaging here is for 'bill_to_user' vs 'account_group' but almost impossible + # to screw up so YAGNI + # Note that this is never confirmed to be a real user. May want to fix that, but + # since it's admin only... YAGNI + bill_to_user=concierge_params.get("account_group"), + # default is to ignore concurrency limits for concierge + ignore_concurrency_limits=bool( + concierge_params.get(IGNORE_CONCURRENCY_LIMITS, 1) + ), + scheduler_requirements=schd_reqs, + debug_mode=norm.get(DEBUG_MODE), + ) def update_job_to_queued(self, job_id, scheduler_id): # TODO RETRY FOR RACE CONDITION OF RUN/CANCEL # TODO PASS QUEUE TIME IN FROM SCHEDULER ITSELF? # TODO PASS IN SCHEDULER TYPE? - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - j = self.sdkmr.get_mongo_util().get_job(job_id=job_id) - previous_status = j.status - j.status = Status.queued.value - j.queued = time.time() - j.scheduler_id = scheduler_id - j.scheduler_type = "condor" - j.save() - - self.sdkmr.kafka_client.send_kafka_message( - message=KafkaQueueChange( - job_id=str(j.id), - new_status=j.status, - previous_status=previous_status, - scheduler_id=scheduler_id, - ) + j = self.sdkmr.get_mongo_util().get_job(job_id=job_id) + previous_status = j.status + j.status = Status.queued.value + j.queued = time.time() + j.scheduler_id = scheduler_id + j.scheduler_type = "condor" + self.sdkmr.save_job(j) + + self.sdkmr.get_kafka_client().send_kafka_message( + message=KafkaQueueChange( + job_id=str(j.id), + new_status=j.status, + previous_status=previous_status, + scheduler_id=scheduler_id, ) + ) def get_job_params(self, job_id, as_admin=False): """ @@ -407,12 +1022,14 @@ def get_job_params(self, job_id, as_admin=False): job_input = job.job_input - job_params["method"] = job_input.method + job_params[_METHOD] = job_input.method job_params["params"] = job_input.params job_params["service_ver"] = job_input.service_ver - job_params["app_id"] = job_input.app_id - job_params["wsid"] = job_input.wsid - job_params["parent_job_id"] = job_input.parent_job_id - job_params["source_ws_objects"] = job_input.source_ws_objects + job_params[_APP_ID] = job_input.app_id + job_params[_WORKSPACE_ID] = job_input.wsid + # This is specfically the data in the job params, which includes any manually submitted + # parent job information but does not include batch job information + job_params[_PARENT_JOB_ID] = job_input.parent_job_id + job_params[_SOURCE_WS_OBJECTS] = job_input.source_ws_objects return job_params diff --git a/lib/execution_engine2/sdk/EE2Status.py b/lib/execution_engine2/sdk/EE2Status.py index eb4bfa5f6..053cfb77d 100644 --- a/lib/execution_engine2/sdk/EE2Status.py +++ b/lib/execution_engine2/sdk/EE2Status.py @@ -18,6 +18,7 @@ ErrorCode, TerminatedCode, ) +from execution_engine2.utils.arg_processing import parse_bool from lib.execution_engine2.utils.KafkaUtils import ( KafkaCancelJob, KafkaCondorCommand, @@ -93,7 +94,6 @@ def cancel_job(self, job_id, terminated_code=None, as_admin=False): :param as_admin: Cancel the job for a different user """ # Is it inefficient to get the job twice? Is it cached? - # Maybe if the call fails, we don't actually cancel the job? job = self.sdkmr.get_job_with_permission( job_id, JobPermissions.WRITE, as_admin=as_admin @@ -122,6 +122,7 @@ def cancel_job(self, job_id, terminated_code=None, as_admin=False): ) # TODO Issue #190 IF success['TotalSuccess = 0'] == FALSE, don't send a kafka message? + self.sdkmr.get_condor().cancel_job(job_id=f"{job.scheduler_id}.0") self.sdkmr.kafka_client.send_kafka_message( message=KafkaCancelJob( @@ -354,9 +355,9 @@ def finish_job( ) ) else: - self.sdkmr.logger.debug("Finishing job with a success") + self.sdkmr.get_logger().debug("Finishing job with a success") self._finish_job_with_success(job_id=job_id, job_output=job_output) - self.sdkmr.kafka_client.send_kafka_message( + self.sdkmr.get_kafka_client().send_kafka_message( message=KafkaFinishJob( job_id=str(job_id), new_status=Status.completed.value, @@ -367,15 +368,16 @@ def finish_job( ) ) self._send_exec_stats_to_catalog(job_id=job_id) - self.update_finished_job_with_usage(job_id, as_admin=as_admin) + self._update_finished_job_with_usage(job_id, as_admin=as_admin) - def update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: + def _update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: """ # TODO Does this need a kafka message? - :param job_id: - :param as_admin: - :return: + # TODO EE2 issue #251 : The saved job stats are inaccurate: + # The usage is not recorded until the job is completely finished. + :return: Resources at the time the job almost finished. """ + # note this method is replaced by a magic mock in some tests job = self.sdkmr.get_job_with_permission( job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=as_admin ) @@ -389,7 +391,9 @@ def update_finished_job_with_usage(self, job_id, as_admin=None) -> Dict: ) condor = self.sdkmr.get_condor() resources = condor.get_job_resource_info(job_id=job_id) - self.sdkmr.logger.debug(f"Extracted the following condor job ads {resources}") + self.sdkmr.get_logger().debug( + f"Extracted the following condor job ads {resources}" + ) self.sdkmr.get_mongo_util().update_job_resources( job_id=job_id, resources=resources ) @@ -445,7 +449,7 @@ def check_jobs( "Checking for read permission to: {}".format(job_ids) ) perms = can_read_jobs( - jobs, self.sdkmr.user_id, self.sdkmr.token, self.sdkmr.config + jobs, self.sdkmr.user_id, self.sdkmr.workspace_auth ) except RuntimeError as e: self.sdkmr.logger.error( @@ -468,7 +472,9 @@ def check_jobs( else: mongo_rec = job.to_mongo().to_dict() del mongo_rec["_id"] + mongo_rec["retry_count"] = len(job["retry_ids"]) mongo_rec["job_id"] = str(job.id) + mongo_rec["batch_id"] = job.batch_id mongo_rec["created"] = int(job.id.generation_time.timestamp() * 1000) mongo_rec["updated"] = int(job.updated * 1000) if job.estimating: @@ -486,7 +492,7 @@ def check_jobs( {job_id: job_states.get(job_id, []) for job_id in job_ids} ) - if return_list is not None and self.sdkmr.parse_bool_from_string(return_list): + if return_list is not None and parse_bool(return_list): job_states = {"job_states": list(job_states.values())} return job_states @@ -502,8 +508,7 @@ def check_workspace_jobs(self, workspace_id, exclude_fields=None, return_list=No if exclude_fields is None: exclude_fields = [] - ws_auth = self.sdkmr.get_workspace_auth() - if not ws_auth.can_read(workspace_id): + if not self.sdkmr.workspace_auth.can_read(workspace_id): self.sdkmr.logger.debug( f"User {self.sdkmr.user_id} doesn't have permission to read jobs in workspace {workspace_id}." ) @@ -534,8 +539,13 @@ def _send_exec_stats_to_catalog(self, job_id): log_exec_stats_params = dict() log_exec_stats_params["user_id"] = job.user app_id = job_input.app_id - log_exec_stats_params["app_module_name"] = app_id.split("/")[0] - log_exec_stats_params["app_id"] = app_id + if app_id: + # Note this will not work properly for app_ids incorrectly separated by a '.', + # which happens in some KBase code (which needs to be fixed at some point) - + # notably the narrative data download code, maybe more + # It's been this way for a long time, so leave for now + log_exec_stats_params["app_module_name"] = app_id.split("/")[0] + log_exec_stats_params["app_id"] = app_id method = job_input.method log_exec_stats_params["func_module_name"] = method.split(".")[0] log_exec_stats_params["func_name"] = method.split(".")[-1] @@ -546,16 +556,19 @@ def _send_exec_stats_to_catalog(self, job_id): log_exec_stats_params["is_error"] = int(job.status == Status.error.value) log_exec_stats_params["job_id"] = job_id - self.sdkmr.catalog_utils.catalog.log_exec_stats(log_exec_stats_params) + self.sdkmr.get_catalog().log_exec_stats(log_exec_stats_params) - def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict: - if not parent_job_id: - raise ValueError("Please provide valid parent_job id") + def abandon_children(self, batch_id, child_job_ids, as_admin=False) -> Dict: + # Note this does not work for 'manual' batch jobs as the parent job is + # never updated with the child jobs. It will only work with batch jobs specifically + # created by the run_job_batch endpoint. + if not batch_id: + raise ValueError("Please provide valid batch_id") if not child_job_ids: raise ValueError("Please provide job_ids of children to abandon") job = self.sdkmr.get_job_with_permission( - parent_job_id, JobPermissions.WRITE, as_admin=as_admin + batch_id, JobPermissions.WRITE, as_admin=as_admin ) # type: Job for child_job_id in child_job_ids: if child_job_id not in job.child_jobs: @@ -563,11 +576,10 @@ def abandon_children(self, parent_job_id, child_job_ids, as_admin=False) -> Dict f"Couldn't find {child_job_id} in {child_job_ids}" ) - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - job.update(pull_all__child_jobs=child_job_ids) - job.reload() + job.update(pull_all__child_jobs=child_job_ids) + job.reload() - return {"parent_job_id": parent_job_id, "child_jobs": job.child_jobs} + return {"batch_id": batch_id, "child_job_ids": job.child_jobs} def start_job(self, job_id, skip_estimation=True, as_admin=False): """ diff --git a/lib/execution_engine2/sdk/EE2StatusRange.py b/lib/execution_engine2/sdk/EE2StatusRange.py index ae64f2c15..3bd22203c 100644 --- a/lib/execution_engine2/sdk/EE2StatusRange.py +++ b/lib/execution_engine2/sdk/EE2StatusRange.py @@ -1,15 +1,16 @@ from collections import Counter from collections import namedtuple -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from typing import Dict from bson import ObjectId -from lib.execution_engine2.db.models.models import Job -from lib.execution_engine2.exceptions import AuthError +from execution_engine2.utils.arg_processing import parse_bool +from execution_engine2.exceptions import AuthError +# TODO this class is duplicated all over the place, move to common file class JobPermissions(Enum): READ = "r" WRITE = "w" @@ -74,18 +75,14 @@ def check_jobs_date_range_for_user( if offset is None: offset = 0 - if self.sdkmr.token is None: - raise AuthError("Please provide a token to check jobs date range") - - token_user = self.sdkmr.auth.get_user(self.sdkmr.token) if user is None: - user = token_user - + user = self.sdkmr.get_user_id() # Admins can view "ALL" or check_jobs for other users - if user != token_user: + elif user != self.sdkmr.get_user_id(): if not self.sdkmr.check_is_admin(): raise AuthError( - f"You are not authorized to view all records or records for others. user={user} token={token_user}" + "You are not authorized to view all records or records for others. " + + f"user={user} token={self.sdkmr.get_user_id()}" ) dummy_ids = self._get_dummy_dates(creation_start_time, creation_end_time) @@ -121,17 +118,12 @@ def check_jobs_date_range_for_user( if user != "ALL": job_filter_temp["user"] = user - with self.sdkmr.get_mongo_util().mongo_engine_connection(): - count = Job.objects.filter(**job_filter_temp).count() - jobs = ( - Job.objects[:limit] - .filter(**job_filter_temp) - .order_by(f"{sort_order}_id") - .skip(offset) - .only(*job_projection) - ) + count = self.sdkmr.get_job_counts(job_filter_temp) + jobs = self.sdkmr.get_jobs( + job_filter_temp, job_projection, sort_order, offset, limit + ) - self.sdkmr.logger.debug( + self.sdkmr.get_logger().debug( f"Searching for jobs with id_gt {dummy_ids.start} id_lt {dummy_ids.stop}" ) @@ -161,6 +153,8 @@ def check_jobs_date_range_for_user( # TODO USE AS_PYMONGO() FOR SPEED # TODO Better define default fields # TODO Instead of SKIP use ID GT LT https://www.codementor.io/arpitbhayani/fast-and-efficient-pagination-in-mongodb-9095flbqr + # ^ this one is important - the workspace was DOSed by a single open narrative at one + # point due to skip abuse, which is why it was removed def _get_dummy_dates(self, creation_start_time, creation_end_time): @@ -170,14 +164,16 @@ def _get_dummy_dates(self, creation_start_time, creation_end_time): ) creation_start_time = self.sdkmr.check_and_convert_time(creation_start_time) - creation_start_date = datetime.fromtimestamp(creation_start_time) + creation_start_date = datetime.fromtimestamp( + creation_start_time, tz=timezone.utc + ) dummy_start_id = ObjectId.from_datetime(creation_start_date) if creation_end_time is None: raise Exception("Please provide a valid end time for when job was created") creation_end_time = self.sdkmr.check_and_convert_time(creation_end_time) - creation_end_date = datetime.fromtimestamp(creation_end_time) + creation_end_date = datetime.fromtimestamp(creation_end_time, tz=timezone.utc) dummy_end_id = ObjectId.from_datetime(creation_end_date) if creation_start_time > creation_end_time: @@ -191,7 +187,7 @@ def get_sort_order(self, ascending): if ascending is None: return "+" else: - if self.sdkmr.parse_bool_from_string(ascending): + if parse_bool(ascending): return "+" else: return "-" @@ -208,9 +204,16 @@ def _job_state_from_jobs(jobs): str(job_id) float(created/queued/estimating/running/finished/updated/) (Time in MS) """ + hidden_keys = ["retry_saved_toggle"] + job_states = [] for job in jobs: mongo_rec = job.to_mongo().to_dict() + + for key in hidden_keys: + if key in mongo_rec: + del mongo_rec[key] + mongo_rec["_id"] = str(job.id) mongo_rec["job_id"] = str(job.id) mongo_rec["created"] = int(job.id.generation_time.timestamp() * 1000) diff --git a/lib/execution_engine2/sdk/SDKMethodRunner.py b/lib/execution_engine2/sdk/SDKMethodRunner.py index 02440f791..350599960 100644 --- a/lib/execution_engine2/sdk/SDKMethodRunner.py +++ b/lib/execution_engine2/sdk/SDKMethodRunner.py @@ -8,33 +8,35 @@ * Clients are only loaded if they are necessary """ -import json -import os import time from datetime import datetime from enum import Enum +from logging import Logger +from typing import List import dateutil -from installed_clients.WorkspaceClient import Workspace -from installed_clients.authclient import KBaseAuth -from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job -from lib.execution_engine2.exceptions import AuthError -from lib.execution_engine2.sdk import ( +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job +from execution_engine2.exceptions import AuthError +from execution_engine2.sdk import ( EE2Runjob, EE2StatusRange, EE2Authentication, EE2Status, EE2Logs, ) -from lib.execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME -from lib.execution_engine2.utils.CatalogUtils import CatalogUtils -from lib.execution_engine2.utils.Condor import Condor -from lib.execution_engine2.utils.EE2Logger import get_logger -from lib.execution_engine2.utils.KafkaUtils import KafkaClient -from lib.execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.sdk.EE2Constants import KBASE_CONCIERGE_USERNAME +from execution_engine2.utils.Condor import Condor +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.clients import UserClientSet, ClientSet +from execution_engine2.utils.EE2Logger import get_logger as _get_logger +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient +from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace +from execution_engine2.utils.catalog_cache import CatalogCache class JobPermissions(Enum): @@ -46,6 +48,7 @@ class JobPermissions(Enum): class SDKMethodRunner: """ The execution engine 2 api calls functions from here. + The SDKMR is instantiated per call """ """ @@ -53,35 +56,32 @@ class SDKMethodRunner: """ JOB_PERMISSION_CACHE_SIZE = 500 JOB_PERMISSION_CACHE_EXPIRE_TIME = 300 # seconds - ADMIN_READ_ROLE = "EE2_ADMIN_RO" - ADMIN_WRITE_ROLE = "EE2_ADMIN" def __init__( self, - config, - user_id=None, - token=None, + user_clients: UserClientSet, + clients: ClientSet, job_permission_cache=None, admin_permissions_cache=None, - mongo_util=None, ): - self.deployment_config_fp = os.environ["KB_DEPLOYMENT_CONFIG"] - self.config = config - self.mongo_util = mongo_util - self.condor = None - self.workspace = None - self.workspace_auth = None - self.admin_roles = config.get("admin_roles", ["EE2_ADMIN", "EE2_ADMIN_RO"]) - self.catalog_utils = CatalogUtils( - config["catalog-url"], config["catalog-token"] - ) - self.workspace_url = config.get("workspace-url") - self.auth_url = config.get("auth-url") - self.auth = KBaseAuth(auth_url=config.get("auth-service-url")) - self.user_id = user_id - self.token = token - self.debug = SDKMethodRunner.parse_bool_from_string(config.get("debug")) - self.logger = get_logger() + if not user_clients: + raise ValueError("user_clients is required") + if not clients: + raise ValueError("clients is required") + self.mongo_util = clients.mongo_util + self.condor = clients.condor + self.catalog = clients.catalog + # Cache Instantiated on a per request basis + self.catalog_cache = CatalogCache(catalog=clients.catalog_no_auth) + self.job_requirements_resolver = clients.requirements_resolver + + self.workspace = user_clients.workspace + self.workspace_auth = user_clients.workspace_auth + self.auth = clients.auth + self.auth_admin = clients.auth_admin + self.user_id = user_clients.user_id + self.token = user_clients.token + self.logger = _get_logger() self.job_permission_cache = EE2Authentication.EE2Auth.get_cache( cache=job_permission_cache, @@ -94,17 +94,13 @@ def __init__( expire=self.JOB_PERMISSION_CACHE_EXPIRE_TIME, ) - self.is_admin = False - # self.roles = self.roles_cache.get_roles(user_id,token) or list() self._ee2_runjob = None self._ee2_status = None self._ee2_logs = None self._ee2_status_range = None self._ee2_auth = None - self.kafka_client = KafkaClient(config.get("kafka-host")) - self.slack_client = SlackClient( - config.get("slack-token"), debug=self.debug, endpoint=config.get("ee2-url") - ) + self.kafka_client = clients.kafka_client + self.slack_client = clients.slack_client # Various Clients: TODO: Think about sending in just required clients, not entire SDKMR @@ -133,28 +129,89 @@ def get_jobs_status(self) -> EE2Status.JobsStatus: self._ee2_status = EE2Status.JobsStatus(self) return self._ee2_status + # A note on getters: + # Getters are commonly described as unpythonic. However, accessing instance variables + # directly, rather than via getters, causes significant problems when mocking a class in + # that instance variables cannot be detected by create_autospec with spec_set=True, and thus + # cannot be mocked in a rigorous way. The danger of not using spec_set=True is that if a + # mocked class's API changes, the unit tests will still pass. Thus the choice is between + # unpythonic getters or false positives in unit tests, and we choose the former. + # For more details: https://www.seanh.cc/2017/03/17/the-problem-with-mocks/ + + def get_workspace(self) -> Workspace: + """ + Get the workspace client for this instance of SDKMR. + """ + return self.workspace + def get_workspace_auth(self) -> WorkspaceAuth: - if self.workspace_auth is None: - self.workspace_auth = WorkspaceAuth( - self.token, self.user_id, self.workspace_url - ) + """ + Get the workspace authorization client for this instance of SDKMR. + """ return self.workspace_auth + def get_logger(self) -> Logger: + """ + Get the logger for this instance of SDKMR. + """ + # There's not really any way to meaningfully test this method without passing in the + # logger, which seems... overkill? + return self.logger + + def get_catalog(self) -> Catalog: + """ + Get the catalog client for this instance of SDKMR. + """ + return self.catalog + + def get_catalog_cache(self) -> CatalogCache: + """ + Get the catalog cache client for this instance of SDKMR. + """ + return self.catalog_cache + + def get_job_requirements_resolver(self) -> JobRequirementsResolver: + """ + Get the job requirements resolver for this instance of SDKMR. + """ + return self.job_requirements_resolver + + def get_kafka_client(self) -> KafkaClient: + """ + Get the Kafka client for this instance of SDKMR. + """ + return self.kafka_client + + def get_slack_client(self) -> SlackClient: + """ + Get the Kafka client for this instance of SDKMR. + """ + return self.slack_client + + def get_user_id(self) -> str: + """ + Get the user id of the user for this instance of SDKMR. + """ + return self.user_id + + def get_token(self) -> str: + """ + Get the token of the user for this instance of SDKMR. + """ + return self.token + def get_mongo_util(self) -> MongoUtil: - if self.mongo_util is None: - self.mongo_util = MongoUtil(self.config) + """ + Get the mongo utilities for this instance of SDKMR. + """ return self.mongo_util def get_condor(self) -> Condor: - if self.condor is None: - self.condor = Condor(self.deployment_config_fp) + """ + Get the Condor interface for this instance of SDKMR + """ return self.condor - def get_workspace(self) -> Workspace: - if self.workspace is None: - self.workspace = Workspace(token=self.token, url=self.workspace_url) - return self.workspace - # Permissions Decorators #TODO Verify these actually work #TODO add as_admin to these def allow_job_read(func): @@ -190,11 +247,75 @@ def check_as_concierge(self): "You are not the concierge user. This method is not for you" ) + # The next few methods allow for unit testing the various EE2*.py classes. + # They could also be moved to the MongoUtil class, but there doesn't appear to be a need + # at this point since MongoEngine creates a global connection to MongoDB + # and makes it available to all the model objects. + + def save_jobs(self, jobs: List[Job]) -> List[str]: + """ + Save multiple jobs to the Mongo DB at once, and return all of the job ids + """ + job_ids = self.get_mongo_util().insert_jobs(jobs_to_insert=jobs) + return [str(job_id) for job_id in job_ids] + + def save_job(self, job: Job) -> str: + """ + Save a job record to the Mongo database and return the job's ID as a string. + """ + job.save() + return str(job.id) + + def add_child_jobs(self, batch_job: Job, child_jobs: List[str]): + """ + Add child jobs to a batch job record in the Mongo Database and return the updated job. + :return: + """ + batch_job.modify(add_to_set__child_jobs=child_jobs) + return batch_job + + def save_and_return_job(self, job: Job) -> Job: + """ + Save a job record to the Mongo database and return the updated job. + """ + job.save() + return job + + def get_job_counts(self, job_filter): + """ + Get the number of jobs matching a filter. + + job_filter - a dict of keys to filter terms in the MongoEngine filter language. + """ + return Job.objects.filter(**job_filter).count() + + def get_jobs(self, job_filter, job_projection, sort_order, offset, limit): + """ + Get jobs from the database. + + job_filter - a dict of keys to filter terms in the MongoEngine filter language. + job_projection - a list of field names to include in the returned jobs. + sort_order - '+' to sort by job ID ascending, '-' descending. + offset - the number of jobs to skip before returning results. + limit - the maximum number of jobs to return. + """ + # TODO Instead of SKIP use ID GT LT + # https://www.codementor.io/arpitbhayani/fast-and-efficient-pagination-in-mongodb-9095flbqr + # ^ this one is important - the workspace was DOSed by a single open narrative at one + # point due to skip abuse, which is why it was removed + return ( + Job.objects[:limit] + .filter(**job_filter) + .order_by(f"{sort_order}_id") + .skip(offset) + .only(*job_projection) + ) + # API ENDPOINTS # ENDPOINTS: Admin Related Endpoints def check_is_admin(self): - """ Authorization Required Read """ + """Authorization Required Read""" # Check whether if at minimum, a read only admin" try: return self.check_as_admin(requested_perm=JobPermissions.READ) @@ -205,67 +326,76 @@ def get_admin_permission(self): return self.get_ee2_auth().retrieve_admin_permissions() # ENDPOINTS: Running jobs and getting job input params + + def retry_multiple(self, job_ids, as_admin=False): + """Authorization Required Read/Write""" + return self.get_runjob().retry_multiple(job_ids=job_ids, as_admin=as_admin) + + def retry(self, job_id, as_admin=False): + """Authorization Required Read/Write""" + return self.get_runjob().retry(job_id=job_id, as_admin=as_admin) + def run_job(self, params, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_runjob().run(params=params, as_admin=as_admin) def run_job_batch(self, params, batch_params, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_runjob().run_batch( params=params, batch_params=batch_params, as_admin=as_admin ) def run_job_concierge(self, params, concierge_params): - """ Authorization Required : Be the kbaseconcierge user """ + """Authorization Required : Be the kbaseconcierge user""" return self.get_runjob().run(params=params, concierge_params=concierge_params) def get_job_params(self, job_id, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" return self.get_runjob().get_job_params(job_id=job_id, as_admin=as_admin) # ENDPOINTS: Adding and retrieving Logs def add_job_logs(self, job_id, log_lines, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_job_logs().add_job_logs( job_id=job_id, log_lines=log_lines, as_admin=as_admin ) def view_job_logs(self, job_id, skip_lines=None, as_admin=False, limit=None): - """ Authorization Required Read """ + """Authorization Required Read""" return self.get_job_logs().view_job_logs( job_id=job_id, skip_lines=skip_lines, as_admin=as_admin, limit=limit ) # Endpoints: Changing a job's status def start_job(self, job_id, skip_estimation=True, as_admin=False): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().start_job( job_id=job_id, skip_estimation=skip_estimation, as_admin=as_admin ) # Endpoints: Changing a job's status - def abandon_children(self, parent_job_id, child_job_ids, as_admin=False): - """ Authorization Required Read/Write """ + def abandon_children(self, batch_id, child_job_ids, as_admin=False): + """Authorization Required Read/Write""" return self.get_jobs_status().abandon_children( - parent_job_id=parent_job_id, child_job_ids=child_job_ids, as_admin=as_admin + batch_id=batch_id, child_job_ids=child_job_ids, as_admin=as_admin ) def update_job_status(self, job_id, status, as_admin=False): # TODO: Make this an ADMIN ONLY function? Why would anyone need to call this who is not an admin? - """ Authorization Required: Read/Write """ + """Authorization Required: Read/Write""" return self.get_jobs_status().force_update_job_status( job_id=job_id, status=status, as_admin=as_admin ) def cancel_job(self, job_id, terminated_code=None, as_admin=False): # TODO: Cancel Child Jobs as well - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().cancel_job( job_id=job_id, terminated_code=terminated_code, as_admin=as_admin ) def handle_held_job(self, cluster_id): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" if self.check_as_admin(requested_perm=JobPermissions.WRITE): return self.get_jobs_status().handle_held_job( cluster_id=cluster_id, as_admin=True @@ -280,7 +410,7 @@ def finish_job( job_output=None, as_admin=False, ): - """ Authorization Required Read/Write """ + """Authorization Required Read/Write""" return self.get_jobs_status().finish_job( job_id=job_id, @@ -294,7 +424,7 @@ def finish_job( # Endpoints: Checking a job's status def check_job(self, job_id, exclude_fields=None, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" check_permission = True if as_admin is True: @@ -308,23 +438,23 @@ def check_job(self, job_id, exclude_fields=None, as_admin=False): ) def check_job_canceled(self, job_id, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" return self.get_jobs_status().check_job_canceled( job_id=job_id, as_admin=as_admin ) def get_job_status_field(self, job_id, as_admin=False): - """ Authorization Required: Read """ + """Authorization Required: Read""" return self.get_jobs_status().get_job_status(job_id=job_id, as_admin=as_admin) def check_job_batch( self, - parent_job_id, + batch_id, check_permission=True, exclude_fields=None, as_admin=False, ): - """ Authorization Required: Read """ + """Authorization Required: Read""" if as_admin is True: self.check_as_admin(requested_perm=JobPermissions.READ) @@ -334,7 +464,7 @@ def check_job_batch( raise ValueError("You can't exclude child jobs from this endpoint") parent_job_status = self.get_jobs_status().check_job( - job_id=parent_job_id, + job_id=batch_id, check_permission=check_permission, exclude_fields=exclude_fields, ) @@ -348,7 +478,7 @@ def check_job_batch( return_list=1, )["job_states"] return { - "parent_jobstate": parent_job_status, + "batch_jobstate": parent_job_status, "child_jobstates": child_job_states, } @@ -360,7 +490,7 @@ def check_jobs( return_list=1, as_admin=False, ): - """ Authorization Required: Read """ + """Authorization Required: Read""" if as_admin: self.check_as_admin(requested_perm=JobPermissions.READ) check_permission = False @@ -384,7 +514,7 @@ def check_jobs_date_range_for_user( ascending=None, as_admin=False, ): - """ Authorization Required: Read """ + """Authorization Required: Read""" if as_admin: self.check_as_admin(requested_perm=JobPermissions.READ) @@ -452,8 +582,7 @@ def check_workspace_jobs( if as_admin: self.check_as_admin(requested_perm=JobPermissions.READ) else: - ws_auth = self.get_workspace_auth() - if not ws_auth.can_read(workspace_id): + if not self.workspace_auth.can_read(workspace_id): self.logger.debug( f"User {self.user_id} doesn't have permission to read jobs in workspace {workspace_id}." ) @@ -475,19 +604,6 @@ def check_workspace_jobs( return job_states - @staticmethod - def parse_bool_from_string(str_or_bool): - if isinstance(str_or_bool, bool): - return str_or_bool - - if isinstance(str_or_bool, int): - return str_or_bool - - if isinstance(json.loads(str_or_bool.lower()), bool): - return json.loads(str_or_bool.lower()) - - raise Exception("Not a boolean value") - @staticmethod def check_and_convert_time(time_input, assign_default_time=False): """ diff --git a/lib/execution_engine2/sdk/job_submission_parameters.py b/lib/execution_engine2/sdk/job_submission_parameters.py new file mode 100644 index 000000000..90bc58d73 --- /dev/null +++ b/lib/execution_engine2/sdk/job_submission_parameters.py @@ -0,0 +1,257 @@ +""" +Parameters for submitting a job to a scheduler. +""" + +from maps import FrozenMap +from typing import Dict, List, Union +from execution_engine2.utils.arg_processing import ( + check_string as _check_string, + not_falsy as _not_falsy, +) +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.exceptions import IncorrectParamsException + + +def _gt_zero(num: int, name: str, optional=False) -> Union[int, None]: + if num is None and optional: + return None + if num is None or num < 1: + raise IncorrectParamsException(f"{name} must be at least 1") + return num + + +class JobRequirements: + """ + Requirements for running a job on a scheduler. + """ + + def __init__( + self, + cpus: int, + memory_MB: int, + disk_GB: int, + client_group: str, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = False, + ): + """ + Create the job requirements. + + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. Pass None for no preference. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + debug_mode - whether to run the job in debug mode. + """ + self.cpus = _gt_zero(cpus, "CPU count") + self.memory_MB = _gt_zero(memory_MB, "memory in MB") + self.disk_GB = _gt_zero(disk_GB, "disk space in GB") + self.client_group = _check_string(client_group, "client_group") + self.client_group_regex = ( + None if client_group_regex is None else bool(client_group_regex) + ) + self.bill_to_user = _check_string(bill_to_user, "bill_to_user", optional=True) + self.ignore_concurrency_limits = bool(ignore_concurrency_limits) + self.scheduler_requirements = FrozenMap( + self._check_scheduler_requirements(scheduler_requirements) + ) + self.debug_mode = bool(debug_mode) + + @classmethod + def _check_scheduler_requirements(cls, schd_reqs): + sr = schd_reqs if schd_reqs else {} + for key, value in sr.items(): + _check_string(key, "key in scheduler requirements structure") + _check_string( + value, f"value for key '{key}' in scheduler requirements structure" + ) + return sr + + @classmethod + def check_parameters( + cls, + cpus: int = None, + memory_MB: int = None, + disk_GB: int = None, + client_group: str = None, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: Union[bool, None] = None, + scheduler_requirements: Dict[str, str] = None, + debug_mode: Union[bool, None] = None, + ): + """ + Test that a set of parameters are legal and returns normalized parmeters. + All arguments are optional - parameters required for initializing the class may be missing. + + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + """ + # Could add a check_required_parameters bool if needed, but YAGNI for now. Any missing + # required paramaters will be looked up from the catalog or EE2 config file. + if cpus is not None: + _gt_zero(cpus, "CPU count") + if memory_MB is not None: + _gt_zero(memory_MB, "memory in MB") + if disk_GB is not None: + _gt_zero(disk_GB, "disk space in GB") + if client_group is not None: + client_group = _check_string(client_group, "client_group") + return ( + cpus, + memory_MB, + disk_GB, + client_group, + None if client_group_regex is None else bool(client_group_regex), + _check_string(bill_to_user, "bill_to_user", optional=True), + None + if ignore_concurrency_limits is None + else bool(ignore_concurrency_limits), + cls._check_scheduler_requirements(scheduler_requirements), + None if debug_mode is None else bool(debug_mode), + ) + + def _params(self): + return ( + self.cpus, + self.memory_MB, + self.disk_GB, + self.client_group, + self.client_group_regex, + self.bill_to_user, + self.ignore_concurrency_limits, + self.scheduler_requirements, + self.debug_mode, + ) + + def __eq__(self, other): + if type(self) == type(other): + return self._params() == ( + other.cpus, + other.memory_MB, + other.disk_GB, + other.client_group, + other.client_group_regex, + other.bill_to_user, + other.ignore_concurrency_limits, + other.scheduler_requirements, + other.debug_mode, + ) + return False + + def __hash__(self): + return hash(self._params()) + + +# move this function somewhere else? +def _is_valid_UPA(upa: str) -> (str, bool): + # returns an empty string if not a valid upa + if upa is None or not upa.strip(): + return "", False + parts = [p.strip() for p in upa.split("/")] + if not len(parts) == 3: + return "", False + for p in parts: + try: + int(p) + except ValueError: + return "", False + return "/".join(parts), True + + +class JobSubmissionParameters: + """ + Parameters for submitting a job to a job scheduler. + """ + + def __init__( + self, + job_id: str, + app_info: AppInfo, + job_reqs: JobRequirements, + user_creds: UserCreds, + parent_job_id: str = None, + wsid: int = None, + source_ws_objects: List[str] = None, + ): + """ + Create the parameters. + + job_id - the ID of the job. + app_info - information about the application to be run. + job_reqs - requirements for the job. + user_creds - user credentials. + parent_job_id - the ID of the parent job to this job, if any. + wsid - the ID of the workspace with which the job is associated, if any. + source_ws_objects - workspace objects that are part of the job input. + """ + self.job_id = _check_string(job_id, "job_id") + self.app_info = _not_falsy(app_info, "app_info") + self.job_reqs = _not_falsy(job_reqs, "job_reqs") + self.user_creds = _not_falsy(user_creds, "user_creds") + self.parent_job_id = _check_string( + parent_job_id, "parent_job_id", optional=True + ) + self.wsid = _gt_zero(wsid, "wsid", optional=True) + source_ws_objects = source_ws_objects if source_ws_objects else [] + if type(source_ws_objects) != list: + raise IncorrectParamsException("source_ws_objects must be a list") + for i, ref in enumerate(source_ws_objects): + upa, is_valid = _is_valid_UPA(ref) + if not is_valid: + raise IncorrectParamsException( + f"source_ws_objects index {i}, '{ref}', " + + "is not a valid Unique Permanent Address" + ) + source_ws_objects[i] = upa + self.source_ws_objects = tuple(source_ws_objects) + + def _params(self): + return ( + self.job_id, + self.app_info, + self.job_reqs, + self.user_creds, + self.parent_job_id, + self.wsid, + self.source_ws_objects, + ) + + def __eq__(self, other): + if type(self) == type(other): + return self._params() == ( + other.job_id, + other.app_info, + other.job_reqs, + other.user_creds, + other.parent_job_id, + other.wsid, + other.source_ws_objects, + ) + return False + + def __hash__(self): + return hash(self._params()) + + def __repr__(self): + return str(self._params()) diff --git a/lib/execution_engine2/utils/APIHelpers.py b/lib/execution_engine2/utils/APIHelpers.py new file mode 100644 index 000000000..65e66ed74 --- /dev/null +++ b/lib/execution_engine2/utils/APIHelpers.py @@ -0,0 +1,31 @@ +""" +Contains classes and fuctions for use with the EE2 SDK API class (e.g. the *Impl.py file). +""" + +from typing import Dict +from execution_engine2.utils.clients import UserClientSet, get_user_client_set + + +# this class is only tested as part of integration tests. +class GenerateFromConfig: + """ + Utility methods to generate constructs from the service configuration. + """ + + def __init__(self, cfg: Dict[str, str]): + """ + Create an instance from a configuration. + + cfg - the configuration. + """ + self.cfg = cfg + + def get_user_clients(self, ctx) -> UserClientSet: + """ + Create a user client set from an SDK context object. + + ctx - the context object. This is passed in to SDK methods in the *Impl.py file. It is + expected that the context object contains the user_id and token keys, and this method + will fail with a KeyError if it does not. + """ + return get_user_client_set(self.cfg, ctx["user_id"], ctx["token"]) diff --git a/lib/execution_engine2/utils/CatalogUtils.py b/lib/execution_engine2/utils/CatalogUtils.py deleted file mode 100644 index 214ac28b2..000000000 --- a/lib/execution_engine2/utils/CatalogUtils.py +++ /dev/null @@ -1,66 +0,0 @@ -import json -from typing import List, Dict - -from lib.installed_clients.CatalogClient import Catalog - - -class CatalogUtils: - def __init__(self, url, admin_token): - self.catalog = Catalog(url=url, token=admin_token) - - def get_normalized_resources(self, method) -> Dict: - """ - get client groups info from Catalog - """ - if method is None: - raise ValueError("Please input module_name.function_name") - - if method is not None and "." not in method: - raise ValueError( - "unrecognized method: {}. Please input module_name.function_name".format( - method - ) - ) - - module_name, function_name = method.split(".") - - group_config = self.catalog.list_client_group_configs( - {"module_name": module_name, "function_name": function_name} - ) - - job_settings = [] - if group_config and len(group_config) > 0: - job_settings = group_config[0].get("client_groups") - - normalize = self.normalize_job_settings(job_settings) - - return normalize - - @staticmethod - def normalize_job_settings(resources_request: List): - """ - Ensure that the client_groups are processed as a dictionary and has at least one value - :param resources_request: either an empty string, a json object, or cg,key1=value,key2=value - :return: - """ - - # No client group provided - if len(resources_request) == 0: - return {} - # JSON - if "{" in resources_request[0]: - json_resources_request = ", ".join(resources_request) - return json.loads(json_resources_request) - # CSV Format - rr = resources_request[0].split(",") # type: list - rv = {"client_group": rr.pop(0)} - for item in rr: - if "=" not in item: - raise Exception( - f"Malformed requirement. Format is = . Item is {item}" - ) - (key, value) = item.split("=") - rv[key] = value - # - # print("Going to return", rv) - return rv diff --git a/lib/execution_engine2/utils/Condor.py b/lib/execution_engine2/utils/Condor.py index 5dca510ac..71289b550 100644 --- a/lib/execution_engine2/utils/Condor.py +++ b/lib/execution_engine2/utils/Condor.py @@ -3,35 +3,25 @@ Functions to call condor to manage jobs and extract resource requirements """ import logging -import os import pathlib -import pwd -from configparser import ConfigParser -from typing import Dict, Optional, Any, Tuple +from typing import Dict, Optional, Any import htcondor -from lib.execution_engine2.exceptions import ( - MissingCondorRequirementsException, - MissingRunJobParamsException, +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements, ) -from lib.execution_engine2.sdk.EE2Runjob import ConciergeParams from lib.execution_engine2.utils.CondorTuples import ( - CondorResources, SubmissionInfo, JobInfo, ) -from lib.execution_engine2.utils.Scheduler import Scheduler +from execution_engine2.utils.arg_processing import not_falsy as _not_falsy -class Condor(Scheduler): +class Condor: # TODO: Should these be outside of the class? - REQUEST_CPUS = "request_cpus" - REQUEST_MEMORY = "request_memory" - REQUEST_DISK = "request_disk" CG = "+CLIENTGROUP" - EE2 = "execution_engine2" - ENDPOINT = "kbase-endpoint" EXTERNAL_URL = "external-url" EXECUTABLE = "executable" CATALOG_TOKEN = "catalog-token" @@ -41,64 +31,48 @@ class Condor(Scheduler): LEAVE_JOB_IN_QUEUE = "leavejobinqueue" TRANSFER_INPUT_FILES = "transfer_input_files" PYTHON_EXECUTABLE = "PYTHON_EXECUTABLE" - DEFAULT_CLIENT_GROUP = "default_client_group" - - def __init__(self, config_filepath): - self.config = ConfigParser() - self.override_clientgroup = os.environ.get("OVERRIDE_CLIENT_GROUP", None) - self.config.read(config_filepath) - self.ee_endpoint = self.config.get(section=self.EE2, option=self.EXTERNAL_URL) - self.python_executable = self.config.get( - section=self.EE2, - option=self.PYTHON_EXECUTABLE, - fallback="/miniconda/bin/python", - ) - self.initial_dir = self.config.get( - section=self.EE2, option=self.INITIAL_DIR, fallback="/condor_shared" + + def __init__(self, config: Dict[str, str], htc=htcondor): + """ + Create the condor wrapper. + + config - the execution_engine2 configuration. + htc - the htcondor module, or an alternate implementation or mock. + """ + # TODO some nicer error messages for the required keys vs. just KeyError + self.htcondor = htc + self.ee_endpoint = config[self.EXTERNAL_URL] + self.python_executable = config.get( + self.PYTHON_EXECUTABLE, "/miniconda/bin/python" ) - executable = self.config.get(section=self.EE2, option=self.EXECUTABLE) - if not pathlib.Path(executable).exists() and not pathlib.Path( - self.initial_dir + "/" + executable + self.initial_dir = config.get(self.INITIAL_DIR, "/condor_shared") + self.executable = config[self.EXECUTABLE] + if not pathlib.Path(self.executable).exists() and not pathlib.Path( + self.initial_dir + "/" + self.executable ): - raise FileNotFoundError(executable) - self.executable = executable - self.catalog_token = self.config.get( - section=self.EE2, option=self.CATALOG_TOKEN - ) - self.docker_timeout = self.config.get( - section=self.EE2, option=self.DOCKER_TIMEOUT, fallback="604801" - ) - self.pool_user = self.config.get( - section=self.EE2, option=self.POOL_USER, fallback="condor_pool" - ) - self.leave_job_in_queue = self.config.get( - section=self.EE2, option=self.LEAVE_JOB_IN_QUEUE, fallback="True" - ) - self.transfer_input_files = self.config.get( - section=self.EE2, - option=self.TRANSFER_INPUT_FILES, - fallback="/condor_shared/JobRunner.tgz", + raise FileNotFoundError(self.executable) + self.catalog_token = config[self.CATALOG_TOKEN] + self.docker_timeout = config.get(self.DOCKER_TIMEOUT, "604801") + self.pool_user = config.get(self.POOL_USER, "condor_pool") + self.leave_job_in_queue = config.get(self.LEAVE_JOB_IN_QUEUE, "True") + self.transfer_input_files = config.get( + self.TRANSFER_INPUT_FILES, "/condor_shared/JobRunner.tgz" ) self.logger = logging.getLogger("ee2") - def setup_environment_vars(self, params: Dict, client_group: str) -> str: + def _setup_environment_vars(self, params: JobSubmissionParameters) -> str: # 7 day docker job timeout default, Catalog token used to get access to volume mounts - dm = ( - str(params["cg_resources_requirements"].get("debug_mode", "")).lower() - == "true" - ) - environment_vars = { "DOCKER_JOB_TIMEOUT": self.docker_timeout, "KB_ADMIN_AUTH_TOKEN": self.catalog_token, - "KB_AUTH_TOKEN": params.get("token"), - "CLIENTGROUP": client_group, - "JOB_ID": params.get("job_id"), + "KB_AUTH_TOKEN": params.user_creds.token, + "CLIENTGROUP": params.job_reqs.client_group, + "JOB_ID": params.job_id, # "WORKDIR": f"{config.get('WORKDIR')}/{params.get('USER')}/{params.get('JOB_ID')}", "CONDOR_ID": "$(Cluster).$(Process)", "PYTHON_EXECUTABLE": self.python_executable, - "DEBUG_MODE": str(dm), - "PARENT_JOB_ID": params.get("parent_job_id", ""), + "DEBUG_MODE": str(params.job_reqs.debug_mode), + "PARENT_JOB_ID": params.parent_job_id or "", } environment = "" @@ -107,89 +81,6 @@ def setup_environment_vars(self, params: Dict, client_group: str) -> str: return f'"{environment}"' - @staticmethod - def _check_for_missing_runjob_params(params: Dict[str, str]) -> None: - """ - Check for missing runjob parameters - :param params: Params saved when the job was created - """ - for item in ("token", "user_id", "job_id", "cg_resources_requirements"): - if item not in params: - raise MissingRunJobParamsException(f"{item} not found in params") - - def extract_resources(self, cgrr: Dict[str, str]) -> CondorResources: - """ - # TODO Validate MB/GB from both config and catalog. - Checks to see if request_cpus/memory/disk is available - If not, it sets them based on defaults from the config - :param cgrr: - :return: - """ - self.logger.debug(f"About to extract from {cgrr}") - - client_group = cgrr.get("client_group", "") - if client_group is None or client_group == "": - client_group = self.config.get( - section="DEFAULT", option=self.DEFAULT_CLIENT_GROUP - ) - - if client_group not in self.config.sections(): - raise ValueError(f"{client_group} not found in {self.config.sections()}") - - # TODO Validate that they are a resource followed by a unit - for key in [self.REQUEST_DISK, self.REQUEST_CPUS, self.REQUEST_MEMORY]: - if key not in cgrr or cgrr[key] in ["", None]: - cgrr[key] = self.config.get(section=client_group, option=key) - - if self.override_clientgroup: - client_group = self.override_clientgroup - - cr = CondorResources( - str(cgrr.get(self.REQUEST_CPUS)), - str(cgrr.get(self.REQUEST_DISK)), - str(cgrr.get(self.REQUEST_MEMORY)), - client_group, - ) - - return cr - - def extract_requirements( - self, cgrr: Optional[dict] = None, client_group: Optional[str] = None - ): - """ - - :param cgrr: Client Groups and Resource Requirements - :param client_group: Client Group - :return: A list of condor submit file requirements in (key == value) format - """ - if cgrr is None or client_group is None: - raise MissingCondorRequirementsException( - "Please provide normalized cgrr and client_group" - ) - - requirements_statement = [] - - # Default to using a regex - if str(cgrr.get("client_group_regex", True)).lower() == "true": - requirements_statement.append(f'regexp("{client_group}",CLIENTGROUP)') - else: - requirements_statement.append(f'(CLIENTGROUP == "{client_group}")') - - restricted_requirements = [ - "client_group", - "client_group_regex", - self.REQUEST_MEMORY, - self.REQUEST_DISK, - self.REQUEST_CPUS, - "debug_mode", - ] - - for key, value in cgrr.items(): - if key.lower() not in restricted_requirements: - requirements_statement.append(f'({key} == "{value}")') - - return requirements_statement - @staticmethod def _add_hardcoded_attributes(sub, job_id): sub["universe"] = "vanilla" @@ -226,94 +117,69 @@ def _add_configurable_attributes(self, sub): return sub def _extract_resources_and_requirements( - self, sub: Dict[str, Any], cgrr: Dict[str, str] - ) -> Tuple[Dict[str, Any], str]: + self, sub: Dict[str, Any], job_reqs: JobRequirements + ) -> Dict[str, Any]: # Extract minimum condor resource requirements and client_group - resources = self.extract_resources(cgrr) - sub["request_cpus"] = resources.request_cpus - sub["request_memory"] = resources.request_memory - sub["request_disk"] = resources.request_disk - client_group = resources.client_group + sub["request_cpus"] = job_reqs.cpus + sub["request_memory"] = f"{job_reqs.memory_MB}MB" + sub["request_disk"] = f"{job_reqs.disk_GB}GB" # Set requirements statement - requirements = self.extract_requirements(cgrr=cgrr, client_group=client_group) - sub["requirements"] = " && ".join(requirements) - sub["+KB_CLIENTGROUP"] = f'"{client_group}"' - return (sub, client_group) - - @staticmethod - def _modify_with_concierge(sub, concierge_params): - # Remove Concurrency Limits for this Job - del sub["Concurrency_Limits"] - # Override Clientgroup - sub["+KB_CLIENTGROUP"] = f'"{concierge_params.client_group}"' - if concierge_params.account_group: - sub["+AccountingGroup"] = concierge_params.account_group - # Override Resource Requirements - sub["request_cpus"] = concierge_params.request_cpus - sub["request_memory"] = concierge_params.request_memory - sub["request_disk"] = concierge_params.request_disk - # Build up requirements w/ custom requirements - sub["requirements"] = f'(CLIENTGROUP == "{concierge_params.client_group}")' - requirements = [] - if concierge_params.requirements_list: - for item in concierge_params.requirements_list: - key, value = item.split("=") - requirements.append(f'({key} == "{value}")') - sub["requirements"] += " && ".join(requirements) - + sub["requirements"] = self._create_requirements_statement(job_reqs) + sub["+KB_CLIENTGROUP"] = f'"{job_reqs.client_group}"' return sub + def _create_requirements_statement(self, job_reqs: JobRequirements) -> str: + reqs = [] + if job_reqs.client_group_regex is not False: + # Default is True, so a value of None means True + reqs = [f'regexp("{job_reqs.client_group}",CLIENTGROUP)'] + else: + reqs = [f'(CLIENTGROUP == "{job_reqs.client_group}")'] + for key in sorted(job_reqs.scheduler_requirements): + reqs.append(f'({key} == "{job_reqs.scheduler_requirements[key]}")') + return " && ".join(reqs) + def _add_resources_and_special_attributes( - self, params: Dict, concierge_params: ConciergeParams = None - ) -> Dict: + self, params: JobSubmissionParameters + ) -> Dict[str, str]: sub = dict() - sub["JobBatchName"] = params.get("job_id") - sub["arguments"] = f"{params['job_id']} {self.ee_endpoint}" - sub = self.add_job_labels(sub=sub, params=params) + sub["JobBatchName"] = params.job_id + sub["arguments"] = f"{params.job_id} {self.ee_endpoint}" + sub = self._add_job_labels(sub=sub, params=params) # Extract special requirements - (sub, client_group) = self._extract_resources_and_requirements( - sub, params["cg_resources_requirements"] - ) + sub = self._extract_resources_and_requirements(sub, params.job_reqs) - sub["+AccountingGroup"] = params.get("user_id") - sub["Concurrency_Limits"] = params.get("user_id") - if concierge_params: - sub = self._modify_with_concierge(sub, concierge_params) - client_group = concierge_params.client_group - sub["+AccountingGroup"] = f'"{sub["+AccountingGroup"]}"' + btu = params.job_reqs.bill_to_user + user = btu if btu else params.user_creds.username + if not params.job_reqs.ignore_concurrency_limits: + sub["Concurrency_Limits"] = user + sub["+AccountingGroup"] = f'"{user}"' - sub["environment"] = self.setup_environment_vars( - params, client_group=client_group - ) + sub["environment"] = self._setup_environment_vars(params) return sub - # TODO Copy stuff from Concierge Params into #AcctGroup/Clientgroup/JobPrio, CPu/MEMORY/DISK/ - def create_submit( - self, params: Dict, concierge_params: ConciergeParams = None - ) -> Dict: - self._check_for_missing_runjob_params(params) + def _create_submit(self, params: JobSubmissionParameters) -> Dict[str, str]: + # note some tests call this function directly and will need to be updated if the + # signature is changed - sub = self._add_resources_and_special_attributes(params, concierge_params) - sub = self._add_hardcoded_attributes(sub=sub, job_id=params["job_id"]) + sub = self._add_resources_and_special_attributes(params) + sub = self._add_hardcoded_attributes(sub=sub, job_id=params.job_id) sub = self._add_configurable_attributes(sub) # Ensure all values are a string for item in sub.keys(): sub[item] = str(sub[item]) return sub - def concierge(self, sub, concierge_params): - pass - @staticmethod - def add_job_labels(sub: Dict, params: Dict[str, str]): - sub["+KB_PARENT_JOB_ID"] = params.get("parent_job_id", "") - sub["+KB_MODULE_NAME"] = params.get("method", "").split(".")[0] - sub["+KB_FUNCTION_NAME"] = params.get("method", "").split(".")[-1] - sub["+KB_APP_ID"] = params.get("app_id", "") - sub["+KB_APP_MODULE_NAME"] = params.get("app_id", "").split("/")[0] - sub["+KB_WSID"] = params.get("wsid", "") - sub["+KB_SOURCE_WS_OBJECTS"] = ",".join(params.get("source_ws_objects", list())) + def _add_job_labels(sub: Dict, params: JobSubmissionParameters): + sub["+KB_PARENT_JOB_ID"] = params.parent_job_id or "" + sub["+KB_MODULE_NAME"] = params.app_info.module + sub["+KB_FUNCTION_NAME"] = params.app_info.method + sub["+KB_APP_ID"] = params.app_info.get_application_id() or "" + sub["+KB_APP_MODULE_NAME"] = params.app_info.application_module or "" + sub["+KB_WSID"] = params.wsid or "" + sub["+KB_SOURCE_WS_OBJECTS"] = ",".join(params.source_ws_objects) # Ensure double quoted user inputs for key in sub.keys(): @@ -324,47 +190,31 @@ def add_job_labels(sub: Dict, params: Dict[str, str]): return sub - def run_job( - self, - params: Dict[str, str], - submit_file: Dict[str, str] = None, - concierge_params: Dict[str, str] = None, - ) -> SubmissionInfo: + def run_job(self, params: JobSubmissionParameters) -> SubmissionInfo: """ TODO: Add a retry TODO: Add list of required params - :param params: Params to run the job, such as the username, job_id, token, client_group_and_requirements - :param submit_file: A optional completed Submit File - :param concierge_params: Concierge Options for Submit Files - :return: + :param params: Params to run the job. + :return: ClusterID, Submit File, and Info about Errors """ - if submit_file is None: - submit_file = self.create_submit(params, concierge_params) - - return self.run_submit(submit_file) + # Contains sensitive information to be sent to condor + submit = self._create_submit(_not_falsy(params, "params")) - def run_submit(self, submit: Dict[str, str]) -> SubmissionInfo: - - sub = htcondor.Submit(submit) + sub = self.htcondor.Submit(submit) try: - schedd = htcondor.Schedd() - self.logger.debug(schedd) - self.logger.debug(submit) - self.logger.debug(os.getuid()) - self.logger.debug(pwd.getpwuid(os.getuid()).pw_name) - self.logger.debug(submit) + schedd = self.htcondor.Schedd() with schedd.transaction() as txn: return SubmissionInfo(str(sub.queue(txn, 1)), sub, None) except Exception as e: return SubmissionInfo(None, sub, e) def get_job_resource_info( - self, job_id: Optional[str] = None, cluster_id: Optional[str] = None + self, job_id: str = None, cluster_id: str = None ) -> Dict[str, str]: if job_id is not None and cluster_id is not None: raise Exception("Use only batch name (job_id) or cluster_id, not both") - condor_stats = self.get_job_info(job_id=job_id, cluster_id=cluster_id) + condor_stats = self._get_job_info(job_id=job_id, cluster_id=cluster_id) # Don't leak token into the logs here job_info = condor_stats.info if job_info is None: @@ -397,9 +247,11 @@ def get_job_resource_info( return extracted_resources - def get_job_info( + def _get_job_info( self, job_id: Optional[str] = None, cluster_id: Optional[str] = None ) -> JobInfo: + # note some tests replace this function with a MagicMock and will need to be updated if + # the signature is changed if job_id is not None and cluster_id is not None: return JobInfo( @@ -416,7 +268,7 @@ def get_job_info( ) try: - job = htcondor.Schedd().query(constraint=constraint, limit=1) + job = self.htcondor.Schedd().query(constraint=constraint, limit=1) if len(job) == 0: job = [{}] return JobInfo(info=job[0], error=None) @@ -425,18 +277,15 @@ def get_job_info( raise e # return JobInfo(info=None, error=e) - def get_user_info(self, user_id, projection=None): - pass - def cancel_job(self, job_id: str) -> bool: """ :param job_id: :return: """ - return self.cancel_jobs([f"{job_id}"]) + return self._cancel_jobs([f"{job_id}"]) - def cancel_jobs(self, scheduler_ids: list): + def _cancel_jobs(self, scheduler_ids: list): """ Possible return structure like this [ @@ -457,11 +306,9 @@ def cancel_jobs(self, scheduler_ids: list): raise Exception("Please provide a list of condor ids to cancel") try: - cancel_jobs = htcondor.Schedd().act( - action=htcondor.JobAction.Remove, job_spec=scheduler_ids + cancel_jobs = self.htcondor.Schedd().act( + action=self.htcondor.JobAction.Remove, job_spec=scheduler_ids ) - self.logger.info(f"Cancel job message for {scheduler_ids} is") - self.logger.debug(f"{cancel_jobs}") return cancel_jobs except Exception: self.logger.error( diff --git a/lib/execution_engine2/utils/CondorTuples.py b/lib/execution_engine2/utils/CondorTuples.py index bec435de1..048568084 100644 --- a/lib/execution_engine2/utils/CondorTuples.py +++ b/lib/execution_engine2/utils/CondorTuples.py @@ -15,13 +15,6 @@ class SubmissionInfo(NamedTuple): error: Optional[Exception] -class CondorResources(NamedTuple): - request_cpus: str - request_disk: str - request_memory: str - client_group: str - - class JobStatusCodes(enum.Enum): UNEXPANDED = 0 IDLE = 1 diff --git a/lib/execution_engine2/utils/KafkaUtils.py b/lib/execution_engine2/utils/KafkaUtils.py index ec2b07f7c..afb1a9473 100644 --- a/lib/execution_engine2/utils/KafkaUtils.py +++ b/lib/execution_engine2/utils/KafkaUtils.py @@ -212,11 +212,8 @@ def send_kafka_message(self, message, topic: str = DEFAULT_TOPIC): ) # TODO Remove POLL? producer.poll(2) - logger.debug( - f"Successfully sent message to kafka at topic={topic} message={json.dumps(message.__dict__)} server_address={self.server_address}" - ) except Exception as e: - logger.debug( + logger.error( f"Failed to send message to kafka at topic={topic} message={json.dumps(message.__dict__)} server_address={self.server_address}" ) raise Exception(e) diff --git a/lib/execution_engine2/utils/Scheduler.py b/lib/execution_engine2/utils/Scheduler.py deleted file mode 100644 index f945248e2..000000000 --- a/lib/execution_engine2/utils/Scheduler.py +++ /dev/null @@ -1,32 +0,0 @@ -from abc import ABC, abstractmethod - - -class Scheduler(ABC): - @abstractmethod - def run_job(self, params, submit_file=None): - raise NotImplementedError - - @abstractmethod - def create_submit(self, params): - raise NotImplementedError - - def validate_submit_file( - self, - ): - raise NotImplementedError - - @abstractmethod - def run_submit(self, submit): - raise NotImplementedError - - @abstractmethod - def get_job_info(self, job_id, cluster_id): - raise NotImplementedError - - @abstractmethod - def get_user_info(self, user_id, projection=None): - raise NotImplementedError - - @abstractmethod - def cancel_job(self, job_id): - raise NotImplementedError diff --git a/lib/execution_engine2/utils/Scheduler.pyi b/lib/execution_engine2/utils/Scheduler.pyi deleted file mode 100644 index f22afc81e..000000000 --- a/lib/execution_engine2/utils/Scheduler.pyi +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Dict, List -from abc import ABC - -class Scheduler(ABC): - def run_job( - self, params: Dict[str, str], submit_file: Dict[str, str] = None - ) -> str: ... - def create_submit(self, params: Dict[str, str]) -> str: ... - def validate_submit_file(self, submit_file_path) -> bool: ... - def cleanup_submit_file(self, submit_file_path) -> bool: ... - def run_submit(self, submit) -> str: ... - def get_job_info(self, job_id: str, cluster_id: str = None) -> Dict[str, str]: ... - def get_user_info( - self, user_id: str, projection: List[str] = None - ) -> Dict[str, str]: ... - def cancel_job(self, job_id: str) -> bool: ... diff --git a/lib/execution_engine2/utils/SlackUtils.py b/lib/execution_engine2/utils/SlackUtils.py index 836c41dc2..5a8c13fa8 100644 --- a/lib/execution_engine2/utils/SlackUtils.py +++ b/lib/execution_engine2/utils/SlackUtils.py @@ -26,8 +26,8 @@ def held_job_message(self, held_job): message = f"Held Job Stats {held_job}" self.safe_chat_post_message(channel=self.channel, text=message) - def ee2_reaper_failure(self, endpoint="Unknown EE2 URL", job_id="Unknown"): - message = f"EE2 Held Job reaper failed for {endpoint} (job {job_id}). Please check it out" + def ee2_reaper_failure(self, endpoint="Unknown EE2 URL", job_id="Unknown", e=None): + message = f"EE2 Held Job reaper failed for {endpoint} (job {job_id}), {e}. Please check it out" self.safe_chat_post_message(channel=self.channel, text=message) def ee2_reaper_success( @@ -55,7 +55,7 @@ def cancel_job_message(self, job_id, scheduler_id, termination_code): if self.debug is False: return - message = f"scheduler_id:{scheduler_id} job_id:{job_id} has been canceled due to {termination_code} ({self.endpoint})" + message = f"scheduler_id:`{scheduler_id}` job_id:`{job_id}` has been canceled due to `{termination_code}` ({self.endpoint})" self.safe_chat_post_message(channel=self.channel, text=message) def finish_job_message(self, job_id, scheduler_id, finish_status, error_code=None): diff --git a/lib/execution_engine2/utils/application_info.py b/lib/execution_engine2/utils/application_info.py new file mode 100644 index 000000000..5ebc6d02d --- /dev/null +++ b/lib/execution_engine2/utils/application_info.py @@ -0,0 +1,144 @@ +""" +Contains information about KBase applications. +""" + +from typing import Union +from execution_engine2.utils.arg_processing import check_string as _check_string +from execution_engine2.exceptions import IncorrectParamsException + + +def _get2part_string(s, sep, name, err_pt1, err_pt2, desired_sep=None): + desired_sep = desired_sep if desired_sep else sep + parts = s.split(sep) + if len(parts) != 2: + raise IncorrectParamsException( + f"Expected exactly one '{desired_sep}' in {name} '{s}'" + ) + return _check_string(parts[0], err_pt1), _check_string(parts[1], err_pt2) + + +class AppInfo: + """ + Information about a KBase app. + + Instance variables: + module - the app's module, e.g. kb_uploadmethods. + method - the SDK method the app will run, e.g. import_reads_from_staging + application_module - the module containing the application. Under normal conditions this + will be the same as 'module', if not None. Always supplied if 'application' is not None. + application - the id of the application, e.g. import_fastq_interleaved_as_reads_from_staging. + This is the name of the folder in the 'ui/narrative/methods' folder in the app repo + contining the spec files for the app. May be None. + """ + + def __init__(self, method: str, app_id: Union[str, None] = None, strict=True): + """ + Create the application information. + + method - the method name, e.g. kb_uploadmethods.import_reads_from_staging + app_id - the app name in the module/app_name format (e.g. + kb_uploadmethods/import_fastq_interleaved_as_reads_from_staging). For historical + reasons, this class will also accept only the module name or the module.app_name + format. In both cases the module name must match that given for the method argument. + Optional. + strict - whether the app_id should be processed strictly or not. Without strict=True, + the application module name may be different from the method module name. + """ + # Implementation notes: as of this writing, there are KBase processes that + # submit app_ids to ee2 that: + # 1) have a . separator rather than a / + # - Narrative downloads are a known place where this happens, although + # there are many other jobs in the database with this pattern, so there may be + # unknown processes submitting jobs like this. In most cases, this is just the + # process using the method for the app_id (and note that is often inaccurate). + # 2) consist only of a module ID with no separator + # - KBParallel does this. That may be the only source or there may be other sources + # as well, unknown. + # There are also some records in the database where the module for the application and + # method is not the same - to the best of our knowledge this was one off test data and + # shouldn't be expected to happen in practice. + # As such: + # 1) The only requirement for the app ID is that, if provided, it starts with the module + # given in the method argument. That must be followed by either nothing, or + # a '.' or '/' separator containing an arbitrary string. + # 2) We provide a 'strict' argument to disable even that check, which should be used for + # data loaded from the database. + self.module, self.method = _get2part_string( + _check_string(method, "method ID"), + ".", + "method ID", + "module portion of method ID", + "method portion of method ID", + ) + app_id = _check_string(app_id, "application ID", optional=True) + app = None + sep = None + mod = None + if app_id: + err1 = "module portion of application ID" + err2 = "application portion of application ID" + if "/" in app_id and "." in app_id: + raise IncorrectParamsException( + f"Application ID '{app_id}' has both '/' and '.' separators" + ) + if "/" in app_id: + mod, app = _get2part_string(app_id, "/", "application ID", err1, err2) + sep = "/" + elif "." in app_id: + mod, app = _get2part_string( + app_id, ".", "application ID", err1, err2, "/" + ) + sep = "." + else: + mod = app_id + if strict and mod and mod != self.module: + raise IncorrectParamsException( + f"Application module '{mod}' must equal method module '{self.module}'" + ) + self.application = app + self.application_module = mod + self._sep = sep + + def get_method_id(self) -> str: + """ + Get the method id, e.g. module.method. + """ + return f"{self.module}.{self.method}" + + def get_application_id(self) -> str: + """ + Get the application id, e.g. module/application, if present + """ + if not self.application_module: + return None + if self.application: + return f"{self.application_module}{self._sep}{self.application}" + return self.application_module + + def __eq__(self, other): + if type(self) == type(other): + return ( + self.module, + self.method, + self.application_module, + self.application, + self._sep, + ) == ( + other.module, + other.method, + other.application_module, + other.application, + other._sep, + ) + return False + + def __hash__(self): + return hash( + ( + self.module, + self.method, + self.application_module, + self.application, + self._sep, + ) + ) diff --git a/lib/execution_engine2/utils/arg_processing.py b/lib/execution_engine2/utils/arg_processing.py new file mode 100644 index 000000000..688964b69 --- /dev/null +++ b/lib/execution_engine2/utils/arg_processing.py @@ -0,0 +1,159 @@ +""" +Functions for processing arguments / parameters, including argument validity checkers and +normalizers. +""" + +from typing import Optional, Iterable, TypeVar, Union +import datetime +import unicodedata +from execution_engine2.exceptions import IncorrectParamsException + +T = TypeVar("T") + + +def parse_bool(putative_bool: Union[str, bool, int, float, None]) -> bool: + """ + Parse a string, bool, int, or float to a boolean value. + Strings containing 'true' or 'false', regardless of capitalization, are considered booleans. + Strings containing ints or floats are parsed to floats before processing. + + Raises IncorrectParamsException if the value cannot be parsed. + """ + pb = putative_bool + if pb is None: + return False + + if isinstance(pb, bool) or isinstance(pb, int) or isinstance(pb, float): + return bool(pb) + + if isinstance(pb, str): + try: + return bool(float(pb)) + except ValueError: + pass # check for 'true' and 'false' strings next + # they're more likely and if we really wanted to optimize they should go first. + # probably doesn't matter at all and it makes the code a bit simpler + if pb.lower() == "true": + return True + if pb.lower() == "false": + return False + + raise IncorrectParamsException(f"{pb} is not a boolean value") + + +# The remaining methods are ported from +# https://github.com/kbase/sample_service/blob/master/lib/SampleService/core/arg_checkers.py +# with slight changes. +# Should probably make a package or see if there are equivalent 3rd party functions at some point. +# Although if you want to use custom exceptions as here that won't work + + +def not_falsy(item: T, item_name: str) -> T: + """ + Check if a value is falsy and throw an exception if so. + :param item: the item to check for falsiness. + :param item_name: the name of the item to include in any exception. + :raises ValueError: if the item is falsy. + :returns: the item. + """ + if not item: + raise ValueError(f"{item_name} cannot be a value that evaluates to false") + return item + + +def not_falsy_in_iterable( + iterable: Optional[Iterable[T]], name: str, allow_none: bool = False +) -> Optional[Iterable[T]]: + """ + Check that an iterable is not None and contains no falsy items. Empty iterables are accepted. + :param iterable: the iterable to check. + :param name: the name of the iterable to be used in error messages. + :param allow_none: allow the iterable to be None - in this case return None. The contents of + the iterable may not be None. + :returns: the iterable. + :raises ValueError: if the iterable is None or contains falsy items. + """ + # probably need to allow for 0 as an option + if iterable is None: + if allow_none: + return None + raise ValueError(f"{name} cannot be None") + for i, item in enumerate(iterable): + not_falsy(item, f"Index {i} of iterable {name}") + return iterable + + +def _contains_control_characters(string: str) -> bool: + """ + Check if a string contains control characters, as denoted by the Unicode character category + starting with a C. + :param string: the string to check. + :returns: True if the string contains control characters, False otherwise. + """ + # make public if needed + # See https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python # noqa: E501 + for c in string: + if unicodedata.category(c)[0] == "C": + return True + return False + + +def _no_control_characters(string: str, name: str) -> str: + """ + Checks that a string contains no control characters and throws an exception if it does. + See :meth:`contains_control_characters` for more information. + :param string: The string to check. + :param name: the name of the string to include in any exception. + :raises IncorrectParamsException: if the string contains control characters. + :returns: the string. + """ + # make public if needed + if _contains_control_characters(string): + raise IncorrectParamsException(name + " contains control characters") + return string + + +def check_string( + string: Optional[str], name: str, max_len: int = None, optional: bool = False +) -> Optional[str]: + """ + Check that a string meets a set of criteria: + - it is not None or whitespace only (unless the optional parameter is specified) + - it contains no control characters + - (optional) it is less than some specified maximum length + :param string: the string to test. + :param name: the name of the string to be used in error messages. + :param max_len: the maximum length of the string. + :param optional: True if no error should be thrown if the string is None. + :returns: the stripped string or None if the string was optional and None or whitespace only. + :raises IncorrectParamsException: if the string is None, whitespace only, too long, or + contains illegal characters. + """ + if max_len is not None and max_len < 1: + raise ValueError("max_len must be > 0 if provided") + if not string or not string.strip(): + if optional: + return None + raise IncorrectParamsException("Missing input parameter: " + name) + string = string.strip() + _no_control_characters(string, name) + if max_len and len(string) > max_len: + raise IncorrectParamsException(f"{name} exceeds maximum length of {max_len}") + return string + + +def check_timestamp(timestamp: datetime.datetime, name: str): + """ + Check that a timestamp is not None and not naive. See + https://docs.python.org/3.8/library/datetime.html#aware-and-naive-objects + :param timestamp: the timestamp to check. + :param name: the name of the variable to use in thrown errors. + :returns: the timestamp. + :raises ValueError: if the check fails. + """ + if not_falsy(timestamp, name).tzinfo is None: + # The docs say you should also check savetime.tzinfo.utcoffset(savetime) is not None, + # but initializing a datetime with a tzinfo subclass that returns None for that method + # causes the constructor to throw an error + raise ValueError(f"{name} cannot be a naive datetime") + return timestamp diff --git a/lib/execution_engine2/utils/catalog_cache.py b/lib/execution_engine2/utils/catalog_cache.py new file mode 100644 index 000000000..46400f3cc --- /dev/null +++ b/lib/execution_engine2/utils/catalog_cache.py @@ -0,0 +1,95 @@ +import copy + +from collections import defaultdict +from typing import Dict + +from lib.installed_clients.CatalogClient import Catalog + + +class CatalogCache: + """ + Per call catalog cache used to speed up catalog lookups + Caches the "Method Version" and the "Job Resource Requirements" + There's no cache invalidation, and to refresh a cache entry you have to make a new cache + Cache is not thread safe + """ + + def __init__(self, catalog: Catalog): + """ + :param catalog: Instance of catalog client. Does not require authentication + """ + if not catalog: + raise ValueError("Please provide instance of catalog client") + + self._catalog = catalog + self._method_version_cache = defaultdict(dict) + self._job_requirements_cache = defaultdict(dict) + + def get_catalog(self) -> Catalog: + """Get the catalog client for this instance.""" + return self._catalog + + def get_method_version_cache(self) -> Dict: + """Get the _method_version_cache for this instance.""" + return self._method_version_cache + + def get_job_resources_cache(self) -> Dict: + """Get the _condor_resources_cache for this instance.""" + return self._job_requirements_cache + + def lookup_git_commit_version(self, method, service_ver=None) -> str: + """ + If "service_ver" is "release|beta|dev", get git commit version for that version + if "service_ver" is a semantic version, get commit version for that semantic version + If "service_ver" is a git commit hash, see if that get commit is valid + Convenience wrapper for verifying a git commit hash, or getting git commit hash from a tag + :param method: Method to look up + :param service_ver: Version to look up + :return: A git commit hash for the requested job + """ + + # Structure of cache + # { 'run_megahit' : + # { + # 'dev' : 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6', #Tag + # '2.5.0' : 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6', #Semantic + # 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6' : 'cc91ddfe376f907aa56cfb3dd1b1b21cae8885z6' #vcs + # } + # } + mv_cache = self.get_method_version_cache() + if not method: + raise ValueError("Must provide a method to lookup") + + if not service_ver: + service_ver = "release" + + # If not in the cache add it + if method not in mv_cache or service_ver not in mv_cache[method]: + module_name = method.split(".")[0] + module_version = self.get_catalog().get_module_version( + {"module_name": module_name, "version": service_ver} + ) + mv_cache[method][service_ver] = module_version.get("git_commit_hash") + # Retrieve from cache + return mv_cache[method][service_ver] + + def lookup_job_resource_requirements(self, module_name, function_name) -> dict: + """ + Gets required job resources and clientgroups for a job submission + :param module_name: Module name to lookup + :param function_name: Function name to lookup + :return: A cached lookup of unformatted resource requests from the catalog + """ + # Structure of cache + # { 'module_name' : {'function_name' : [group_config] } + # } + cr_cache = self.get_job_resources_cache() + # If not in the cache add it + if module_name not in cr_cache or function_name not in cr_cache[module_name]: + cr_cache[module_name][ + function_name + ] = self.get_catalog().list_client_group_configs( + {"module_name": module_name, "function_name": function_name} + ) + # Retrieve from cache + return copy.deepcopy(cr_cache[module_name][function_name]) diff --git a/lib/execution_engine2/utils/clients.py b/lib/execution_engine2/utils/clients.py new file mode 100644 index 000000000..b47381252 --- /dev/null +++ b/lib/execution_engine2/utils/clients.py @@ -0,0 +1,218 @@ +""" Contains the various clients EE2 needs to communicate with other services it depends on. """ + +# Note on testing - this class is not generally unit-testable, and is only tested fully in +# integration tests. + +from typing import Dict, Iterable + +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.arg_processing import not_falsy as _not_falsy +from execution_engine2.utils.arg_processing import parse_bool +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace +from installed_clients.authclient import KBaseAuth + + +class UserClientSet: + """ + Clients required by EE2 for communicating with other services that need to be instantiated + on a per user basis. Also contains the user credentials for ease of use. + """ + + def __init__( + self, + user_id: str, + token: str, + workspace: Workspace, + workspace_auth: WorkspaceAuth, + ): + """ + Initialize the client set. + + user_id - The user's ID. + token - The users's token + workspace - A workspace client initialized with the user's token. + workspace_auth - A workspace auth client initialized with the user's token. + """ + if not user_id or not user_id.strip(): + raise ValueError("user_id is required") + if not token or not token.strip(): + raise ValueError("token is required") + if not workspace: + raise ValueError("workspace is required") + if not workspace_auth: + raise ValueError("workspace_auth is required") + self.user_id = user_id + self.token = token + self.workspace = workspace + self.workspace_auth = workspace_auth + + +def get_user_client_set(cfg: Dict[str, str], user_id: str, token: str): + """ + Create the client set from a configuration dictionary. + + cfg - the configuration dictionary + user_id - the ID of the user to be used to initialize the client set. + token - the token of the user to be used to initialize the client set. Note that the set + trusts that the token actually belongs to the user ID, and currently does not + independently check the validity of the user ID. + + Expected keys in config: + workspace-url - the URL of the kbase workspace service + """ + if not cfg: + raise ValueError("cfg is required") + # Do a check that the url actually points to the workspace? + # Also maybe consider passing in the workspace url rather than the dict, but the ClientSet + # below will need lots of params so a dict makes sense there, maybe keep the apis similar? + # TODO the client throws a 'X is not a valid url' error if the url isn't valid, improve + # by catching & rethrowing with a more clear message that the config is wrong + ws_url = cfg.get("workspace-url") # may want to make the keys constants? + if not ws_url or not ws_url.strip(): + raise ValueError("missing workspace-url in configuration") + workspace = Workspace(ws_url, token=token) + workspace_auth = WorkspaceAuth(user_id, workspace) + return UserClientSet(user_id, token, workspace, workspace_auth) + + +class ClientSet: + """ + There is only one instance of this class globally. The codebase effectively treats this as a singleton. + Clients required by EE2 for communicating with other services. + These are not user-specific and can be reused throughout the application. + """ + + def __init__( + self, + auth: KBaseAuth, + auth_admin: AdminAuthUtil, + condor: Condor, + catalog: Catalog, + catalog_no_auth: Catalog, + requirements_resolver: JobRequirementsResolver, + kafka_client: KafkaClient, + mongo_util: MongoUtil, + slack_client: SlackClient, + ): + """ + Initialize the client set from the individual clients. + """ + + self.auth = _not_falsy(auth, "auth") + self.auth_admin = _not_falsy(auth_admin, "auth_admin") + self.condor = _not_falsy(condor, "condor") + self.catalog = _not_falsy(catalog, "catalog") + self.catalog_no_auth = _not_falsy(catalog_no_auth, "catalog_no_auth") + self.requirements_resolver = _not_falsy( + requirements_resolver, "requirements_resolver" + ) + self.kafka_client = _not_falsy(kafka_client, "kafka_client") + self.mongo_util = _not_falsy(mongo_util, "mongo_util") + self.slack_client = _not_falsy(slack_client, "slack_client") + + +# the constructor allows for mix and match of mocks and real implementations as needed +# the method below handles all the client set up for going straight from a config + + +def get_clients( + cfg: Dict[str, str], + cfg_file: Iterable[str], + override_client_group: str = None, +) -> ( + KBaseAuth, + AdminAuthUtil, + Condor, + Catalog, + Catalog, + JobRequirementsResolver, + KafkaClient, + MongoUtil, + SlackClient, +): + """ + Get the set of clients used in the EE2 application that are not user-specific and can be + reused from user to user. + + cfg - the configuration dictionary + cfg_file - the full configuration file as a file like object or iterable. + override_client_group - a client group name to override any client groups provided by + users or the catalog service. + + Expected keys in config: + auth-url - the root URL of the kbase auth service + catalog-url - the URL of the catalog service + catalog-token - a token to use with the catalog service. Ideally a service token + kafka-host - the host string for a Kafka service + slack-token - a token for contacting Slack + """ + # Condor needs access to the entire deploy.cfg file, not just the ee2 section + condor = Condor(cfg) + # Do a check to ensure the urls and tokens actually work correctly? + # TODO check keys are present - make some general methods for dealing with this + # token is needed for running log_exec_stats in EE2Status + catalog = Catalog(cfg["catalog-url"], token=cfg["catalog-token"]) + # instance of catalog without creds is used here + catalog_no_auth = Catalog(cfg["catalog-url"]) + jrr = JobRequirementsResolver(cfg_file, override_client_group) + auth_url = cfg["auth-url"] + auth = KBaseAuth(auth_url=auth_url + "/api/legacy/KBase/Sessions/Login") + # TODO using hardcoded roles for now to avoid possible bugs with mismatched cfg roles + # these should probably be configurable. + # See https://github.com/kbase/execution_engine2/issues/295 + auth_admin = AdminAuthUtil(auth_url, [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE]) + + # KafkaClient has a nice error message when the arg is None + kafka_client = KafkaClient(cfg.get("kafka-host")) + + debug = parse_bool(cfg.get("debug")) + # SlackClient handles None arguments + slack_client = SlackClient( + cfg.get("slack-token"), debug=debug, endpoint=cfg.get("ee2-url") + ) + # TODO check how MongoUtil handles a bad config + that error messages are understandable + mongo_util = MongoUtil(cfg) + return ( + auth, + auth_admin, + condor, + catalog, + catalog_no_auth, + jrr, + kafka_client, + mongo_util, + slack_client, + ) + + +def get_client_set( + cfg: Dict[str, str], + cfg_file: Iterable[str], + override_client_group: str = None, +) -> ClientSet: + """ + A helper method to create a ClientSet from a config dict rather than constructing and passing + in clients individually. + + cfg - the configuration dictionary + cfg_file - the full configuration file as a file like object or iterable. + override_client_group - a client group name to override any client groups provided by + users or the catalog service. + + Expected keys in config: + auth-url - the root URL of the kbase auth service + catalog-url - the URL of the catalog service + catalog-token - a token to use with the catalog service. Ideally a service token + kafka-host - the host string for a Kafka service + slack-token - a token for contacting Slack + """ + + return ClientSet(*get_clients(cfg, cfg_file, override_client_group)) diff --git a/lib/execution_engine2/utils/job_requirements_resolver.py b/lib/execution_engine2/utils/job_requirements_resolver.py new file mode 100644 index 000000000..c0f17cdc6 --- /dev/null +++ b/lib/execution_engine2/utils/job_requirements_resolver.py @@ -0,0 +1,512 @@ +""" +Contains resolvers for job requirements. +""" + +import json +from configparser import ConfigParser +from enum import Enum +from typing import Iterable, Dict, Union, Set + +from execution_engine2.exceptions import IncorrectParamsException +from execution_engine2.sdk.EE2Constants import ( + EE2_CONFIG_SECTION, + EE2_DEFAULT_SECTION, + EE2_DEFAULT_CLIENT_GROUP, +) +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.utils.arg_processing import ( + check_string as _check_string, + not_falsy as _not_falsy, +) +from execution_engine2.utils.catalog_cache import CatalogCache + +CLIENT_GROUP = "client_group" +REQUEST_CPUS = "request_cpus" +REQUEST_MEMORY = "request_memory" +REQUEST_DISK = "request_disk" +CLIENT_GROUP_REGEX = "client_group_regex" +BILL_TO_USER = "bill_to_user" +IGNORE_CONCURRENCY_LIMITS = "ignore_concurrency_limits" +DEBUG_MODE = "debug_mode" +_RESOURCES = set([CLIENT_GROUP, REQUEST_CPUS, REQUEST_MEMORY, REQUEST_DISK]) +_ALL_SPECIAL_KEYS = _RESOURCES | set( + [CLIENT_GROUP_REGEX, DEBUG_MODE, BILL_TO_USER, IGNORE_CONCURRENCY_LIMITS] +) + +_CLIENT_GROUPS = "client_groups" + + +def _remove_special_keys(inc_dict): + return {k: inc_dict[k] for k in set(inc_dict) - _ALL_SPECIAL_KEYS} + + +class RequirementsType(Enum): + """ + A classification of the type of requirements requested by the user. + """ + + STANDARD = 1 + """ + No special requests. + """ + + PROCESSING = 2 + """ + The user requests special processing such as a CPU count, removal of concurrency limits, etc. + """ + + BILLING = 3 + """ + The user requests that they bill another user. + """ + + +def _check_raise(name, value, source): + raise IncorrectParamsException( + f"Found illegal {name} '{value}' in job requirements from {source}" + ) + + +def _check_clientgroup(clientgroup, source): + clientgroup = _string_request(clientgroup, "client group", source) + # this is a possible error mode from the catalog since it uses key=value pairs in CSV + # format + if "=" in clientgroup: + _check_raise("client group", clientgroup, source) + return clientgroup + + +def _string_request(putative_string, name, source): + if type(putative_string) != str: + _check_raise(name, putative_string, source) + return putative_string.strip() + + +def _int_request(putative_int, original, name, source): + if type(putative_int) == float: + _check_raise(f"{name} request", original, source) + try: + return int(putative_int) + except (ValueError, TypeError): + _check_raise(f"{name} request", original, source) + + +def _check_cpus(cpus, source): + return _int_request(cpus, cpus, "cpu", source) + + +def _check_memory(memory, source): + if type(memory) == int: + return memory + memory2 = _string_request(memory, "memory request", source) + if memory2.endswith("M"): + memory2 = memory2[:-1] + elif memory2.endswith("MB"): + memory2 = memory2[:-2] + return _int_request(memory2, memory, "memory", source) + + +def _check_disk(disk, source): + if type(disk) == int: + return disk + disk2 = _string_request(disk, "disk request", source) + if disk2.endswith("GB"): + disk2 = disk2[:-2] + return _int_request(disk2, disk, "disk", source) + + +def _bool_request(putative_bool, name, source): + if type(putative_bool) == bool or type(putative_bool) == int: + return bool(putative_bool) + pbs = _string_request(putative_bool, name, source).lower() + if pbs == "true": + return True + if pbs == "false": + return False + _check_raise(name, putative_bool, source) + + +def _check_client_group_regex(client_group_regex, source) -> bool: + return _bool_request(client_group_regex, "client group regex", source) + + +def _check_debug_mode(debug_mode, source) -> bool: + return _bool_request(debug_mode, "debug mode", source) + + +_KEY_CHECKERS = { + CLIENT_GROUP: _check_clientgroup, + REQUEST_CPUS: _check_cpus, + REQUEST_MEMORY: _check_memory, + REQUEST_DISK: _check_disk, + CLIENT_GROUP_REGEX: _check_client_group_regex, + DEBUG_MODE: _check_debug_mode, +} + + +class JobRequirementsResolver: + """ + Resolves requirements for a job (e.g. CPU, memory, etc.) given a method id and optional input + parameters. Order of precedence is: + 1) Parameters submitted by the client programmer + 2) Requirements in the KBase Catalog service + 3) Requirements from the EE2 configuration file (deploy.cfg). + """ + + def __init__( + self, + cfgfile: Iterable[str], + override_client_group: str = None, + ): + """ + Create the resolver. + cfgfile - the configuration file as an open file object or other iterable. + override_client_group - if provided, this client group will be used for all jobs, ignoring + all other sources of client group information. + """ + self._override_client_group = _check_string( + override_client_group, "override_client_group", optional=True + ) + config = ConfigParser() + config.read_file(_not_falsy(cfgfile, "cfgfile")) + self._default_client_group = _check_string( + config.get( + section=EE2_DEFAULT_SECTION, + option=EE2_DEFAULT_CLIENT_GROUP, + fallback=None, + ), + f"value for {EE2_DEFAULT_SECTION}.{EE2_DEFAULT_CLIENT_GROUP} in deployment config file", + ) + self._clientgroup_default_configs = self._build_config(config) + if self._default_client_group not in self._clientgroup_default_configs: + raise ValueError( + "No deployment configuration entry for default " + + f"client group '{self._default_client_group}'" + ) + if ( + self._override_client_group + and self._override_client_group not in self._clientgroup_default_configs + ): + raise ValueError( + "No deployment configuration entry for override " + + f"client group '{self._override_client_group}'" + ) + + def _build_config(self, config): + ret = {} + for sec in config.sections(): + # if the default section is left as DEFAULT configparser shouldn't include it + # in the list, but just in case it changes... + if sec != EE2_CONFIG_SECTION and sec != EE2_DEFAULT_SECTION: + reqspec = {item[0]: item[1] for item in config.items(sec)} + reqspec[CLIENT_GROUP] = sec + ret[sec] = self.normalize_job_reqs( + reqspec, + f"section '{sec}' of the deployment configuration", + require_all_resources=True, + ) + return ret + + def get_override_client_group(self) -> Union[str, None]: + """ + Get the override client group, if any. This client group supercedes all others. + """ + return self._override_client_group + + def get_default_client_group(self) -> str: + """ + Get the default client group used if a client group is not provided by override, the user, + or the catalog. + """ + return self._default_client_group + + def get_configured_client_groups(self) -> Set[str]: + """ + Get the client groups configured in the configuration file. + """ + return self._clientgroup_default_configs.keys() + + def get_configured_client_group_spec( + self, clientgroup: str + ) -> Dict[str, Union[int, str]]: + f""" + Get the client group specification in normalized format. Includes the {CLIENT_GROUP}, + {REQUEST_CPUS}, {REQUEST_MEMORY}, and {REQUEST_DISK} keys. May, but usually will not, + include the {DEBUG_MODE} and {CLIENT_GROUP_REGEX} keys. + """ + if clientgroup not in self._clientgroup_default_configs: + raise ValueError(f"Client group '{clientgroup}' is not configured") + # make a copy to prevent accidental mutation by the caller + return dict(self._clientgroup_default_configs[clientgroup]) + + @classmethod + def get_requirements_type( + self, + cpus: int = None, + memory_MB: int = None, + disk_GB: int = None, + client_group: str = None, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = False, + ) -> RequirementsType: + f""" + Determine what type of requirements are being requested. + + All parameters are optional. + + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. Pass None for no preference. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + debug_mode - whether to run the job in debug mode. + + Returns the type of requirements requested by the user: + {RequirementsType.STANDARD.name} - if no requirements are requested + {RequirementsType.PROCESSING.name} - if any requirements other than bill_to_user are + requested + {RequirementsType.BILLING.name} - if bill_to_user is requested + """ + args = JobRequirements.check_parameters( + cpus, + memory_MB, + disk_GB, + client_group, + client_group_regex, + bill_to_user, + ignore_concurrency_limits, + scheduler_requirements, + debug_mode, + ) + if args[5]: # bill_to_user + return RequirementsType.BILLING + if any(args) or args[4] is False: + # regex False means the user is asking for non default + return RequirementsType.PROCESSING + return RequirementsType.STANDARD + + @classmethod + def normalize_job_reqs( + cls, reqs: Dict[str, Union[str, int]], source: str, require_all_resources=False + ) -> Dict[str, Union[str, int]]: + f""" + Massage job requirements into a standard format. Does the following to specific keys of + the reqs argument: + + {CLIENT_GROUP}: ensures it does not contain an =. This error mode is more probable in + the KBase catalog UI. + {REQUEST_CPUS}: parses to an int + {REQUEST_MEMORY}: parses to an int, removing a trailing 'M' or 'MB' if necessary. + {REQUEST_DISK}: parses to an int, removing a trailing 'GB' if necessary. + {CLIENT_GROUP_REGEX}: parses to a boolean or None. The strings true and false are + parsed to booleans, case-insensitive. Ints are parsed directly to booleans. + {DEBUG_MODE}: parses to a boolean. The strings true and false are parsed to booleans, + case-insensitive. Ints are parsed directly to booleans. + + reqs - the job requirements + source - the source of the job requirements, e.g. catalog, user, etc. + require_all_resources - True to throw an error if all four keys resources keys + ({CLIENT_GROUP}, {REQUEST_CPUS}, {REQUEST_MEMORY}, {REQUEST_DISK}) aren't present + with valid values. + + Returns a new dictionary with the altered keys. If any key is not present no action is + taken for that key. + """ + # TODO could support more units and convert as necessary (see checker funcs at start + # of module). YAGNI for now. + if reqs is None: + reqs = {} + ret = {} + for key in [ + CLIENT_GROUP, + REQUEST_CPUS, + REQUEST_MEMORY, + REQUEST_DISK, + CLIENT_GROUP_REGEX, + DEBUG_MODE, + ]: + if not cls._has_value(reqs.get(key)): + if require_all_resources and key in _RESOURCES: + raise IncorrectParamsException( + f"Missing {key} key in job requirements from {source}" + ) + else: + ret[key] = _KEY_CHECKERS[key](reqs.get(key), source) + return ret + + @classmethod + def _has_value(cls, inc): + if inc is None: + return False + if type(inc) == str and not inc.strip(): + return False + return True + + def resolve_requirements( + self, + method: str, + catalog_cache: CatalogCache, + cpus: int = None, + memory_MB: int = None, + disk_GB: int = None, + client_group: str = None, + client_group_regex: Union[bool, None] = None, + bill_to_user: str = None, + ignore_concurrency_limits: bool = False, + scheduler_requirements: Dict[str, str] = None, + debug_mode: bool = None, + ) -> JobRequirements: + """ + Resolve jobs requirements for a method. + + All parameters are optional other than the method and supplying them will override + the catalog and ee2 settings for the job. + + method - the method to be run in module.method format. + catalog_cache - a per request instance of a CatalogCache in order to speed up catalog lookups + cpus - the number of CPUs required for the job. + memory_MB - the amount of memory, in MB, required for the job. + disk_GB - the amount of disk space, in GB, required for the job. + client_group - the client group in which the job will run. + client_group_regex - whether to treat the client group string as a regular expression + that can match multiple client groups. Pass None for no preference. + bill_to_user - bill the job to an alternate user; takes the user's username. + ignore_concurrency_limits - allow the user to run this job even if the user's maximum + job count has already been reached. + scheduler_requirements - arbitrary requirements for the scheduler passed as key/value + pairs. Requires knowledge of the scheduler API. + debug_mode - whether to run the job in debug mode. + + Returns the job requirements. + """ + + if method is None or len(method.split(".")) != 2: + raise IncorrectParamsException( + f"Unrecognized method: '{method}'. Please input module_name.function_name" + ) + module_name, function_name = [m.strip() for m in method.split(".")] + + args = JobRequirements.check_parameters( + cpus, + memory_MB, + disk_GB, + client_group, + client_group_regex, + bill_to_user, + ignore_concurrency_limits, + scheduler_requirements, + debug_mode, + ) + + # the catalog could contain arbitrary scheduler requirements so we can't skip the + # call even if all the arguments are provided + cat_reqs_all = self._get_catalog_reqs(module_name, function_name, catalog_cache) + cat_reqs = self.normalize_job_reqs( + cat_reqs_all, + f"catalog method {module_name}.{function_name}", + ) + client_group = self._get_client_group( + args[3], cat_reqs.get(CLIENT_GROUP), module_name, function_name + ) + + # don't mutate the spec, make a copy + reqs = dict(self._clientgroup_default_configs[client_group]) + reqs.update(cat_reqs) + + scheduler_requirements = _remove_special_keys(cat_reqs_all) + # don't mutate args, check_parameters doesn't make a copy of the incoming args + scheduler_requirements.update(_remove_special_keys(dict(args[7]))) + + cgr = args[4] if (args[4] is not None) else reqs.pop(CLIENT_GROUP_REGEX, None) + dm = args[8] if (args[8] is not None) else reqs.pop(DEBUG_MODE, None) + + return JobRequirements( + args[0] or reqs[REQUEST_CPUS], + args[1] or reqs[REQUEST_MEMORY], + args[2] or reqs[REQUEST_DISK], + client_group, + client_group_regex=cgr, + bill_to_user=args[5], + ignore_concurrency_limits=args[6], + scheduler_requirements=scheduler_requirements, + debug_mode=dm, + ) + + def _get_client_group(self, user_cg, catalog_cg, module_name, function_name): + cg = next( + i + for i in [ + user_cg, + self._override_client_group, + catalog_cg, + self._default_client_group, + ] + if i is not None + ) + if cg not in self._clientgroup_default_configs: + if cg == catalog_cg: + raise IncorrectParamsException( + f"Catalog specified illegal client group '{cg}' for method " + + f"{module_name}.{function_name}" + ) + raise IncorrectParamsException(f"No such clientgroup: {cg}") + return cg + + @staticmethod + def _get_catalog_reqs( + module_name: str, function_name: str, catalog_cache: CatalogCache + ): + # could cache results for 30s or so to speed things up... YAGNI + group_config = catalog_cache.lookup_job_resource_requirements( + module_name=module_name, function_name=function_name + ) + # If group_config is empty, that means there's no clientgroup entry in the catalog + # It'll return an empty list even for non-existent modules + if not group_config: + return {} + if len(group_config) > 1: + raise ValueError( + "Unexpected result from the Catalog service: more than one client group " + + f"configuration found for method {module_name}.{function_name} {group_config}" + ) + + resources_request = group_config[0].get(_CLIENT_GROUPS, None) + + # No client group provided + if not resources_request: + return {} + # JSON + if "{" in resources_request[0]: + try: + rv = json.loads(", ".join(resources_request)) + except ValueError: + raise ValueError( + "Unable to parse JSON client group entry from catalog " + + f"for method {module_name}.{function_name}" + ) + return {k.strip(): rv[k] for k in rv} + # CSV Format + # This presents as CSV in the Catalog UI, e.g. + # clientgroup, key1=value1, key2=value2 + # and so on + # The UI splits by comma before sending the data to the catalog, which is what we + # get when we pull the data + rv = {CLIENT_GROUP: resources_request.pop(0)} + for item in resources_request: + if "=" not in item: + raise ValueError( + f"Malformed requirement. Format is =. Item is '{item}' for " + + f"catalog method {module_name}.{function_name}" + ) + (key, value) = item.split("=", 1) + rv[key.strip()] = value.strip() + return rv diff --git a/lib/execution_engine2/utils/user_info.py b/lib/execution_engine2/utils/user_info.py new file mode 100644 index 000000000..1a7a4e10f --- /dev/null +++ b/lib/execution_engine2/utils/user_info.py @@ -0,0 +1,36 @@ +""" +User information classes and methods. +""" + +from execution_engine2.utils.arg_processing import check_string as _check_string + + +class UserCreds: + """ + Contains a user's username and token. + + Instance variables: + username - the users's username. + token - the user's token. + """ + + # TODO replace the creds in the clients.UserClientSet with this class + + def __init__(self, username: str, token: str): + """ + Create the creds. + + username - the user's username. + token - the user's token. It is expected that the client programmer verifies that the + token is indeed tied to the user. + """ + self.username = _check_string(username, "username") + self.token = _check_string(token, "token") + + def __eq__(self, other): + if type(self) == type(other): + return (self.username, self.token) == (other.username, other.token) + return False + + def __hash__(self): + return hash((self.username, self.token)) diff --git a/lib/installed_clients/execution_engine2Client.py b/lib/installed_clients/execution_engine2Client.py index 77b02084c..591328e75 100644 --- a/lib/installed_clients/execution_engine2Client.py +++ b/lib/installed_clients/execution_engine2Client.py @@ -100,119 +100,328 @@ def status(self, context=None): def run_job(self, params, context=None): """ - Start a new job (long running method of service registered in ServiceRegistery). - Such job runs Docker image for this service in script mode. - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String + Start a new job. + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ return self._client.call_method( "execution_engine2.run_job", [params], self._service_ver, context ) + def run_job_batch(self, params, batch_params, context=None): + """ + Run a batch job, consisting of a parent job and one or more child jobs. + Note that the as_admin parameters in the list of child jobs are ignored - + only the as_admin parameter in the batch_params is considered. + :param params: instance of list of type "RunJobParams" (method - the + SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) + :param batch_params: instance of type "BatchParams" (Additional + parameters for a batch job. wsid: the workspace with which to + associate the parent job. as_admin: run the job with full EE2 + permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights.) -> structure: + parameter "wsid" of Long, parameter "as_admin" of type "boolean" + (@range [0,1]) + :returns: instance of type "BatchSubmission" -> structure: parameter + "parent_job_id" of type "job_id" (A job id.), parameter + "child_job_ids" of list of type "job_id" (A job id.) + """ + return self._client.call_method( + "execution_engine2.run_job_batch", + [params, batch_params], + self._service_ver, + context, + ) + + def retry_job(self, params, context=None): + """ + #TODO write retry parent tests to ensure BOTH the parent_job_id is present, and retry_job_id is present + #TODO Add retry child that checks the status of the child? to prevent multiple retries + Allowed Jobs + Regular Job with no children + Regular job with/without parent_id that runs a kbparallel call or a run_job_batch call + Not Allowed + Regular Job with children (Should not be possible to create yet) + Batch Job Parent Container (Not a job, it won't do anything, except cancel it's child jobs) + :param params: instance of type "RetryParams" (job_id of job to retry + as_admin: retry someone elses job in your namespace #TODO Possibly + Add JobRequirements job_requirements;) -> structure: parameter + "job_id" of type "job_id" (A job id.), parameter "as_admin" of + type "boolean" (@range [0,1]) + :returns: instance of type "RetryResult" (job_id of retried job + retry_id: job_id of the job that was launched str error: reason as + to why that particular retry failed (available for bulk retry + only)) -> structure: parameter "job_id" of type "job_id" (A job + id.), parameter "retry_id" of type "job_id" (A job id.), parameter + "error" of String + """ + return self._client.call_method( + "execution_engine2.retry_job", [params], self._service_ver, context + ) + + def retry_jobs(self, params, context=None): + """ + Same as retry_job, but accepts multiple jobs + :param params: instance of type "BulkRetryParams" (job_ids of job to + retry as_admin: retry someone else's job in your namespace #TODO: + Possibly Add list job_requirements;) -> + structure: parameter "job_ids" of list of type "job_id" (A job + id.), parameter "as_admin" of type "boolean" (@range [0,1]) + :returns: instance of list of type "RetryResult" (job_id of retried + job retry_id: job_id of the job that was launched str error: + reason as to why that particular retry failed (available for bulk + retry only)) -> structure: parameter "job_id" of type "job_id" (A + job id.), parameter "retry_id" of type "job_id" (A job id.), + parameter "error" of String + """ + return self._client.call_method( + "execution_engine2.retry_jobs", [params], self._service_ver, context + ) + + def abandon_children(self, params, context=None): + """ + :param params: instance of type "AbandonChildren" -> structure: + parameter "parent_job_id" of type "job_id" (A job id.), parameter + "child_job_ids" of list of type "job_id" (A job id.), parameter + "as_admin" of type "boolean" (@range [0,1]) + :returns: instance of type "BatchSubmission" -> structure: parameter + "parent_job_id" of type "job_id" (A job id.), parameter + "child_job_ids" of list of type "job_id" (A job id.) + """ + return self._client.call_method( + "execution_engine2.abandon_children", [params], self._service_ver, context + ) + def run_job_concierge(self, params, concierge_params, context=None): """ - :param params: instance of type "RunJobParams" (method - service - defined in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String + :param params: instance of type "RunJobParams" (method - the SDK + method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) :param concierge_params: instance of type "ConciergeParams" (EE2Constants Concierge Params are request_cpus: int - request_memory: int in MB request_disk: int in MB job_priority: + request_memory: int in MB request_disk: int in GB job_priority: int = None range from -20 to +20, with higher values meaning - better priority. account_group: str = None # Someone elses account - requirements_list: list = None ['machine=worker102','color=red'] - client_group: Optional[str] = CONCIERGE_CLIENTGROUP # You can - leave default or specify a clientgroup) -> structure: parameter - "request_cpu" of Long, parameter "request_memory_mb" of Long, - parameter "request_disk_mb" of Long, parameter "job_priority" of - Long, parameter "account_group" of String, parameter - "requirements_list" of list of String, parameter "client_group" of - String + better priority. Note: job_priority is currently not implemented. + account_group: str = None # Someone elses account + ignore_concurrency_limits: ignore any limits on simultaneous job + runs. Default 1 (True). requirements_list: list = None + ['machine=worker102','color=red'] client_group: Optional[str] = + CONCIERGE_CLIENTGROUP # You can leave default or specify a + clientgroup client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. debug_mode: Whether to run the job in debug mode. Default + 0 (False).) -> structure: parameter "request_cpu" of Long, + parameter "request_memory" of Long, parameter "request_disk" of + Long, parameter "job_priority" of Long, parameter "account_group" + of String, parameter "ignore_concurrency_limits" of type "boolean" + (@range [0,1]), parameter "requirements_list" of list of String, + parameter "client_group" of String, parameter "client_group_regex" + of type "boolean" (@range [0,1]), parameter "debug_mode" of type + "boolean" (@range [0,1]) :returns: instance of type "job_id" (A job id.) """ return self._client.call_method( @@ -228,51 +437,75 @@ def get_job_params(self, params, context=None): necessary for job execution @optional as_admin) -> structure: parameter "job_id" of type "job_id" (A job id.), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "RunJobParams" (method - service defined - in standard JSON RPC way, typically it's module name from - spec-file followed by '.' and name of funcdef from spec-file - corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String + :returns: instance of type "RunJobParams" (method - the SDK method to + run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]) """ return self._client.call_method( "execution_engine2.get_job_params", [params], self._service_ver, context @@ -417,63 +650,321 @@ def check_job(self, params, context=None): parameter "job_id" of type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type - "RunJobParams" (method - service defined in standard JSON RPC way, - typically it's module name from spec-file followed by '.' and name - of funcdef from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_job", [params], self._service_ver, context ) + def check_job_batch(self, params, context=None): + """ + get current status of a parent job, and it's children, if it has any. + :param params: instance of type "CheckJobParams" (exclude_fields: + exclude certain fields to return. default None. exclude_fields + strings can be one of fields defined in + execution_engine2.db.models.models.Job) -> structure: parameter + "job_id" of type "job_id" (A job id.), parameter "exclude_fields" + of list of String, parameter "as_admin" of type "boolean" (@range + [0,1]) + :returns: instance of type "CheckJobBatchResults" (parent_job - state + of parent job job_states - states of child jobs IDEA: ADD + aggregate_states - count of all available child job states, even + if they are zero) -> structure: parameter "parent_jobstate" of + type "JobState" (job_id - string - id of the job user - string - + user who started the job wsid - int - optional id of the workspace + where the job is bound authstrat - string - what strategy used to + authenticate the job job_input - object - inputs to the job (from + the run_job call) ## TODO - verify updated - int - timestamp + since epoch in milliseconds of the last time the status was + updated running - int - timestamp since epoch in milliseconds of + when it entered the running state created - int - timestamp since + epoch in milliseconds when the job was created finished - int - + timestamp since epoch in milliseconds when the job was finished + status - string - status of the job. one of the following: created + - job has been created in the service estimating - an estimation + job is running to estimate resources required for the main job, + and which queue should be used queued - job is queued to be run + running - job is running on a worker node completed - job was + completed successfully error - job is no longer running, but + failed with an error terminated - job is no longer running, + terminated either due to user cancellation, admin cancellation, or + some automated task error_code - int - internal reason why the job + is an error. one of the following: 0 - unknown 1 - job crashed 2 - + job terminated by automation 3 - job ran over time limit 4 - job + was missing its automated output document 5 - job authentication + token expired errormsg - string - message (e.g. stacktrace) + accompanying an errored job error - object - the JSON-RPC error + package that accompanies the error code and message + terminated_code - int - internal reason why a job was terminated, + one of: 0 - user cancellation 1 - admin cancellation 2 - + terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter + "child_jobstates" of list of type "JobState" (job_id - string - id + of the job user - string - user who started the job wsid - int - + optional id of the workspace where the job is bound authstrat - + string - what strategy used to authenticate the job job_input - + object - inputs to the job (from the run_job call) ## TODO - + verify updated - int - timestamp since epoch in milliseconds of + the last time the status was updated running - int - timestamp + since epoch in milliseconds of when it entered the running state + created - int - timestamp since epoch in milliseconds when the job + was created finished - int - timestamp since epoch in milliseconds + when the job was finished status - string - status of the job. one + of the following: created - job has been created in the service + estimating - an estimation job is running to estimate resources + required for the main job, and which queue should be used queued - + job is queued to be run running - job is running on a worker node + completed - job was completed successfully error - job is no + longer running, but failed with an error terminated - job is no + longer running, terminated either due to user cancellation, admin + cancellation, or some automated task error_code - int - internal + reason why the job is an error. one of the following: 0 - unknown + 1 - job crashed 2 - job terminated by automation 3 - job ran over + time limit 4 - job was missing its automated output document 5 - + job authentication token expired errormsg - string - message (e.g. + stacktrace) accompanying an errored job error - object - the + JSON-RPC error package that accompanies the error code and message + terminated_code - int - internal reason why a job was terminated, + one of: 0 - user cancellation 1 - admin cancellation 2 - + terminated by some automatic process @optional error @optional + error_code @optional errormsg @optional terminated_code @optional + estimating @optional running @optional finished) -> structure: + parameter "job_id" of type "job_id" (A job id.), parameter "user" + of String, parameter "authstrat" of String, parameter "wsid" of + Long, parameter "status" of String, parameter "job_input" of type + "RunJobParams" (method - the SDK method to run in module.method + format, e.g. 'KBaseTrees.construct_species_tree' params - the + parameters to pass to the method. Optional parameters: app_id - + the id of the Narrative application (UI) running this job (e.g. + repo/name) service_ver - specific version of deployed service, + last version is used if this parameter is not defined + source_ws_objects - denotes the workspace objects that will serve + as a source of data when running the SDK method. These references + will be added to the autogenerated provenance. Must be in UPA + format (e.g. 6/90/4). meta - Narrative metadata to associate with + the job. wsid - an optional workspace id to associate with the + job. This is passed to the workspace service, which will share the + job based on the permissions of the workspace rather than owner of + the job parent_job_id - EE2 job id for the parent of the current + job. For run_job and run_job_concierge, this value can be + specified to denote the parent job of the job being created. + Warning: No checking is done on the validity of the job ID, and + the parent job record is not altered. Submitting a job with a + parent ID to run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long + """ + return self._client.call_method( + "execution_engine2.check_job_batch", [params], self._service_ver, context + ) + def check_jobs(self, params, context=None): """ :param params: instance of type "CheckJobsParams" (As in check_job, @@ -519,58 +1010,81 @@ def check_jobs(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_jobs", [params], self._service_ver, context @@ -622,58 +1136,81 @@ def check_workspace_jobs(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long """ return self._client.call_method( "execution_engine2.check_workspace_jobs", @@ -740,14 +1277,30 @@ def check_jobs_date_range_for_user(self, params, context=None): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -755,9 +1308,34 @@ def check_jobs_date_range_for_user(self, params, context=None): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the @@ -791,58 +1369,85 @@ def check_jobs_date_range_for_user(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "count" of + Long, parameter "query_count" of Long, parameter "filter" of + mapping from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ return self._client.call_method( "execution_engine2.check_jobs_date_range_for_user", @@ -855,14 +1460,30 @@ def check_jobs_date_range_for_all(self, params, context=None): """ :param params: instance of type "CheckJobsDateRangeParams" (Check job for all jobs in a given date/time range for all users (Admin - function) float start_time; # Filter based on creation timestamp - since epoch float end_time; # Filter based on creation timestamp - since epoch list projection; # A list of fields to include - in the projection, default ALL See "Projection Fields" - list filter; # A list of simple filters to "AND" together, - such as error_code=1, wsid=1234, terminated_code = 1 int limit; # - The maximum number of records to return string user; # Optional. - Defaults off of your token @optional projection @optional filter + function) Notes on start_time and end_time: These fields are + designated as floats but floats, ints, and strings are all + accepted. Times are determined as follows: - if the field is a + float or a string that contains a float and only a float, the + field value is treated as seconds since the epoch. - if the field + is an int or a string that contains an int and only an int, the + field value is treated as milliseconds since the epoch. - if the + field is a string not matching the criteria above, it is treated + as a date and time. Nearly any unambigous format can be parsed. + float start_time - Filter based on job creation timestamp since + epoch float end_time - Filter based on job creation timestamp + since epoch list projection - A list of fields to include + in the projection, default ALL See "Projection Fields" above + list filter - DEPRECATED: this field may change or be + removed in the future. A list of simple filters to "AND" together, + such as error_code=1, wsid=1234, terminated_code = 1 int limit - + The maximum number of records to return string user - The user + whose job records will be returned. Optional. Default is the + current user. int offset - the number of jobs to skip before + returning records. boolean ascending - true to sort by job ID + ascending, false descending. boolean as_admin - true to run the + query as an admin; user must have admin EE2 permissions. Required + if setting `user` to something other than your own. TODO: this + seems to have no effect @optional projection @optional filter @optional limit @optional user @optional offset @optional ascending) -> structure: parameter "start_time" of Double, parameter "end_time" of Double, parameter "projection" of list of @@ -870,9 +1491,34 @@ def check_jobs_date_range_for_all(self, params, context=None): Long, parameter "user" of String, parameter "offset" of Long, parameter "ascending" of type "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" (@range [0,1]) - :returns: instance of type "CheckJobsResults" (job_states - states of - jobs could be mapping or list) -> - structure: parameter "job_states" of list of type "JobState" + :returns: instance of type "CheckJobsDateRangeResults" (Projection + Fields user = StringField(required=True) authstrat = StringField( + required=True, default="kbaseworkspace", + validation=valid_authstrat ) wsid = IntField(required=False) + status = StringField(required=True, validation=valid_status) + updated = DateTimeField(default=datetime.datetime.utcnow, + autonow=True) estimating = DateTimeField(default=None) # Time + when job began estimating running = DateTimeField(default=None) # + Time when job started # Time when job finished, errored out, or + was terminated by the user/admin finished = + DateTimeField(default=None) errormsg = StringField() msg = + StringField() error = DynamicField() terminated_code = + IntField(validation=valid_termination_code) error_code = + IntField(validation=valid_errorcode) scheduler_type = + StringField() scheduler_id = StringField() scheduler_estimator_id + = StringField() job_input = EmbeddedDocumentField(JobInput, + required=True) job_output = DynamicField() /* /* Results of + check_jobs_date_range methods. jobs - the jobs matching the query, + up to `limit` jobs. count - the number of jobs returned. + query_count - the number of jobs that matched the filters. filter + - DEPRECATED - this field may change in the future. The filters + that were applied to the jobs. skip - the number of jobs that were + skipped prior to beginning to return jobs. projection - the list + of fields included in the returned job. By default all fields. + limit - the maximum number of jobs returned. sort_order - the + order in which the results were sorted by the job ID - + for + ascending, - for descending. TODO: DOCUMENT THE RETURN OF STATS + mapping) -> structure: parameter "jobs" of list of type "JobState" (job_id - string - id of the job user - string - user who started the job wsid - int - optional id of the workspace where the job is bound authstrat - string - what strategy used to authenticate the @@ -906,58 +1552,85 @@ def check_jobs_date_range_for_all(self, params, context=None): type "job_id" (A job id.), parameter "user" of String, parameter "authstrat" of String, parameter "wsid" of Long, parameter "status" of String, parameter "job_input" of type "RunJobParams" - (method - service defined in standard JSON RPC way, typically it's - module name from spec-file followed by '.' and name of funcdef - from spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); params - - the parameters of the method that performed this call; Optional - parameters: service_ver - specific version of deployed service, - last version is used if this parameter is not defined rpc_context - - context of current method call including nested call history - remote_url - run remote service call instead of local command line - execution. source_ws_objects - denotes the workspace objects that - will serve as a source of data when running the SDK method. These - references will be added to the autogenerated provenance. app_id - - the id of the Narrative application (UI) running this job (e.g. - repo/name) mapping meta - user defined metadata to - associate with the job. wsid - an optional workspace id to - associate with the job. This is passed to the workspace service, - which will share the job based on the permissions of the workspace - rather than owner of the job parent_job_id - EE2 id of the parent - of a batch job. Batch jobs will add this id to the EE2 database - under the field "parent_job_id") -> structure: parameter "method" - of String, parameter "params" of list of unspecified object, - parameter "service_ver" of String, parameter "rpc_context" of type - "RpcContext" (call_stack - upstream calls details including nested - service calls and parent jobs where calls are listed in order from - outer to inner.) -> structure: parameter "call_stack" of list of - type "MethodCall" (time - the time the call was started; method - - service defined in standard JSON RPC way, typically it's module - name from spec-file followed by '.' and name of funcdef from - spec-file corresponding to running method (e.g. - 'KBaseTrees.construct_species_tree' from trees service); job_id - - job id if method is asynchronous (optional field).) -> structure: - parameter "time" of type "timestamp" (A time in the format - YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z - (representing the UTC timezone) or the difference in time to UTC - in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) - 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC - time)), parameter "method" of String, parameter "job_id" of type - "job_id" (A job id.), parameter "run_id" of String, parameter - "remote_url" of String, parameter "source_ws_objects" of list of - type "wsref" (A workspace object reference of the form X/Y or - X/Y/Z, where X is the workspace name or id, Y is the object name - or id, Z is the version, which is optional.), parameter "app_id" - of String, parameter "meta" of mapping from String to String, - parameter "wsid" of Long, parameter "parent_job_id" of String, - parameter "created" of Long, parameter "queued" of Long, parameter - "estimating" of Long, parameter "running" of Long, parameter - "finished" of Long, parameter "updated" of Long, parameter "error" - of type "JsonRpcError" (Error block of JSON RPC response) -> - structure: parameter "name" of String, parameter "code" of Long, - parameter "message" of String, parameter "error" of String, - parameter "error_code" of Long, parameter "errormsg" of String, - parameter "terminated_code" of Long + (method - the SDK method to run in module.method format, e.g. + 'KBaseTrees.construct_species_tree' params - the parameters to + pass to the method. Optional parameters: app_id - the id of the + Narrative application (UI) running this job (e.g. repo/name) + service_ver - specific version of deployed service, last version + is used if this parameter is not defined source_ws_objects - + denotes the workspace objects that will serve as a source of data + when running the SDK method. These references will be added to the + autogenerated provenance. Must be in UPA format (e.g. 6/90/4). + meta - Narrative metadata to associate with the job. wsid - an + optional workspace id to associate with the job. This is passed to + the workspace service, which will share the job based on the + permissions of the workspace rather than owner of the job + parent_job_id - EE2 job id for the parent of the current job. For + run_job and run_job_concierge, this value can be specified to + denote the parent job of the job being created. Warning: No + checking is done on the validity of the job ID, and the parent job + record is not altered. Submitting a job with a parent ID to + run_job_batch will cause an error to be returned. + job_requirements: the requirements for the job. The user must have + full EE2 administration rights to use this parameter. Note that + the job_requirements are not returned along with the rest of the + job parameters when querying the EE2 API - they are only + considered when submitting a job. as_admin: run the job with full + EE2 permissions, meaning that any supplied workspace IDs are not + checked for accessibility and job_requirements may be supplied. + The user must have full EE2 administration rights. Note that this + field is not included in returned data when querying EE2.) -> + structure: parameter "method" of String, parameter "app_id" of + String, parameter "params" of list of unspecified object, + parameter "service_ver" of String, parameter "source_ws_objects" + of list of type "wsref" (A workspace object reference of the form + X/Y/Z, where X is the workspace id, Y is the object id, Z is the + version.), parameter "meta" of type "Meta" (Narrative metadata for + a job. All fields are optional. run_id - the Narrative-assigned ID + of the job run. 1:1 with a job ID. token_id - the ID of the token + used to run the method. tag - the release tag, e.g. + dev/beta/release. cell_id - the ID of the narrative cell from + which the job was run.) -> structure: parameter "run_id" of + String, parameter "token_id" of String, parameter "tag" of String, + parameter "cell_id" of String, parameter "wsid" of Long, parameter + "parent_job_id" of String, parameter "job_requirements" of type + "JobRequirements" (Job requirements for a job. All fields are + optional. To submit job requirements, the user must have full EE2 + admin permissions. Ignored for the run concierge endpoint. + request_cpus: the number of CPUs to request for the job. + request_memory: the amount of memory, in MB, to request for the + job. request_disk: the amount of disk space, in GB, to request for + the job. client_group: the name of the client group on which to + run the job. client_group_regex: Whether to treat the client group + string, whether provided here, from the catalog, or as a default, + as a regular expression when matching clientgroups. Default True + for HTC, but the default depends on the scheduler. Omit to use the + default. bill_to_user: the job will be counted against the + provided user's fair share quota. ignore_concurrency_limits: + ignore any limits on simultaneous job runs. Default false. + scheduler_requirements: arbitrary key-value pairs to be provided + to the job scheduler. Requires knowledge of the scheduler + interface. debug_mode: Whether to run the job in debug mode. + Default false.) -> structure: parameter "request_cpus" of Long, + parameter "requst_memory" of Long, parameter "request_disk" of + Long, parameter "client_group" of String, parameter + "client_group_regex" of type "boolean" (@range [0,1]), parameter + "bill_to_user" of String, parameter "ignore_concurrency_limits" of + type "boolean" (@range [0,1]), parameter "scheduler_requirements" + of mapping from String to String, parameter "debug_mode" of type + "boolean" (@range [0,1]), parameter "as_admin" of type "boolean" + (@range [0,1]), parameter "created" of Long, parameter "queued" of + Long, parameter "estimating" of Long, parameter "running" of Long, + parameter "finished" of Long, parameter "updated" of Long, + parameter "error" of type "JsonRpcError" (Error block of JSON RPC + response) -> structure: parameter "name" of String, parameter + "code" of Long, parameter "message" of String, parameter "error" + of String, parameter "error_code" of Long, parameter "errormsg" of + String, parameter "terminated_code" of Long, parameter "count" of + Long, parameter "query_count" of Long, parameter "filter" of + mapping from String to String, parameter "skip" of Long, parameter + "projection" of list of String, parameter "limit" of Long, + parameter "sort_order" of String """ return self._client.call_method( "execution_engine2.check_jobs_date_range_for_all", @@ -993,7 +1666,7 @@ def get_admin_permission(self, context=None): """ Check if current user has ee2 admin rights. If so, return the type of rights and their roles - :returns: instance of type "AdminRolesResults" (str permission; # One + :returns: instance of type "AdminRolesResults" (str permission - One of 'r|w|x' (('read' | 'write' | 'none'))) -> structure: parameter "permission" of String """ @@ -1003,6 +1676,7 @@ def get_admin_permission(self, context=None): def get_client_groups(self, context=None): """ + Get a list of clientgroups manually extracted from the config file :returns: instance of list of String """ return self._client.call_method( diff --git a/pyproject.toml b/pyproject.toml index 757d99be5..234b8d86b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,8 @@ exclude = ''' | __pycache__ | lib/__pycache__ | lib/execution_engine2/execution_engine2Impl.py + | lib/execution_engine2/authclient.py + | lib/biokbase/log.py | lib/installed_clients/* ) ''' diff --git a/requirements-dev.txt b/requirements-dev.txt index 366b4859e..06e45c8e1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ -wheel +-i https://pypi.org/simple/ aiofiles==0.4.0 -aiohttp==3.6.3 +aiohttp==3.7.4 asn1crypto==1.3.0 async-timeout==3.0.1 attrs==20.2.0 @@ -12,7 +12,7 @@ codecov==2.0.15 configparser==3.7.4 confluent-kafka==1.5.0 coverage==4.5.3 -cryptography==3.2 +cryptography==3.3.2 docker==4.3.1 gevent==20.9.0 gprof2dot==2019.11.30 @@ -28,34 +28,35 @@ hyperframe==5.2.0 idna==2.8 importlib-metadata==2.0.0 iniconfig==1.1.1 -Jinja2==2.10.3 -JSONRPCBase==0.2.0 -MarkupSafe==1.1.1 +jinja2==2.11.3 +jsonrpcbase==0.2.0 +maps==5.1.1 +markupsafe==1.1.1 memory-profiler==0.55.0 mock==3.0.5 -mongoengine==0.18.2 +mongoengine==0.23.0 multidict==4.5.2 nose==1.3.7 -packaging==20.4 +packaging==20.9 pluggy==0.13.1 psutil==5.6.6 -py==1.9.0 +py==1.10.0 pycosat==0.6.3 pycparser==2.19 pymongo==3.8.0 -pyOpenSSL==19.1.0 +pyopenssl==19.1.0 pyparsing==2.4.7 -PySocks==1.7.1 -pytest==6.1.1 +pysocks==1.7.1 pytest-cov==2.8.1 pytest-profiling==1.7.0 +pytest==6.1.1 python-dateutil==2.8.0 python-dotenv==0.10.3 -requests==2.22.0 requests-async==0.5.0 requests-mock==1.7.0 +requests==2.22.0 rfc3986==1.3.2 -ruamel-yaml==0.15.87 +ruamel.yaml==0.15.87 sanic==19.6.0 sentry-sdk==0.14.3 six==1.14.0 @@ -64,7 +65,7 @@ toml==0.10.1 tqdm==4.42.1 typing-extensions==3.7.4.3 ujson==1.35 -urllib3==1.25.3 +urllib3==1.25.8 uvloop==0.12.2 websocket-client==0.57.0 websockets==6.0 diff --git a/requirements.txt b/requirements.txt index bf77a6d0f..618dc4908 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,73 +1,30 @@ -aiofiles==0.4.0 -aiohttp==3.6.3 -asn1crypto==1.3.0 -async-timeout==3.0.1 -attrs==20.2.0 -cachetools==3.1.1 -certifi==2019.6.16 -cffi==1.14.0 -chardet==3.0.4 -codecov==2.0.15 -configparser==3.7.4 -confluent-kafka==1.5.0 -coverage==4.5.3 -cryptography==3.2 -docker==4.3.1 -gevent==20.9.0 -gprof2dot==2019.11.30 -greenlet==0.4.17 -gunicorn==20.0.4 -h11==0.8.1 -h2==3.1.0 -hpack==3.0.0 -htcondor==8.9.8 -httpcore==0.3.0 -httptools==0.0.13 -hyperframe==5.2.0 -idna==2.8 -importlib-metadata==2.0.0 -iniconfig==1.1.1 -Jinja2==2.10.3 +-i https://pypi.org/simple/ +aiofiles==0.7.0 +aiohttp==3.7.4.post0 +cachetools==4.2.2 +codecov==2.1.11 +configparser==5.0.2 +confluent-kafka==1.7.0 +coverage==5.5 +docker==5.0.0 +gevent==21.1.2 +greenlet==1.1.0 +gunicorn==20.1.0 +htcondor==9.1.0 +Jinja2==3.0.1 JSONRPCBase==0.2.0 -MarkupSafe==1.1.1 -memory-profiler==0.55.0 -mock==3.0.5 -mongoengine==0.18.2 -multidict==4.5.2 -nose==1.3.7 -packaging==20.4 -pluggy==0.13.1 -psutil==5.6.6 -py==1.9.0 -pycosat==0.6.3 -pycparser==2.19 -pymongo==3.8.0 -pyOpenSSL==19.1.0 -pyparsing==2.4.7 -PySocks==1.7.1 -pytest==6.1.1 -pytest-cov==2.8.1 -pytest-profiling==1.7.0 -python-dateutil==2.8.0 -python-dotenv==0.10.3 -requests==2.22.0 -requests-async==0.5.0 -requests-mock==1.7.0 -rfc3986==1.3.2 -ruamel-yaml==0.15.87 -sanic==19.6.0 -sentry-sdk==0.14.3 -six==1.14.0 -slackclient==2.7.1 -toml==0.10.1 -tqdm==4.42.1 -typing-extensions==3.7.4.3 -ujson==1.35 -urllib3==1.25.3 -uvloop==0.12.2 -websocket-client==0.57.0 -websockets==6.0 -yarl==1.5.1 -zipp==3.3.1 -zope.event==4.5.0 -zope.interface==5.1.2 +mock==4.0.3 +maps==5.1.1 +mongoengine==0.23.1 +psutil==5.8.0 +pymongo==3.12.0 +pytest==6.2.4 +pytest-cov==2.12.1 +python-dateutil==2.8.2 +python-dotenv==0.18.0 +requests==2.25.1 +requests-mock==1.9.3 +sanic==21.6.0 +slackclient==2.9.3 +toml==0.10.2 +urllib3==1.26.6 diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 642ca69f3..2d5fbbab2 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -2,25 +2,29 @@ cp ./deploy.cfg ./work/config.properties -#condor_shared=condor_shared - - -if [ $# -eq 0 ] ; then +if [ $# -eq 0 ]; then useradd kbase - if [ "${POOL_PASSWORD}" ] ; then - /usr/sbin/condor_store_cred -p "${POOL_PASSWORD}" -f /etc/condor/password - chown kbase:kbase /etc/condor/password + if [ "${POOL_PASSWORD}" ]; then + /usr/sbin/condor_store_cred -p "${POOL_PASSWORD}" -f /etc/condor/password + chown kbase:kbase /etc/condor/password fi chown kbase /etc/condor/password + + # Copy downloaded JobRunner to a shared volume mount cp -rf /runner/JobRunner.tgz /condor_shared cp -rf ./scripts/execute_runner.sh /condor_shared + # Give permissions to transfer logs into here mkdir /condor_shared/runner_logs && chown kbase /condor_shared/runner_logs mkdir /condor_shared/cluster_logs && chown kbase /condor_shared/cluster_logs + # Save ENV Variables to file for cron and Remove _=/usr/bin/env + envsubst /etc/environment + chmod a+rw /etc/environment + service cron start sh ./scripts/start_server.sh -elif [ "${1}" = "test" ] ; then +elif [ "${1}" = "test" ]; then echo "Run Tests" make test diff --git a/test/deploy.cfg b/test/deploy.cfg index 17889078e..fa520f56c 100644 --- a/test/deploy.cfg +++ b/test/deploy.cfg @@ -28,6 +28,7 @@ mongo-database = ee2 mongo-user = travis mongo-password = travis mongo-authmechanism = DEFAULT +mongo-retry-rewrites = False # mongo-in-docker-compose = mini_kb_ci-mongo_1 # mongo-in-docker-compose = condor_mongo_1 @@ -51,6 +52,11 @@ transfer_input_files = ../scripts/JobRunner.tgz # Log Level and sending DEBUG=true to the jobs, which means containers do not get cleaned up debug = false +#---------------------------------------------------------------------------------------# +[concierge] +request_cpus = 4 +request_memory = 23000M +request_disk = 100GB #---------------------------------------------------------------------------------------# [njs] request_cpus = 4 @@ -80,7 +86,7 @@ request_disk = 100GB [hpc] request_cpus = 4 request_memory = 2000M -request_disk = 100GBraiss +request_disk = 100GB #---------------------------------------------------------------------------------------# [DEFAULT] default_client_group = njs diff --git a/test/dockerfiles/condor/requirements.txt b/test/dockerfiles/condor/requirements.txt index f447095f0..a7f499bb6 100644 --- a/test/dockerfiles/condor/requirements.txt +++ b/test/dockerfiles/condor/requirements.txt @@ -18,7 +18,7 @@ requests-async==0.5.0 rfc3986==1.3.2 sanic==19.6.0 ujson==1.35 -urllib3==1.25.3 +urllib3==1.26.5 uvloop==0.12.2 websockets==6.0 htcondor==8.9.2 diff --git a/test/test_clients/authclient.py b/test/test_clients/authclient.py index 2087f463f..950b5760f 100644 --- a/test/test_clients/authclient.py +++ b/test/test_clients/authclient.py @@ -12,7 +12,7 @@ class TokenCache(object): - """ A basic cache for tokens. """ + """A basic cache for tokens.""" _MAX_TIME_SEC = 5 * 60 # 5 min diff --git a/test/tests_for_auth/ee2_admin_mode_test.py b/test/tests_for_auth/ee2_admin_mode_test.py index abf89cbe3..15ee0682a 100644 --- a/test/tests_for_auth/ee2_admin_mode_test.py +++ b/test/tests_for_auth/ee2_admin_mode_test.py @@ -2,30 +2,44 @@ import os import unittest from configparser import ConfigParser +from unittest.mock import create_autospec import bson from mock import MagicMock from mock import patch +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.models.models import Status +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import ( + UserClientSet, + ClientSet, + get_client_set, + get_user_client_set, +) from installed_clients.CatalogClient import Catalog -from lib.execution_engine2.authorization.roles import AdminAuthUtil -from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth -from lib.execution_engine2.db.models.models import Status -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.Condor import Condor -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo +from installed_clients.WorkspaceClient import Workspace +from test.utils_shared.mock_utils import get_client_mocks as _get_client_mocks from test.utils_shared.test_utils import ( get_sample_job_params, get_sample_condor_info, ) +# Cause any tests that contact external services (e.g. KBASE CI auth) as part of the test to +# pass automatically. +SKIP_TESTS_WITH_EXTERNALITIES = False + class EE2TestAdminMode(unittest.TestCase): @classmethod def setUpClass(cls): - config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + cls.config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") config_parser = ConfigParser() - config_parser.read(config_file) + config_parser.read(cls.config_file) cls.cfg = {} @@ -40,28 +54,18 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token - ) - def setUp(self) -> None: """ - Patch out Catalog and Condor + Patch out Condor :return: """ - self.catalog_patch = patch( - "lib.installed_clients.CatalogClient.Catalog.get_module_version" - ) - self.catalog = self.catalog_patch.start() - self.catalog.return_value = {"git_commit_hash": "moduleversiongoeshere"} - si = SubmissionInfo(clusterid="123", submit={}, error=None) self.condor_patch = patch.object( target=Condor, attribute="run_job", return_value=si ) self.condor_patch2 = patch.object( target=Condor, - attribute="get_job_info", + attribute="_get_job_info", return_value=get_sample_condor_info(), ) @@ -78,15 +82,17 @@ def setUp(self) -> None: # self.good_job_id_user2 = setup_runner.run_job(params=job_params_1,as_admin=False) def tearDown(self) -> None: - self.catalog_patch.stop() self.condor_patch.stop() self.condor_patch2.start() - def getRunner(self) -> SDKMethodRunner: + def getRunner(self, user_clients=None, clients=None) -> SDKMethodRunner: # Initialize these clients from None - runner = SDKMethodRunner( - self.cfg, user_id=self.user_id, token=self.token - ) # type : SDKMethodRunner + if not user_clients: + user_clients = get_user_client_set(self.cfg, self.user_id, self.token) + if not clients: + with open(self.config_file) as cf: + clients = get_client_set(self.cfg, cf) + runner = SDKMethodRunner(user_clients, clients) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() @@ -96,7 +102,7 @@ def getRunner(self) -> SDKMethodRunner: def get_runner_with_condor(self) -> SDKMethodRunner: runner = self.getRunner() condor = MagicMock(return_value={}) - condor.get_job_info = MagicMock(return_value="") + condor._get_job_info = MagicMock(return_value="") condor.get_job_resource_info = MagicMock(return_value="njs") runner.condor = condor @@ -104,21 +110,47 @@ def get_runner_with_condor(self) -> SDKMethodRunner: # TODO How do you test ADMIN_MODE without increasing too much coverage - @patch.object(Catalog, "get_module_version", return_value="module.version") - @patch.object(WorkspaceAuth, "can_write", return_value=True) - @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_regular_user(self, aau, workspace, catalog): + def get_user_mocks( + self, user_id=None, token=None + ) -> (UserClientSet, Workspace, WorkspaceAuth): + user_id = user_id if user_id else self.user_id + token = token if token else self.token + ws = create_autospec(Workspace, instance=True, spec_set=True) + wsa = create_autospec(WorkspaceAuth, instance=True, spec_set=True) + ucs = UserClientSet(user_id, token, ws, wsa) + return ucs, ws, wsa + + def get_client_mocks(self, *to_be_mocked): + return _get_client_mocks(self.cfg, self.config_file, *to_be_mocked) + + def test_regular_user(self): # Regular User lowly_user = "Access Denied: You are not an administrator" - runner = self.getRunner() - aau.return_value = ["RegularJoe"] + user_client_set, _, ws_auth = self.get_user_mocks() + clients_and_mocks = self.get_client_mocks(AdminAuthUtil, Catalog) + aau = clients_and_mocks[AdminAuthUtil] + catalog = clients_and_mocks[Catalog] + # TODO check catalog called as expected + catalog.get_module_version.return_value = { + "git_commit_hash": "moduleversiongoeshere" + } + catalog.list_client_group_configs.return_value = [] + aau.get_admin_role.return_value = None + ws_auth.can_write.return_value = True + runner = self.getRunner(user_client_set, clients_and_mocks[ClientSet]) method_1 = "module_name.function_name" - job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) + job_params_1 = get_sample_job_params( + method=method_1, wsid=self.ws_id, app_id="module_name/foo" + ) # Check Admin Status is_admin = runner.check_is_admin() self.assertFalse(is_admin) + aau.get_admin_role.assert_called_once_with( + self.token, ADMIN_READ_ROLE, ADMIN_WRITE_ROLE + ) + # Check Admin Status admin_type = runner.get_admin_permission() self.assertEqual(admin_type, {"permission": "n"}) @@ -127,9 +159,10 @@ def test_regular_user(self, aau, workspace, catalog): job_id = runner.run_job(params=job_params_1, as_admin=False) self.assertTrue(bson.objectid.ObjectId.is_valid(job_id)) + ws_auth.can_write.assert_called_once_with(self.ws_id) # RUNJOB BUT ATTEMPT TO BE AN ADMIN - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.run_job(params=job_params_1, as_admin=True) @@ -139,7 +172,7 @@ def test_regular_user(self, aau, workspace, catalog): self.assertEqual(params["method"], job_params_1["method"]) # get_job_params BUT ATTEMPT TO BE AN ADMIN - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.get_job_params(job_id=job_id, as_admin=True) @@ -155,12 +188,12 @@ def test_regular_user(self, aau, workspace, catalog): runner.view_job_logs(job_id=job_id) # add_job_logs and view them, BUT ATTEMPT TO BE AN ADMIN - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.add_job_logs(job_id=job_id, log_lines=lines, as_admin=True) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_user ): runner.view_job_logs(job_id=job_id, as_admin=True) @@ -180,16 +213,24 @@ def test_regular_user(self, aau, workspace, catalog): # TODO do the above with as_admin=True and assert failure each time - # Start the job and get it's status as an admin + # Start the job and get its status as an admin - @patch.object(Catalog, "get_module_version", return_value="module.version") @patch.object(WorkspaceAuth, "can_write", return_value=True) - @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_admin_writer(self, aau, workspace, catalog): + def test_admin_writer(self, workspace): # Admin User with WRITE - runner = self.getRunner() - aau.return_value = [runner.ADMIN_READ_ROLE] + clients_and_mocks = self.get_client_mocks(AdminAuthUtil, Catalog) + clients = clients_and_mocks[ClientSet] + adminauth = clients_and_mocks[AdminAuthUtil] + catalog = clients_and_mocks[Catalog] + # TODO check catalog called as expected + catalog.get_module_version.return_value = { + "git_commit_hash": "moduleversiongoeshere" + } + catalog.list_client_group_configs.return_value = [] + + runner = self.getRunner(None, clients) + adminauth.get_admin_role.return_value = ADMIN_READ_ROLE method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -197,14 +238,20 @@ def test_admin_writer(self, aau, workspace, catalog): is_admin = runner.check_is_admin() self.assertTrue(is_admin) + adminauth.get_admin_role.assert_called_once_with( + self.token, ADMIN_READ_ROLE, ADMIN_WRITE_ROLE + ) + # Admin User with WRITE - runner = self.getRunner() + runner = self.getRunner(None, clients) # SET YOUR ADMIN STATUS HERE - aau.return_value = [runner.ADMIN_WRITE_ROLE] + adminauth.get_admin_role.return_value = ADMIN_WRITE_ROLE method_1 = "module_name.function_name" - job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) + job_params_1 = get_sample_job_params( + method=method_1, wsid=self.ws_id, app_id="module_name/foo" + ) # Check Admin Status is_admin = runner.check_is_admin() @@ -231,23 +278,29 @@ def test_admin_writer(self, aau, workspace, catalog): # These tests should throw the most errors def test_no_user(self): - # No Token + if SKIP_TESTS_WITH_EXTERNALITIES: + return + # Passes a fake token to the auth server, guaranteed to fail. + # Auth is *not mocked*, hits the real auth service. Will fail if CI is down. + # Not sure of the value of this test - if a client actually passes a bad token to the + # server it'll get caught in the Server.py file before the Impl file is reached. runner = self.getRunner() method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=RuntimeError, expected_regex=r"ServerError\('Token validation failed: Login failed! Server responded with code 401 Unauthorized'\)", ): runner.run_job(params=job_params_1, as_admin=False) - @patch.object(AdminAuthUtil, "_fetch_user_roles") - def test_admin_reader(self, aau): - # Admin User with WRITE + def test_admin_reader(self): + # Admin User with READ lowly_admin = r"Access Denied: You are a read-only admin. This function requires write access" - runner = self.getRunner() - aau.return_value = [runner.ADMIN_READ_ROLE] + clients_and_mocks = self.get_client_mocks(AdminAuthUtil) + adminauth = clients_and_mocks[AdminAuthUtil] + runner = self.getRunner(None, clients_and_mocks[ClientSet]) + adminauth.get_admin_role.return_value = ADMIN_READ_ROLE method_1 = "module_name.function_name" job_params_1 = get_sample_job_params(method=method_1, wsid=self.ws_id) @@ -255,12 +308,16 @@ def test_admin_reader(self, aau): is_admin = runner.check_is_admin() self.assertTrue(is_admin) + adminauth.get_admin_role.assert_called_once_with( + self.token, ADMIN_READ_ROLE, ADMIN_WRITE_ROLE + ) + # Check Admin Status admin_type = runner.get_admin_permission() self.assertEqual(admin_type, {"permission": "r"}) # RUNJOB - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=PermissionError, expected_regex=lowly_admin ): runner.run_job(params=job_params_1, as_admin=True) diff --git a/test/tests_for_auth/ee2_authstrategy_test.py b/test/tests_for_auth/ee2_authstrategy_test.py index cae61d41e..a4cfd776d 100644 --- a/test/tests_for_auth/ee2_authstrategy_test.py +++ b/test/tests_for_auth/ee2_authstrategy_test.py @@ -15,6 +15,9 @@ custom_ws_perm_maker, ) +from installed_clients.WorkspaceClient import Workspace +from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth + class AuthStrategyTestCase(unittest.TestCase): @classmethod @@ -73,13 +76,18 @@ def _mock_ws_deleted(self, rq_mock, ws_id): "POST", self.ws_url, [{"json": response, "status_code": 500}] ) + def _get_workspace_auth(self, token) -> WorkspaceAuth: + # TODO these tests can be converted to unit tests by mocking the WorkspaceAuth class + return WorkspaceAuth(self.user, Workspace(url=self.ws_url, token=token)) + @requests_mock.Mocker() def test_can_read_job_ok(self, rq_mock): rq_mock.add_matcher(custom_ws_perm_maker(self.user, self.ws_access)) (jobs, expected_perms) = self._generate_all_test_jobs(perm="read") for idx, job in enumerate(jobs): self.assertEqual( - expected_perms[idx], can_read_job(job, self.user, "foo", self.cfg) + expected_perms[idx], + can_read_job(job, self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -89,7 +97,7 @@ def test_can_read_job_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_read_job(job, self.user, "token", self.cfg) + can_read_job(job, self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) @requests_mock.Mocker() @@ -98,7 +106,8 @@ def test_can_write_job_ok(self, rq_mock): (jobs, expected_perms) = self._generate_all_test_jobs(perm="write") for idx, job in enumerate(jobs): self.assertEqual( - expected_perms[idx], can_write_job(job, self.user, "foo", self.cfg) + expected_perms[idx], + can_write_job(job, self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -108,7 +117,7 @@ def test_can_write_job_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_write_job(job, self.user, "token", self.cfg) + can_write_job(job, self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) @requests_mock.Mocker() @@ -121,7 +130,8 @@ def test_can_read_jobs_ok(self, rq_mock): (jobs, expected_perms) = self._generate_all_test_jobs(perm="read") for idx, job in enumerate(jobs): self.assertEqual( - [expected_perms[idx]], can_read_jobs([job], self.user, "foo", self.cfg) + [expected_perms[idx]], + can_read_jobs([job], self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -131,7 +141,7 @@ def test_can_read_jobs_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_read_jobs([job], self.user, "token", self.cfg) + can_read_jobs([job], self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) @requests_mock.Mocker() @@ -140,7 +150,8 @@ def test_can_write_jobs_ok(self, rq_mock): (jobs, expected_perms) = self._generate_all_test_jobs(perm="write") for idx, job in enumerate(jobs): self.assertEqual( - [expected_perms[idx]], can_write_jobs([job], self.user, "foo", self.cfg) + [expected_perms[idx]], + can_write_jobs([job], self.user, self._get_workspace_auth("foo")), ) @requests_mock.Mocker() @@ -150,5 +161,5 @@ def test_can_write_jobs_fail(self, rq_mock): user=self.other_user, wsid=123, authstrat="kbaseworkspace" ) with self.assertRaises(RuntimeError) as e: - can_write_jobs([job], self.user, "token", self.cfg) + can_write_jobs([job], self.user, self._get_workspace_auth("token")) self.assertIn("Workspace 123 is deleted", str(e.exception)) diff --git a/test/tests_for_auth/ee2_workspaceauth_test.py b/test/tests_for_auth/ee2_workspaceauth_test.py index af4d614c0..4412afb3b 100644 --- a/test/tests_for_auth/ee2_workspaceauth_test.py +++ b/test/tests_for_auth/ee2_workspaceauth_test.py @@ -3,6 +3,7 @@ import requests_mock +from installed_clients.WorkspaceClient import Workspace from execution_engine2.authorization.workspaceauth import WorkspaceAuth from test.utils_shared.test_utils import read_config_into_dict @@ -37,13 +38,17 @@ def _mock_ws_deleted(self, rq_mock, ws_id): "POST", self.ws_url, [{"json": response, "status_code": 500}] ) + def _get_ws(self, token) -> Workspace: + # TODO these tests can be converted to unit tests by mocking the Workspace class + return Workspace(url=self.ws_url, token=token) + @requests_mock.Mocker() def test_can_read_ok(self, rq_mock): cases = {"123": True, "456": True, "789": False, "321": True} ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} for ws_id in ws_id_map.keys(): self._mock_ok_ws_perms(rq_mock, self.user, {ws_id: ws_id_map[ws_id]}) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_read(ws_id) self.assertEqual(perms, cases[ws_id]) @@ -52,7 +57,7 @@ def test_can_read_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_read(ws_id) self.assertIn( "An error occurred while fetching user permissions from the Workspace", @@ -65,7 +70,7 @@ def test_can_write_ok(self, rq_mock): ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} for ws_id in ws_id_map.keys(): self._mock_ok_ws_perms(rq_mock, self.user, {ws_id: ws_id_map[ws_id]}) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_write(ws_id) self.assertEqual(perms, cases[ws_id]) @@ -74,7 +79,7 @@ def test_can_write_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_write(ws_id) self.assertIn( "An error occurred while fetching user permissions from the Workspace", @@ -86,7 +91,7 @@ def test_can_read_list_ok(self, rq_mock): ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} cases = {"123": True, "456": True, "789": False, "321": True} self._mock_ok_ws_perms(rq_mock, self.user, ws_id_map) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_read_list(list(ws_id_map.keys())) self.assertEqual(perms, cases) @@ -95,7 +100,7 @@ def test_can_read_list_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_read_list([ws_id]) self.assertIn( "An error occurred while fetching user permissions from the Workspace", @@ -107,7 +112,7 @@ def test_can_write_list_ok(self, rq_mock): ws_id_map = {"123": "r", "456": "a", "789": "n", "321": "w"} cases = {"123": False, "456": True, "789": False, "321": True} self._mock_ok_ws_perms(rq_mock, self.user, ws_id_map) - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) perms = wsauth.can_write_list(list(ws_id_map.keys())) self.assertEqual(perms, cases) @@ -116,7 +121,7 @@ def test_can_write_list_fail(self, rq_mock): ws_id = 67890 self._mock_ws_deleted(rq_mock, ws_id) with self.assertRaises(RuntimeError) as e: - wsauth = WorkspaceAuth("foo", self.user, self.ws_url) + wsauth = WorkspaceAuth(self.user, self._get_ws("foo")) wsauth.can_write_list([ws_id]) self.assertIn( "An error occurred while fetching user permissions from the Workspace", diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index eb558df01..9591f16a1 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -2,12 +2,13 @@ import logging import os import unittest -from configparser import ConfigParser +from datetime import datetime from bson.objectid import ObjectId from execution_engine2.db.MongoUtil import MongoUtil -from execution_engine2.db.models.models import Job, JobLog +from execution_engine2.db.models.models import Job, JobLog, Status +from execution_engine2.sdk.EE2Runjob import JobIdPair from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -57,8 +58,89 @@ def test_init_ok(self): mongo_util = self.getMongoUtil() self.assertTrue(set(class_attri) <= set(mongo_util.__dict__.keys())) + def test_insert_jobs(self): + """Check to see that jobs are inserted into mongo""" + job = get_example_job(status=Status.created.value) + job2 = get_example_job(status=Status.created.value) + jobs_to_insert = [job, job2] + job_ids = self.getMongoUtil().insert_jobs(jobs_to_insert) + assert len(job_ids) == len(jobs_to_insert) + retrieved_jobs = self.getMongoUtil().get_jobs(job_ids=job_ids) + + for i, retrieved_job in enumerate(retrieved_jobs): + assert jobs_to_insert[i].to_json() == retrieved_job.to_json() + + def test_update_jobs_enmasse(self): + """Check to see that created jobs get updated to queued""" + for state in Status: + job = get_example_job(status=Status.created.value, scheduler_id=None) + job2 = get_example_job(status=state.value, scheduler_id=None) + job3 = get_example_job(status=state.value, scheduler_id=None) + jobs = [job, job2, job3] + + for j in jobs: + j.scheduler_id = None + j.save() + assert j.scheduler_id is None + + job_ids = [job.id, job2.id, job3.id] + scheduler_ids = ["humpty", "dumpty", "alice"] + jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) + + now_ms = datetime.utcnow().timestamp() + + self.getMongoUtil().update_jobs_to_queued(jobs_to_update) + job.reload() + job2.reload() + job3.reload() + + # Check that sched ids are set + for i, val in enumerate(scheduler_ids): + assert jobs[i].scheduler_id == val + assert jobs[i].scheduler_type == "condor" + + # Checks that a timestamp in seconds since the epoch is within a second of the current time. + for j in jobs: + assert now_ms + 1 > j.updated + assert now_ms - 1 < j.updated + + # First job always should transition to queued + assert job.status == Status.queued.value + + # Created jobs should transition + if state.value == Status.created.value: + assert all(j.status == Status.queued.value for j in [job, job2, job3]) + + else: + # Don't change their state + assert all(j.status == state.value for j in [job2, job3]) + + def test_update_jobs_enmasse_bad_job_pairs(self): + job = get_example_job(status=Status.created.value).save() + job2 = get_example_job(status=Status.created.value).save() + job3 = get_example_job(status=Status.created.value).save() + job_ids = [job.id, job2.id, job3.id] + scheduler_ids = [job.scheduler_id, job2.scheduler_id, None] + job_id_pairs = list(map(JobIdPair, job_ids, scheduler_ids)) + + with self.assertRaisesRegex( + expected_exception=ValueError, + expected_regex=f"Provided a bad job_id_pair, missing scheduler_id for {job3.id}", + ): + self.getMongoUtil().update_jobs_to_queued(job_id_pairs) + + job_ids = [job.id, job2.id, None] + scheduler_ids = [job.scheduler_id, job2.scheduler_id, job3.scheduler_id] + job_id_pairs = list(map(JobIdPair, job_ids, scheduler_ids)) + + with self.assertRaisesRegex( + expected_exception=ValueError, + expected_regex=f"Provided a bad job_id_pair, missing job_id for {job3.scheduler_id}", + ): + self.getMongoUtil().update_jobs_to_queued(job_id_pairs) + def test_get_by_cluster(self): - """ Get a job by its condor scheduler_id""" + """Get a job by its condor scheduler_id""" mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): job = get_example_job() @@ -67,7 +149,6 @@ def test_get_by_cluster(self): self.assertEqual(str(job_id), batch) def test_get_job_ok(self): - mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): @@ -90,6 +171,8 @@ def test_get_job_ok(self): "scheduler_id", "child_jobs", "batch_job", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -110,6 +193,8 @@ def test_get_job_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -129,6 +214,8 @@ def test_get_job_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -136,7 +223,6 @@ def test_get_job_ok(self): self.assertEqual(ori_job_count, Job.objects.count()) def test_get_jobs_ok(self): - mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): @@ -161,6 +247,8 @@ def test_get_jobs_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] for job in jobs: @@ -180,6 +268,8 @@ def test_get_jobs_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] for job in jobs: self.assertCountEqual(job.to_mongo().to_dict().keys(), expected_keys) @@ -189,7 +279,6 @@ def test_get_jobs_ok(self): self.assertEqual(ori_job_count, Job.objects.count()) def test_connection_ok(self): - mongo_util = self.getMongoUtil() with mongo_util.mongo_engine_connection(): @@ -211,6 +300,8 @@ def test_connection_ok(self): "scheduler_id", "batch_job", "child_jobs", + "retry_ids", + "retry_saved_toggle", ] self.assertCountEqual(job.keys(), expected_keys) @@ -329,7 +420,6 @@ def test_delete_one_ok(self): self.assertEqual(col.count_documents({}), doc_count) def test_get_job_log_pymongo_ok(self): - mongo_util = self.getMongoUtil() primary_key = ObjectId() diff --git a/test/tests_for_integration/api_to_db_test.py b/test/tests_for_integration/api_to_db_test.py new file mode 100644 index 000000000..c2943d1d0 --- /dev/null +++ b/test/tests_for_integration/api_to_db_test.py @@ -0,0 +1,2006 @@ +""" +Integration tests that cover the entire codebase from API to database. + +NOTE 1: These tests are designed to only be runnable after running docker-compose up. + +NOTE 2: These tests were set up quickly in order to debug a problem with administration related +calls. As such, the auth server was set up to run in test mode locally. If more integrations +are needed, they will need to be added either locally or as docker containers. +If the latter, the test auth and workspace integrations will likely need to be converted to +docker containers or exposed to other containers. + +NOTE 3: Although this is supposed to be an integration test, the catalog service and htcondor +are still mocked out as bringing them up would take a large amount of effort. Someday... + +NOTE 4: Kafka notes + a) Currently nothing listens to the kafka feed. + b) When running the tests, the kafka producer logs that kafka cannot be reached. However, + this error is silent otherwise. + c) I wasn't able to contact the docker kafka service with the kafka-python client either. + d) As such, Kafka is not tested. Once tests are added, at least one test should check that + something sensible happens if a kafka message cannot be sent. + +NOTE 5: EE2 posting to Slack always fails silently in tests. Currently slack calls are not tested. +""" + +# TODO add more integration tests, these are not necessarily exhaustive + +import os +import tempfile +import time +from configparser import ConfigParser +from pathlib import Path +from threading import Thread +from typing import Dict +from unittest.mock import patch, create_autospec, ANY, call + +import htcondor +import pymongo +from bson import ObjectId +from pytest import fixture, raises + +from execution_engine2.exceptions import InvalidParameterForBatch +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from installed_clients.WorkspaceClient import Workspace +from installed_clients.baseclient import ServerError +from installed_clients.execution_engine2Client import execution_engine2 as ee2client +from test.utils_shared.test_utils import bootstrap +from tests_for_integration.auth_controller import AuthController +from tests_for_integration.workspace_controller import WorkspaceController + +# in the future remove this +from tests_for_utils.Condor_test import _get_common_sub +from utils_shared.test_utils import ( + get_full_test_config, + get_ee2_test_config, + EE2_CONFIG_SECTION, + KB_DEPLOY_ENV, + find_free_port, + create_auth_login_token, + create_auth_user, + create_auth_role, + set_custom_roles, + assert_close_to_now, + assert_exception_correct, +) + +bootstrap() + +KEEP_TEMP_FILES = False +TEMP_DIR = Path("test_temp_can_delete") + +# may need to make this configurable +JARS_DIR = Path("/opt/jars/lib/jars") + +USER_READ_ADMIN = "readuser" +TOKEN_READ_ADMIN = None +USER_NO_ADMIN = "nouser" +TOKEN_NO_ADMIN = None +USER_WRITE_ADMIN = "writeuser" +TOKEN_WRITE_ADMIN = None + +USER_KBASE_CONCIERGE = "kbaseconcierge" +TOKEN_KBASE_CONCIERGE = None + +USER_WS_READ_ADMIN = "wsreadadmin" +TOKEN_WS_READ_ADMIN = None +USER_WS_FULL_ADMIN = "wsfulladmin" +TOKEN_WS_FULL_ADMIN = None +WS_READ_ADMIN = "WS_READ_ADMIN" +WS_FULL_ADMIN = "WS_FULL_ADMIN" + +CAT_GET_MODULE_VERSION = "installed_clients.CatalogClient.Catalog.get_module_version" +CAT_LIST_CLIENT_GROUPS = ( + "installed_clients.CatalogClient.Catalog.list_client_group_configs" +) + +# from test/deploy.cfg +MONGO_EE2_DB = "ee2" +MONGO_EE2_JOBS_COL = "ee2_jobs" + +_MOD = "mod.meth" +_APP = "mod/app" + + +@fixture(scope="module") +def config() -> Dict[str, str]: + yield get_ee2_test_config() + + +@fixture(scope="module") +def full_config() -> ConfigParser: + yield get_full_test_config() + + +@fixture(scope="module") +def mongo_client(config): + mc = pymongo.MongoClient( + config["mongo-host"], + username=config["mongo-user"], + password=config["mongo-password"], + ) + yield mc + + mc.close() + + +def _clean_db(mongo_client, db, db_user): + try: + mongo_client[db].command("dropUser", db_user) + except pymongo.errors.OperationFailure as e: + if f"User '{db_user}@{db}' not found" not in e.args[0]: + raise # otherwise ignore and continue, user is already toast + mongo_client.drop_database(db) + + +def _create_db_user(mongo_client, db, db_user, password): + mongo_client[db].command("createUser", db_user, pwd=password, roles=["readWrite"]) + + +def _set_up_auth_user(auth_url, user, display, roles=None): + create_auth_user(auth_url, user, display) + if roles: + set_custom_roles(auth_url, user, roles) + return create_auth_login_token(auth_url, user) + + +def _set_up_auth_users(auth_url): + create_auth_role(auth_url, ADMIN_READ_ROLE, "ee2 admin read doohickey") + create_auth_role(auth_url, ADMIN_WRITE_ROLE, "ee2 admin write thinger") + create_auth_role(auth_url, WS_READ_ADMIN, "wsr") + create_auth_role(auth_url, WS_FULL_ADMIN, "wsf") + + global TOKEN_READ_ADMIN + TOKEN_READ_ADMIN = _set_up_auth_user( + auth_url, USER_READ_ADMIN, "display1", [ADMIN_READ_ROLE] + ) + + global TOKEN_NO_ADMIN + TOKEN_NO_ADMIN = _set_up_auth_user(auth_url, USER_NO_ADMIN, "display2") + + global TOKEN_WRITE_ADMIN + TOKEN_WRITE_ADMIN = _set_up_auth_user( + auth_url, USER_WRITE_ADMIN, "display3", [ADMIN_WRITE_ROLE] + ) + + global TOKEN_KBASE_CONCIERGE + TOKEN_KBASE_CONCIERGE = _set_up_auth_user( + auth_url, USER_KBASE_CONCIERGE, "concierge" + ) + + global TOKEN_WS_READ_ADMIN + TOKEN_WS_READ_ADMIN = _set_up_auth_user( + auth_url, USER_WS_READ_ADMIN, "wsra", [WS_READ_ADMIN] + ) + + global TOKEN_WS_FULL_ADMIN + TOKEN_WS_FULL_ADMIN = _set_up_auth_user( + auth_url, USER_WS_FULL_ADMIN, "wsrf", [WS_FULL_ADMIN] + ) + + +@fixture(scope="module") +def auth_url(config, mongo_client): + auth_db = "api_to_db_auth_test" + auth_mongo_user = "auth" + # clean up from any previously failed test runs that left the db in place + _clean_db(mongo_client, auth_db, auth_mongo_user) + + # make a user for the auth db + _create_db_user(mongo_client, auth_db, auth_mongo_user, "authpwd") + + auth = AuthController( + JARS_DIR, + config["mongo-host"], + auth_db, + TEMP_DIR, + mongo_user=auth_mongo_user, + mongo_pwd="authpwd", + ) + print( + f"Started KBase Auth2 {auth.version} on port {auth.port} " + + f"in dir {auth.temp_dir} in {auth.startup_count}s" + ) + url = f"http://localhost:{auth.port}" + + _set_up_auth_users(url) + + yield url + + print(f"shutting down auth, KEEP_TEMP_FILES={KEEP_TEMP_FILES}") + auth.destroy(not KEEP_TEMP_FILES) + + # Because the tests are run with mongo in a persistent docker container via docker-compose, + # we need to clean up after ourselves. + _clean_db(mongo_client, auth_db, auth_mongo_user) + + +def _add_ws_types(ws_controller): + wsc = Workspace(f"http://localhost:{ws_controller.port}", token=TOKEN_WS_FULL_ADMIN) + wsc.request_module_ownership("Trivial") + wsc.administer({"command": "approveModRequest", "module": "Trivial"}) + wsc.register_typespec( + { + "spec": """ + module Trivial { + /* @optional dontusethisfieldorifyoudomakesureitsastring */ + typedef structure { + string dontusethisfieldorifyoudomakesureitsastring; + } Object; + }; + """, + "dryrun": 0, + "new_types": ["Object"], + } + ) + wsc.release_module("Trivial") + + +@fixture(scope="module") +def ws_controller(config, mongo_client, auth_url): + ws_db = "api_to_db_ws_test" + ws_types_db = "api_to_db_ws_types_test" + ws_mongo_user = "workspace" + # clean up from any previously failed test runs that left the db in place + _clean_db(mongo_client, ws_db, ws_mongo_user) + _clean_db(mongo_client, ws_types_db, ws_mongo_user) + + # make a user for the ws dbs + _create_db_user(mongo_client, ws_db, ws_mongo_user, "wspwd") + _create_db_user(mongo_client, ws_types_db, ws_mongo_user, "wspwd") + + ws = WorkspaceController( + JARS_DIR, + config["mongo-host"], + ws_db, + ws_types_db, + auth_url + "/testmode/", + TEMP_DIR, + mongo_user=ws_mongo_user, + mongo_pwd="wspwd", + ) + print( + f"Started KBase Workspace {ws.version} on port {ws.port} " + + f"in dir {ws.temp_dir} in {ws.startup_count}s" + ) + _add_ws_types(ws) + + yield ws + + print(f"shutting down workspace, KEEP_TEMP_FILES={KEEP_TEMP_FILES}") + ws.destroy(not KEEP_TEMP_FILES) + + # Because the tests are run with mongo in a persistent docker container via docker-compose, + # we need to clean up after ourselves. + _clean_db(mongo_client, ws_db, ws_mongo_user) + _clean_db(mongo_client, ws_types_db, ws_mongo_user) + + +def _update_config_and_create_config_file(full_config, auth_url, ws_controller): + """ + Updates the config in place with the correct auth url for the tests and + writes the updated config to a temporary file. + + Returns the file path. + """ + # Don't call get_ee2_test_config here, we *want* to update the config object in place + # so any other tests that use the config fixture run against the test auth server if they + # access those keys + ee2c = full_config[EE2_CONFIG_SECTION] + ee2c["auth-service-url"] = auth_url + "/testmode/api/legacy/KBase/Sessions/Login" + ee2c["auth-service-url-v2"] = auth_url + "/testmode/api/v2/token" + ee2c["auth-url"] = auth_url + "/testmode" + ee2c["auth-service-url-allow-insecure"] = "true" + ee2c["workspace-url"] = f"http://localhost:{ws_controller.port}" + + deploy = tempfile.mkstemp(".cfg", "deploy-", dir=TEMP_DIR, text=True) + os.close(deploy[0]) + + with open(deploy[1], "w") as handle: + full_config.write(handle) + + return deploy[1] + + +def _clear_dbs( + mc: pymongo.MongoClient, config: Dict[str, str], ws_controller: WorkspaceController +): + ee2 = mc[config["mongo-database"]] + for name in ee2.list_collection_names(): + if not name.startswith("system."): + # don't drop collection since that drops indexes + ee2.get_collection(name).delete_many({}) + ws_controller.clear_db() + + +@fixture(scope="module") +def service(full_config, auth_url, mongo_client, config, ws_controller): + # also updates the config in place so it contains the correct auth urls for any other + # methods that use the config fixture + cfgpath = _update_config_and_create_config_file( + full_config, auth_url, ws_controller + ) + print(f"created test deploy at {cfgpath}") + _clear_dbs(mongo_client, config, ws_controller) + + prior_deploy = os.environ[KB_DEPLOY_ENV] + # from this point on, calling the get_*_test_config methods will get the temp config file + os.environ[KB_DEPLOY_ENV] = cfgpath + # The server creates the configuration, impl, and application *AT IMPORT TIME* so we have to + # import *after* setting the config path. + # This is terrible design. Awful. It definitely wasn't me that wrote it over Xmas in 2012 + from execution_engine2 import execution_engine2Server + + portint = find_free_port() + Thread( + target=execution_engine2Server.start_server, + kwargs={"port": portint}, + daemon=True, + ).start() + time.sleep(0.05) + port = str(portint) + print("running ee2 service at localhost:" + port) + yield port + + # shutdown the server + # SampleServiceServer.stop_server() <-- this causes an error. + # See the server file for the full scoop, but in short, the stop method expects a _proc + # package variable to be set, but start doesn't always set it, and that causes an error. + + # Tests are run in the same process so we need to be put the environment back the way it was + os.environ[KB_DEPLOY_ENV] = prior_deploy + + if not KEEP_TEMP_FILES: + os.remove(cfgpath) + + +@fixture +def ee2_port(service, mongo_client, config, ws_controller): + _clear_dbs(mongo_client, config, ws_controller) + + yield service + + +def test_is_admin_success(ee2_port): + ee2cli_read = ee2client("http://localhost:" + ee2_port, token=TOKEN_READ_ADMIN) + ee2cli_no = ee2client("http://localhost:" + ee2_port, token=TOKEN_NO_ADMIN) + ee2cli_write = ee2client("http://localhost:" + ee2_port, token=TOKEN_WRITE_ADMIN) + + # note that if we ever need to have Java talk to ee2 these responses will break the SDK client + assert ee2cli_read.is_admin() is True + assert ee2cli_no.is_admin() is False + assert ee2cli_write.is_admin() is True + + +def test_get_admin_permission_success(ee2_port): + ee2cli_read = ee2client("http://localhost:" + ee2_port, token=TOKEN_READ_ADMIN) + ee2cli_no = ee2client("http://localhost:" + ee2_port, token=TOKEN_NO_ADMIN) + ee2cli_write = ee2client("http://localhost:" + ee2_port, token=TOKEN_WRITE_ADMIN) + + assert ee2cli_read.get_admin_permission() == {"permission": "r"} + assert ee2cli_no.get_admin_permission() == {"permission": "n"} + assert ee2cli_write.get_admin_permission() == {"permission": "w"} + + +######## run_job tests ######## + + +def _get_htc_mocks(): + sub = create_autospec(htcondor.Submit, spec_set=True, instance=True) + schedd = create_autospec(htcondor.Schedd, spec_set=True, instance=True) + txn = create_autospec(htcondor.Transaction, spec_set=True, instance=True) + return sub, schedd, txn + + +def _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn): + sub_init.return_value = sub + schedd_init.return_value = schedd + # mock context manager ops + schedd.transaction.return_value = txn + txn.__enter__.return_value = txn + return sub, schedd, txn + + +def _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub): + sub_init.assert_called_once_with(expected_sub) + schedd_init.assert_called_once_with() + schedd.transaction.assert_called_once_with() + sub.queue.assert_called_once_with(txn, 1) + + +def _set_up_workspace_objects(ws_controller, token, ws_name="foo"): + wsc = Workspace(ws_controller.get_url(), token=token) + wsid = wsc.create_workspace({"workspace": ws_name})[0] + wsc.save_objects( + { + "id": wsid, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + {"name": "two", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + + +def _get_run_job_param_set(app_id=_APP, job_reqs=None, as_admin=False): + return { + "method": _MOD, + "app_id": app_id, + "wsid": 1, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "parent_job_id": "totallywrongid", + "job_requirements": job_reqs, + "as_admin": as_admin, + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "thiskey": "getssilentlydropped", + }, + } + + +def _get_condor_sub_for_rj_param_set( + job_id, + user, + token, + clientgroup, + cpu, + mem, + disk, + parent_job_id="totallywrongid", + app_id=_APP, + app_module="mod", + wsid=1, +): + expected_sub = _get_common_sub(job_id) + expected_sub.update( + { + "JobBatchName": job_id, + "arguments": f"{job_id} https://ci.kbase.us/services/ee2", + "+KB_PARENT_JOB_ID": f'"{parent_job_id}"', + "+KB_MODULE_NAME": '"mod"', + "+KB_FUNCTION_NAME": '"meth"', + "+KB_APP_ID": f'"{app_id}"' if app_id else "", + "+KB_APP_MODULE_NAME": f'"{app_module}"' if app_module else "", + "+KB_WSID": f'"{wsid}"', + "+KB_SOURCE_WS_OBJECTS": '"1/1/1,1/2/1"', + "request_cpus": f"{cpu}", + "request_memory": f"{mem}MB", + "request_disk": f"{disk}GB", + "requirements": f'regexp("{clientgroup}",CLIENTGROUP)', + "+KB_CLIENTGROUP": f'"{clientgroup}"', + "Concurrency_Limits": f"{user}", + "+AccountingGroup": f'"{user}"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=604805 KB_ADMIN_AUTH_TOKEN=test_auth_token ' + + f"KB_AUTH_TOKEN={token} CLIENTGROUP={clientgroup} JOB_ID={job_id} " + + "CONDOR_ID=$(Cluster).$(Process) PYTHON_EXECUTABLE=/miniconda/bin/python " + + f'DEBUG_MODE=False PARENT_JOB_ID={parent_job_id} "' + ), + "leavejobinqueue": "true", + "initial_dir": "../scripts/", + "+Owner": '"condor_pool"', + "executable": "../scripts//../scripts/execute_runner.sh", + "transfer_input_files": "../scripts/JobRunner.tgz", + } + ) + return expected_sub + + +def _get_mongo_job(mongo_client, job_id, has_queued=True): + # also checks and removes the queued and updated times + job = mongo_client[MONGO_EE2_DB][MONGO_EE2_JOBS_COL].find_one( + {"_id": ObjectId(job_id)} + ) + assert_close_to_now(job.pop("updated")) + if has_queued: + assert_close_to_now(job.pop("queued")) + return job + + +def _check_mongo_job( + mongo_client, job_id, user, app_id, clientgroup, cpu, mem, disk, githash +): + job = _get_mongo_job(mongo_client, job_id) + expected_job = { + "_id": ObjectId(job_id), + "user": user, + "authstrat": "kbaseworkspace", + "wsid": 1, + "status": "queued", + "job_input": { + "wsid": 1, + "method": _MOD, + "params": [{"foo": "bar"}, 42], + "service_ver": githash, + "source_ws_objects": ["1/1/1", "1/2/1"], + "parent_job_id": "totallywrongid", + "requirements": { + "clientgroup": clientgroup, + "cpu": cpu, + "memory": mem, + "disk": disk, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + }, + }, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + if app_id: + expected_job["job_input"]["app_id"] = app_id + assert job == expected_job + + +def test_run_job_no_app_id(ee2_port, ws_controller, mongo_client): + _run_job( + ee2_port, + ws_controller, + mongo_client, + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + ) + + +def test_run_job_with_app_id(ee2_port, ws_controller, mongo_client): + _run_job( + ee2_port, + ws_controller, + mongo_client, + app_id="mod/app", + app_mod="mod", + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + ) + + +def test_run_job_with_job_requirements_full(ee2_port, ws_controller, mongo_client): + """ + Tests running a job where all requirements are specified on input. + """ + + def modify_sub(sub): + del sub["Concurrency_Limits"] + sub["requirements"] = ( + '(CLIENTGROUP == "extreme") && (after == "pantsremoval") && ' + + '(beforemy == "2pmsalonappt")' + ) + sub["+AccountingGroup"] = '"borishesgoodforit"' + sub["environment"] = sub["environment"].replace( + "DEBUG_MODE=False", "DEBUG_MODE=True" + ) + + _run_job( + ee2_port, + ws_controller, + mongo_client, + job_reqs={ + "request_cpus": 21, + "request_memory": 34, + "request_disk": 99, + "client_group": "extreme", + "client_group_regex": 0, + "bill_to_user": "borishesgoodforit", + "ignore_concurrency_limits": "true", + "scheduler_requirements": { + "beforemy": "2pmsalonappt", + "after": "pantsremoval", + }, + "debug_mode": True, + }, + modify_sub=modify_sub, + clientgroup="extreme", + cpu=21, + mem=34, + disk=99, + catalog_return=[ + { + "client_groups": [ + '{"client_group":"njs","request_cpus":8,"request_memory":5}' + ] + } + ], + as_admin=7, # truthy + user=USER_WRITE_ADMIN, + token=TOKEN_WRITE_ADMIN, + ) + + +def test_run_job_with_job_requirements_mixed(ee2_port, ws_controller, mongo_client): + """ + Tests running a job where requirements are specified on input, from the catalog, and from + the deploy.cfg file. + """ + _run_job( + ee2_port, + ws_controller, + mongo_client, + job_reqs={"request_cpus": 9}, + clientgroup="njs", + cpu=9, + mem=5, + disk=30, + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + as_admin="wheee", # truthy + user=USER_WRITE_ADMIN, + token=TOKEN_WRITE_ADMIN, + ) + + +def _run_job( + ee2_port, + ws_controller, + mongo_client, + app_id=None, + app_mod=None, + job_reqs=None, + modify_sub=lambda x: x, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + catalog_return=None, + as_admin=False, + user=None, + token=None, +): + # values in the method sig are set at the time of method creation, at which time the + # user and token fields haven't yet been set by the fixtures + user = user if user else USER_NO_ADMIN + token = token if token else TOKEN_NO_ADMIN + _set_up_workspace_objects(ws_controller, token) + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.return_value = 123 + list_cgroups.return_value = catalog_return or [] + get_mod_ver.return_value = {"git_commit_hash": "somehash"} + + # run the method + ee2 = ee2client(f"http://localhost:{ee2_port}", token=token) + params = _get_run_job_param_set(app_id, job_reqs, as_admin) + job_id = ee2.run_job(params) + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_called_once_with( + ANY, {"module_name": "mod", "version": "beta"} + ) + list_cgroups.assert_called_once_with( + ANY, {"module_name": "mod", "function_name": "meth"} + ) + + expected_sub = _get_condor_sub_for_rj_param_set( + job_id, + user, + token, + clientgroup=clientgroup, + cpu=cpu, + mem=mem, + disk=disk, + app_id=app_id, + app_module=app_mod, + ) + modify_sub(expected_sub) + _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) + + _check_mongo_job( + mongo_client, + job_id, + user, + app_id, + clientgroup=clientgroup, + cpu=cpu, + mem=mem, + disk=disk, + githash="somehash", + ) + + +def test_run_job_fail_not_admin(ee2_port): + params = {"method": _MOD, "as_admin": 1} + err = "Access Denied: You are not an administrator. AdminPermissions.NONE" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_only_read_admin(ee2_port): + params = {"method": _MOD, "as_admin": 1} + err = ( + "Access Denied: You are a read-only admin. This function requires write access" + ) + _run_job_fail(ee2_port, TOKEN_READ_ADMIN, params, err) + + +def test_run_job_fail_no_workspace_access(ee2_port): + params = {"method": _MOD, "wsid": 1} + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 1 exists'))" + ) + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_cpu(ee2_port): + params = {"method": _MOD, "job_requirements": {"request_cpus": -10}} + err = "CPU count must be at least 1" + _run_job_fail(ee2_port, TOKEN_WRITE_ADMIN, params, err) + + +def test_run_job_fail_bad_scheduler_requirements(ee2_port): + params = { + "method": _MOD, + "job_requirements": {"scheduler_requirements": {"foo": ""}}, + } + # TODO non-string keys/values in schd_reqs causes a not-very-useful error + # Since it's admin only don't worry about it for now + err = "Missing input parameter: value for key 'foo' in scheduler requirements structure" + _run_job_fail(ee2_port, TOKEN_WRITE_ADMIN, params, err) + + +def test_run_job_fail_job_reqs_but_no_as_admin(ee2_port): + params = {"method": _MOD, "job_requirements": {"request_cpus": 10}} + err = "In order to specify job requirements you must be a full admin" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_method(ee2_port): + params = {"method": "mod.meth.moke"} + err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_app(ee2_port): + params = {"method": _MOD, "app_id": "mod.ap\bp"} + err = "application ID contains control characters" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_bad_upa(ee2_port): + params = { + "method": _MOD, + "source_ws_objects": ["ws/obj/1"], + } + err = ( + "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" + ) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def test_run_job_fail_no_such_object(ee2_port, ws_controller): + # Set up workspace and objects + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + params = {"method": _MOD, "source_ws_objects": ["1/2/1"]} + err = "Some workspace object is inaccessible" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_fail(ee2_port, TOKEN_NO_ADMIN, params, err) + + +def _run_job_fail(ee2_port, token, params, expected, throw_exception=False): + client = ee2client(f"http://localhost:{ee2_port}", token=token) + if throw_exception: + client.run_job(params) + else: + with raises(ServerError) as got: + client.run_job(params) + assert_exception_correct(got.value, ServerError("name", 1, expected)) + + +######## run_job_concierge tests ######## + + +def test_run_job_concierge_minimal(ee2_port, ws_controller, mongo_client): + def modify_sub(sub): + del sub["Concurrency_Limits"] + + _run_job_concierge( + ee2_port=ee2_port, + ws_controller=ws_controller, + mongo_client=mongo_client, + # if the concierge dict is empty, regular old run_job gets run + conc_params={"trigger": "concierge"}, # contents are ignored + modify_sub=modify_sub, + clientgroup="concierge", + cpu=4, + mem=23000, + disk=100, + ) + + +def test_run_job_concierge_mixed(ee2_port, ws_controller, mongo_client): + """ + Gets cpu from the input, memory from deploy.cfg, and disk from the catalog. + """ + + def modify_sub(sub): + del sub["Concurrency_Limits"] + + _run_job_concierge( + ee2_port=ee2_port, + ws_controller=ws_controller, + mongo_client=mongo_client, + conc_params={"client_group": "extreme", "request_cpus": 76}, + modify_sub=modify_sub, + clientgroup="extreme", + cpu=76, + mem=250000, + disk=7, + catalog_return=[{"client_groups": ['{"request_cpus":8,"request_disk":7}']}], + ) + + +def test_run_job_concierge_maximal(ee2_port, ws_controller, mongo_client): + def modify_sub(sub): + sub[ + "requirements" + ] = '(CLIENTGROUP == "bigmem") && (baz == "bat") && (foo == "bar")' + sub["Concurrency_Limits"] = "some_sucker" + sub["+AccountingGroup"] = '"some_sucker"' + sub["environment"] = sub["environment"].replace( + "DEBUG_MODE=False", "DEBUG_MODE=True" + ) + + _run_job_concierge( + ee2_port=ee2_port, + ws_controller=ws_controller, + mongo_client=mongo_client, + conc_params={ + "client_group": "bigmem", + "request_cpus": 42, + "request_memory": 56, + "request_disk": 89, + "client_group_regex": False, + "account_group": "some_sucker", + "ignore_concurrency_limits": False, + "requirements_list": ["foo=bar", "baz=bat"], + "debug_mode": "true", + }, + modify_sub=modify_sub, + clientgroup="bigmem", + cpu=42, + mem=56, + disk=89, + ) + + +def _run_job_concierge( + ee2_port, + ws_controller, + mongo_client, + conc_params, + modify_sub, + clientgroup, + cpu, + mem, + disk, + catalog_return=None, +): + _set_up_workspace_objects(ws_controller, TOKEN_KBASE_CONCIERGE) + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.return_value = 123 + list_cgroups.return_value = catalog_return or [] + get_mod_ver.return_value = {"git_commit_hash": "somehash"} + + # run the method + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_KBASE_CONCIERGE) + # if the concierge dict is empty, regular old run_job gets run + job_id = ee2.run_job_concierge(_get_run_job_param_set(), conc_params) + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_called_once_with( + ANY, {"module_name": "mod", "version": "beta"} + ) + list_cgroups.assert_called_once_with( + ANY, {"module_name": "mod", "function_name": "meth"} + ) + + expected_sub = _get_condor_sub_for_rj_param_set( + job_id, + USER_KBASE_CONCIERGE, + TOKEN_KBASE_CONCIERGE, + clientgroup, + cpu, + mem, + disk, + ) + modify_sub(expected_sub) + + _check_htc_calls(sub_init, sub, schedd_init, schedd, txn, expected_sub) + + _check_mongo_job( + mongo_client, + job_id, + USER_KBASE_CONCIERGE, + app_id="mod/app", + clientgroup=clientgroup, + cpu=cpu, + mem=mem, + disk=disk, + githash="somehash", + ) + + +def test_run_job_concierge_fail_no_workspace_access(ee2_port): + params = {"method": _MOD, "wsid": 1} + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 1 exists'))" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_not_concierge(ee2_port): + params = {"method": _MOD} + err = "You are not the concierge user. This method is not for you" + _run_job_concierge_fail(ee2_port, TOKEN_NO_ADMIN, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_bad_method(ee2_port): + params = {"method": "mod.meth.moke"} + err = "Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err) + + +def test_run_job_concierge_fail_reqs_list_not_list(ee2_port): + params = {"method": _MOD} + conc_params = {"requirements_list": {"a": "b"}} + err = "requirements_list must be a list" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_reqs_list_bad_req(ee2_port): + params = {"method": _MOD} + conc_params = {"requirements_list": ["a=b", "touchmymonkey"]} + err = "Found illegal requirement in requirements_list: touchmymonkey" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_cpu(ee2_port): + params = {"method": _MOD} + conc_params = {"request_cpus": [2]} + err = ( + "Found illegal cpu request '[2]' in job requirements from concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_mem(ee2_port): + params = {"method": _MOD} + conc_params = {"request_memory": "-3"} + err = "memory in MB must be at least 1" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_disk(ee2_port): + params = {"method": _MOD} + conc_params = {"request_disk": 4.5} + err = ( + "Found illegal disk request '4.5' in job requirements from concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_clientgroup(ee2_port): + params = {"method": _MOD} + conc_params = {"client_group": "fakefakefake"} + err = "No such clientgroup: fakefakefake" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err + ) + + +def test_run_job_concierge_fail_bad_clientgroup_regex(ee2_port): + params = {"method": _MOD} + conc_params = {"client_group_regex": "now I have 2 problems"} + err = ( + "Found illegal client group regex 'now I have 2 problems' in job requirements " + + "from concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_catalog_data(ee2_port): + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [{"client_groups": ['{"request_cpus":-8}']}] + + params = {"method": _MOD} + conc_params = {"request_memory": 9} + # TODO this is not a useful error for the user. Need to change the job reqs resolver + # However, getting this wrong in the catalog is not super likely so not urgent + err = "CPU count must be at least 1" + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err + ) + + +def test_run_job_concierge_fail_bad_reqs_item(ee2_port): + params = {"method": _MOD} + conc_params = {"requirements_list": ["a=b", "=c"]} + # this error isn't the greatest but as I understand it the concierge endpoint is going + # to become redundant so don't worry about it for now + err = "Missing input parameter: key in scheduler requirements structure" + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_debug_mode(ee2_port): + params = {"method": _MOD} + conc_params = {"debug_mode": "debug debug debug"} + err = ( + "Found illegal debug mode 'debug debug debug' in job requirements from " + + "concierge parameters" + ) + _run_job_concierge_fail(ee2_port, TOKEN_KBASE_CONCIERGE, params, conc_params, err) + + +def test_run_job_concierge_fail_bad_app(ee2_port): + params = {"method": _MOD, "app_id": "mo\bd.app"} + err = "application ID contains control characters" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err + ) + + +def test_run_job_concierge_fail_bad_upa(ee2_port): + params = { + "method": _MOD, + "source_ws_objects": ["ws/obj/1"], + } + err = ( + "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" + ) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err + ) + + +def test_run_job_concierge_fail_no_such_object(ee2_port, ws_controller): + # Set up workspace and objects + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + params = {"method": _MOD, "source_ws_objects": ["1/2/1"]} + err = "Some workspace object is inaccessible" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_concierge_fail( + ee2_port, TOKEN_KBASE_CONCIERGE, params, {"a": "b"}, err + ) + + +def _run_job_concierge_fail( + ee2_port, token, params, conc_params, expected, throw_exception=False +): + client = ee2client(f"http://localhost:{ee2_port}", token=token) + if throw_exception: + client.run_job_concierge(params, conc_params) + else: + with raises(ServerError) as got: + client.run_job_concierge(params, conc_params) + assert_exception_correct(got.value, ServerError("name", 1, expected)) + + +######## run_job_batch tests ######## + + +def test_run_job_batch(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "foo") # ws 1 + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "bar") # ws 2 + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.side_effect = [123, 456] + list_cgroups.side_effect = [ + [{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + [{"client_groups": ['{"client_group":"bigmem"}']}], + ] + get_mod_ver.side_effect = [ + {"git_commit_hash": "somehash"}, + {"git_commit_hash": "somehash2"}, + ] + + # run the method + job1_params = { + "method": _MOD, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "thiskey": "getssilentlydropped", + }, + } + job2_params = { + "method": "mod2.meth2", + "app_id": "mod2/app2", + "params": [{"baz": "bat"}, 3.14], + } + job_batch_wsid = 2 + job_batch_params = { + "wsid": job_batch_wsid, + "meta": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + "thiskey": "getssilentlydropped2", + }, + } + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) + ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + + # May need to increase sleep if thread takes too long + time.sleep(0.1) + + batch_id = ret["batch_id"] + job_id_1, job_id_2 = ret["child_job_ids"] + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "version": "beta"}), + call(ANY, {"module_name": "mod2", "version": "release"}), + ] + ) + list_cgroups.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "function_name": "meth"}), + call(ANY, {"module_name": "mod2", "function_name": "meth2"}), + ] + ) + + job1 = _get_mongo_job(mongo_client, job_id_1) + job2 = _get_mongo_job(mongo_client, job_id_2) + + expected_job1 = { + "_id": ObjectId(job_id_1), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "batch_id": batch_id, + "job_input": { + "wsid": job_batch_wsid, + "method": _MOD, + "params": [{"foo": "bar"}, 42], + "service_ver": "somehash", + "source_ws_objects": ["1/1/1", "1/2/1"], + "requirements": { + "clientgroup": "njs", + "cpu": 8, + "memory": 5, + "disk": 30, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + }, + }, + "wsid": job_batch_wsid, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + + assert job1 == expected_job1 + + expected_job2 = { + "_id": ObjectId(job_id_2), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": job_batch_wsid, + "status": "queued", + "batch_id": batch_id, + "job_input": { + "wsid": job_batch_wsid, + "method": "mod2.meth2", + "params": [{"baz": "bat"}, 3.14], + "service_ver": "somehash2", + "app_id": "mod2/app2", + "source_ws_objects": [], + "requirements": { + "clientgroup": "bigmem", + "cpu": 4, + "memory": 2000, + "disk": 100, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "456", + "scheduler_type": "condor", + } + assert job2 == expected_job2 + + parent_job = _get_mongo_job(mongo_client, batch_id, has_queued=False) + expected_parent_job = { + "_id": ObjectId(batch_id), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": job_batch_wsid, + "status": "created", + "job_input": { + "method": "batch", + "service_ver": "batch", + "app_id": "batch", + "source_ws_objects": [], + "narrative_cell_info": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + }, + }, + "child_jobs": [job_id_1, job_id_2], + "batch_job": True, + "retry_ids": [], + "retry_saved_toggle": False, + } + assert parent_job == expected_parent_job + + expected_sub_1 = _get_condor_sub_for_rj_param_set( + job_id_1, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + parent_job_id=batch_id, + app_id=None, + app_module=None, + wsid=job_batch_wsid, + ) + expected_sub_1["+KB_WSID"] = f'"{job_batch_wsid}"' + expected_sub_2 = _get_condor_sub_for_rj_param_set( + job_id_2, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="bigmem", + cpu=4, + mem=2000, + disk=100, + parent_job_id=batch_id, + wsid=job_batch_wsid, + ) + expected_sub_2.update( + { + "+KB_MODULE_NAME": '"mod2"', + "+KB_FUNCTION_NAME": '"meth2"', + "+KB_APP_ID": '"mod2/app2"', + "+KB_APP_MODULE_NAME": '"mod2"', + "+KB_SOURCE_WS_OBJECTS": "", + } + ) + _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 + ) + + # Check to see check_job_batch has both the batch and child jobstates + ret = ee2.check_job_batch(params={"job_id": batch_id}) + batch_jobstate = ret["batch_jobstate"] + child_jobstates = ret["child_jobstates"] + + # Check to see that the BATCH jobstate is as expected + expected_batch_jobstate = expected_parent_job + del expected_batch_jobstate["_id"] + expected_batch_jobstate.update( + {"batch_id": None, "job_id": batch_id, "retry_count": 0} + ) + del batch_jobstate["created"] + del batch_jobstate["updated"] + assert batch_jobstate == expected_batch_jobstate + + # Check to see the child states are as expected + for expected_child_job_state in [expected_job1, expected_job2]: + expected_child_job_state["retry_count"] = 0 + expected_child_job_state["job_id"] = str(expected_child_job_state["_id"]) + del expected_child_job_state["_id"] + + for received_child_job_state in child_jobstates: + del received_child_job_state["created"] + del received_child_job_state["queued"] + del received_child_job_state["updated"] + + assert child_jobstates[0] == expected_job1 + assert child_jobstates[1] == expected_job2 + + +def test_run_job_batch_with_no_batch_wsid(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "foo") # ws 1 + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "bar") # ws 2 + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.side_effect = [123, 456] + list_cgroups.side_effect = [ + [{"client_groups": ['{"request_cpus":8,"request_memory":5}']}], + [{"client_groups": ['{"client_group":"bigmem"}']}], + ] + get_mod_ver.side_effect = [ + {"git_commit_hash": "somehash"}, + {"git_commit_hash": "somehash2"}, + ] + + # run the method + job1_params = { + "method": _MOD, + "source_ws_objects": ["1/1/1", "1/2/1"], + "params": [{"foo": "bar"}, 42], + "service_ver": "beta", + "meta": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + "thiskey": "getssilentlydropped", + }, + } + job2_params = { + "method": "mod2.meth2", + "app_id": "mod2/app2", + "params": [{"baz": "bat"}, 3.14], + } + + job_batch_params = { + "meta": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + "thiskey": "getssilentlydropped2", + }, + } + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_NO_ADMIN) + ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + + # May need to increase sleep if thread takes too long + time.sleep(0.1) + + batch_id = ret["batch_id"] + job_id_1, job_id_2 = ret["child_job_ids"] + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "version": "beta"}), + call(ANY, {"module_name": "mod2", "version": "release"}), + ] + ) + list_cgroups.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "function_name": "meth"}), + call(ANY, {"module_name": "mod2", "function_name": "meth2"}), + ] + ) + + job1 = _get_mongo_job(mongo_client, job_id_1) + job2 = _get_mongo_job(mongo_client, job_id_2) + + expected_job1 = { + "_id": ObjectId(job_id_1), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "batch_id": batch_id, + "job_input": { + "method": _MOD, + "params": [{"foo": "bar"}, 42], + "service_ver": "somehash", + "source_ws_objects": ["1/1/1", "1/2/1"], + "requirements": { + "clientgroup": "njs", + "cpu": 8, + "memory": 5, + "disk": 30, + }, + "narrative_cell_info": { + "run_id": "rid", + "token_id": "tid", + "tag": "yourit", + "cell_id": "cid", + }, + }, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + } + + assert job1 == expected_job1 + + expected_job2 = { + "_id": ObjectId(job_id_2), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "batch_id": batch_id, + "job_input": { + "method": "mod2.meth2", + "params": [{"baz": "bat"}, 3.14], + "service_ver": "somehash2", + "app_id": "mod2/app2", + "source_ws_objects": [], + "requirements": { + "clientgroup": "bigmem", + "cpu": 4, + "memory": 2000, + "disk": 100, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "retry_ids": [], + "retry_saved_toggle": False, + "batch_job": False, + "scheduler_id": "456", + "scheduler_type": "condor", + } + assert job2 == expected_job2 + + parent_job = _get_mongo_job(mongo_client, batch_id, has_queued=False) + expected_parent_job = { + "_id": ObjectId(batch_id), + "user": USER_NO_ADMIN, + "authstrat": "kbaseworkspace", + "status": "created", + "job_input": { + "method": "batch", + "service_ver": "batch", + "app_id": "batch", + "source_ws_objects": [], + "narrative_cell_info": { + "run_id": "rid2", + "token_id": "tid2", + "tag": "yourit2", + "cell_id": "cid2", + }, + }, + "child_jobs": [job_id_1, job_id_2], + "batch_job": True, + "retry_ids": [], + "retry_saved_toggle": False, + } + assert parent_job == expected_parent_job + + expected_sub_1 = _get_condor_sub_for_rj_param_set( + job_id_1, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="njs", + cpu=8, + mem=5, + disk=30, + parent_job_id=batch_id, + app_id=None, + app_module=None, + ) + expected_sub_1["+KB_WSID"] = "" + expected_sub_2 = _get_condor_sub_for_rj_param_set( + job_id_2, + USER_NO_ADMIN, + TOKEN_NO_ADMIN, + clientgroup="bigmem", + cpu=4, + mem=2000, + disk=100, + parent_job_id=batch_id, + ) + + expected_sub_2.update( + { + "+KB_WSID": "", + "+KB_MODULE_NAME": '"mod2"', + "+KB_FUNCTION_NAME": '"meth2"', + "+KB_APP_ID": '"mod2/app2"', + "+KB_APP_MODULE_NAME": '"mod2"', + "+KB_SOURCE_WS_OBJECTS": "", + } + ) + _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 + ) + + +def test_run_job_batch_as_admin_with_job_reqs(ee2_port, ws_controller, mongo_client): + """ + A test of the run_job method focusing on job requirements and minimizing all other inputs. + Since the batch endpoint uses the same code path as the single job endpoint for processing + job requirements, we only have a single test that mixes job requirements from the input, + catalog, and deploy configuration, as opposed to the multiple tests for single jobs. + """ + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN, "foo") # ws 1 + # need to get the mock objects first so spec_set can do its magic before we mock out + # the classes in the context manager + sub, schedd, txn = _get_htc_mocks() + # seriously black you're killing me here. This is readable? + with patch("htcondor.Submit", spec_set=True, autospec=True) as sub_init, patch( + "htcondor.Schedd", spec_set=True, autospec=True + ) as schedd_init, patch( + CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True + ) as list_cgroups, patch( + CAT_GET_MODULE_VERSION, spec_set=True, autospec=True + ) as get_mod_ver: + # set up the rest of the mocks + _finish_htc_mocks(sub_init, schedd_init, sub, schedd, txn) + sub.queue.side_effect = [123, 456] + list_cgroups.side_effect = [ + [{"client_groups": ['{"client_group":"bigmem"}']}], + [{"client_groups": ['{"request_disk":8,"request_memory":5}']}], + ] + get_mod_ver.side_effect = [ + {"git_commit_hash": "somehash"}, + {"git_commit_hash": "somehash2"}, + ] + + # run the method + job1_params = {"method": _MOD} + job2_params = { + "method": "mod2.meth2", + "job_requirements": { + "request_memory": 42, + "client_group": "extreme", + "client_group_regex": 0, + "bill_to_user": "forrest_gump", + "ignore_concurrency_limits": "true", + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + "debug_mode": True, + }, + } + job_batch_wsid = 1 + job_batch_params = {"wsid": job_batch_wsid, "as_admin": "foo"} + ee2 = ee2client(f"http://localhost:{ee2_port}", token=TOKEN_WRITE_ADMIN) + ret = ee2.run_job_batch([job1_params, job2_params], job_batch_params) + + # May need to increase sleep if thread takes too long + time.sleep(0.1) + + batch_id = ret["batch_id"] + job_id_1, job_id_2 = ret["child_job_ids"] + + # check that mocks were called correctly + # Since these are class methods, the first argument is self, which we ignore + get_mod_ver.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "version": "release"}), + call(ANY, {"module_name": "mod2", "version": "release"}), + ] + ) + list_cgroups.assert_has_calls( + [ + call(ANY, {"module_name": "mod", "function_name": "meth"}), + call(ANY, {"module_name": "mod2", "function_name": "meth2"}), + ] + ) + + job1 = _get_mongo_job(mongo_client, job_id_1) + job2 = _get_mongo_job(mongo_client, job_id_2) + + expected_job1 = { + "_id": ObjectId(job_id_1), + "user": USER_WRITE_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "wsid": job_batch_wsid, + "batch_id": batch_id, + "job_input": { + "wsid": job_batch_wsid, + "method": _MOD, + "service_ver": "somehash", + "source_ws_objects": [], + "requirements": { + "clientgroup": "bigmem", + "cpu": 4, + "memory": 2000, + "disk": 100, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "123", + "scheduler_type": "condor", + "retry_ids": [], + "retry_saved_toggle": False, + } + assert job1 == expected_job1 + + expected_job2 = { + "_id": ObjectId(job_id_2), + "user": USER_WRITE_ADMIN, + "authstrat": "kbaseworkspace", + "status": "queued", + "wsid": job_batch_wsid, + "batch_id": batch_id, + "job_input": { + "wsid": job_batch_wsid, + "method": "mod2.meth2", + "service_ver": "somehash2", + "source_ws_objects": [], + "requirements": { + "clientgroup": "extreme", + "cpu": 32, + "memory": 42, + "disk": 8, + }, + "narrative_cell_info": {}, + }, + "child_jobs": [], + "batch_job": False, + "scheduler_id": "456", + "scheduler_type": "condor", + "retry_ids": [], + "retry_saved_toggle": False, + } + assert job2 == expected_job2 + + parent_job = _get_mongo_job(mongo_client, batch_id, has_queued=False) + expected_parent_job = { + "_id": ObjectId(batch_id), + "user": USER_WRITE_ADMIN, + "authstrat": "kbaseworkspace", + "wsid": job_batch_wsid, + "status": "created", + "job_input": { + "method": "batch", + "service_ver": "batch", + "app_id": "batch", + "source_ws_objects": [], + "narrative_cell_info": {}, + }, + "child_jobs": [job_id_1, job_id_2], + "batch_job": True, + "retry_ids": [], + "retry_saved_toggle": False, + } + assert parent_job == expected_parent_job + + expected_sub_1 = _get_condor_sub_for_rj_param_set( + job_id_1, + USER_WRITE_ADMIN, + TOKEN_WRITE_ADMIN, + clientgroup="bigmem", + cpu=4, + mem=2000, + disk=100, + parent_job_id=batch_id, + app_id=None, + app_module=None, + wsid=job_batch_wsid, + ) + expected_sub_1.update( + {"+KB_SOURCE_WS_OBJECTS": "", "+KB_WSID": f'"{job_batch_wsid}"'} + ) + + expected_sub_2 = _get_condor_sub_for_rj_param_set( + job_id_2, + USER_WRITE_ADMIN, + TOKEN_WRITE_ADMIN, + clientgroup="extreme", + cpu=32, + mem=42, + disk=8, + parent_job_id=batch_id, + app_id=None, + app_module=None, + wsid=job_batch_wsid, + ) + expected_sub_2.update( + { + "+KB_SOURCE_WS_OBJECTS": "", + "+KB_WSID": f'"{job_batch_wsid}"', + "+AccountingGroup": '"forrest_gump"', + "+KB_MODULE_NAME": '"mod2"', + "+KB_FUNCTION_NAME": '"meth2"', + "requirements": '(CLIENTGROUP == "extreme") && (baz == "bat") && (foo == "bar")', + "environment": expected_sub_2["environment"].replace( + "DEBUG_MODE=False", "DEBUG_MODE=True" + ), + } + ) + del expected_sub_2["Concurrency_Limits"] + _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 + ) + + +def _check_batch_htc_calls( + sub_init, schedd_init, sub, schedd, txn, expected_sub_1, expected_sub_2 +): + assert sub_init.call_args_list == [call(expected_sub_1), call(expected_sub_2)] + # The line above and the line below should be completely equivalent IIUC, but the line + # below fails for reasons I don't understand. The error output shows the actual calls + # for the line below having 2 extra calls that appear to be the sub.queue calls + # below. Stumped, so going with what works and moving on. + # sub_init.assert_has_calls([call(expected_sub_1), call(expected_sub_2)]) + schedd_init.call_args_list = [call(), call()] + # same deal here. Output includes stuff like `call().transaction()` so + # it appears the sub calls are being picked up, which is weird. + # schedd_init.assert_has_calls([call(), call()]) + schedd.transaction.call_args_list = [call(), call()] + # and again + # schedd.transaction.assert_has_calls([call(), call()]) + sub.queue.assert_has_calls([call(txn, 1), call(txn, 1)]) + + +def test_run_job_batch_fail_not_admin(ee2_port, ws_controller): + err = "Access Denied: You are not an administrator. AdminPermissions.NONE" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, [], {"as_admin": True}, err) + + +def test_run_job_batch_fail_only_read_admin(ee2_port, ws_controller): + err = ( + "Access Denied: You are a read-only admin. This function requires write access" + ) + _run_job_batch_fail(ee2_port, TOKEN_READ_ADMIN, [], {"as_admin": True}, err) + + +def test_run_job_batch_fail_no_workspace_access_for_batch(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [{"method": _MOD}] + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 2 exists'))" + ) + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 2}, err) + + +def test_run_job_batch_fail_no_allowed_wsid(ee2_port): + params = [ + {"method": _MOD}, + {"method": _MOD, "wsid": 1}, + ] + # this error could probably use some cleanup + err = "Workspace ids are not allowed in RunJobParams in Batch Mode" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_no_workspace_access_for_job(ee2_port): + params = [ + {"method": _MOD}, + {"method": _MOD}, + ] + # this error could probably use some cleanup + err = ( + "('An error occurred while fetching user permissions from the Workspace', " + + "ServerError('No workspace with id 1 exists'))" + ) + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_memory(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD}, + {"method": _MOD}, + {"method": _MOD, "job_requirements": {"request_memory": [1000]}}, + ] + err = "Job #3: Found illegal memory request '[1000]' in job requirements from input job" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_scheduler_requirements(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD, "job_requirements": {"scheduler_requirements": {"": "foo"}}}, + {"method": _MOD}, + ] + err = "Job #1: Missing input parameter: key in scheduler requirements structure" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_job_reqs_but_no_as_admin(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD}, + { + "method": _MOD, + "job_requirements": {"request_memory": 1000}, + # as_admin is only considered in the batch params for run_job_batch + "as_admin": True, + }, + ] + err = "Job #2: In order to specify job requirements you must be a full admin" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_catalog_data(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [{"client_groups": ['{"request_cpus":-8}']}] + + params = [{"method": _MOD}] + # TODO this is not a useful error for the user. Need to change the job reqs resolver + # However, getting this wrong in the catalog is not super likely so not urgent + err = "CPU count must be at least 1" + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_method(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + {"method": _MOD}, + {"method": "mod.meth.moke"}, + ] + err = "Job #2: Unrecognized method: 'mod.meth.moke'. Please input module_name.function_name" + # TODO this test surfaced a bug - if a batch wsid is not supplied and any job does not have + # a wsid an error occurs + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_app(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [{"method": _MOD, "app_id": "mod.\bapp"}] + err = "application ID contains control characters" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_bad_upa(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + params = [ + { + "method": _MOD, + "source_ws_objects": ["ws/obj/1"], + } + ] + err = ( + "source_ws_objects index 0, 'ws/obj/1', is not a valid Unique Permanent Address" + ) + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_parent_id(ee2_port, ws_controller): + _set_up_workspace_objects(ws_controller, TOKEN_NO_ADMIN) + + params = [{"method": _MOD, "parent_job_id": "ae"}] + err = "batch jobs may not specify a parent job ID" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + params = [ + {"method": _MOD}, + {"method": _MOD, "parent_job_id": "ae"}, + ] + err = "Job #2: batch jobs may not specify a parent job ID" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def test_run_job_batch_fail_no_such_object(ee2_port, ws_controller): + # Set up workspace and objects + wsc = Workspace(ws_controller.get_url(), token=TOKEN_NO_ADMIN) + wsc.create_workspace({"workspace": "foo"}) + wsc.save_objects( + { + "id": 1, + "objects": [ + {"name": "one", "type": "Trivial.Object-1.0", "data": {}}, + ], + } + ) + params = [{"method": _MOD, "source_ws_objects": ["1/2/1"]}] + err = "Some workspace object is inaccessible" + with patch(CAT_LIST_CLIENT_GROUPS, spec_set=True, autospec=True) as list_cgroups: + list_cgroups.return_value = [] + _run_job_batch_fail(ee2_port, TOKEN_NO_ADMIN, params, {"wsid": 1}, err) + + +def _run_job_batch_fail( + ee2_port, token, params, batch_params, expected, throw_exception=False +): + client = ee2client(f"http://localhost:{ee2_port}", token=token) + if throw_exception: + client.run_job_batch(params, batch_params) + else: + with raises(ServerError) as got: + client.run_job_batch(params, batch_params) + assert_exception_correct(got.value, ServerError("name", 1, expected)) diff --git a/test/tests_for_integration/auth_controller.py b/test/tests_for_integration/auth_controller.py new file mode 100644 index 000000000..fc7484dc2 --- /dev/null +++ b/test/tests_for_integration/auth_controller.py @@ -0,0 +1,161 @@ +""" +A controller for the KBase Auth2 service (https://github.com/kbase/auth2) for use in testing +auth-enabled applications. +""" + +# Ported from: +# https://github.com/kbase/sample_service/blob/master/test/auth_controller.py +# May want to set up a python package for this...? + +import os +import requests +import shutil +import subprocess +import tempfile +import time +import zipfile + +from pathlib import Path +from utils_shared.test_utils import TestException +from utils_shared import test_utils + +_AUTH_CLASS = "us.kbase.test.auth2.StandaloneAuthServer" +_JARS_FILE = Path(__file__).resolve().parent.joinpath("authjars") +_RETRY_COUNT = 40 + + +class AuthController: + """ + The main Auth controller class. + + Attributes: + version - the version of the auth service + port - the port for the Auth service. + temp_dir - the location of the Auth data and logs. + """ + + def __init__( + self, + jars_dir: Path, + mongo_host: str, + mongo_db: str, + root_temp_dir: Path, + mongo_user: str = None, + mongo_pwd: str = None, + ): + """ + Create and start a new Auth service. An unused port will be selected for the server. + + :param jars_dir: The path to the lib/jars dir of the KBase Jars repo + (https://github.com/kbase/jars), e.g /path_to_repo/lib/jars. + :param mongo_host: The address of the MongoDB server to use as the Auth service database, + e.g. localhost:27017. + :param mongo_db: The database in which to store Auth data. + :param root_temp_dir: A temporary directory in which to store Auth data and log files. + The files will be stored inside a child directory that is unique per invocation. + :param mongo_user: if the MongoDB server requires authentication, the user name. + :param mongo_pwd: if the MongoDB server requires authentication, the user password. + """ + if not jars_dir or not os.access(jars_dir, os.X_OK): + raise TestException( + "jars_dir {} does not exist or is not executable.".format(jars_dir) + ) + if not mongo_host: + raise TestException("mongo_host must be provided") + if not mongo_db: + raise TestException("mongo_db must be provided") + if not root_temp_dir: + raise TestException("root_temp_dir is None") + + if bool(mongo_user) ^ bool(mongo_pwd): # xor + raise TestException( + "Neither or both of mongo_user and mongo_pwd is required" + ) + + jars_dir = jars_dir.resolve() + class_path = self._get_class_path(jars_dir) + + # make temp dirs + root_temp_dir = root_temp_dir.absolute() + os.makedirs(root_temp_dir, exist_ok=True) + self.temp_dir = Path( + tempfile.mkdtemp(prefix="AuthController-", dir=str(root_temp_dir)) + ) + + self.port = test_utils.find_free_port() + + template_dir = self.temp_dir.joinpath("templates") + self._install_templates(jars_dir, template_dir) + + command = [ + "java", + "-classpath", + class_path, + "-DAUTH2_TEST_MONGOHOST=" + mongo_host, + "-DAUTH2_TEST_MONGODB=" + mongo_db, + "-DAUTH2_TEST_TEMPLATE_DIR=" + str(template_dir), + _AUTH_CLASS, + str(self.port), + ] + if mongo_user: + command.insert(5, "-DAUTH2_TEST_MONGOPWD=" + mongo_pwd) + command.insert(5, "-DAUTH2_TEST_MONGOUSER=" + mongo_user) + + self._outfile = open(self.temp_dir.joinpath("auth.log"), "w") + + self._proc = subprocess.Popen( + command, stdout=self._outfile, stderr=subprocess.STDOUT + ) + + for count in range(_RETRY_COUNT): + err = None + time.sleep(1) # wait for server to start + try: + res = requests.get( + f"http://localhost:{self.port}", + headers={"accept": "application/json"}, + ) + if res.ok: + self.version = res.json()["version"] + break + err = TestException(res.text) + except requests.exceptions.ConnectionError as e: + err = TestException(e.args[0]) + err.__cause__ = e + if err: + raise err + self.startup_count = count + 1 + + def destroy(self, delete_temp_files: bool = True): + """ + Shut down the server and optionally delete any files generated. + + :param delete_temp_files: if true, delete all the temporary files generated as part of + running the server. + """ + if self._proc: + self._proc.terminate() + if self._outfile: + self._outfile.close() + if delete_temp_files and self.temp_dir: + shutil.rmtree(self.temp_dir) + + def _install_templates(self, jars_dir: Path, template_dir: Path): + with open(_JARS_FILE) as jf: + template_zip_file = jars_dir.joinpath(jf.readline().strip()) + with zipfile.ZipFile(template_zip_file) as z: + # should really check to see that the entries are safe, but it's our zipfile, so + # don't bother for now. + z.extractall(template_dir) + + def _get_class_path(self, jars_dir: Path): + cp = [] + with open(_JARS_FILE) as jf: + jf.readline() # 1st line is template file + for line in jf: + if line.strip() and not line.startswith("#"): + p = jars_dir.joinpath(line.strip()) + if not p.is_file(): + raise TestException(f"Required jar does not exist: {p}") + cp.append(str(p)) + return ":".join(cp) diff --git a/test/tests_for_integration/authjars b/test/tests_for_integration/authjars new file mode 100644 index 000000000..c79b8d685 --- /dev/null +++ b/test/tests_for_integration/authjars @@ -0,0 +1,74 @@ +kbase/auth2/kbase-auth2templates-0.4.3.zip + +kbase/auth2/kbase-auth2-0.4.3.jar +kbase/auth2/kbase-auth2test-0.4.3.jar + +#lib +apache_commons/commons-codec-1.8.jar +apache_commons/commons-validator-1.5.1.jar +google/guava-18.0.jar +ini4j/ini4j-0.5.2.jar +jcommander/jcommander-1.48.jar +mongo/mongo-java-driver-3.8.2.jar +mustache/compiler-0.9.3.jar +nulab-inc/zxcvbn-1.2.2.jar + +#logging +kbase/common/kbase-common-0.0.22.jar +jna/jna-3.4.0.jar +logback/logback-core-1.1.2.jar +logback/logback-classic-1.1.2.jar +slf4j/slf4j-api-1.7.25.jar +syslog4j/syslog4j-0.9.46.jar + +#yauaa +yauaa/yauaa-1.3.jar +apache_commons/commons-lang3-3.5.jar +apache_commons/commons-collections4-4.1.jar +apache_commons/commons-logging-1.2.jar +apache_commons/commons-io-2.4.jar +kohsuke/args4j-2.33.jar +snakeyaml/snakeyaml-1.18.jar + +#jackson +jackson/jackson-annotations-2.5.4.jar +jackson/jackson-core-2.5.4.jar +jackson/jackson-databind-2.5.4.jar +jackson/jackson-jaxrs-base-2.5.4.jar +jackson/jackson-jaxrs-json-provider-2.5.4.jar +jackson/jackson-module-jaxb-annotations-2.5.4.jar + +#jersey +jersey/entity-filtering/jersey-entity-filtering-2.23.2.jar +jersey/entity-filtering/jersey-media-json-jackson-2.23.2.jar +jersey/mvc/jersey-mvc-2.23.2.jar +jersey/mvc/jersey-mvc-mustache-2.23.2.jar +jersey/jersey-client-2.23.2.jar +jersey/jersey-common-2.23.2.jar +jersey/jersey-container-servlet-2.23.2.jar +jersey/jersey-container-servlet-core-2.23.2.jar +jersey/jersey-guava-2.23.2.jar +jersey/jersey-media-jaxb-2.23.2.jar +jersey/jersey-server-2.23.2.jar + +#jerseydeps +annotation/javax.annotation-api-1.2.jar +asm/asm-debug-all-5.0.4.jar +inject/javax.inject-2.5.0-b05.jar +javassist/javassist-3.20.0-GA.jar +jaxb/jaxb-api-2.2.7.jar +jaxrs/javax.ws.rs-api-2.0.1.jar +osgi/org.osgi.core-4.2.0.jar +persistence/persistence-api-1.0.jar +servlet/javax.servlet-api-3.0.1.jar +validationapi/validation-api-1.1.0.Final.jar + +#jerseydep_hk2 +hk2/aopalliance-repackaged-2.5.0-b05.jar +hk2/hk2-api-2.5.0-b05.jar +hk2/hk2-locator-2.5.0-b05.jar +hk2/hk2-utils-2.5.0-b05.jar +hk2/osgi-resource-locator-1.0.1.jar + +#test +jetty/jetty-all-9.3.11.v20160721-uber.jar diff --git a/test/tests_for_integration/workspace_controller.py b/test/tests_for_integration/workspace_controller.py new file mode 100644 index 000000000..92fa59733 --- /dev/null +++ b/test/tests_for_integration/workspace_controller.py @@ -0,0 +1,269 @@ +""" +Q&D Utility to run a Workspace server for the purposes of testing. + +Initializes a GridFS backend and does not support handles, bytestreams or samples. +""" + +import os as _os +import shutil as _shutil +import subprocess as _subprocess +import tempfile as _tempfile +import time as _time +from pathlib import Path as _Path +from pymongo.mongo_client import MongoClient + +import requests as _requests + +from configparser import ConfigParser as _ConfigParser +from installed_clients.WorkspaceClient import Workspace as _Workspace +from installed_clients.baseclient import ServerError as _ServerError + +from utils_shared.test_utils import TestException as _TestException +from utils_shared import test_utils as _test_utils + +_WS_CLASS = "us.kbase.workspace.WorkspaceServer" +_JARS_FILE = _Path(__file__).resolve().parent.joinpath("wsjars") + + +class WorkspaceController: + """ + The main Workspace controller class. The Workspace will allow users with the KBase Auth + service WS_READ_ADMIN role to use read-only administration methods and WS_FULL_ADMIN role + to use all administration methods. + + Attributes: + version - the version of the Workspace service + port - the port for the Workspace service. + temp_dir - the location of the Workspace data and logs. + """ + + # TODO This code likely belongs somewhere else. Not quite sure where though, maybe in WS repo. + # TODO This code is similar to the auth controller code, DRY it up? + + def __init__( + self, + jars_dir: _Path, + mongo_host: str, + mongo_db: str, + mongo_type_db: str, + auth_url: str, + root_temp_dir: _Path, + mongo_user: str = None, + mongo_pwd: str = None, + ): + """ + Create and start a new Workspace service. An unused port will be selected for the server. + + :param jars_dir: The path to the lib/jars dir of the KBase Jars repo + (https://github.com/kbase/jars), e.g /path_to_repo/lib/jars. + :param mongo_host: The address for the MongoDB host. + :param mongo_db: The database in which to store Workspace data. + :param mongo_type_db: The database in which to store Workspace type specifications. + :param auth_url: The root url of an instance of the KBase auth service. + :param root_temp_dir: A temporary directory in which to store Auth data and log files. + The files will be stored inside a child directory that is unique per invocation. + :param mongo_user: The username for the Mongo account, if provided. The user is expected + to be a user in the provided databases with readWrite permission. + :param mongo_pwd: The password for the Mongo accont if, provided. + """ + self._check_params( + jars_dir, + mongo_host, + mongo_db, + mongo_type_db, + auth_url, + root_temp_dir, + mongo_user, + mongo_pwd, + ) + + self._db = mongo_db + jars_dir = jars_dir.resolve() + class_path = self._get_class_path(jars_dir) + + # make temp dirs + root_temp_dir = root_temp_dir.absolute() + _os.makedirs(root_temp_dir, exist_ok=True) + self.temp_dir = _Path( + _tempfile.mkdtemp(prefix="WorkspaceController-", dir=str(root_temp_dir)) + ) + ws_temp_dir = self.temp_dir.joinpath("temp_files") + _os.makedirs(ws_temp_dir) + + configfile = self._create_deploy_cfg( + self.temp_dir, + ws_temp_dir, + mongo_host, + mongo_db, + mongo_type_db, + auth_url, + mongo_user, + mongo_pwd, + ) + newenv = _os.environ.copy() + newenv["KB_DEPLOYMENT_CONFIG"] = configfile + + self.port = _test_utils.find_free_port() + + command = ["java", "-classpath", class_path, _WS_CLASS, str(self.port)] + + self._wslog = self.temp_dir / "ws.log" + self._outfile = open(self._wslog, "w") + + self._proc = _subprocess.Popen( + command, stdout=self._outfile, stderr=_subprocess.STDOUT, env=newenv + ) + + self.version, self.startup_count = self._wait_for_service() + self._mongo_client = self._get_mongo_client( + mongo_host, mongo_db, mongo_user, mongo_pwd + ) + + def _check_params( + self, + jars_dir: _Path, + mongo_host: str, + mongo_db: str, + mongo_type_db: str, + auth_url: str, + root_temp_dir: _Path, + mongo_user: str, + mongo_pwd: str, + ): + if not jars_dir or not _os.access(jars_dir, _os.X_OK): + raise _TestException( + "jars_dir {} does not exist or is not executable.".format(jars_dir) + ) + if not mongo_host: + raise _TestException("mongo_controller must be provided") + if not mongo_db: + raise _TestException("mongo_db must be provided") + if not mongo_type_db: + raise _TestException("mongo_type_db must be provided") + if not auth_url: + raise _TestException("auth_url must be provided") + if not root_temp_dir: + raise _TestException("root_temp_dir is None") + if bool(mongo_user) ^ bool(mongo_pwd): # xor + raise _TestException( + "Neither or both of mongo_user and mongo_pwd is required" + ) + + def _get_class_path(self, jars_dir: _Path): + cp = [] + with open(_JARS_FILE) as jf: + for line in jf: + if line.strip() and not line.startswith("#"): + p = jars_dir.joinpath(line.strip()) + if not p.is_file(): + raise _TestException(f"Required jar does not exist: {p}") + cp.append(str(p)) + return ":".join(cp) + + def _create_deploy_cfg( + self, + temp_dir, + ws_temp_dir, + mongo_host, + mongo_db, + mongo_type_db, + auth_url, + mongo_user, + mongo_pwd, + ): + cp = _ConfigParser() + cp["Workspace"] = { + "mongodb-host": mongo_host, + "mongodb-database": mongo_db, + "mongodb-type-database": mongo_type_db, + "backend-type": "GridFS", + "auth-service-url": auth_url + "/api/legacy/KBase", + "auth-service-url-allow-insecure": "true", + # TODO WS trailing slash should not be necessary + # see https://github.com/kbase/workspace_deluxe/issues/350 + "auth2-service-url": auth_url + "/", + "temp-dir": str(ws_temp_dir), + "ignore-handle-service": "true", + "auth2-ws-admin-read-only-roles": "WS_READ_ADMIN", + "auth2-ws-admin-full-roles": "WS_FULL_ADMIN", + } + if mongo_user: + cp["Workspace"]["mongodb-user"] = mongo_user + cp["Workspace"]["mongodb-pwd"] = mongo_pwd + f = temp_dir / "test.cfg" + with open(f, "w") as inifile: + cp.write(inifile) + return f + + def _wait_for_service(self): + ws = _Workspace(f"http://localhost:{self.port}") + for count in range(40): + err = None + _time.sleep(1) # wait for server to start + try: + version = ws.ver() + break + except (_ServerError, _requests.exceptions.ConnectionError) as se: + err = _TestException(se.args[0]) + err.__cause__ = se + if err: + print("Error starting workspace service. Dumping logs and throwing error") + self._print_ws_logs() + raise err + return version, count + 1 + + def _get_mongo_client(self, mongo_host, mongo_db, mongo_user, mongo_pwd): + if mongo_user: + mongo_client = MongoClient( + mongo_host, username=mongo_user, password=mongo_pwd, authSource=mongo_db + ) + else: + mongo_client = MongoClient(mongo_host) + # check that the client is correctly connected. See + # https://api.mongodb.com/python/3.7.0/api/pymongo/mongo_client.html + # #pymongo.mongo_client.MongoClient + mongo_client.admin.command("ismaster") + return mongo_client + + def get_url(self): + """ + Get the url for the running workspace instance. + """ + return f"http://localhost:{self.port}" + + def clear_db(self): + """ + Remove all data, but not indexes, from the database. Do not remove any installed types. + """ + db = self._mongo_client[self._db] + for name in db.list_collection_names(): + if not name.startswith("system."): + # don't drop collection since that drops indexes + db.get_collection(name).delete_many({}) + + def destroy(self, delete_temp_files: bool = True, dump_logs_to_stdout: bool = True): + """ + Shut down the server and optionally delete any files generated. + + :param delete_temp_files: if true, delete all the temporary files generated as part of + running the server. + :param dump_logs_to_stdout: Write the contents of the workspace log file to stdout. + This is useful in the context of 3rd party CI services, where the log file is not + necessarily accessible. + """ + if self._proc: + self._proc.terminate() + self._print_ws_logs(dump_logs_to_stdout=dump_logs_to_stdout) + if delete_temp_files and self.temp_dir: + _shutil.rmtree(self.temp_dir) + if self._mongo_client: + self._mongo_client.close() + + # closes logfile + def _print_ws_logs(self, dump_logs_to_stdout=True): + if self._outfile: + self._outfile.close() + if dump_logs_to_stdout: + with open(self._wslog) as f: + for line in f: + print(line) diff --git a/test/tests_for_integration/wsjars b/test/tests_for_integration/wsjars new file mode 100644 index 000000000..c2d134ddc --- /dev/null +++ b/test/tests_for_integration/wsjars @@ -0,0 +1,40 @@ +kbase/workspace/WorkspaceService-0.11.2.jar + +# server code +kbase/common/kbase-common-0.0.24.jar +ini4j/ini4j-0.5.2.jar +jetty/jetty-all-7.0.0.jar +jna/jna-3.4.0.jar +servlet/servlet-api-2.5.jar +syslog4j/syslog4j-0.9.46.jar +joda/joda-time-2.2.jar +annotation/javax.annotation-api-1.3.2.jar + +junit/junit-4.12.jar +hamcrest/hamcrest-core-1.3.jar +kbase/auth/kbase-auth-0.4.4.jar +jackson/jackson-annotations-2.2.3.jar +jackson/jackson-core-2.2.3.jar +jackson/jackson-databind-2.2.3.jar + +# shock client +kbase/shock/shock-client-0.0.16.jar +apache_commons/commons-logging-1.1.1.jar +apache_commons/http/httpclient-4.3.1.jar +apache_commons/http/httpcore-4.3.jar +apache_commons/http/httpmime-4.3.1.jar + +kbase/kidl/kbase-kidl-parser-1409261812-7863aef.jar +apache_commons/commons-codec-1.8.jar +apache_commons/commons-io-2.4.jar +apache_commons/commons-lang3-3.1.jar +mongo/mongo-java-driver-3.8.2.jar +bson4jackson/bson4jackson-2.2.0-2.2.0.jar +slf4j/slf4j-api-1.7.7.jar +logback/logback-core-1.1.2.jar +logback/logback-classic-1.1.2.jar +google/guava-14.0.1.jar +kafka/kafka-clients-2.1.0.jar +kbase/handle/AbstractHandleClient-1.0.0.jar + +# leaving out S3 libs as this is a test instance of the WS and will save all data in GFS \ No newline at end of file diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py new file mode 100644 index 000000000..6dec768bf --- /dev/null +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -0,0 +1,1559 @@ +""" +Unit tests for the EE2Runjob class. +""" + +# Incomplete by a long way. Will add more unit tests as they come up. + +import copy +import time +from logging import Logger +from typing import List, Dict, Any +from unittest.mock import create_autospec, call + +from bson.objectid import ObjectId +from pytest import raises + +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import ( + Job, + JobInput, + JobRequirements, + Meta, + Status, +) +from execution_engine2.exceptions import ( + IncorrectParamsException, + AuthError, + InvalidParameterForBatch, +) +from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions, JobIdPair +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements as ResolvedRequirements, + AppInfo, + UserCreds, +) +from execution_engine2.utils.Condor import Condor, SubmissionInfo +from execution_engine2.utils.KafkaUtils import ( + KafkaClient, + KafkaQueueChange, + KafkaCreateJob, +) +from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.catalog_cache import CatalogCache +from execution_engine2.utils.job_requirements_resolver import ( + JobRequirementsResolver, + RequirementsType, +) +from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace +from utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS +from utils_shared.test_utils import assert_exception_correct + +# common variables +_JOB_ID = "603051cfaf2e3401b0500982" +_GIT_COMMIT = "git5678" +_WS_REF_1 = "1/2/3" +_WS_REF_2 = "4/5/6" +_CLUSTER = "cluster42" +_METHOD = "lolcats.lol_unto_death" +_APP = "lolcats/itsmypartyilllolifiwantto" +_USER = "someuser" +_TOKEN = "tokentokentoken" +_OTHER_USER = "some_sucker" +_CREATED_STATE = "created" +_QUEUED_STATE = "queued" + +# batch common variables +_BATCH = "batch" +_GIT_COMMIT_1 = "commit1" +_GIT_COMMIT_2 = "commit2" +_JOB_ID_1 = "603051cfaf2e3401b0500985" +_JOB_ID_2 = "603051cfaf2e3401b0500986" +_METHOD_1 = "module1.method1" +_APP_1 = "module1/app1" +_METHOD_2 = "module2.method2" +_APP_2 = "module2/app2" +_CLUSTER_1 = "cluster1" +_CLUSTER_2 = "cluster2" + +_EMPTY_JOB_REQUIREMENTS = { + "cpus": None, + "memory_MB": None, + "disk_GB": None, + "client_group": None, + "client_group_regex": None, + "bill_to_user": None, + "ignore_concurrency_limits": False, + "scheduler_requirements": None, + "debug_mode": None, +} + + +def _set_up_mocks(user: str, token: str) -> Dict[Any, Any]: + """ + Returns a dictionary of the class that is mocked to the mock of the class, and initializes + the SDKMR getters to return the mocks. + """ + # Can't seem to find a mypy annotation for a class, so Any it is + + # The amount of mocking required here implies the method should be broken up into smaller + # classes that are individually mockable. Or maybe it's just really complicated and this + # is the best we can do. Worth looking into at some point though. + mocks = get_client_mocks(None, None, *ALL_CLIENTS) + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + mocks[SDKMethodRunner] = sdkmr + mocks[Logger] = create_autospec(Logger, spec_set=True, instance=True) + mocks[Workspace] = create_autospec(Workspace, spec_set=True, instance=True) + mocks[WorkspaceAuth] = create_autospec(WorkspaceAuth, spec_set=True, instance=True) + mocks[CatalogCache] = create_autospec(CatalogCache, spec_set=True, instance=True) + + # Set up basic getter calls + sdkmr.get_catalog_cache.return_value = mocks[CatalogCache] + sdkmr.get_catalog.return_value = mocks[Catalog] + sdkmr.get_condor.return_value = mocks[Condor] + sdkmr.get_kafka_client.return_value = mocks[KafkaClient] + sdkmr.get_logger.return_value = mocks[Logger] + sdkmr.get_mongo_util.return_value = mocks[MongoUtil] + sdkmr.get_job_requirements_resolver.return_value = mocks[JobRequirementsResolver] + sdkmr.get_slack_client.return_value = mocks[SlackClient] + sdkmr.get_token.return_value = token + sdkmr.get_user_id.return_value = user + sdkmr.get_workspace.return_value = mocks[Workspace] + sdkmr.get_workspace_auth.return_value = mocks[WorkspaceAuth] + + return mocks + + +def _create_job( + reqs: ResolvedRequirements, + user=_USER, + method=_METHOD, + app=_APP, + state=_CREATED_STATE, + git_commit=_GIT_COMMIT, + batch_id=None, + parent_job_id=None, + source_ws_objects=None, + wsid=None, +): + job = Job() + job.user = user + job.status = state + job.wsid = wsid + job.batch_id = batch_id + ji = JobInput() + ji.method = method + ji.app_id = app + ji.wsid = wsid + ji.service_ver = git_commit + ji.source_ws_objects = source_ws_objects + if parent_job_id: + ji.parent_job_id = parent_job_id + jr = JobRequirements() + jr.clientgroup = reqs.client_group + jr.cpu = reqs.cpus + jr.memory = reqs.memory_MB + jr.disk = reqs.disk_GB + ji.requirements = jr + ji.narrative_cell_info = Meta() + job.job_input = ji + return job + + +def _check_queued_job_save(got_job, job_id, cluster): + expected_job = Job() + expected_job.id = ObjectId(job_id) + expected_job.status = _QUEUED_STATE + # no way to test this really without code refactoring + expected_job.queued = got_job.queued + expected_job.scheduler_type = "condor" + expected_job.scheduler_id = cluster + assert_jobs_equal(got_job, expected_job) + + +def _set_up_common_return_values(mocks): + """ + Set up return values on mocks that are the same for several tests. + """ + mocks[Workspace].get_object_info3.return_value = { + "paths": [[_WS_REF_1], [_WS_REF_2]] + } + mocks[CatalogCache].lookup_git_commit_version.return_value = _GIT_COMMIT + mocks[SDKMethodRunner].save_job.return_value = _JOB_ID + mocks[Condor].run_job.return_value = SubmissionInfo(_CLUSTER, {}, None) + retjob = Job() + retjob.id = ObjectId(_JOB_ID) + retjob.status = _CREATED_STATE + mocks[MongoUtil].get_job.return_value = retjob + + +def _check_common_mock_calls(mocks, reqs, wsid, app=_APP, parent_job_id=None): + """ + Check that mocks are called as expected when those calls are similar or the same for + several tests. + """ + sdkmr = mocks[SDKMethodRunner] + kafka = mocks[KafkaClient] + mocks[Workspace].get_object_info3.assert_called_once_with( + {"objects": [{"ref": _WS_REF_1}, {"ref": _WS_REF_2}], "ignoreErrors": 1} + ) + mocks[CatalogCache].lookup_git_commit_version.assert_called_once_with( + method="lolcats.lol_unto_death", service_ver=None + ) + + # initial job data save + expected_job = _create_job( + reqs, + app=app, + wsid=wsid, + parent_job_id=parent_job_id, + source_ws_objects=[_WS_REF_1, _WS_REF_2], + ) + assert len(sdkmr.save_job.call_args_list) == 2 + got_job = sdkmr.save_job.call_args_list[0][0][0] + assert_jobs_equal(got_job, expected_job) + + kafka.send_kafka_message.assert_any_call(KafkaCreateJob(_USER, _JOB_ID)) + jsp_expected = JobSubmissionParameters( + _JOB_ID, + AppInfo(_METHOD, app), + reqs, + UserCreds(_USER, _TOKEN), + wsid=wsid, + parent_job_id=parent_job_id, + source_ws_objects=[_WS_REF_1, _WS_REF_2], + ) + mocks[Condor].run_job.assert_called_once_with(params=jsp_expected) + + # updated job data save + mocks[MongoUtil].get_job.assert_called_once_with(_JOB_ID) + + # update to queued state + got_job = sdkmr.save_job.call_args_list[1][0][0] + _check_queued_job_save(got_job, _JOB_ID, _CLUSTER) + + kafka.send_kafka_message.assert_called_with( # update to queued state + KafkaQueueChange( + job_id=_JOB_ID, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER, + ) + ) + # Removed for now, but might be added back in at a later point + # mocks[SlackClient].run_job_message.assert_called_once_with(_JOB_ID, _CLUSTER, _USER) + + +def _create_reqs_dict( + cpu, + mem, + disk, + clientgroup, + client_group_regex=None, + ignore_concurrency_limits=None, + debug_mode=None, + merge_with=None, + internal_representation=False, +): + # the bill to user and scheduler requirements keys are different for the concierge endpoint + # so we don't include them. If needed use the merge_with parameter. + if internal_representation: + ret = { + "cpus": cpu, + "memory_MB": mem, + "disk_GB": disk, + } + else: + ret = { + "request_cpus": cpu, + "request_memory": mem, + "request_disk": disk, + } + ret.update( + { + "client_group": clientgroup, + "client_group_regex": client_group_regex, + "ignore_concurrency_limits": ignore_concurrency_limits, + "debug_mode": debug_mode, + } + ) + if merge_with: + ret.update(merge_with) + return ret + + +def test_run_job(): + """ + A basic unit test of the run() method. + + This test is a fairly minimal test of the run() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + + # Set up data variables + client_group = "myfirstclientgroup" + cpus = 1 + mem = 1 + disk = 1 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.STANDARD + reqs = ResolvedRequirements( + cpus=cpus, memory_MB=mem, disk_GB=disk, client_group=client_group + ) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + assert rj.run(params) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + jrr.normalize_job_reqs.assert_called_once_with({}, "input job") + jrr.get_requirements_type.assert_called_once_with(**_EMPTY_JOB_REQUIREMENTS) + jrr.resolve_requirements.assert_called_once_with( + _METHOD, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS + ) + _check_common_mock_calls(mocks, reqs, None, _APP) + + +def test_run_job_as_admin_with_job_requirements_and_parent_job(): + """ + A basic unit test of the run() method with an administrative user and job requirements. + + This test is a fairly minimal test of the run() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + + Does not include an app_id. + + Does include a parent job id. + """ + + # Set up data variables + client_group = "grotesquememlong" + cpus = 4 + mem = 32 + disk = 2600 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + jrr.normalize_job_reqs.return_value = _create_reqs_dict( + cpus, mem, disk, client_group, client_group_regex=True, debug_mode=True + ) + jrr.get_requirements_type.return_value = RequirementsType.BILLING + req_args = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=True, + ignore_concurrency_limits=True, + debug_mode=True, + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + internal_representation=True, + ) + reqs = ResolvedRequirements(**req_args) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + inc_reqs = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=1, + ignore_concurrency_limits="righty ho, luv", + debug_mode="true", + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + ) + params = { + "method": _METHOD, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + "job_requirements": inc_reqs, + "parent_job_id": "thisislikesoooofake", + } + assert rj.run(params, as_admin=True) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + jrr.normalize_job_reqs.assert_called_once_with(inc_reqs, "input job") + jrr.get_requirements_type.assert_called_once_with(**req_args) + jrr.resolve_requirements.assert_called_once_with( + _METHOD, mocks[CatalogCache], **req_args + ) + _check_common_mock_calls( + mocks, reqs, None, None, parent_job_id="thisislikesoooofake" + ) + + +def test_run_job_as_concierge_with_wsid(): + """ + A unit test of the run() method with a concierge - but not admin - user. + + Includes an app ID. + """ + + # Set up data variables + client_group = "tinymem" + cpus = 4 + mem = 32 + disk = 2600 + wsid = 78 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + wsauth = mocks[WorkspaceAuth] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + wsauth.can_write.return_value = True + jrr.normalize_job_reqs.return_value = _create_reqs_dict( + cpus, mem, disk, client_group, client_group_regex=False, debug_mode=True + ) + reqs = ResolvedRequirements( + cpus=cpus, + memory_MB=mem, + disk_GB=disk, + client_group=client_group, + client_group_regex=False, + ignore_concurrency_limits=False, + bill_to_user=_OTHER_USER, + scheduler_requirements={"foo": "bar", "baz": "bat"}, + debug_mode=True, + ) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + "wsid": wsid, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + conc_params = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=0, + ignore_concurrency_limits=0, + debug_mode=1, + merge_with={ + "account_group": _OTHER_USER, + "requirements_list": [" foo = bar ", "baz=bat"], + }, + ) + assert rj.run(params, concierge_params=conc_params) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_concierge.assert_called_once_with() + wsauth.can_write.assert_called_once_with(wsid) + jrr.normalize_job_reqs.assert_called_once_with(conc_params, "concierge parameters") + + jrr.resolve_requirements.assert_called_once_with( + _METHOD, + mocks[CatalogCache], + cpus=cpus, + memory_MB=mem, + disk_GB=disk, + client_group=client_group, + client_group_regex=False, + ignore_concurrency_limits=False, + bill_to_user=_OTHER_USER, + scheduler_requirements={"foo": "bar", "baz": "bat"}, + debug_mode=True, + ) + _check_common_mock_calls(mocks, reqs, wsid) + + +def test_run_job_as_concierge_empty_as_admin(): + """ + A unit test of the run() method with an effectively empty concierge dict and admin privs. + The fake key should be ignored but is required to make the concierge params truthy and + trigger the pathway. + + Also provides a module only app ID, as some KBase processes provide these. + """ + _run_as_concierge_empty_as_admin({"fake": "foo"}, "lolcats") + + +def test_run_job_as_concierge_sched_reqs_None_as_admin(): + """ + A unit test of the run() method with an concierge dict containing None for the scheduler + requirements and admin privs. + + Also provides an app ID with a . instead of a / + """ + _run_as_concierge_empty_as_admin( + {"requirements_list": None}, "lolcats.itsmypartyilllolifiwantto" + ) + + +def test_run_job_as_concierge_sched_reqs_empty_list_as_admin(): + """ + A unit test of the run() method with an concierge dict containing an empty list for the + scheduler requirements and admin privs. + """ + _run_as_concierge_empty_as_admin({"requirements_list": []}, _APP) + + +def _run_as_concierge_empty_as_admin(concierge_params, app): + # Set up data variables + client_group = "concierge" # hardcoded default for run_as_concierge + cpus = 1 + mem = 1 + disk = 1 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + jrr.normalize_job_reqs.return_value = {} + reqs = ResolvedRequirements( + cpus=cpus, + memory_MB=mem, + disk_GB=disk, + client_group=client_group, + ) + jrr.resolve_requirements.return_value = reqs + _set_up_common_return_values(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": app, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + } + assert rj.run(params, concierge_params=concierge_params, as_admin=True) == _JOB_ID + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + sdkmr.check_as_concierge.assert_called_once_with() + jrr.normalize_job_reqs.assert_called_once_with( + concierge_params, "concierge parameters" + ) + + jrr.resolve_requirements.assert_called_once_with( + _METHOD, + mocks[CatalogCache], + cpus=None, + memory_MB=None, + disk_GB=None, + client_group=client_group, + client_group_regex=None, + ignore_concurrency_limits=True, + bill_to_user=None, + scheduler_requirements={}, + debug_mode=None, + ) + _check_common_mock_calls(mocks, reqs, None, app) + + +def test_run_job_concierge_fail_bad_params(): + """ + Test that submitting invalid concierge params causes the job to fail. Note that most + error checking happens in the mocked out job requirements resolver, so we only check for + errors that EE2RunJob is responsible for handling. + """ + _run_fail_concierge_params( + {"requirements_list": {"a", "b"}}, + IncorrectParamsException("requirements_list must be a list"), + ) + for err in [None, "", 42, "foo:bar"]: + _run_fail_concierge_params( + {"requirements_list": [err]}, + IncorrectParamsException( + f"Found illegal requirement in requirements_list: {err}" + ), + ) + + +def _run_fail_concierge_params(concierge_params, expected): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + + rj = EE2RunJob(sdkmr) + params = { + "method": _METHOD, + "app_id": _APP, + } + with raises(Exception) as got: + rj.run(params, concierge_params=concierge_params) + assert_exception_correct(got.value, expected) + + +def test_run_job_and_run_job_batch_fail_illegal_arguments(): + """ + Test that illegal arguments cause the job to fail. Note that not all arguments are + checked - this test checks arguments that are checked in the _check_job_arguments() + method. Furthermore, most argument checking occurs in the job submission parameters + class and its respective composed classes, and we don't reproduce all the error conditions + possible - just enough to ensure the error checking occurs. If major changes are made to + the error checking code then more tests may need to be written. + + Tests both the run() and run_batch() methods. + """ + # These are extremely annoying to debug as they don't raise a stacktrace if a different exception type was thrown + # or let you know that it was an entirely different exception, or if the exception happened in the bulk version of the run + + _run_and_run_batch_fail_illegal_arguments( + {}, IncorrectParamsException("Missing input parameter: method ID") + ) + + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "wsid": 0}, + IncorrectParamsException("wsid must be at least 1"), + InvalidParameterForBatch(), + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "wsid": -1}, + IncorrectParamsException("wsid must be at least 1"), + InvalidParameterForBatch(), + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "source_ws_objects": {"a": "b"}}, + IncorrectParamsException("source_ws_objects must be a list"), + ) + _run_and_run_batch_fail_illegal_arguments( + {"method": "foo.bar", "job_requirements": ["10 bob", "a pickled egg"]}, + IncorrectParamsException("job_requirements must be a mapping"), + ) + _run_and_run_batch_fail_illegal_arguments( + { + "method": "foo.bar", + "job_requirements": { + "bill_to_user": { + "Bill": "$3.78", + "Boris": "$2.95", + "AJ": "one BILIIOOOON dollars", + "Sumin": "$1,469,890.42", + } + }, + }, + IncorrectParamsException("bill_to_user must be a string"), + ) + + +def _run_and_run_batch_fail_illegal_arguments(params, expected, batch_expected=None): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") + _run_and_run_batch_fail(mocks[SDKMethodRunner], params, expected, batch_expected) + + +def test_run_job_and_run_job_batch_fail_arg_normalization(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + e = "Found illegal request_cpus 'like 10 I guess? IDK' in job requirements from input job" + jrr.normalize_job_reqs.side_effect = IncorrectParamsException(e) + _run_and_run_batch_fail( + mocks[SDKMethodRunner], + { + "method": "foo.bar", + "job_requirements": {"request_cpus": "like 10 I guess? IDK"}, + }, + IncorrectParamsException(e), + ) + + +def test_run_job_and_run_job_batch_fail_get_requirements_type(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + e = "bill_to_user contains control characters" + jrr.get_requirements_type.side_effect = IncorrectParamsException(e) + _run_and_run_batch_fail( + mocks[SDKMethodRunner], + {"method": "foo.bar", "job_requirements": {"bill_to_user": "ding\bding"}}, + IncorrectParamsException(e), + ) + + +def test_run_job_and_run_job_batch_fail_not_admin_with_job_reqs(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.PROCESSING + _run_and_run_batch_fail( + mocks[SDKMethodRunner], + {"method": "foo.bar", "job_requirements": {"ignore_concurrency_limits": 1}}, + AuthError("In order to specify job requirements you must be a full admin"), + as_admin=False, + ) + + +def test_run_job_and_run_job_batch_fail_resolve_requirements(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.STANDARD + e = "Unrecognized method: 'None'. Please input module_name.function_name" + jrr.resolve_requirements.side_effect = IncorrectParamsException(e) + _run_and_run_batch_fail(mocks[SDKMethodRunner], {}, IncorrectParamsException(e)) + + +def test_run_job_and_run_job_batch_fail_workspace_objects_check(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") + mocks[Workspace].get_object_info3.return_value = { + "paths": ["1/2/3", None, "21/34/55"] + } + + params = { + "method": "foo.bar", + "app_id": "foo/baz", + "source_ws_objects": ["1/2/3", "5/8/13", "21/34/55"], + } + _run_and_run_batch_fail( + sdkmr, params, ValueError("Some workspace object is inaccessible") + ) + + +def _run_and_run_batch_fail( + sdkmr, params, expected, batch_expected=None, as_admin=True +): + rj = EE2RunJob(sdkmr) + with raises(Exception) as got: + rj.run(params, as_admin=as_admin) + assert_exception_correct(got.value, expected) + + if batch_expected: + expected = batch_expected + _run_batch_fail(rj, [params], {}, as_admin, expected) + + +def _set_up_common_return_values_batch(mocks, returned_job_state=_QUEUED_STATE): + """ + Set up return values on mocks that are the same for several tests. + """ + mocks[Workspace].get_object_info3.return_value = { + "paths": [[_WS_REF_1], [_WS_REF_2]] + } + returned_parent_job = Job() + returned_parent_job.id = ObjectId(_JOB_ID) + returned_parent_job.user = _USER + + mocks[SDKMethodRunner].save_and_return_job.return_value = returned_parent_job + mocks[CatalogCache].lookup_git_commit_version.side_effect = [ + _GIT_COMMIT_1, + _GIT_COMMIT_2, + ] + + # create job1, update job1, create job2, update job2, update parent job + mocks[SDKMethodRunner].save_job.side_effect = [ + _JOB_ID_1, + None, + _JOB_ID_2, + None, + None, + ] + + mocks[SDKMethodRunner].save_jobs.side_effect = [ + [_JOB_ID_1, _JOB_ID_2], + ] + + mocks[Condor].run_job.side_effect = [ + SubmissionInfo(_CLUSTER_1, {}, None), + SubmissionInfo(_CLUSTER_2, {}, None), + ] + retjob_1 = Job() + retjob_1.id = ObjectId(_JOB_ID_1) + retjob_1.status = _CREATED_STATE + retjob_2 = Job() + retjob_2.id = ObjectId(_JOB_ID_2) + retjob_2.status = _CREATED_STATE + + retjob_1_after_submit = Job() + retjob_1_after_submit.id = ObjectId(_JOB_ID_1) + retjob_1_after_submit.status = returned_job_state + retjob_1_after_submit.scheduler_id = _CLUSTER_1 + retjob_2_after_submit = Job() + retjob_2_after_submit.id = ObjectId(_JOB_ID_2) + retjob_2_after_submit.status = returned_job_state + retjob_2_after_submit.scheduler_id = _CLUSTER_2 + + mocks[MongoUtil].get_job.side_effect = [retjob_1, retjob_2] + mocks[MongoUtil].get_jobs.side_effect = [ + [retjob_1_after_submit, retjob_2_after_submit] + ] + + +def _check_common_mock_calls_batch( + mocks, reqs1, reqs2, parent_wsid, terminated_during_submit=False +): + """ + Check that mocks are called as expected when those calls are similar or the same for + several tests. + """ + sdkmr = mocks[SDKMethodRunner] + mocks[Workspace].get_object_info3.assert_called_once_with( + {"objects": [{"ref": _WS_REF_1}, {"ref": _WS_REF_2}], "ignoreErrors": 1} + ) + + # parent job initial save + expected_parent_job = Job() + job_input = JobInput() + job_input.service_ver = _BATCH + job_input.app_id = _BATCH + job_input.method = _BATCH + job_input.narrative_cell_info = Meta() + expected_parent_job.job_input = job_input + expected_parent_job.batch_job = True + expected_parent_job.status = _CREATED_STATE + expected_parent_job.wsid = parent_wsid + expected_parent_job.user = _USER + assert len(sdkmr.save_and_return_job.call_args_list) == 1 + got_parent_job = sdkmr.save_and_return_job.call_args_list[0][0][0] + assert_jobs_equal(got_parent_job, expected_parent_job) + + mocks[CatalogCache].lookup_git_commit_version.assert_has_calls( + [ + call(method="module1.method1", service_ver=None), + call(method="module2.method2", service_ver=None), + ] + ) + + assert len(sdkmr.save_jobs.call_args_list) == 1 + + # initial child jobs data save + expected_job_1 = _create_job( + reqs1, + method=_METHOD_1, + app=_APP_1, + git_commit=_GIT_COMMIT_1, + source_ws_objects=[_WS_REF_1, _WS_REF_2], + wsid=parent_wsid, + batch_id=_JOB_ID, + ) + got_job_1 = sdkmr.save_jobs.call_args_list[0][0][0][0] + assert_jobs_equal(got_job_1, expected_job_1) + + expected_job_2 = _create_job( + reqs2, + method=_METHOD_2, + app=_APP_2, + git_commit=_GIT_COMMIT_2, + wsid=parent_wsid, + batch_id=_JOB_ID, + ) + # index 1 because save_jobs returns a list of two jobs + got_job_2 = sdkmr.save_jobs.call_args_list[0][0][0][1] + assert_jobs_equal(got_job_2, expected_job_2) + + jsp_expected_1 = JobSubmissionParameters( + _JOB_ID_1, + AppInfo(_METHOD_1, _APP_1), + reqs1, + UserCreds(_USER, _TOKEN), + parent_job_id=_JOB_ID, + source_ws_objects=[_WS_REF_1, _WS_REF_2], + wsid=parent_wsid, + ) + jsp_expected_2 = JobSubmissionParameters( + _JOB_ID_2, + AppInfo(_METHOD_2, _APP_2), + reqs2, + UserCreds(_USER, _TOKEN), + parent_job_id=_JOB_ID, + wsid=parent_wsid, + ) + mocks[Condor].run_job.assert_has_calls( + [call(params=jsp_expected_1), call(params=jsp_expected_2)] + ) + + # update to queued state + child_job_pairs = [ + JobIdPair(_JOB_ID_1, _CLUSTER_1), + JobIdPair(_JOB_ID_2, _CLUSTER_2), + ] + mocks[MongoUtil].update_jobs_to_queued.assert_has_calls([call(child_job_pairs)]) + job_ids = [child_job_pair.job_id for child_job_pair in child_job_pairs] + mocks[MongoUtil].get_jobs.assert_has_calls([call(job_ids)]) + + if not terminated_during_submit: + mocks[KafkaClient].send_kafka_message.assert_has_calls( + [ + call(KafkaCreateJob(job_id=_JOB_ID, user=_USER)), # parent job + call(KafkaCreateJob(job_id=_JOB_ID_1, user=_USER)), + call(KafkaCreateJob(job_id=_JOB_ID_2, user=_USER)), + call( + KafkaQueueChange( + job_id=_JOB_ID_1, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER_1, + ) + ), + call( + KafkaQueueChange( + job_id=_JOB_ID_2, + new_status=_QUEUED_STATE, + previous_status=_CREATED_STATE, + scheduler_id=_CLUSTER_2, + ) + ), + ] + ) + else: + mocks[KafkaClient].send_kafka_message.assert_has_calls( + [ + call(KafkaCreateJob(job_id=_JOB_ID, user=_USER)), # parent job + call(KafkaCreateJob(job_id=_JOB_ID_1, user=_USER)), + call(KafkaCreateJob(job_id=_JOB_ID_2, user=_USER)), + ] + ) + mocks[SDKMethodRunner].cancel_job.assert_has_calls( + [ + call(job_id=_JOB_ID_1, terminated_code=0), + call(job_id=_JOB_ID_2, terminated_code=0), + ] + ) + + # Removed for now, but might be added back in if run_job_message is re-added + # mocks[SlackClient].run_job_message.assert_has_calls( + # [ + # call(job_id=_JOB_ID_1, scheduler_id=_CLUSTER_1, username=_USER), + # call(job_id=_JOB_ID_2, scheduler_id=_CLUSTER_2, username=_USER), + # ] + # ) + + # Test to see if add_child jobs is called with correct batch_container and children + expected_batch_container = Job() + expected_batch_container.id = ObjectId(_JOB_ID) + expected_batch_container.user = _USER + + batch_job = sdkmr.add_child_jobs.call_args_list[0][1]["batch_job"] + sdkmr.add_child_jobs.assert_called_once_with( + batch_job=expected_batch_container, child_jobs=[_JOB_ID_1, _JOB_ID_2] + ) + """ + So this test doesn't actually check that the call is correct, but the assert_jobs_equal line below does + the assert below is necessary because of how equality works for Job objects + ( because they have the same object ID, which is what Job equality is based on. ) + and that the assert_called_once_with doesn't correctly check the job object + """ + assert_jobs_equal(batch_job, expected_batch_container) + + +def test_run_job_batch_with_cancellation_during_submit(): + """ + A basic unit test of the run_batch() method, providing a workspace ID for the parent job. This one also checks for + cancellation during submit causing a job cancellation request to be processed . + + This test is a fairly minimal test of the run_batch() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + # When an assertion is failed, this test doesn't show you where failed in PyCharm, so use + # Additional arguments `--no-cov -s` or run from cmd line + # PYTHONPATH=.:lib:test pytest test/tests_for_sdkmr/EE2Runjob_test.py::test_run_job_batch_with_parent_job_wsid --no-cov + + # set up variables + parent_wsid = 89 + wsid = 32 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + mocks[WorkspaceAuth].can_write.return_value = True + mocks[WorkspaceAuth].can_write_list.return_value = {wsid: True} + + jrr.normalize_job_reqs.side_effect = [{}, {}] + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.STANDARD, + ] + reqs1 = ResolvedRequirements( + cpus=1, + memory_MB=2, + disk_GB=3, + client_group="cg1", + ) + reqs2 = ResolvedRequirements( + cpus=10, + memory_MB=20, + disk_GB=30, + client_group="cg2", + ) + jrr.resolve_requirements.side_effect = [reqs1, reqs2] + + _set_up_common_return_values_batch( + mocks, returned_job_state=Status.terminated.value + ) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = [ + { + "method": _METHOD_1, + "app_id": _APP_1, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + }, + { + "method": _METHOD_2, + "app_id": _APP_2, + "wsid": wsid, + }, + ] + params[1]["wsid"] = None + assert rj.run_batch(params, {"wsid": parent_wsid}) == { + "batch_id": _JOB_ID, + "child_job_ids": [_JOB_ID_1, _JOB_ID_2], + } + # May need to increase sleep if thread takes too long + time.sleep(0.1) + + # check mocks called as expected. The order here is the order that they're called in the code. + mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) + + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.assert_has_calls( + [call({}, "input job"), call({}, "input job")] + ) + jrr.get_requirements_type.assert_has_calls( + [call(**_EMPTY_JOB_REQUIREMENTS), call(**_EMPTY_JOB_REQUIREMENTS)] + ) + jrr.resolve_requirements.assert_has_calls( + [ + call(_METHOD_1, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + ] + ) + _check_common_mock_calls_batch( + mocks, reqs1, reqs2, parent_wsid, terminated_during_submit=True + ) + + +def test_run_job_batch_with_parent_job_wsid(): + """ + A basic unit test of the run_batch() method, providing a workspace ID for the parent job. + + This test is a fairly minimal test of the run_batch() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + # When an assertion is failed, this test doesn't show you where failed in PyCharm, so use + # Additional arguments `--no-cov -s` or run from cmd line + # PYTHONPATH=.:lib:test pytest test/tests_for_sdkmr/EE2Runjob_test.py::test_run_job_batch_with_parent_job_wsid --no-cov + + # set up variables + parent_wsid = 89 + wsid = 32 + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + mocks[WorkspaceAuth].can_write.return_value = True + mocks[WorkspaceAuth].can_write_list.return_value = {wsid: True} + + jrr.normalize_job_reqs.side_effect = [{}, {}] + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.STANDARD, + ] + reqs1 = ResolvedRequirements( + cpus=1, + memory_MB=2, + disk_GB=3, + client_group="cg1", + ) + reqs2 = ResolvedRequirements( + cpus=10, + memory_MB=20, + disk_GB=30, + client_group="cg2", + ) + jrr.resolve_requirements.side_effect = [reqs1, reqs2] + + _set_up_common_return_values_batch(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + params = [ + { + "method": _METHOD_1, + "app_id": _APP_1, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + }, + { + "method": _METHOD_2, + "app_id": _APP_2, + "wsid": wsid, + }, + ] + with raises(InvalidParameterForBatch) as got: + rj.run_batch(copy.deepcopy(params), {"wsid": parent_wsid}) + assert_exception_correct(got.value, InvalidParameterForBatch()) + + params[1]["wsid"] = None + assert rj.run_batch(params, {"wsid": parent_wsid}) == { + "batch_id": _JOB_ID, + "child_job_ids": [_JOB_ID_1, _JOB_ID_2], + } + # May need to increase sleep if thread takes too long + time.sleep(0.1) + + # check mocks called as expected. The order here is the order that they're called in the code. + mocks[WorkspaceAuth].can_write.assert_called_once_with(parent_wsid) + + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.assert_has_calls( + [call({}, "input job"), call({}, "input job")] + ) + jrr.get_requirements_type.assert_has_calls( + [call(**_EMPTY_JOB_REQUIREMENTS), call(**_EMPTY_JOB_REQUIREMENTS)] + ) + jrr.resolve_requirements.assert_has_calls( + [ + call(_METHOD_1, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + ] + ) + _check_common_mock_calls_batch(mocks, reqs1, reqs2, parent_wsid) + + +def test_run_job_batch_as_admin_with_job_requirements(): + """ + A basic unit test of the run_batch() method with an administrative user and supplied job + requirements. + + This test is a fairly minimal test of the run_batch() method. It does not exercise all the + potential code paths or provide all the possible run inputs, such as job parameters, cell + metadata, etc. + """ + # set up variables + cpus = 89 + mem = 3 + disk = 10000 + client_group = "verylargeclientgroup" + + # set up mocks + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + jrr = mocks[JobRequirementsResolver] + # We intentionally do not check the logger methods as there are a lot of them and this is + # already a very large test. This may be something to be added later when needed. + + # Set up call returns. These calls are in the order they occur in the code + jrr.normalize_job_reqs.side_effect = [ + {}, + _create_reqs_dict( + cpus, mem, disk, client_group, client_group_regex=True, debug_mode=True + ), + ] + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.BILLING, + ] + req_args = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=True, + ignore_concurrency_limits=True, + debug_mode=True, + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + internal_representation=True, + ) + reqs1 = ResolvedRequirements( + cpus=1, memory_MB=1, disk_GB=1, client_group="verysmallclientgroup" + ) + reqs2 = ResolvedRequirements(**req_args) + jrr.resolve_requirements.side_effect = [reqs1, reqs2] + + _set_up_common_return_values_batch(mocks) + + # set up the class to be tested and run the method + rj = EE2RunJob(sdkmr) + inc_reqs = _create_reqs_dict( + cpus, + mem, + disk, + client_group, + client_group_regex=1, + ignore_concurrency_limits="righty ho, luv", + debug_mode="true", + merge_with={ + "bill_to_user": _OTHER_USER, + "scheduler_requirements": {"foo": "bar", "baz": "bat"}, + }, + ) + params = [ + { + "method": _METHOD_1, + "app_id": _APP_1, + "source_ws_objects": [_WS_REF_1, _WS_REF_2], + }, + { + "method": _METHOD_2, + "app_id": _APP_2, + "job_requirements": inc_reqs, + }, + ] + assert rj.run_batch(params, {}, as_admin=True) == { + "batch_id": _JOB_ID, + "child_job_ids": [_JOB_ID_1, _JOB_ID_2], + } + # May need to increase sleep if thread takes too long + time.sleep(0.1) + + # check mocks called as expected. The order here is the order that they're called in the code. + sdkmr.check_as_admin.assert_called_once_with(JobPermissions.WRITE) + jrr.normalize_job_reqs.assert_has_calls( + [call({}, "input job"), call(inc_reqs, "input job")] + ) + jrr.get_requirements_type.assert_has_calls( + [call(**_EMPTY_JOB_REQUIREMENTS), call(**req_args)] + ) + jrr.resolve_requirements.assert_has_calls( + [ + call(_METHOD_1, mocks[CatalogCache], **_EMPTY_JOB_REQUIREMENTS), + call(_METHOD_2, mocks[CatalogCache], **req_args), + ] + ) + _check_common_mock_calls_batch(mocks, reqs1, reqs2, None) + + +def test_run_batch_preflight_failures(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + rj = EE2RunJob(sdkmr) + with raises(Exception) as got: + rj._preflight(runjob_params=[], batch_params=[]) + + assert_exception_correct( + got.value, + expected=IncorrectParamsException( + "RunJobParams and BatchParams cannot be identical" + ), + ) + + with raises(Exception) as got: + rj._preflight(runjob_params=[], batch_params={"batch": "batch"}) + + assert_exception_correct( + got.value, + expected=IncorrectParamsException( + "Programming error, you forgot to set the new_batch_job flag to True" + ), + ) + + +def test_run_batch_fail_params_not_list_or_batch_not_mapping(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + rj = EE2RunJob(sdkmr) + for params in [ + None, + {}, + { + 1, + }, + "a", + 8, + ]: + _run_batch_fail( + rj, params, {}, True, IncorrectParamsException("params must be a list") + ) + + _run_batch_fail( + rj, [], [], True, IncorrectParamsException("batch params must be a mapping") + ) + + +# Note the next few tests are specifically testing that errors for multiple jobs have the +# correct job number + + +def test_run_job_batch_fail_illegal_arguments(): + """ + Test that illegal arguments cause the job to fail. Note that not all arguments are + checked - this test checks arguments that are checked in the _check_job_arguments() + method. Furthermore, most argument checking occurs in the job submission parameters + class and its respective composed classes, and we don't reproduce all the error conditions + possible - just enough to ensure the error checking occurs. If major changes are made to + the error checking code then more tests may need to be written. + + """ + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.resolve_requirements.return_value = ResolvedRequirements(1, 1, 1, "cg") + rj = EE2RunJob(mocks[SDKMethodRunner]) + job = {"method": "foo.bar"} + + _run_batch_fail( + rj, + [job, job, {}], + {}, + True, + IncorrectParamsException("Job #3: Missing input parameter: method ID"), + ) + _run_batch_fail( + rj, + [job, {"method": "foo.bar", "wsid": 0}], + {}, + True, + InvalidParameterForBatch(), + ) + _run_batch_fail( + rj, + [{"method": "foo.bar", "source_ws_objects": {"a": "b"}}, job], + {}, + True, + IncorrectParamsException("Job #1: source_ws_objects must be a list"), + ) + _run_batch_fail( + rj, + [job, {"method": "foo.bar", "job_requirements": ["10 bob", "a pickled egg"]}], + {}, + True, + IncorrectParamsException("Job #2: job_requirements must be a mapping"), + ) + _run_batch_fail( + rj, + [{"method": "foo.bar", "job_requirements": {"bill_to_user": 1}}, job], + {}, + True, + IncorrectParamsException("Job #1: bill_to_user must be a string"), + ) + + +def test_run_job_batch_fail_arg_normalization(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + e = "Found illegal request_cpus 'like 10 I guess? IDK' in job requirements from input job" + jrr.normalize_job_reqs.side_effect = [{}, IncorrectParamsException(e)] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [ + {"method": "foo.bar"}, + { + "method": "foo.bar", + "job_requirements": {"request_cpus": "like 10 I guess? IDK"}, + }, + ], + {}, + True, + IncorrectParamsException("Job #2: " + e), + ) + + +def test_run_job_batch_fail_get_requirements_type(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + e = "bill_to_user contains control characters" + jrr.get_requirements_type.side_effect = [ + RequirementsType.STANDARD, + RequirementsType.STANDARD, + IncorrectParamsException(e), + ] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [ + {"method": "foo.bar"}, + {"method": "foo.bar"}, + {"method": "foo.bar", "job_requirements": {"bill_to_user": "ding\bding"}}, + ], + {}, + False, + IncorrectParamsException("Job #3: " + e), + ) + + +def test_run_job_batch_fail_not_admin_with_job_reqs(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.side_effect = [ + RequirementsType.PROCESSING, + RequirementsType.STANDARD, + ] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [ + {"method": "foo.bar", "job_requirements": {"ignore_concurrency_limits": 1}}, + {"method": "foo.bar"}, + ], + {}, + False, + AuthError( + "Job #1: In order to specify job requirements you must be a full admin" + ), + ) + + +def test_run_job_batch_fail_resolve_requirements(): + mocks = _set_up_mocks(_USER, _TOKEN) + jrr = mocks[JobRequirementsResolver] + jrr.normalize_job_reqs.return_value = {} + jrr.get_requirements_type.return_value = RequirementsType.STANDARD + e = "Unrecognized method: 'None'. Please input module_name.function_name" + jr = ResolvedRequirements(cpus=4, memory_MB=4, disk_GB=4, client_group="cg") + jrr.resolve_requirements.side_effect = [jr, IncorrectParamsException(e)] + _run_batch_fail( + EE2RunJob(mocks[SDKMethodRunner]), + [{}, {"method": "foo.bar"}], + {}, + False, + IncorrectParamsException("Job #2: " + e), + ) + + +def test_run_job_batch_fail_parent_id_included(): + mocks = _set_up_mocks(_USER, _TOKEN) + sdkmr = mocks[SDKMethodRunner] + rj = EE2RunJob(sdkmr) + + _run_batch_fail( + rj, + [{"method": "foo.bar", "app_id": "foo/bat", "parent_job_id": "a"}], + {}, + True, + IncorrectParamsException("batch jobs may not specify a parent job ID"), + ) + + _run_batch_fail( + rj, + [ + {"method": "foo.bar", "app_id": "foo/bat"}, + {"method": "foo.bar", "app_id": "foo/bat", "parent_job_id": "a"}, + ], + {}, + True, + IncorrectParamsException("Job #2: batch jobs may not specify a parent job ID"), + ) + + +def _run_batch_fail(run_job, params, batch_params, as_admin, expected): + with raises(Exception) as got: + run_job.run_batch(params, batch_params, as_admin=as_admin) + assert_exception_correct(got.value, expected) + + +def assert_jobs_equal(got_job: Job, expected_job: Job): + """ + Checks that the two jobs are equivalent, except that the 'updated' fields are checked that + they're within 1 second of each other. + """ + # Job inherits from Document which inherits from BaseDocument in MongoEngine. BD provides + # the __eq__ method for the hierarchy, which bases equality on the Jobs having equal id + # fields, or if no id is present, on identity. Therefore + # assert job1 == job2 + # will not work as a test mechanic. + # JobInput and its contained classes inherit from EmbeddedDocument which *does* have an + # __eq__ method that takes the class fields into account. + # Also note that all these classes use __slots__ so vars() and __dict__ are empty other + # than the class name. + # Hence we do this disgusting hack instead. Note it will need to be updated any time a + # job field is added. + + if not hasattr(got_job, "id"): + assert not hasattr(expected_job, "id") + else: + assert got_job.id == expected_job.id + + # The Job class fills the updated field with the output of time.time on instantiation + # so we can't do a straight equality + assert abs(got_job.updated - expected_job.updated) < 1 + + job_fields = [ + "user", + "authstrat", + "wsid", + "status", + "queued", + "estimating", + "running", + "finished", + "errormsg", + "msg", + "error", + "terminated_code", + "error_code", + "scheduler_type", + "scheduler_id", + "scheduler_estimator_id", + "job_input", + "job_output", + "condor_job_ads", + "child_jobs", + "batch_job", + "batch_id", + ] + + _assert_field_subset_equal(got_job, expected_job, job_fields) + + +def _assert_field_subset_equal(obj1: object, obj2: object, fields: List[str]): + """ + Checks that field subsets from two objects are the same. + + :param obj1: The first object + :param obj2: The second object + :param fields: The fields in the objects to compare for equality. Any fields in the object + not in this list are ignored and not included in the equality calculation. + :raises AttributeError: If the field is not present in one or both of the objects. + """ + for field in fields: + assert getattr(obj1, field) == getattr(obj2, field), field diff --git a/test/tests_for_sdkmr/EE2StatusRange_test.py b/test/tests_for_sdkmr/EE2StatusRange_test.py new file mode 100644 index 000000000..f515898f1 --- /dev/null +++ b/test/tests_for_sdkmr/EE2StatusRange_test.py @@ -0,0 +1,134 @@ +""" +Unit tests for the EE2StatusRange class. +""" + +from pytest import raises + +from logging import Logger +from unittest.mock import create_autospec, call +from bson.objectid import ObjectId + +from execution_engine2.exceptions import AuthError +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.EE2StatusRange import JobStatusRange +from execution_engine2.db.models.models import Job + +from utils_shared.test_utils import assert_exception_correct + +# Incomplete by a long way. Will add more unit tests as they come up. + +USER1 = "user1" + + +def test_run_minimal_no_user_in_input(): + """ + Tests a minimal run of the job lookup method as a standard user with no username passed into + the method. + The returned job has minimal fields. + """ + _run_minimal(None) + + +def test_run_minimal_self_user_in_input(): + """ + Tests a minimal run of the job lookup method as a standard user with the user's own username + passed into the method. + The returned job has minimal fields. + """ + _run_minimal(USER1) + + +def _run_minimal(user): + # set up constants + expected_user = USER1 + job_count = 26 + objectid = "603051cfaf2e3401b0500982" + created_state = "created" + expected_job_filter = { + "id__gt": "000000230000000000000000", + "id__lt": "0000005c0000000000000000", + "user": expected_user, + } + + # set up mock return values. Ordered as per the call order in the EE2SR code. + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + logger = create_autospec(Logger, spec_set=True, instance=True) + sdkmr.get_logger.return_value = logger + sdkmr.get_user_id.return_value = expected_user + sdkmr.check_and_convert_time.side_effect = [35.6, 92.4] + sdkmr.get_job_counts.return_value = job_count + + j = Job() + j.id = ObjectId(objectid) + j.user = expected_user + j.updated = 1000000.0 + j.status = created_state + sdkmr.get_jobs.return_value = [j] + + # call the method + ee2sr = JobStatusRange(sdkmr) + ret = ee2sr.check_jobs_date_range_for_user("5/6/21", "7/6/21", user=user) + + assert ret == { + "count": 1, + "filter": expected_job_filter, + "jobs": [ + { + "_id": objectid, + "authstrat": "kbaseworkspace", + "batch_job": False, + "child_jobs": [], + # this comes from the ObjectID, which has an embedded date + "created": 1613779407000, + "job_id": objectid, + "status": created_state, + "updated": 1000000000, + "user": expected_user, + "retry_ids": [], + } + ], + "limit": 2000, + "projection": [], + "query_count": job_count, + "skip": 0, + "sort_order": "+", + "stats": { + "app_id": {None: 1}, + "clientgroup": {None: 1}, + "method": {None: 1}, + "status": {created_state: 1}, + "user": {expected_user: 1}, + "wsid": {None: 1}, + }, + } + + # check mocks called as expected. Ordered as per the call order in the EE2SR code + sdkmr.check_and_convert_time.assert_has_calls([call("5/6/21"), call("7/6/21")]) + sdkmr.get_job_counts.assert_called_once_with(expected_job_filter) + sdkmr.get_jobs.assert_called_once_with(expected_job_filter, [], "+", 0, 2000) + logger.debug.assert_called_once_with( + "Searching for jobs with id_gt 000000230000000000000000 id_lt 0000005c0000000000000000" + ) + + +def test_run_with_non_matching_user_and_not_admin(): + """ + Test that a user trying to see another user's jobs without admin privs fails as expected. + """ + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + sdkmr.get_user_id.return_value = "user1" + sdkmr.check_is_admin.return_value = False + + ee2sr = JobStatusRange(sdkmr) + with raises(Exception) as got: + ee2sr.check_jobs_date_range_for_user("5/6/21", "7/6/21", user="user2") + assert_exception_correct( + got.value, + AuthError( + "You are not authorized to view all records or records for others. " + + "user=user2 token=user1" + ), + ) + + sdkmr.get_user_id.assert_has_calls([call(), call()]) + sdkmr.check_is_admin.assert_called_once_with() diff --git a/test/tests_for_sdkmr/EE2Status_test.py b/test/tests_for_sdkmr/EE2Status_test.py new file mode 100644 index 000000000..26596dc6f --- /dev/null +++ b/test/tests_for_sdkmr/EE2Status_test.py @@ -0,0 +1,128 @@ +""" +Unit tests for the EE2Status class. +""" + +from logging import Logger +from unittest.mock import create_autospec, call +from bson.objectid import ObjectId + +from execution_engine2.db.models.models import Job, Status, JobInput +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.EE2Status import JobsStatus, JobPermissions +from execution_engine2.db.MongoUtil import MongoUtil +from lib.execution_engine2.utils.KafkaUtils import KafkaClient, KafkaFinishJob +from lib.execution_engine2.utils.Condor import Condor +from installed_clients.CatalogClient import Catalog + + +def _finish_job_complete_minimal_get_test_job(job_id, sched, app_id, gitcommit, user): + job = Job() + job.id = ObjectId(job_id) + job.running = 123.0 + job.finished = 456.5 + job.status = Status.running.value + job.scheduler_id = sched + job_input = JobInput() + job.job_input = job_input + job_input.app_id = app_id + job_input.method = "module.method_id" + job_input.service_ver = gitcommit + job.user = user + return job + + +def test_finish_job_complete_minimal_without_app_id(): + _finish_job_complete_minimal(None, None) + + +def test_finish_job_complete_minimal_with_app_id(): + _finish_job_complete_minimal("module/myapp", "module") + + +def _finish_job_complete_minimal(app_id, app_module): + """ + Tests a very simple case of completing a job successfully by the `finish_job` method. + """ + # set up constants + job_id = "6046b539ce9c58ecf8c3e5f3" + job_output = {"version": "1.1", "id": job_id, "result": [{"foo": "bar"}]} + user = "someuser" + gitcommit = "somecommit" + resources = {"fake": "condor", "resources": "in", "here": "yo"} + sched = "somescheduler" + + # set up mocks + sdkmr = create_autospec(SDKMethodRunner, spec_set=True, instance=True) + logger = create_autospec(Logger, spec_set=True, instance=True) + mongo = create_autospec(MongoUtil, spec_set=True, instance=True) + kafka = create_autospec(KafkaClient, spec_set=True, instance=True) + catalog = create_autospec(Catalog, spec_set=True, instance=True) + condor = create_autospec(Condor, spec_set=True, instance=True) + sdkmr.get_mongo_util.return_value = mongo + sdkmr.get_logger.return_value = logger + sdkmr.get_kafka_client.return_value = kafka + sdkmr.get_condor.return_value = condor + sdkmr.get_catalog.return_value = catalog + + # set up return values for mocks. Ordered as per order of operations in code + job1 = _finish_job_complete_minimal_get_test_job( + job_id, sched, app_id, gitcommit, user + ) + job2 = _finish_job_complete_minimal_get_test_job( + job_id, sched, app_id, gitcommit, user + ) + job2.status = Status.completed.value + + sdkmr.get_job_with_permission.side_effect = [job1, job2] + mongo.get_job.return_value = job2 # gets the job 3x...? + condor.get_job_resource_info.return_value = resources + + # call the method + JobsStatus(sdkmr).finish_job(job_id, job_output=job_output) # no return + + # check mocks called as expected. Ordered as per order of operations in code + + sdkmr.get_job_with_permission.assert_has_calls( + [ + call( + job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False + ), + call( + job_id=job_id, requested_job_perm=JobPermissions.WRITE, as_admin=False + ), + ] + ) + logger.debug.assert_has_calls( + [ + call("Finishing job with a success"), + # depending on stable dict ordering for this test to pass + call(f"Extracted the following condor job ads {resources}"), + ] + ) + mongo.finish_job_with_success.assert_called_once_with(job_id, job_output) + kafka.send_kafka_message.assert_called_once_with( + KafkaFinishJob( + job_id=job_id, + new_status=Status.completed.value, + previous_status=Status.running.value, + scheduler_id=sched, + error_code=None, + error_message=None, + ) + ) + mongo.get_job.assert_called_once_with(job_id) + les_expected = { + "user_id": user, + "func_module_name": "module", + "func_name": "method_id", + "git_commit_hash": gitcommit, + "creation_time": 1615246649.0, # from Job ObjectId + "exec_start_time": 123.0, + "finish_time": 456.5, + "is_error": 0, + "job_id": job_id, + } + if app_id: + les_expected.update({"app_id": app_id, "app_module_name": app_module}) + catalog.log_exec_stats.assert_called_once_with(les_expected) + mongo.update_job_resources.assert_called_once_with(job_id, resources) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py index a70178c41..279cf0438 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_EE2Logs_test.py @@ -7,9 +7,10 @@ import requests_mock -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job, JobLog -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job, JobLog +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.utils_shared.test_utils import ( bootstrap, run_job_adapter, @@ -34,9 +35,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token - ) + with open(deploy) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cf), + ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -44,7 +47,7 @@ def setUpClass(cls): db=cls.cfg["mongo-database"], col=cls.cfg["mongo-jobs-collection"] ) - cls.test_helper = ee2_sdkmr_test_helper(cls.method_runner) + cls.test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: return copy.copy(self.__class__.method_runner) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py index 5a0cd0933..ce98c0fb9 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test.py @@ -6,45 +6,63 @@ import time import unittest from configparser import ConfigParser -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from pprint import pprint -from unittest.mock import patch +from unittest.mock import patch, create_autospec import bson import dateutil import requests_mock from bson import ObjectId from mock import MagicMock - -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job, Status, TerminatedCode -from lib.execution_engine2.exceptions import AuthError -from lib.execution_engine2.exceptions import InvalidStatusTransitionException -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from pytest import raises + +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job, Status, TerminatedCode +from execution_engine2.exceptions import AuthError +from execution_engine2.exceptions import InvalidStatusTransitionException +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient +from execution_engine2.utils.clients import UserClientSet, ClientSet +from execution_engine2.utils.clients import get_user_client_set, get_client_set +from execution_engine2.utils.job_requirements_resolver import ( + JobRequirementsResolver, + RequirementsType, +) from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper +from test.utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS from test.utils_shared.test_utils import ( bootstrap, get_example_job, validate_job_state, run_job_adapter, + assert_exception_correct, ) from tests_for_db.mongo_test_helper import MongoTestHelper logging.basicConfig(level=logging.INFO) bootstrap() -from lib.execution_engine2.sdk.EE2Runjob import EE2RunJob +from execution_engine2.sdk.EE2Runjob import EE2RunJob + +from installed_clients.CatalogClient import Catalog +from installed_clients.WorkspaceClient import Workspace +# TODO this isn't necessary with pytest, can just use regular old functions class ee2_SDKMethodRunner_test(unittest.TestCase): @classmethod def setUpClass(cls): - config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") - logging.info(f"Loading config from {config_file}") + cls.config_file = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + logging.info(f"Loading config from {cls.config_file}") config_parser = ConfigParser() - config_parser.read(config_file) + config_parser.read(cls.config_file) cls.cfg = {} @@ -59,9 +77,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token - ) + with open(cls.config_file) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cf), + ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -69,13 +89,7 @@ def setUpClass(cls): db=cls.cfg["mongo-database"], col=cls.cfg["mongo-jobs-collection"] ) - cls.cr = CondorResources( - request_cpus="1", - request_disk="1GB", - request_memory="100M", - client_group="njs", - ) - cls.sdkmr_test_helper = ee2_sdkmr_test_helper(mr=cls.method_runner) + cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: # Initialize these clients from None @@ -84,6 +98,7 @@ def getRunner(self) -> SDKMethodRunner: runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() + runner.get_catalog_cache() return runner def create_job_rec(self): @@ -121,6 +136,97 @@ def create_job_rec(self): # self.assertEqual(len(git_commit_1), len(git_commit_2)) # self.assertNotEqual(git_commit_1, git_commit_2) + def test_init_fail(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + user_clients = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + clients = clients_and_mocks[ClientSet] + + self._init_fail(None, clients, ValueError("user_clients is required")) + self._init_fail(user_clients, None, ValueError("clients is required")) + + def _init_fail(self, cfg, user_clients, expected): + with raises(Exception) as e: + SDKMethodRunner(cfg, user_clients) + assert_exception_correct(e.value, expected) + + def test_getters(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + user_clients = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + + sdkmr = SDKMethodRunner(user_clients, clients_and_mocks[ClientSet]) + + assert sdkmr.get_catalog_cache() is sdkmr.catalog_cache + assert sdkmr.get_workspace() is ws + assert sdkmr.get_workspace_auth() is wsa + assert sdkmr.get_user_id() == "user" + assert sdkmr.get_token() == "token" + assert sdkmr.get_kafka_client() is clients_and_mocks[KafkaClient] + assert sdkmr.get_mongo_util() is clients_and_mocks[MongoUtil] + assert sdkmr.get_slack_client() is clients_and_mocks[SlackClient] + assert sdkmr.get_condor() is clients_and_mocks[Condor] + assert sdkmr.get_catalog() is clients_and_mocks[Catalog] + assert ( + sdkmr.get_job_requirements_resolver() + is clients_and_mocks[JobRequirementsResolver] + ) + + def test_save_job_and_save_jobs(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + sdkmr = SDKMethodRunner(cliset, clients_and_mocks[ClientSet]) + + # We cannot use spec_set=True here because the code must access the Job.id field, + # which is set dynamically. This means if the Job api changes, this test could pass + # when it should fail, but there doesn't seem to be a way around that other than + # completely rewriting how the code interfaces with MongoDB. + # For a discussion of spec_set see + # https://www.seanh.cc/2017/03/17/the-problem-with-mocks/ + j = create_autospec(Job, spec_set=False, instance=True) + j.id = bson.objectid.ObjectId("603051cfaf2e3401b0500982") + assert sdkmr.save_job(j) == "603051cfaf2e3401b0500982" + j.save.assert_called_once_with() + + # Test Save Jobs + job1 = Job() + job1.id = bson.objectid.ObjectId("603051cfaf2e3401b0500980") + job2 = Job() + job2.id = bson.objectid.ObjectId("603051cfaf2e3401b0500981") + sdkmr.get_mongo_util().insert_jobs.return_value = [job1.id, job2.id] + jobs = sdkmr.save_jobs([job1, job2]) + sdkmr.get_mongo_util().insert_jobs.assert_called_with( + jobs_to_insert=[job1, job2] + ) + assert jobs == [str(job1.id), str(job2.id)] + + def test_add_child_jobs(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + sdkmr = SDKMethodRunner(cliset, clients_and_mocks[ClientSet]) + j = create_autospec(Job, spec_set=False, instance=True) + returned_job = sdkmr.add_child_jobs(batch_job=j, child_jobs=["a", "b", "c"]) + j.modify.assert_called_once_with(add_to_set__child_jobs=["a", "b", "c"]) + assert returned_job == j + + def test_save_and_return_job(self): + ws = Workspace("https://fake.com") + wsa = WorkspaceAuth("user", ws) + cliset = UserClientSet("user", "token", ws, wsa) + clients_and_mocks = get_client_mocks(self.cfg, self.config_file, *ALL_CLIENTS) + sdkmr = SDKMethodRunner(cliset, clients_and_mocks[ClientSet]) + + j = create_autospec(Job, spec_set=True, instance=True) + assert sdkmr.save_and_return_job(j) == j + + j.save.assert_called_once_with() + # Status @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_cancel_job(self, condor): @@ -128,12 +234,11 @@ def test_cancel_job(self, condor): sdk = self.getRunner() sdk.condor = condor - with sdk.get_mongo_util().mongo_engine_connection(): - job = get_example_job() - job.user = self.user_id - job.wsid = self.ws_id - job.save() - job_id = job.id + job = get_example_job() + job.user = self.user_id + job.wsid = self.ws_id + job.save() + job_id = job.id logging.info( f"Created job in wsid={job.wsid} status={job.status} scheduler={job.scheduler_id}. About to cancel {job_id}" @@ -150,11 +255,10 @@ def test_cancel_job(self, condor): TerminatedCode.terminated_by_user, ) - with sdk.get_mongo_util().mongo_engine_connection(): - job = get_example_job() - job.user = self.user_id - job.wsid = self.ws_id - job_id = job.save().id + job = get_example_job() + job.user = self.user_id + job.wsid = self.ws_id + job_id = job.save().id logging.info( f"Created job {job_id} in {job.wsid} status {job.status}. About to cancel" @@ -189,8 +293,6 @@ def test_cancel_job2(self, rq_mock, condor_mock): runner = self.getRunner() runner.workspace_auth = MagicMock() runner.auth.get_user = MagicMock(return_value=user_name) - runner.is_admin = True - runner._is_admin = MagicMock(return_value=True) runner.workspace_auth.can_read = MagicMock(return_value=True) runner.get_permissions_for_workspace = MagicMock(return_value=True) @@ -199,24 +301,21 @@ def test_cancel_job2(self, rq_mock, condor_mock): # runner.get_runjob = MagicMock(return_value="git_commit_goes_here") runner.get_condor = MagicMock(return_value=condor_mock) + fixed_rj = EE2RunJob(runner) - fixed_rj._get_module_git_commit = MagicMock(return_value="hash_goes_here") - fixed_rj.sdkmr.catalog_utils.list_client_group_configs = MagicMock( - return_value="cg goes her" - ) + # _get_module_git_commitfixed_rj._get_module_git_commit = MagicMock(return_value="hash_goes_here") runner.get_runjob = MagicMock(return_value=fixed_rj) # ctx = {"user_id": self.user_id, "wsid": self.ws_id, "token": self.token} job = get_example_job().to_mongo().to_dict() - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) print("About to run job with params") pprint(job) job_id0 = runner.run_job(params=job) @@ -287,50 +386,49 @@ def test_check_job_canceled(self, mongo_util): # # runner = self.getRunner() - with self.mongo_util.mongo_engine_connection(): - job_id = self.create_job_rec() - - call_count = 0 - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - # estimating - runner.update_job_status(job_id=job_id, status=Status.estimating.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.queued.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.running.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertFalse(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.completed.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertTrue(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.error.value) - rv = runner.check_job_canceled(job_id) - self.assertFalse(rv["canceled"]) - self.assertTrue(rv["finished"]) - call_count += 1 - - runner.update_job_status(job_id=job_id, status=Status.terminated.value) - rv = runner.check_job_canceled(job_id) - self.assertTrue(rv["canceled"]) - self.assertTrue(rv["finished"]) - call_count += 1 + job_id = self.create_job_rec() + + call_count = 0 + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + # estimating + runner.update_job_status(job_id=job_id, status=Status.estimating.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.queued.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.running.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertFalse(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.completed.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertTrue(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.error.value) + rv = runner.check_job_canceled(job_id) + self.assertFalse(rv["canceled"]) + self.assertTrue(rv["finished"]) + call_count += 1 + + runner.update_job_status(job_id=job_id, status=Status.terminated.value) + rv = runner.check_job_canceled(job_id) + self.assertTrue(rv["canceled"]) + self.assertTrue(rv["finished"]) + call_count += 1 @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -349,13 +447,12 @@ def test_run_job_and_add_log(self, rq_mock, condor_mock): ) runner.get_condor = MagicMock(return_value=condor_mock) job = get_example_job(user=self.user_id, wsid=self.ws_id).to_mongo().to_dict() - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) job_id = runner.run_job(params=job) logging.info(f"Job id is {job_id} ") @@ -489,86 +586,82 @@ def test_run_job_and_add_log(self, rq_mock, condor_mock): @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_finish_job(self, condor): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) - runner.catalog_utils.catalog.log_exec_stats = MagicMock(return_value=True) - - # test missing job_id input - with self.assertRaises(ValueError) as context1: - logging.info("Finish Job Case 0 Raises Error") - runner.finish_job(job_id=None) - self.assertEqual("Please provide a valid job id", str(context1.exception)) - - # test finish job with invalid status (This was removed) - # with self.assertRaises(ValueError) as context2: - # logging.info("Finish Job Case 1 Raises Error") - # runner.finish_job(job_id=job_id) - # self.assertIn("Unexpected job status", str(context2.exception)) - - # update job status to running - - runner.start_job(job_id=job_id, skip_estimation=True) - - # self.mongo_util.update_job_status(job_id=job_id, status=Status.running.value) - # job.running = datetime.datetime.utcnow() - # job.save() - - # test finish job without error - job_output = dict() - job_output["version"] = "1" - job_output["id"] = "5d54bdcb9b402d15271b3208" # A valid objectid - job_output["result"] = {"output": "output"} - logging.info("Case2 : Finish a running job") - - print(f"About to finish job {job_id}. The job status is currently") - print(runner.get_job_status_field(job_id)) - try: - runner.finish_job(job_id=job_id, job_output=job_output) - except: - pass - print("Job is now finished, status is") - print(runner.get_job_status_field(job_id)) - self.assertEqual( - {"status": "completed"}, runner.get_job_status_field(job_id) - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, Status.completed.value) - self.assertFalse(job.errormsg) - self.assertTrue(job.finished) - # if job_output not a dict# - # job_output2 = job.job_output.to_mongo().to_dict() - job_output2 = job.job_output - self.assertEqual(job_output2["version"], "1") - self.assertEqual(str(job_output2["id"]), job_output["id"]) - - # update finished status to running - with self.assertRaises(InvalidStatusTransitionException): - self.mongo_util.update_job_status( - job_id=job_id, status=Status.running.value - ) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) + runner.get_catalog().log_exec_stats = MagicMock(return_value=True) + + # test missing job_id input + with self.assertRaises(ValueError) as context1: + logging.info("Finish Job Case 0 Raises Error") + runner.finish_job(job_id=None) + self.assertEqual("Please provide a valid job id", str(context1.exception)) + + # test finish job with invalid status (This was removed) + # with self.assertRaises(ValueError) as context2: + # logging.info("Finish Job Case 1 Raises Error") + # runner.finish_job(job_id=job_id) + # self.assertIn("Unexpected job status", str(context2.exception)) + + # update job status to running + + runner.start_job(job_id=job_id, skip_estimation=True) + + # self.mongo_util.update_job_status(job_id=job_id, status=Status.running.value) + # job.running = datetime.datetime.utcnow() + # job.save() + + # test finish job without error + job_output = dict() + job_output["version"] = "1" + job_output["id"] = "5d54bdcb9b402d15271b3208" # A valid objectid + job_output["result"] = {"output": "output"} + logging.info("Case2 : Finish a running job") + + print(f"About to finish job {job_id}. The job status is currently") + print(runner.get_job_status_field(job_id)) + try: + runner.finish_job(job_id=job_id, job_output=job_output) + except: + pass + print("Job is now finished, status is") + print(runner.get_job_status_field(job_id)) + self.assertEqual({"status": "completed"}, runner.get_job_status_field(job_id)) + + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, Status.completed.value) + self.assertFalse(job.errormsg) + self.assertTrue(job.finished) + # if job_output not a dict# + # job_output2 = job.job_output.to_mongo().to_dict() + job_output2 = job.job_output + self.assertEqual(job_output2["version"], "1") + self.assertEqual(str(job_output2["id"]), job_output["id"]) + + # update finished status to running + with self.assertRaises(InvalidStatusTransitionException): + self.mongo_util.update_job_status( + job_id=job_id, status=Status.running.value + ) @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_finish_job_with_error_message(self, condor): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - job = self.mongo_util.get_job(job_id=job_id) - new_count = Job.objects.count() - self.assertEqual(ori_job_count, new_count - 1) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + job = self.mongo_util.get_job(job_id=job_id) + new_count = Job.objects.count() + self.assertEqual(ori_job_count, new_count - 1) runner = self.getRunner() - condor.get_job_info = MagicMock(return_value={}) + condor._get_job_info = MagicMock(return_value={}) condor.get_job_resource_info = MagicMock(return_value={}) runner.condor = condor runner._send_exec_stats_to_catalog = MagicMock(return_value=True) @@ -593,10 +686,8 @@ def test_finish_job_with_error_message(self, condor): self.assertIsNone(job.error) self.assertTrue(job.finished) - with self.mongo_util.mongo_engine_connection(): - job_id = runner.update_job_status( - job_id, "running" - ) # put job back to running status + # put job back to running status + job_id = runner.update_job_status(job_id, "running") error = { "message": "error message", @@ -628,43 +719,44 @@ def test_check_job_global_perm(self, rq_mock): user_roles=[], ) ) - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.assertFalse(job.running) - self.assertFalse(job.estimating) - - # test check_job - runner = self.getRunner() - job_state = runner.check_job(job_id) - json.dumps(job_state) # make sure it's JSON serializable - self.assertTrue(validate_job_state(job_state)) - self.assertEqual(job_state["status"], "created") - self.assertEqual(job_state["wsid"], self.ws_id) - - self.assertAlmostEqual( - job_state["created"] / 1000.0, job_state["updated"] / 1000.0, places=-1 - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - # test globally - job_states = runner.get_jobs_status().check_workspace_jobs(self.ws_id) - self.assertTrue(job_id in job_states) - self.assertEqual(job_states[job_id]["status"], "created") + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.assertFalse(job.running) + self.assertFalse(job.estimating) - # now test with a different user + # test check_job + runner = self.getRunner() + job_state = runner.check_job(job_id) + json.dumps(job_state) # make sure it's JSON serializable + self.assertTrue(validate_job_state(job_state)) + self.assertEqual(job_state["status"], "created") + self.assertEqual(job_state["wsid"], self.ws_id) + + self.assertAlmostEqual( + job_state["created"] / 1000.0, job_state["updated"] / 1000.0, places=-1 + ) + + # test globally + job_states = runner.get_jobs_status().check_workspace_jobs(self.ws_id) + self.assertTrue(job_id in job_states) + self.assertEqual(job_states[job_id]["status"], "created") + + # now test with a different user + with open(self.config_file) as cf: other_method_runner = SDKMethodRunner( - self.cfg, user_id="some_other_user", token="other_token" - ) - job_states = other_method_runner.get_jobs_status().check_workspace_jobs( - self.ws_id + get_user_client_set(self.cfg, "some_other_user", "other_token"), + get_client_set(self.cfg, cf), ) - self.assertTrue(job_id in job_states) - self.assertEqual(job_states[job_id]["status"], "created") + job_states = other_method_runner.get_jobs_status().check_workspace_jobs( + self.ws_id + ) + self.assertTrue(job_id in job_states) + self.assertEqual(job_states[job_id]["status"], "created") @requests_mock.Mocker() def test_check_job_ok(self, rq_mock): @@ -675,165 +767,144 @@ def test_check_job_ok(self, rq_mock): ) ) - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - job_id_1 = self.create_job_rec() - job_id_fake = str(bson.objectid.ObjectId()) - print(f"Saved job_id {job_id}") - print(f"Saved job_id_1 {job_id_1}") - print(f"Created fake {job_id_fake}") - - new_count = Job.objects.count() - self.assertEqual(ori_job_count, new_count - 2) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.assertFalse(job.running) - self.assertFalse(job.estimating) - - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) - - # test missing job_id input - with self.assertRaises(ValueError) as context: - runner.check_job(None) - self.assertEqual("Please provide valid job_id", str(context.exception)) - - # test check_job in a regular way - job_state = runner.check_job(job_id) - json.dumps(job_state) # make sure it's JSON serializable - self.assertTrue(validate_job_state(job_state)) - self.assertEqual(job_state["status"], "created") - self.assertEqual(job_state["wsid"], self.ws_id) - # Test both - job_state1 = runner.check_job(job_id_1) - self.assertEqual(job_state1["status"], "created") - - print(f'Job status of {job_id}={job_state["status"]}') - print(f'Job status of {job_id_1}={job_state1["status"]}') - - # test check_job with exclude_fields - job_state_exclude = runner.check_job(job_id, exclude_fields=["status"]) - self.assertFalse("status" in job_state_exclude.keys()) - self.assertEqual(job_state_exclude["wsid"], self.ws_id) - - # test check_job with exclude_fields - job_state_exclude2 = runner.check_job(job_id, exclude_fields=["status"]) - self.assertFalse("status" in job_state_exclude2.keys()) - self.assertEqual(job_state_exclude2["wsid"], self.ws_id) - - # test check_jobs - job_states_rl_0 = runner.check_jobs( - [job_id, job_id_1, job_id_fake], return_list=0 - ) - logging.info( - json.dumps(job_states_rl_0) - ) # make sure it's JSON serializable - self.assertEqual(len(job_states_rl_0.keys()), 3) - self.assertEqual(list(job_states_rl_0.keys())[0], job_id) - self.assertEqual(list(job_states_rl_0.keys())[1], job_id_1) - self.assertEqual(list(job_states_rl_0.keys())[2], job_id_fake) - self.assertTrue(validate_job_state(job_states_rl_0[job_id])) - self.assertTrue(job_id in job_states_rl_0) - self.assertEqual(job_states_rl_0[job_id]["status"], "created") - self.assertEqual(job_states_rl_0[job_id]["wsid"], self.ws_id) - - # test check_jobs return list - job_states_rl_1 = runner.check_jobs( - [job_id, job_id_1, job_id_fake], return_list=1 - )["job_states"] - json.dumps(job_states_rl_1) # make sure it's JSON serializable - self.assertEqual(len(job_states_rl_1), 3) - self.assertEqual(job_states_rl_1[0]["job_id"], job_id) - self.assertEqual(job_states_rl_1[1]["job_id"], job_id_1) - self.assertEqual(job_states_rl_1[2], []) - self.assertTrue(isinstance(job_states_rl_1, list)) - print(type(job_states_rl_1)) - self.assertCountEqual(job_states_rl_1, list(job_states_rl_0.values())) - - job_states_list_rl_t = runner.check_jobs( - [job_id, job_id_1], return_list="True" - )["job_states"] - json.dumps(job_states_list_rl_t) # make sure it's JSON serializable - self.assertEqual(job_states_list_rl_t[0]["job_id"], job_id) - self.assertEqual(job_states_list_rl_t[1]["job_id"], job_id_1) - self.assertTrue(isinstance(job_states_list_rl_t, list)) - self.assertCountEqual( - job_states_list_rl_t, list(job_states_rl_0.values())[:2] - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + job_id_1 = self.create_job_rec() + job_id_fake = str(bson.objectid.ObjectId()) + print(f"Saved job_id {job_id}") + print(f"Saved job_id_1 {job_id_1}") + print(f"Created fake {job_id_fake}") - # test check_jobs with exclude_fields - job_states_rl0_exclude_wsid = runner.check_jobs( - [job_id], exclude_fields=["wsid"], return_list=0 - ) - self.assertTrue(job_id in job_states_rl0_exclude_wsid) - self.assertFalse("wsid" in job_states_rl0_exclude_wsid[job_id].keys()) - self.assertEqual(job_states_rl0_exclude_wsid[job_id]["status"], "created") - - # test check_workspace_jobs - job_states_from_workspace_check = ( - runner.get_jobs_status().check_workspace_jobs( - self.ws_id, return_list="False" - ) - ) - for job_id_from_wsid in job_states_from_workspace_check: - self.assertTrue(job_states_from_workspace_check[job_id_from_wsid]) - print("Job States are") - for job_key in job_states_from_workspace_check: - if job_key in job_states_rl_1: - print( - job_key, - job_states_from_workspace_check[job_key]["status"], - runner.check_job(job_id=job_key)["status"], - job_states_rl_0[job], - ) + new_count = Job.objects.count() + self.assertEqual(ori_job_count, new_count - 2) - json.dumps( - job_states_from_workspace_check - ) # make sure it's JSON serializable - self.assertTrue(job_id in job_states_from_workspace_check) - self.assertEqual( - job_states_from_workspace_check[job_id]["status"], "created" - ) - self.assertEqual( - job_states_from_workspace_check[job_id]["wsid"], self.ws_id - ) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.assertFalse(job.running) + self.assertFalse(job.estimating) - self.assertTrue(job_id_1 in job_states_from_workspace_check) - self.assertEqual( - job_states_from_workspace_check[job_id_1]["status"], "created" - ) - self.assertEqual( - job_states_from_workspace_check[job_id_1]["wsid"], self.ws_id - ) + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) + + # test missing job_id input + with self.assertRaises(ValueError) as context: + runner.check_job(None) + self.assertEqual("Please provide valid job_id", str(context.exception)) + + # test check_job in a regular way + job_state = runner.check_job(job_id) + json.dumps(job_state) # make sure it's JSON serializable + self.assertTrue(validate_job_state(job_state)) + self.assertEqual(job_state["status"], "created") + self.assertEqual(job_state["wsid"], self.ws_id) + # Test both + job_state1 = runner.check_job(job_id_1) + self.assertEqual(job_state1["status"], "created") + + print(f'Job status of {job_id}={job_state["status"]}') + print(f'Job status of {job_id_1}={job_state1["status"]}') + + # test check_job with exclude_fields + job_state_exclude = runner.check_job(job_id, exclude_fields=["status"]) + self.assertFalse("status" in job_state_exclude.keys()) + self.assertEqual(job_state_exclude["wsid"], self.ws_id) + + # test check_job with exclude_fields + job_state_exclude2 = runner.check_job(job_id, exclude_fields=["status"]) + self.assertFalse("status" in job_state_exclude2.keys()) + self.assertEqual(job_state_exclude2["wsid"], self.ws_id) + + # test check_jobs + job_states_rl_0 = runner.check_jobs( + [job_id, job_id_1, job_id_fake], return_list=0 + ) + logging.info(json.dumps(job_states_rl_0)) # make sure it's JSON serializable + self.assertEqual(len(job_states_rl_0.keys()), 3) + self.assertEqual(list(job_states_rl_0.keys())[0], job_id) + self.assertEqual(list(job_states_rl_0.keys())[1], job_id_1) + self.assertEqual(list(job_states_rl_0.keys())[2], job_id_fake) + self.assertTrue(validate_job_state(job_states_rl_0[job_id])) + self.assertTrue(job_id in job_states_rl_0) + self.assertEqual(job_states_rl_0[job_id]["status"], "created") + self.assertEqual(job_states_rl_0[job_id]["wsid"], self.ws_id) + + # test check_jobs return list + job_states_rl_1 = runner.check_jobs( + [job_id, job_id_1, job_id_fake], return_list=1 + )["job_states"] + json.dumps(job_states_rl_1) # make sure it's JSON serializable + self.assertEqual(len(job_states_rl_1), 3) + self.assertEqual(job_states_rl_1[0]["job_id"], job_id) + self.assertEqual(job_states_rl_1[1]["job_id"], job_id_1) + self.assertEqual(job_states_rl_1[2], []) + self.assertTrue(isinstance(job_states_rl_1, list)) + print(type(job_states_rl_1)) + self.assertCountEqual(job_states_rl_1, list(job_states_rl_0.values())) + + job_states_list_rl_t = runner.check_jobs( + [job_id, job_id_1], return_list="True" + )["job_states"] + json.dumps(job_states_list_rl_t) # make sure it's JSON serializable + self.assertEqual(job_states_list_rl_t[0]["job_id"], job_id) + self.assertEqual(job_states_list_rl_t[1]["job_id"], job_id_1) + self.assertTrue(isinstance(job_states_list_rl_t, list)) + self.assertCountEqual(job_states_list_rl_t, list(job_states_rl_0.values())[:2]) + + # test check_jobs with exclude_fields + job_states_rl0_exclude_wsid = runner.check_jobs( + [job_id], exclude_fields=["wsid"], return_list=0 + ) + self.assertTrue(job_id in job_states_rl0_exclude_wsid) + self.assertFalse("wsid" in job_states_rl0_exclude_wsid[job_id].keys()) + self.assertEqual(job_states_rl0_exclude_wsid[job_id]["status"], "created") - # test check_workspace_jobs with exclude_fields - job_states_with_exclude_wsid = ( - runner.get_jobs_status().check_workspace_jobs( - self.ws_id, exclude_fields=["wsid"], return_list=False + # test check_workspace_jobs + job_states_from_workspace_check = runner.get_jobs_status().check_workspace_jobs( + self.ws_id, return_list="False" + ) + for job_id_from_wsid in job_states_from_workspace_check: + self.assertTrue(job_states_from_workspace_check[job_id_from_wsid]) + print("Job States are") + for job_key in job_states_from_workspace_check: + if job_key in job_states_rl_1: + print( + job_key, + job_states_from_workspace_check[job_key]["status"], + runner.check_job(job_id=job_key)["status"], + job_states_rl_0[job], ) - ) - logging.info( - json.dumps(job_states_with_exclude_wsid) - ) # make sure it's JSON serializable - self.assertTrue(job_id in job_states_with_exclude_wsid) - self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id].keys()) - self.assertEqual(job_states_with_exclude_wsid[job_id]["status"], "created") - self.assertTrue(job_id_1 in job_states_with_exclude_wsid) - self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id_1].keys()) - self.assertEqual( - job_states_with_exclude_wsid[job_id_1]["status"], "created" - ) + json.dumps(job_states_from_workspace_check) # make sure it's JSON serializable + self.assertTrue(job_id in job_states_from_workspace_check) + self.assertEqual(job_states_from_workspace_check[job_id]["status"], "created") + self.assertEqual(job_states_from_workspace_check[job_id]["wsid"], self.ws_id) - with self.assertRaises(PermissionError) as e: - runner.get_jobs_status().check_workspace_jobs(1234) - self.assertIn( - f"User {self.user_id} does not have permission to read jobs in workspace {1234}", - str(e.exception), - ) + self.assertTrue(job_id_1 in job_states_from_workspace_check) + self.assertEqual(job_states_from_workspace_check[job_id_1]["status"], "created") + self.assertEqual(job_states_from_workspace_check[job_id_1]["wsid"], self.ws_id) + + # test check_workspace_jobs with exclude_fields + job_states_with_exclude_wsid = runner.get_jobs_status().check_workspace_jobs( + self.ws_id, exclude_fields=["wsid"], return_list=False + ) + + logging.info( + json.dumps(job_states_with_exclude_wsid) + ) # make sure it's JSON serializable + self.assertTrue(job_id in job_states_with_exclude_wsid) + self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id].keys()) + self.assertEqual(job_states_with_exclude_wsid[job_id]["status"], "created") + self.assertTrue(job_id_1 in job_states_with_exclude_wsid) + self.assertFalse("wsid" in job_states_with_exclude_wsid[job_id_1].keys()) + self.assertEqual(job_states_with_exclude_wsid[job_id_1]["status"], "created") + + with self.assertRaises(PermissionError) as e: + runner.get_jobs_status().check_workspace_jobs(1234) + self.assertIn( + f"User {self.user_id} does not have permission to read jobs in workspace {1234}", + str(e.exception), + ) @staticmethod def create_job_from_job(job, new_job_id): @@ -848,11 +919,15 @@ def create_job_from_job(job, new_job_id): return j def replace_job_id(self, job1, new_id): - with self.mongo_util.mongo_engine_connection(): - job2 = self.create_job_from_job(job1, new_id) - job2.save() - print("Saved job with id", job2.id, job2.id.generation_time) - job1.delete() + job2 = self.create_job_from_job(job1, new_id) + job2.save() + print( + "Saved job with id", + job2.id, + job2.id.generation_time, + job2.id.generation_time.timestamp(), + ) + job1.delete() # flake8: noqa: C901 @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -861,33 +936,38 @@ def test_check_jobs_date_range(self, condor_mock): runner = self.getRunner() + # TODO redo this test with dependency injection & autospec vs. monkey patching + resolver = create_autospec( + JobRequirementsResolver, spec_set=True, instance=True + ) runner.workspace_auth = MagicMock() + runner.get_job_requirements_resolver = MagicMock(return_value=resolver) + resolver.get_requirements_type.return_value = RequirementsType.STANDARD + resolver.resolve_requirements.return_value = JobRequirements( + cpus=1, + memory_MB=100, + disk_GB=1, + client_group="njs", + ) runner.auth.get_user = MagicMock(return_value=user_name) - runner.is_admin = True runner.check_is_admin = MagicMock(return_value=True) runner.workspace_auth.can_read = MagicMock(return_value=True) self.mock = MagicMock(return_value=True) - runner._ee2_runjob._get_module_git_commit = MagicMock( - return_value="hash_goes_here" - ) # fixed_rj = RunJob(runner) # fixed_rj._get_module_git_commit = MagicMock(return_value='hash_goes_here') - runner._get_module_git_commit = MagicMock(return_value="git_commit_goes_here") - runner.get_condor = MagicMock(return_value=condor_mock) # ctx = {"user_id": self.user_id, "wsid": self.ws_id, "token": self.token} job = get_example_job().to_mongo().to_dict() - job["method"] = job["job_input"]["app_id"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) job_id1 = runner.run_job(params=job) job_id2 = runner.run_job(params=job) @@ -899,7 +979,7 @@ def test_check_jobs_date_range(self, condor_mock): new_job_ids = [] - now = datetime.utcnow() + now = datetime.now(tz=timezone.utc) last_month = now - timedelta(days=30) last_month_and_1_hour = now - timedelta(days=30) - timedelta(hours=1) @@ -908,347 +988,345 @@ def test_check_jobs_date_range(self, condor_mock): tomorrow = now + timedelta(days=1) day_after = now + timedelta(days=2) - with self.mongo_util.mongo_engine_connection(): - # Last Month - job = Job.objects.with_id(job_id1) # type : Job - new_id_last_month = ObjectId.from_datetime(last_month) - print(last_month, new_id_last_month, new_id_last_month.generation_time) - - print("About to replace job id") - print(job) - print(new_id_last_month) - self.replace_job_id(job, new_id_last_month) - new_job_ids.append(str(new_id_last_month)) - - # Last week - job = Job.objects.with_id(job_id2) # type : Job - new_id_last_week = ObjectId.from_datetime(last_week) - self.replace_job_id(job, new_id_last_week) - new_job_ids.append(str(new_id_last_week)) - - # Yesterday - job = Job.objects.with_id(job_id3) # type : Job - new_id_yesterday = ObjectId.from_datetime(yesterday) - self.replace_job_id(job, new_id_yesterday) - new_job_ids.append(str(new_id_yesterday)) - - # Now - job = Job.objects.with_id(job_id4) # type : Job - new_id_now = ObjectId.from_datetime(now) - self.replace_job_id(job, new_id_now) - new_job_ids.append(str(new_id_now)) - - # Tomorrow - job = Job.objects.with_id(job_id5) # type : Job - new_id_tomorrow = ObjectId.from_datetime(tomorrow) - self.replace_job_id(job, new_id_tomorrow) - new_job_ids.append(str(new_id_tomorrow)) - - # Day After - job = Job.objects.with_id(job_id6) # type : Job - new_id_day_after = ObjectId.from_datetime(day_after) - self.replace_job_id(job, new_id_day_after) - new_job_ids.append(str(new_id_day_after)) + print( + f"Last month - 1 hour: {last_month_and_1_hour} " + + f"ts: {last_month_and_1_hour.timestamp()}" + ) + print(f"Last month: {last_month} ts: {last_month.timestamp()}") + print(f"Last Week: {last_week} ts: {last_week.timestamp()}") + print(f"Yesterday: {yesterday} ts: {yesterday.timestamp()}") + print(f"Now: {now} ts: {now.timestamp()}") + print(f"Tomorrow: {tomorrow} ts: {tomorrow.timestamp()}") + print(f"Day after: {day_after} ts: {day_after.timestamp()}") + + # Last Month + job = Job.objects.with_id(job_id1) # type : Job + new_id_last_month = ObjectId.from_datetime(last_month) + print(last_month, new_id_last_month, new_id_last_month.generation_time) + + print("About to replace job id") + print(job) + print(new_id_last_month) + self.replace_job_id(job, new_id_last_month) + new_job_ids.append(str(new_id_last_month)) + + # Last week + job = Job.objects.with_id(job_id2) # type : Job + new_id_last_week = ObjectId.from_datetime(last_week) + self.replace_job_id(job, new_id_last_week) + new_job_ids.append(str(new_id_last_week)) + + # Yesterday + job = Job.objects.with_id(job_id3) # type : Job + new_id_yesterday = ObjectId.from_datetime(yesterday) + self.replace_job_id(job, new_id_yesterday) + new_job_ids.append(str(new_id_yesterday)) + + # Now + job = Job.objects.with_id(job_id4) # type : Job + new_id_now = ObjectId.from_datetime(now) + self.replace_job_id(job, new_id_now) + new_job_ids.append(str(new_id_now)) + + # Tomorrow + job = Job.objects.with_id(job_id5) # type : Job + new_id_tomorrow = ObjectId.from_datetime(tomorrow) + self.replace_job_id(job, new_id_tomorrow) + new_job_ids.append(str(new_id_tomorrow)) + + # Day After + job = Job.objects.with_id(job_id6) # type : Job + new_id_day_after = ObjectId.from_datetime(day_after) + self.replace_job_id(job, new_id_day_after) + new_job_ids.append(str(new_id_day_after)) # JOB ID GETS GENERATED HERE - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.false = self.assertFalse(job.running) - self.assertFalse(job.estimating) - - runner.check_permission_for_job = MagicMock(return_value=True) - # runner.get_permissions_for_workspace = MagicMock( - # return_value=SDKMethodRunner.WorkspacePermissions.ADMINISTRATOR - # ) - runner.is_admin = MagicMock(return_value=True) - - print( - "Test case 1. Retrieving Jobs from last_week and tomorrow_max (yesterday and now jobs) " - ) - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=last_week.timestamp(), # test timestamp input - user="ALL", - ) - count = 0 - for js in job_state["jobs"]: - job_id = js["job_id"] - print("Job is id", job_id) - if job_id in new_job_ids: - count += 1 - self.assertIn(js["status"], ["created", "queued"]) - print(js["created"]) - print(type(js["created"])) - date = SDKMethodRunner.check_and_convert_time(js["created"]) - ts = date - print( - f"Creation date {date}, LastWeek:{last_week}, Tomorrow{tomorrow})" - ) - print(ts, last_week.timestamp()) - self.assertTrue(float(ts) >= last_week.timestamp()) - print(ts, tomorrow.timestamp()) - self.assertTrue(float(ts) <= tomorrow.timestamp()) - self.assertEqual(2, count) - - print( - "Test case 2A. Retrieving Jobs from last_month and tomorrow_max (last_month, last_week, yesterday and now jobs) " - ) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str( - tomorrow.timestamp() - ), # test timestamp string input - creation_start_time=last_month_and_1_hour, # test datetime input - user="ALL", - ) - - count = 0 - for js in job_state["jobs"]: - job_id = js["job_id"] - print("Job is id", job_id) - if job_id in new_job_ids: - count += 1 - self.assertIn(js["status"], ["created", "queued"]) - date = SDKMethodRunner.check_and_convert_time(js["created"]) - ts = date - print(date, last_week, tomorrow) - print(ts, last_week.timestamp(), tomorrow.timestamp()) - self.assertTrue(ts > last_month_and_1_hour.timestamp()) - self.assertTrue(ts < tomorrow.timestamp()) - self.assertEqual(4, count) - - print("Found all of the jobs", len(new_job_ids)) - - with self.assertRaises(Exception) as context: - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(yesterday), - creation_start_time=str(tomorrow), - user="ALL", - ) - self.assertEqual( - "The start date cannot be greater than the end date.", - str(context.exception), + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.false = self.assertFalse(job.running) + self.assertFalse(job.estimating) + + runner.check_permission_for_job = MagicMock(return_value=True) + # runner.get_permissions_for_workspace = MagicMock( + # return_value=SDKMethodRunner.WorkspacePermissions.ADMINISTRATOR + # ) + + print( + "Test case 1. Retrieving Jobs from last_week and tomorrow_max (yesterday and now jobs) " + ) + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=last_week.timestamp(), # test timestamp input + user="ALL", + ) + count = 0 + for js in job_state["jobs"]: + job_id = js["job_id"] + print("Job is id", job_id) + if job_id in new_job_ids: + count += 1 + self.assertIn(js["status"], ["created", "queued"]) + print(js["created"]) + print(type(js["created"])) + date = SDKMethodRunner.check_and_convert_time(js["created"]) + ts = date + print( + f"Creation date {date}, LastWeek:{last_week}, Tomorrow{tomorrow})" ) + print(ts, last_week.timestamp()) + self.assertTrue(float(ts) >= last_week.timestamp()) + print(ts, tomorrow.timestamp()) + self.assertTrue(float(ts) <= tomorrow.timestamp()) + self.assertEqual(2, count) + + print( + "Test case 2A. Retrieving Jobs from last_month and tomorrow_max (last_month, last_week, yesterday and now jobs) " + ) - print("Test case 2B. Same as above but with FAKE user (NO ADMIN) ") - runner.is_admin = False - runner.check_is_admin = MagicMock(return_value=False) - with self.assertRaisesRegex( - AuthError, - "You are not authorized to view all records or records for others.", - ) as error: - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="FAKE", - ) - print("Exception raised is", error) + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow.timestamp()), # test timestamp string input + creation_start_time=last_month_and_1_hour, # test datetime input + user="ALL", + ) - print("Test case 2C. Same as above but with FAKE_TEST_USER + ADMIN) ") - runner.is_admin = True - runner.check_is_admin = MagicMock(return_value=True) + count = 0 + for js in job_state["jobs"]: + job_id = js["job_id"] + print("Job is id", job_id) + if job_id in new_job_ids: + count += 1 + self.assertIn(js["status"], ["created", "queued"]) + ts = SDKMethodRunner.check_and_convert_time(js["created"]) + print(f"Timestamp: {ts}") + self.assertTrue(ts > last_month_and_1_hour.timestamp()) + self.assertTrue(ts < tomorrow.timestamp()) + self.assertEqual(4, count) + + print("Found all of the jobs", len(new_job_ids)) + + with self.assertRaises(Exception) as context: job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user=user_name, + creation_end_time=str(yesterday), + creation_start_time=str(tomorrow), + user="ALL", + ) + self.assertEqual( + "The start date cannot be greater than the end date.", + str(context.exception), ) - count = 0 - for js in job_state["jobs"]: - job_id = js["job_id"] - print("Job is id", job_id) - if job_id in new_job_ids: - count += 1 - self.assertIn(js["status"], ["created", "queued"]) - date = SDKMethodRunner.check_and_convert_time(js["created"]) - ts = date - print(date, last_week, tomorrow) - print(ts, last_week.timestamp(), tomorrow.timestamp()) - self.assertTrue(ts > last_month_and_1_hour.timestamp()) - self.assertTrue(ts < tomorrow.timestamp()) - - # May need to change this if other db entries get added - self.assertEqual(4, count) - - print("Found all of the jobs", len(new_job_ids)) - - print("Test case 3. Assert Raises error") - - with self.assertRaises(Exception) as context: - job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(yesterday), - creation_start_time=str(tomorrow), - user="ALL", - ) - self.assertEqual( - "The start date cannot be greater than the end date.", - str(context.exception), - ) - - print("Test case 4, find the original job") + print("Test case 2B. Same as above but with FAKE user (NO ADMIN) ") + runner.check_is_admin = MagicMock(return_value=False) + with self.assertRaisesRegex( + AuthError, + "You are not authorized to view all records or records for others.", + ) as error: job_state = runner.check_jobs_date_range_for_user( creation_end_time=str(tomorrow), creation_start_time=str(last_month_and_1_hour), - user=user_name, + user="FAKE", ) - self.assertTrue(len(job_state["jobs"][0].keys()) > 0) - print(f"Checking {job_id}") + print("Exception raised is", error) - found = False - for job in job_state["jobs"]: - if job_id == job["job_id"]: - found = True + print("Test case 2C. Same as above but with FAKE_TEST_USER + ADMIN) ") + runner.check_is_admin = MagicMock(return_value=True) + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user=user_name, + ) - if found is False: - raise Exception("Didn't find the original job") + count = 0 + for js in job_state["jobs"]: + job_id = js["job_id"] + print("Job is id", job_id) + if job_id in new_job_ids: + count += 1 + self.assertIn(js["status"], ["created", "queued"]) + ts = SDKMethodRunner.check_and_convert_time(js["created"]) + print(f"Timestamp: {ts}") + self.assertTrue(ts > last_month_and_1_hour.timestamp()) + self.assertTrue(ts < tomorrow.timestamp()) - print(job_state) + # May need to change this if other db entries get added + self.assertEqual(4, count) - print("Test 5, find the original job, but with projections") - job_states = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user=user_name, - job_projection=["wsid"], - ) - job_state_with_proj = None - for job in job_states["jobs"]: - if job_id == job["job_id"]: - job_state_with_proj = job - - example_job_stat = { - "_id": "5d892ede9ea3d7d3b824dbff", - "authstrat": "kbaseworkspace", - "wsid": 9999, - "updated": "2019-09-23 20:45:19.468032", - "job_id": "5d892ede9ea3d7d3b824dbff", - "created": "2019-09-23 20:45:18+00:00", - } - - required_headers = list(example_job_stat.keys()) - required_headers.append("wsid") - - for member in required_headers: - self.assertIn(member, job_state_with_proj) - self.assertNotIn("status", job_state_with_proj) - - print("Test 6a, find the original job, but with projections and filters") + print("Found all of the jobs", len(new_job_ids)) + + print("Test case 3. Assert Raises error") + + with self.assertRaises(Exception) as context: job_state = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), + creation_end_time=str(yesterday), + creation_start_time=str(tomorrow), user="ALL", - job_projection=["wsid", "status"], - job_filter={"wsid": 9999}, + ) + self.assertEqual( + "The start date cannot be greater than the end date.", + str(context.exception), ) - for record in job_state["jobs"]: + print("Test case 4, find the original job") + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user=user_name, + ) + self.assertTrue(len(job_state["jobs"][0].keys()) > 0) + print(f"Checking {job_id}") - print(record) - if record["wsid"] != 9999: - raise Exception("Only records with wsid 9999 should be allowed") - self.assertIn("wsid", record) - self.assertIn("status", record) - self.assertNotIn("service_ver", record) - print("job state is", "len is", len(job_state["jobs"])) + found = False + for job in job_state["jobs"]: + if job_id == job["job_id"]: + found = True - self.assertTrue(len(job_state["jobs"]) >= 1) + if found is False: + raise Exception("Didn't find the original job") - print("Test 6b, find the original job, but with projections and filters") - job_state2 = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - job_filter=["wsid=123"], - ) + print(job_state) - for record in job_state2["jobs"]: + print("Test 5, find the original job, but with projections") + job_states = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user=user_name, + job_projection=["wsid"], + ) + job_state_with_proj = None + for job in job_states["jobs"]: + if job_id == job["job_id"]: + job_state_with_proj = job + + example_job_stat = { + "_id": "5d892ede9ea3d7d3b824dbff", + "authstrat": "kbaseworkspace", + "wsid": 9999, + "updated": "2019-09-23 20:45:19.468032", + "job_id": "5d892ede9ea3d7d3b824dbff", + "created": "2019-09-23 20:45:18+00:00", + } - if record["wsid"] != 123: - print(record) - print("ID IS", record["wsid"]) - raise Exception("Only records with wsid 123 should be allowed") - self.assertIn("wsid", record) - self.assertIn("status", record) - self.assertNotIn("service_ver", record) + required_headers = list(example_job_stat.keys()) + required_headers.append("wsid") - print(len(job_state2["jobs"])) - self.assertTrue(len(job_state2["jobs"]) > 0) + for member in required_headers: + self.assertIn(member, job_state_with_proj) + self.assertNotIn("status", job_state_with_proj) - print( - "Test 7, find same jobs as test 2 or 3, but also filter, project, and limit" - ) - job_state_limit = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - job_filter=["wsid=123"], - limit=2, - ) + print("Test 6a, find the original job, but with projections and filters") + job_state = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + job_filter={"wsid": 9999}, + ) - self.assertTrue(len(job_state_limit["jobs"]) > 0) + for record in job_state["jobs"]: + + print(record) + if record["wsid"] != 9999: + raise Exception("Only records with wsid 9999 should be allowed") + self.assertIn("wsid", record) + self.assertIn("status", record) + self.assertNotIn("service_ver", record) + print("job state is", "len is", len(job_state["jobs"])) + + self.assertTrue(len(job_state["jobs"]) >= 1) + + print("Test 6b, find the original job, but with projections and filters") + job_state2 = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + job_filter=["wsid=123"], + ) - print( - "Test 8, ascending and descending (maybe should verify jobs count > 2)" - ) - job_state_limit_asc = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - ascending="True", - ) + for record in job_state2["jobs"]: - epoch = datetime.utcfromtimestamp(0) + if record["wsid"] != 123: + print(record) + print("ID IS", record["wsid"]) + raise Exception("Only records with wsid 123 should be allowed") + self.assertIn("wsid", record) + self.assertIn("status", record) + self.assertNotIn("service_ver", record) - job_id_temp = str(ObjectId.from_datetime(epoch)) - for item in job_state_limit_asc["jobs"]: - job_id = item["job_id"] - if ObjectId(job_id) > ObjectId(job_id_temp): - job_id_temp = job_id - else: - raise Exception( - "Not ascending" - + "JobIdPrev" - + str(job_id_temp) - + "JobIdNext" - + str(job_id) - ) + print(len(job_state2["jobs"])) + self.assertTrue(len(job_state2["jobs"]) > 0) - job_state_limit_desc = runner.check_jobs_date_range_for_user( - creation_end_time=str(tomorrow), - creation_start_time=str(last_month_and_1_hour), - user="ALL", - job_projection=["wsid", "status"], - ascending="False", - ) + print( + "Test 7, find same jobs as test 2 or 3, but also filter, project, and limit" + ) + job_state_limit = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + job_filter=["wsid=123"], + limit=2, + ) - # TimeDelta Over 9999 days - job_id_temp = str(ObjectId.from_datetime(now + timedelta(days=9999))) + self.assertTrue(len(job_state_limit["jobs"]) > 0) - for item in job_state_limit_desc["jobs"]: - job_id = item["job_id"] - if ObjectId(job_id) < ObjectId(job_id_temp): - job_id_temp = job_id - else: - raise Exception( - "Not Descending" - + "JobIdPrev:" - + str(job_id_temp) - + "JobIdNext:" - + str(job_id) - ) + print("Test 8, ascending and descending (maybe should verify jobs count > 2)") + job_state_limit_asc = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + ascending="True", + ) + + epoch = datetime.utcfromtimestamp(0) + + job_id_temp = str(ObjectId.from_datetime(epoch)) + for item in job_state_limit_asc["jobs"]: + job_id = item["job_id"] + if ObjectId(job_id) > ObjectId(job_id_temp): + job_id_temp = job_id + else: + raise Exception( + "Not ascending" + + "JobIdPrev" + + str(job_id_temp) + + "JobIdNext" + + str(job_id) + ) + + job_state_limit_desc = runner.check_jobs_date_range_for_user( + creation_end_time=str(tomorrow), + creation_start_time=str(last_month_and_1_hour), + user="ALL", + job_projection=["wsid", "status"], + ascending="False", + ) + + # TimeDelta Over 9999 days + job_id_temp = str(ObjectId.from_datetime(now + timedelta(days=9999))) + + for item in job_state_limit_desc["jobs"]: + job_id = item["job_id"] + if ObjectId(job_id) < ObjectId(job_id_temp): + job_id_temp = job_id + else: + raise Exception( + "Not Descending" + + "JobIdPrev:" + + str(job_id_temp) + + "JobIdNext:" + + str(job_id) + ) - for key in job_state_limit_desc.keys(): - print(key) - print(job_state_limit_desc[key]) + for key in job_state_limit_desc.keys(): + print(key) + print(job_state_limit_desc[key]) # TODO TEST _finish_job_with_success, TEST finish_job_with_error diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py index 039edfdfa..af441a81d 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Runjob_test.py @@ -9,10 +9,21 @@ import requests_mock from mock import MagicMock +from execution_engine2.exceptions import ( + CannotRetryJob, + RetryFailureException, + InvalidParameterForBatch, +) +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.utils.clients import ( + get_client_set, + get_user_client_set, +) +from installed_clients.CatalogClient import Catalog from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job +from lib.execution_engine2.db.models.models import Job, Status from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from lib.execution_engine2.utils.CondorTuples import SubmissionInfo from test.utils_shared.test_utils import ( bootstrap, get_example_job, @@ -49,9 +60,11 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token - ) + with open(config_file) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cf), + ) cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -60,165 +73,170 @@ def setUpClass(cls): db=cls.cfg["mongo-database"], col=cls.cfg["mongo-jobs-collection"] ) - cls.cr = CondorResources( - request_cpus="1", - request_disk="1GB", - request_memory="100M", - client_group="njs", - ) - cls.sdkmr_test_helper = ee2_sdkmr_test_helper(mr=cls.method_runner) + cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: + """ # Initialize these clients from None + # Set up some mocks + """ runner = copy.copy(self.__class__.method_runner) # type : SDKMethodRunner runner.get_jobs_status() runner.get_runjob() runner.get_job_logs() + runner.get_workspace() + runner.workspace.get_object_info3 = MagicMock(return_value={"paths": []}) return runner def create_job_rec(self): return self.sdkmr_test_helper.create_job_rec() def test_init_ok(self): - class_attri = ["config", "catalog_utils", "workspace", "mongo_util", "condor"] + class_attri = ["workspace", "mongo_util", "condor"] runner = self.getRunner() self.assertTrue(set(class_attri) <= set(runner.__dict__.keys())) - def test_init_job_rec(self): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() - - job_params = { - "wsid": self.ws_id, - "method": "MEGAHIT.run_megahit", - "app_id": "MEGAHIT/run_megahit", - "service_ver": "2.2.1", - "params": [ - { - "workspace_name": "wjriehl:1475006266615", - "read_library_refs": ["18836/5/1"], - "output_contigset_name": "rhodo_contigs", - "recipe": "auto", - "assembler": None, - "pipeline": None, - "min_contig_len": None, - } - ], - "source_ws_objects": ["a/b/c", "e/d"], - "parent_job_id": "9998", - "meta": {"tag": "dev", "token_id": "12345"}, - } + @patch.object(Catalog, "get_module_version") + def test_init_job_rec(self, get_mod_ver): + ori_job_count = Job.objects.count() + runner = self.getRunner() - job_id = runner.get_runjob()._init_job_rec(self.user_id, job_params) + job_params = { + "wsid": self.ws_id, + "method": "MEGAHIT.run_megahit", + "app_id": "MEGAHIT/run_megahit", + "service_ver": "2.2.1", + "params": [ + { + "workspace_name": "wjriehl:1475006266615", + "read_library_refs": ["18836/5/1"], + "output_contigset_name": "rhodo_contigs", + "recipe": "auto", + "assembler": None, + "pipeline": None, + "min_contig_len": None, + } + ], + "job_reqs": JobRequirements(1, 1, 1, "njs"), + "source_ws_objects": ["a/b/c", "e/d"], + "parent_job_id": "9998", + "meta": {"tag": "dev", "token_id": "12345"}, + } + + get_mod_ver.return_value = { + "git_commit_hash": "048baf3c2b76cb923b3b4c52008ed77dbe20292d" + } + + job_id = runner.get_runjob()._init_job_rec(self.user_id, job_params) + + get_mod_ver.assert_called_once_with( + {"module_name": "MEGAHIT", "version": "2.2.1"} + ) - self.assertEqual(ori_job_count, Job.objects.count() - 1) + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job = Job.objects.get(id=job_id) + job = Job.objects.get(id=job_id) - self.assertEqual(job.user, self.user_id) - self.assertEqual(job.authstrat, "kbaseworkspace") - self.assertEqual(job.wsid, self.ws_id) + self.assertEqual(job.user, self.user_id) + self.assertEqual(job.authstrat, "kbaseworkspace") + self.assertEqual(job.wsid, self.ws_id) - job_input = job.job_input + job_input = job.job_input - self.assertEqual(job_input.wsid, self.ws_id) - self.assertEqual(job_input.method, "MEGAHIT.run_megahit") - self.assertEqual(job_input.app_id, "MEGAHIT/run_megahit") - # TODO this is an integration test - # self.assertEqual(job_input.service_ver, "2.2.1") - self.assertEqual( - job_input.service_ver, "048baf3c2b76cb923b3b4c52008ed77dbe20292d" - ) + self.assertEqual(job_input.wsid, self.ws_id) + self.assertEqual(job_input.method, "MEGAHIT.run_megahit") + self.assertEqual(job_input.app_id, "MEGAHIT/run_megahit") + # TODO this is an integration test + # self.assertEqual(job_input.service_ver, "2.2.1") + self.assertEqual( + job_input.service_ver, "048baf3c2b76cb923b3b4c52008ed77dbe20292d" + ) - self.assertCountEqual(job_input.source_ws_objects, ["a/b/c", "e/d"]) - self.assertEqual(job_input.parent_job_id, "9998") + self.assertCountEqual(job_input.source_ws_objects, ["a/b/c", "e/d"]) + self.assertEqual(job_input.parent_job_id, "9998") - narrative_cell_info = job_input.narrative_cell_info - self.assertEqual(narrative_cell_info.tag, "dev") - self.assertEqual(narrative_cell_info.token_id, "12345") - self.assertFalse(narrative_cell_info.status) + narrative_cell_info = job_input.narrative_cell_info + self.assertEqual(narrative_cell_info.tag, "dev") + self.assertEqual(narrative_cell_info.token_id, "12345") - self.assertFalse(job.job_output) + self.assertFalse(job.job_output) - self.mongo_util.get_job(job_id=job_id).delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.mongo_util.get_job(job_id=job_id).delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_get_job_params(self): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) - - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) - params = runner.get_job_params(job_id) - - expected_params_keys = [ - "wsid", - "method", - "params", - "service_ver", - "app_id", - "source_ws_objects", - "parent_job_id", - ] - self.assertCountEqual(params.keys(), expected_params_keys) - self.assertEqual(params["wsid"], self.ws_id) - self.assertEqual(params["method"], "MEGAHIT.run_megahit") - self.assertEqual(params["app_id"], "MEGAHIT/run_megahit") - self.assertEqual(params["service_ver"], "2.2.1") - self.assertCountEqual(params["source_ws_objects"], ["a/b/c", "e/d"]) - self.assertEqual(params["parent_job_id"], "9998") + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - self.mongo_util.get_job(job_id=job_id).delete() - self.assertEqual(ori_job_count, Job.objects.count()) + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) + params = runner.get_job_params(job_id) + + expected_params_keys = [ + "wsid", + "method", + "params", + "service_ver", + "app_id", + "source_ws_objects", + "parent_job_id", + ] + self.assertCountEqual(params.keys(), expected_params_keys) + self.assertEqual(params["wsid"], self.ws_id) + self.assertEqual(params["method"], "MEGAHIT.run_megahit") + self.assertEqual(params["app_id"], "MEGAHIT/run_megahit") + self.assertEqual(params["service_ver"], "2.2.1") + self.assertCountEqual(params["source_ws_objects"], ["a/b/c", "e/d"]) + self.assertEqual(params["parent_job_id"], "9998") + + self.mongo_util.get_job(job_id=job_id).delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_start_job(self): - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - job_id = self.create_job_rec() - self.assertEqual(ori_job_count, Job.objects.count() - 1) + ori_job_count = Job.objects.count() + job_id = self.create_job_rec() + self.assertEqual(ori_job_count, Job.objects.count() - 1) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "created") - self.assertFalse(job.finished) - self.assertFalse(job.running) - self.assertFalse(job.estimating) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "created") + self.assertFalse(job.finished) + self.assertFalse(job.running) + self.assertFalse(job.estimating) - runner = self.getRunner() - runner._test_job_permissions = MagicMock(return_value=True) + runner = self.getRunner() + runner._test_job_permissions = MagicMock(return_value=True) - # test missing job_id input - with self.assertRaises(ValueError) as context: - runner.start_job(None) - self.assertEqual("Please provide valid job_id", str(context.exception)) + # test missing job_id input + with self.assertRaises(ValueError) as context: + runner.start_job(None) + self.assertEqual("Please provide valid job_id", str(context.exception)) - # start a created job, set job to estimation status - runner.start_job(job_id, skip_estimation=False) + # start a created job, set job to estimation status + runner.start_job(job_id, skip_estimation=False) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "estimating") - self.assertFalse(job.running) - self.assertTrue(job.estimating) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "estimating") + self.assertFalse(job.running) + self.assertTrue(job.estimating) - # start a estimating job, set job to running status - runner.start_job(job_id, skip_estimation=False) + # start a estimating job, set job to running status + runner.start_job(job_id, skip_estimation=False) - job = self.mongo_util.get_job(job_id=job_id) - self.assertEqual(job.status, "running") - self.assertTrue(job.running) - self.assertTrue(job.estimating) + job = self.mongo_util.get_job(job_id=job_id) + self.assertEqual(job.status, "running") + self.assertTrue(job.running) + self.assertTrue(job.estimating) - # test start a job with invalid status - with self.assertRaises(ValueError) as context: - runner.start_job(job_id) - self.assertIn("Unexpected job status", str(context.exception)) + # test start a job with invalid status + with self.assertRaises(ValueError) as context: + runner.start_job(job_id) + self.assertIn("Unexpected job status", str(context.exception)) - self.mongo_util.get_job(job_id=job_id).delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.mongo_util.get_job(job_id=job_id).delete() + self.assertEqual(ori_job_count, Job.objects.count()) @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -232,12 +250,296 @@ def test_run_job(self, rq_mock, condor_mock): runner.get_condor = MagicMock(return_value=condor_mock) job = get_example_job_as_dict(user=self.user_id, wsid=self.ws_id) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + + # OK + condor_mock.run_job = MagicMock(return_value=si) + runner.run_job(params=job) + + # Condor Failure Case Coverage + condor_mock.run_job = MagicMock(return_value=si, side_effect=Exception("fail")) + runner.get_runjob()._finish_created_job = MagicMock(return_value=None) + + with self.assertRaises(expected_exception=Exception): + runner.run_job(params=job) + + # Condor Failure Case Coverage #2 + with self.assertRaisesRegex( + expected_exception=RuntimeError, + expected_regex="Condor job not run, and error not found. Something went wrong", + ): + si = SubmissionInfo(clusterid=None, submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + runner.run_job(params=job) + + @staticmethod + def check_retry_job_state(job_id: str, retry_job_id: str): + """ + Checks to see the required keys are there + :param job_id: The job that was retried + :param retry_job_id: The job id that was a result of the retry + + """ + job = Job.objects.get(id=job_id) # type: Job + retry_job = Job.objects.get(id=retry_job_id) # type: Job + + check_attributes = [ + "job_input", + "wsid", + "authstrat", + "batch_job", + "batch_id", + "scheduler_type", + ] + + for item in check_attributes: + if job[item]: + assert job[item] == retry_job[item] + + assert retry_job.retry_parent == job_id + assert len(job.retry_ids) > 0 + assert retry_job_id in job.retry_ids + assert not job.retry_saved_toggle and retry_job.retry_saved_toggle + + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_retry_job_multiple(self, rq_mock, condor_mock): + # 1. Run the job + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}} + ) + ) + runner = self.getRunner() + runner.get_condor = MagicMock(return_value=condor_mock) + + job = get_example_job_as_dict( + user=self.user_id, wsid=self.ws_id, source_ws_objects=[] + ) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + + parent_job_id0 = runner.run_job(params=job) + parent_job_id1 = runner.run_job(params=job) + parent_job_id2 = runner.run_job(params=job) + parent_job_id3 = runner.run_job(params=job) + parent_job_id4 = runner.run_job(params=job) + + runner.update_job_status(job_id=parent_job_id0, status=Status.terminated.value) + runner.update_job_status(job_id=parent_job_id1, status=Status.terminated.value) + runner.update_job_status(job_id=parent_job_id2, status=Status.error.value) + runner.update_job_status(job_id=parent_job_id3, status=Status.terminated.value) + runner.update_job_status(job_id=parent_job_id4, status=Status.error.value) + + # 2. Retry the jobs with a fake input + errmsg = ( + "'123' is not a valid ObjectId, it must be a 12-byte input or a 24-character " + "hex string" + ) + errmsg2 = ( + "'1234' is not a valid ObjectId, it must be a 12-byte input or a 24-character " + "hex string" + ) + retry_results = runner.retry_multiple(job_ids=[1234, 123, parent_job_id0]) + assert retry_results[0] == {"job_id": 1234, "error": errmsg2} + assert retry_results[1] == {"job_id": 123, "error": errmsg} + assert retry_results[2]["job_id"] == parent_job_id0 + + # 3. Retry the jobs with duplicate job ids + retry_candidates = ( + parent_job_id1, + parent_job_id2, + parent_job_id1, + parent_job_id2, + ) + fail_msg = f"Retry of the same id in the same request is not supported. Offending ids: {[parent_job_id1, parent_job_id2]} " + + with self.assertRaises(ValueError) as e: + runner.retry_multiple(retry_candidates) + assert str(e.exception) == str(ValueError(fail_msg)) + + # 4. Retry the jobs + retry_candidates = ( + parent_job_id1, + parent_job_id2, + parent_job_id3, + parent_job_id4, + ) + check_job = runner.check_job(parent_job_id1) + assert check_job["retry_ids"] == [] + assert check_job["retry_count"] == 0 + retry_job_ids = runner.retry_multiple(retry_candidates) + + assert len(retry_job_ids) == len(retry_candidates) + + # Lets retry the jobs a few times + js = runner.check_jobs( + job_ids=[ + retry_job_ids[0]["retry_id"], + retry_job_ids[1]["retry_id"], + retry_job_ids[2]["retry_id"], + retry_job_ids[3]["retry_id"], + ] + )["job_states"] + + job1, job2, job3, job4 = js + + self.check_retry_job_state(parent_job_id1, job1["job_id"]) + self.check_retry_job_state(parent_job_id2, job2["job_id"]) + self.check_retry_job_state(parent_job_id3, job3["job_id"]) + self.check_retry_job_state(parent_job_id4, job4["job_id"]) + + # Test no job ids + with self.assertRaisesRegex(ValueError, "No job_ids provided to retry"): + runner.retry_multiple(job_ids=None) + + # Test error during retry, but passing validate + runner._ee2_runjob._retry = MagicMock( + side_effect=Exception("Job Retry Misbehaved!") + ) + misbehaving_jobs = runner.retry_multiple(retry_candidates) + for i, candidate in enumerate(retry_candidates): + assert misbehaving_jobs[i] == { + "error": "Job Retry Misbehaved!", + "job_id": candidate, + } + + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_retry_job(self, rq_mock, condor_mock): + # 1. Run the job + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}} + ) + ) + runner = self.getRunner() + runner.get_condor = MagicMock(return_value=condor_mock) + + job = get_example_job_as_dict( + user=self.user_id, wsid=self.ws_id, source_ws_objects=[] + ) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) + parent_job_id = runner.run_job(params=job) + + # 2a. Retry the job and fail because it's in progress + expected_error = f"Error retrying job {parent_job_id} with status running: can only retry jobs with status 'error' or 'terminated'" + with self.assertRaisesRegex(CannotRetryJob, expected_regex=expected_error): + runner.update_job_status(job_id=parent_job_id, status=Status.running.value) + runner.retry(job_id=parent_job_id) + + # 2b. Retry the job + runner.update_job_status(job_id=parent_job_id, status=Status.terminated.value) + retry_job_id = runner.retry(job_id=parent_job_id)["retry_id"] + + # 3. Attempt to retry a retry, and check to see that that the new job is retried off of the parent + runner.update_job_status(job_id=retry_job_id, status=Status.terminated.value) + retry_from_retry_id = runner.retry(job_id=retry_job_id)["retry_id"] + + retry_from_original_again = runner.retry(job_id=parent_job_id)["retry_id"] + original_job, retried_job, retried_job2, retried_job3 = runner.check_jobs( + job_ids=[ + parent_job_id, + retry_job_id, + retry_from_retry_id, + retry_from_original_again, + ] + )["job_states"] + + self.check_retry_job_state(parent_job_id, retry_job_id) + self.check_retry_job_state(parent_job_id, retry_from_retry_id) + self.check_retry_job_state(parent_job_id, retry_from_original_again) - job_id = runner.run_job(params=job) - print(f"Job id is {job_id} ") + for job in [original_job, retried_job, retried_job2, retried_job3]: + j = Job.objects.get(id=job["job_id"]) + if job == original_job: + assert original_job["retry_count"] == 3 + assert not j.retry_saved_toggle + else: + assert job["retry_parent"] == parent_job_id + assert j.retry_saved_toggle + + assert [ + retried_job["job_id"], + retried_job2["job_id"], + retried_job3["job_id"], + ] == original_job["retry_ids"] + + # 4. Get jobs and ensure they contain the same keys and params + same_keys = ["user", "authstrat", "wsid", "scheduler_type", "job_input"] + + assert "retry_parent" not in original_job + + for key in same_keys: + assert original_job[key] == retried_job[key] + + assert original_job["job_input"]["params"] == retried_job["job_input"]["params"] + + # Some failure cases + + # TODO Retry a job that uses run_job_batch or kbparallels (Like metabat) + # TODO Retry a job without an app_id + + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_retry_job_with_params_and_nci_and_src_ws_objs(self, rq_mock, condor_mock): + # 1. Run the job + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}} + ) + ) + runner = self.getRunner() + + runner.get_condor = MagicMock(return_value=condor_mock) + + quast_params = { + "workspace_name": "XX:narrative_1620418248793", + "assemblies": ["62160/9/18"], + "force_glimmer": 0, + } + source_ws_objects = quast_params["assemblies"] + nci = { + "run_id": "3a211c4e-5ba8-4b94-aeae-378079ccc63d", + "token_id": "f38f09f7-5ab1-4bfc-9f3f-2b82c7a8dbdc", + "tag": "release", + "cell_id": "3ee13d64-623b-407f-98a1-72e577662132", + } + + job = get_example_job_as_dict( + user=self.user_id, + wsid=self.ws_id, + narrative_cell_info=nci, + params=quast_params, + source_ws_objects=source_ws_objects, + method_name="kb_quast.run_QUAST_app", + app_id="kb_quast/run_QUAST_app", + ) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + parent_job_id = runner.run_job(params=job) + + # 2. Retry the job + runner.update_job_status(job_id=parent_job_id, status=Status.terminated.value) + retry_job_id = runner.retry(job_id=parent_job_id)["retry_id"] + + # 3. Get both jobs and compare them! + original_job, retried_job = runner.check_jobs( + job_ids=[parent_job_id, retry_job_id] + )["job_states"] + + same_keys = ["user", "authstrat", "wsid", "scheduler_type", "job_input"] + assert "retry_parent" not in original_job + assert original_job["retry_count"] == 1 + assert retried_job["retry_parent"] == parent_job_id + + for key in same_keys: + assert original_job[key] == retried_job[key] + + # TODO Possible test additions Retry a job that uses run_job_batch or kbparallels (Like metabat) + # TODO Retry a job without an app_id + # TODO Check narrative_cell_info @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -252,27 +554,90 @@ def test_run_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job2 = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job3 = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) + jobs = [job, job2, job3] + job_ids = runner.run_job_batch( + params=copy.deepcopy(jobs), batch_params={"wsid": self.ws_id} + ) - jobs = [job, job, job] - job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + for job in runner.check_jobs( + job_ids=job_ids["child_job_ids"] + [job_ids["batch_id"]] + )["job_states"]: + assert job.get("wsid") == self.ws_id + # Job input is forced to assume the batch wsid + if job["job_id"] != job_ids["batch_id"]: + assert job.get("job_input", {}).get("wsid") == self.ws_id - assert "parent_job_id" in job_ids and isinstance(job_ids["parent_job_id"], str) + assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) + with self.assertRaises(InvalidParameterForBatch): + job_good = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job_bad = ( + get_example_job(user=self.user_id, wsid=self.ws_id).to_mongo().to_dict() + ) + jobs = [job_good, job_bad] + runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + # Test that you can't run a job in someone elses workspace + no_perms_ws = 111970 with self.assertRaises(PermissionError): - job_bad = get_example_job(user=self.user_id, wsid=1234).to_mongo().to_dict() - job_bad["method"] = job["job_input"]["app_id"] - job_bad["app_id"] = job["job_input"]["app_id"] - job_bad["service_ver"] = job["job_input"]["service_ver"] - jobs = [job, job_bad] - runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) + job_good = get_example_job_as_dict( + user=self.user_id, wsid=None, source_ws_objects=[] + ) + job_bad = get_example_job(user=self.user_id, wsid=None).to_mongo().to_dict() + jobs = [job_good, job_bad] + runner.run_job_batch(params=jobs, batch_params={"wsid": no_perms_ws}) + + # Check wsids + batch_id = job_ids["batch_id"] + child_job_id = job_ids["child_job_ids"][0] + + # Squeeze in a retry test here + runner.update_job_status(job_id=child_job_id, status=Status.terminated.value) + batch_job = runner.check_job(job_id=batch_id) + assert len(batch_job["child_jobs"]) == 3 + + retry_result = runner.retry(job_id=child_job_id) + retry_id = retry_result["retry_id"] + self.check_retry_job_state(child_job_id, retry_id) + batch_job = runner.check_job(job_id=batch_id) + assert len(batch_job["child_jobs"]) == 4 + assert batch_job["child_jobs"][-1] == retry_id + + job = runner.check_job(job_id=child_job_id) + retry_count = job["retry_count"] + + # Test to see if one input fails, so keep going + results = runner.retry_multiple(job_ids=[child_job_id, "grail", "fail"]) + assert results[0]["job_id"] == child_job_id + assert "error" in results[1] + assert "error" in results[2] + + # Check to see child_job_id was retried + assert retry_count + 1 == runner.check_job(job_id=child_job_id)["retry_count"] + + # Test for duplicates + with self.assertRaises(expected_exception=ValueError) as e: + runner.retry_multiple(job_ids=[1, 2, 2]) + assert ( + e.exception.args[0] + == "Retry of the same id in the same request is not supported. Offending ids: [2] " + ) @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -288,7 +653,6 @@ def test_run_job_fail(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) with self.assertRaises(expected_exception=RuntimeError): runner.run_job(params=job) diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py index c34ec4f0a..f356d8ce0 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_EE2Status_test.py @@ -12,7 +12,8 @@ from lib.execution_engine2.db.models.models import Job from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo, CondorResources +from lib.execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.tests_for_sdkmr.ee2_SDKMethodRunner_test_utils import ee2_sdkmr_test_helper from test.utils_shared.test_utils import bootstrap, get_example_job @@ -44,22 +45,18 @@ def setUpClass(cls): cls.ws_id = 9999 cls.token = "token" - cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token - ) - cls.cr = CondorResources( - request_cpus="1", - request_disk="1GB", - request_memory="100M", - client_group="njs", - ) + with open(config_file) as cf: + cls.method_runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cf), + ) cls.fake_used_resources = { "RemoteUserCpu": "1", "DiskUsage_RAW": "1", "DiskUsage": "1", } cls.mongo_util = cls.method_runner.get_mongo_util() - cls.sdkmr_test_helper = ee2_sdkmr_test_helper(mr=cls.method_runner) + cls.sdkmr_test_helper = ee2_sdkmr_test_helper(cls.user_id) def getRunner(self) -> SDKMethodRunner: # Initialize these clients from None @@ -72,11 +69,69 @@ def getRunner(self) -> SDKMethodRunner: def create_job_rec(self): return self.sdkmr_test_helper.create_job_rec() + @requests_mock.Mocker() + @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) + def test_check_job(self, rq_mock, condor_mock): + rq_mock.add_matcher( + run_job_adapter( + ws_perms_info={"user_id": self.user_id, "ws_perms": {self.ws_id: "a"}}, + user_roles=["EE2_ADMIN"], + ) + ) + runner = self.getRunner() + runner.get_condor = MagicMock(return_value=condor_mock) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + si = SubmissionInfo(clusterid="test", submit=job, error=None) + condor_mock.run_job = MagicMock(return_value=si) + condor_mock.get_job_resource_info = MagicMock( + return_value=self.fake_used_resources + ) + job_id = runner.run_job(params=job) + job_status = runner.check_job(job_id=job_id) + expected_status = { + "authstrat": "kbaseworkspace", + "batch_job": False, + "child_jobs": [], + "created": 1623781528000, + "job_id": "60c8f0989a70bc8ec0ac0ec7", + "job_input": { + "app_id": "module/super_function", + "method": "module.method", + "narrative_cell_info": {}, + "requirements": { + "clientgroup": "njs", + "cpu": 4, + "disk": 30, + "memory": 2000, + }, + "service_ver": "some_commit_hash", + "source_ws_objects": [], + "wsid": 9999, + }, + "batch_id": None, + "queued": 1623781529017, + "retry_count": 0, + "retry_ids": [], + "scheduler_id": "test", + "scheduler_type": "condor", + "status": "queued", + "updated": 1623781529017, + "user": "wsadmin", + "wsid": 9999, + } + + expected_different = ["job_id", "created", "queued", "updated"] + for key, val in expected_status.items(): + if key not in expected_different: + assert job_status[key] == val + else: + assert key in job_status + @requests_mock.Mocker() @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) def test_run_job_and_handle_held(self, rq_mock, condor_mock): """ - Run a job, then call it held as an admin, and then check to see if the record contains condor info about the job + Run a job, then call it held as an admin, and then check to see if the record is set to error or terminated :param rq_mock: :param condor_mock: :return: @@ -93,7 +148,6 @@ def test_run_job_and_handle_held(self, rq_mock, condor_mock): si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) condor_mock.get_job_resource_info = MagicMock( return_value=self.fake_used_resources ) @@ -103,10 +157,15 @@ def test_run_job_and_handle_held(self, rq_mock, condor_mock): print( f"Job id is {job_id}. Status is {check_job.get('status')} Cluster is {check_job.get('scheduler_id')} " ) - + self.assertEqual(check_job.get("status"), Status.queued.value) job_record = runner.handle_held_job(cluster_id=check_job.get("scheduler_id")) - print("Records are", job_record.get("condor_job_ads")) - self.assertEqual(self.fake_used_resources, job_record.get("condor_job_ads")) + # This flaky test changes depending on your test environment + self.assertIn( + job_record.get("status"), [Status.terminated.value, Status.error.value] + ) + # Condor ads are actually wrong and should only be updated after the job is completed, + # so we don't need to check them in this test right now. + # See EE2 issue #251 def test_update_job_status(self): runner = self.getRunner() @@ -177,21 +236,22 @@ def test_cancel_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() # type: SDKMethodRunner runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job2 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job3 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) - jobs = [job, job, job] + jobs = [job, job2, job3] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - assert "parent_job_id" in job_ids and isinstance(job_ids["parent_job_id"], str) + assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) - runner.cancel_job(job_id=job_ids["parent_job_id"]) + runner.cancel_job(job_id=job_ids["batch_id"]) job_status = runner.check_jobs( - job_ids=[job_ids["parent_job_id"]] + job_ids["child_job_ids"] + job_ids=[job_ids["batch_id"]] + job_ids["child_job_ids"] ) for job in job_status["job_states"]: assert job["status"] == Status.terminated.value @@ -206,27 +266,30 @@ def test_abandon_children(self, rq_mock, condor_mock): ) runner = self.getRunner() # type: SDKMethodRunner runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job2 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job3 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) - jobs = [job, job, job] + jobs = [job, job2, job3] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - assert "parent_job_id" in job_ids and isinstance(job_ids["parent_job_id"], str) + assert "batch_id" in job_ids and isinstance(job_ids["batch_id"], str) assert "child_job_ids" in job_ids and isinstance(job_ids["child_job_ids"], list) assert len(job_ids["child_job_ids"]) == len(jobs) - runner.abandon_children( - parent_job_id=job_ids["parent_job_id"], + res = runner.abandon_children( + batch_id=job_ids["batch_id"], child_job_ids=job_ids["child_job_ids"][0:2], ) + assert res == { + "batch_id": job_ids["batch_id"], + "child_job_ids": job_ids["child_job_ids"][2:], + } - job_status = runner.check_jobs(job_ids=[job_ids["parent_job_id"]])[ - "job_states" - ][0] + job_status = runner.check_jobs(job_ids=[job_ids["batch_id"]])["job_states"][0] for job_id in job_ids["child_job_ids"][0:2]: assert job_id not in job_status["child_jobs"] @@ -243,19 +306,20 @@ def test_check_job_batch(self, rq_mock, condor_mock): ) runner = self.getRunner() # type: SDKMethodRunner runner.get_condor = MagicMock(return_value=condor_mock) - job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=self.ws_id) + job = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job2 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) + job3 = get_example_job_as_dict_for_runjob(user=self.user_id, wsid=None) si = SubmissionInfo(clusterid="test", submit=job, error=None) condor_mock.run_job = MagicMock(return_value=si) - condor_mock.extract_resources = MagicMock(return_value=self.cr) - jobs = [job, job, job] + jobs = [job, job2, job3] job_ids = runner.run_job_batch(params=jobs, batch_params={"wsid": self.ws_id}) - job_status = runner.check_job_batch(parent_job_id=job_ids["parent_job_id"]) - parent_job_state = job_status["parent_jobstate"] + job_status = runner.check_job_batch(batch_id=job_ids["batch_id"]) + batch_jobstate = job_status["batch_jobstate"] child_jobstates = job_status["child_jobstates"] assert len(child_jobstates) == len(jobs) for child_job in child_jobstates: - assert child_job["job_id"] in parent_job_state.get("child_jobs") + assert child_job["job_id"] in batch_jobstate.get("child_jobs") diff --git a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py index 5f5fdd094..9ec251f22 100644 --- a/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py +++ b/test/tests_for_sdkmr/ee2_SDKMethodRunner_test_utils.py @@ -5,14 +5,15 @@ class ee2_sdkmr_test_helper: - def __init__(self, mr: SDKMethodRunner, wsid: str = 9999): - self.user_id = mr.user_id + def __init__(self, user_id: str, wsid: str = 9999): + self.user_id = user_id self.ws_id = wsid - self.token = mr.token - self.method_runner = mr def create_job_rec(self): - """ Save a job, forgoing runjob.run""" + """ + Save a job, forgoing runjob.run + Requires a MongoEngine connection + """ job = Job() @@ -53,7 +54,6 @@ def create_job_rec(self): job.job_output = None job.scheduler_id = "123" - with self.method_runner.get_mongo_util().mongo_engine_connection(): - job.save() + job.save() return str(job.id) diff --git a/test/tests_for_sdkmr/ee2_kafka_test.py b/test/tests_for_sdkmr/ee2_kafka_test.py index ebd73a845..60856d18c 100644 --- a/test/tests_for_sdkmr/ee2_kafka_test.py +++ b/test/tests_for_sdkmr/ee2_kafka_test.py @@ -29,7 +29,7 @@ def setUpClass(cls): def test_status_change(self): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( expected_exception=TypeError, expected_regex=r"__init__\(\) missing 1 required positional argument: 'scheduler_id'", ): diff --git a/test/tests_for_sdkmr/ee2_load_test.py b/test/tests_for_sdkmr/ee2_load_test.py index 3671d2c1e..55b614e6f 100644 --- a/test/tests_for_sdkmr/ee2_load_test.py +++ b/test/tests_for_sdkmr/ee2_load_test.py @@ -9,14 +9,16 @@ from configparser import ConfigParser from unittest.mock import patch -from lib.execution_engine2.authorization.workspaceauth import WorkspaceAuth -from lib.execution_engine2.db.MongoUtil import MongoUtil -from lib.execution_engine2.db.models.models import Job, Status -from lib.execution_engine2.execution_engine2Impl import execution_engine2 -from lib.execution_engine2.sdk.EE2Status import JobsStatus -from lib.execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner -from lib.execution_engine2.utils.Condor import Condor -from lib.execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.models.models import Job, Status +from execution_engine2.execution_engine2Impl import execution_engine2 +from execution_engine2.sdk.EE2Status import JobsStatus +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo +from execution_engine2.utils.clients import get_user_client_set, get_client_set from test.utils_shared.test_utils import ( bootstrap, get_sample_job_params, @@ -32,9 +34,9 @@ class ee2_server_load_test(unittest.TestCase): @classmethod def setUpClass(cls): - deploy = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") - print("Deploy is", deploy) - config = read_config_into_dict(deploy) + cls.deploy = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") + print("Deploy is", cls.deploy) + config = read_config_into_dict(cls.deploy) cls.cfg = config cls.user_id = "wsadmin" cls.ws_id = 9999 @@ -42,9 +44,7 @@ def setUpClass(cls): cls.ctx = {"token": cls.token, "user_id": cls.user_id} cls.impl = execution_engine2(cls.cfg) - cls.method_runner = SDKMethodRunner( - cls.cfg, user_id=cls.user_id, token=cls.token - ) + cls.method_runner = cls._getRunner() cls.mongo_util = MongoUtil(cls.cfg) cls.mongo_helper = MongoTestHelper(cls.cfg) @@ -54,187 +54,191 @@ def setUpClass(cls): cls.thread_count = 5 - def getRunner(self) -> SDKMethodRunner: + @classmethod + def _getRunner(cls) -> SDKMethodRunner: + with open(cls.deploy) as cf: + runner = SDKMethodRunner( + get_user_client_set(cls.cfg, cls.user_id, cls.token), + get_client_set(cls.cfg, cf), + ) # Initialize these clients from None - runner = copy.copy(self.__class__.method_runner) # type : SDKMethodRunner - runner._ee2_status = runner.get_jobs_status() # type: JobsStatus - runner._ee2_status._send_exec_stats_to_catalog = MagicMock(return_value=True) - runner._ee2_status.update_finished_job_with_usage = MagicMock(return_value=True) - runner.get_runjob() - runner._ee2_runjob._get_module_git_commit = MagicMock( - return_value="GitCommithash" - ) + status = runner.get_jobs_status() # type: JobsStatus + status._send_exec_stats_to_catalog = MagicMock(return_value=True) + status._update_finished_job_with_usage = MagicMock(return_value=True) + runjob = runner.get_runjob() + runjob._get_module_git_commit = MagicMock(return_value="GitCommithash") runner.get_job_logs() - runner.get_condor() runner.condor = MagicMock(autospec=True) # runner.get_job_resource_info = MagicMock(return_val={}) return runner - def test_init_job_stress(self): + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + def test_init_job_stress(self, cc_get_mod_ver): """ testing initializing 3 different jobs in multiple theads. """ + cc_get_mod_ver.return_value = {"git_commit_hash": "123"} thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() - - # set job method differently to distinguish - method_1 = "app_1.a_method" - method_2 = "app_1.b_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - - threads = list() - job_ids = list() - que = queue.Queue() - - # execute _init_job_rec for 2 different jobs in threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - runner.get_runjob()._init_job_rec(self.user_id, job_params_1) - ) + ori_job_count = Job.objects.count() + runner = self.method_runner + # set job method differently to distinguish + method_1 = "app_1.a_method" + method_2 = "app_1.b_method" + job_params_1 = get_sample_job_params(method=method_1) + job_params_1["job_reqs"] = JobRequirements(1, 1, 1, "njs") + job_params_2 = get_sample_job_params(method=method_2) + job_params_2["job_reqs"] = JobRequirements(1, 1, 1, "njs") + + threads = list() + job_ids = list() + que = queue.Queue() + + # execute _init_job_rec for 2 different jobs in threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + runner.get_runjob()._init_job_rec(self.user_id, job_params_1) ) - threads.append(x) - x.start() - y = threading.Thread( - target=que.put( - runner.get_runjob()._init_job_rec(self.user_id, job_params_2) - ) + ) + threads.append(x) + x.start() + y = threading.Thread( + target=que.put( + runner.get_runjob()._init_job_rec(self.user_id, job_params_2) ) - threads.append(y) - y.start() + ) + threads.append(y) + y.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_ids.append(que.get()) + while not que.empty(): + job_ids.append(que.get()) - jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs + jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs - methods = [job.job_input.method for job in jobs] # examing methods returned - self.assertEqual(len(methods), thread_count * 2) - self.assertEqual(methods.count(method_1), thread_count) - self.assertEqual(methods.count(method_2), thread_count) + methods = [job.job_input.method for job in jobs] # examing methods returned + self.assertEqual(len(methods), thread_count * 2) + self.assertEqual(methods.count(method_1), thread_count) + self.assertEqual(methods.count(method_2), thread_count) - self.assertEqual( - len(set(job_ids)), thread_count * 2 - ) # testing identicalness of job_ids returned - self.assertEqual(len(job_ids), len(set(job_ids))) + self.assertEqual( + len(set(job_ids)), thread_count * 2 + ) # testing identicalness of job_ids returned + self.assertEqual(len(job_ids), len(set(job_ids))) - self.assertEqual( - ori_job_count, Job.objects.count() - thread_count * 2 - ) # testing job numbers created + self.assertEqual( + ori_job_count, Job.objects.count() - thread_count * 2 + ) # testing job numbers created - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_update_job_status_stress(self): """ testing update jobs into different status in multiple threads """ - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() + ori_job_count = Job.objects.count() + runner = self.method_runner - job_params = get_sample_job_params() + job_params = get_sample_job_params() + job_params["job_reqs"] = JobRequirements(1, 1, 1, "njs") - thread_count = self.thread_count # threads to test + thread_count = self.thread_count # threads to test - job_ids_queued = list() # jobs to be set into 'queued' status - job_ids_running = list() # jobs to be set into 'running' status - job_ids_completed = list() # jobs to be set into 'completed' status + job_ids_queued = list() # jobs to be set into 'queued' status + job_ids_running = list() # jobs to be set into 'running' status + job_ids_completed = list() # jobs to be set into 'completed' status - # initializing jobs to be tested - for index in range(thread_count): - job_ids_queued.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_running.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_completed.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) + # initializing jobs to be tested + for index in range(thread_count): + job_ids_queued.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_running.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_completed.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) - # examing newly created job status - queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) - for job in queued_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNone(job_rec.get("queued")) - self.assertEqual(job_rec.get("status"), "created") - - running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) - for job in running_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNone(job_rec.get("running")) - self.assertEqual(job_rec.get("status"), "created") - - finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) - for job in finish_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNone(job_rec.get("finished")) - self.assertEqual(job_rec.get("status"), "created") - - threads = list() - - def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): - """ - update jobs status in one thread - """ - runner.get_runjob().update_job_to_queued( - job_ids_queued[index], "scheduler_id" - ) - runner.get_jobs_status().start_job(job_ids_running[index]) - runner.get_jobs_status().start_job(job_ids_finish[index]) - job_output = { - "version": "11", - "result": {"result": 1}, - "id": "5d54bdcb9b402d15271b3208", - } - runner.finish_job(job_id=job_ids_finish[index], job_output=job_output) - - for index in range(thread_count): - x = threading.Thread( - target=update_states( - index, job_ids_queued, job_ids_running, job_ids_completed - ) + # examing newly created job status + queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) + for job in queued_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNone(job_rec.get("queued")) + self.assertEqual(job_rec.get("status"), "created") + + running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) + for job in running_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNone(job_rec.get("running")) + self.assertEqual(job_rec.get("status"), "created") + + finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) + for job in finish_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNone(job_rec.get("finished")) + self.assertEqual(job_rec.get("status"), "created") + + threads = list() + + def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): + """ + update jobs status in one thread + """ + runner.get_runjob().update_job_to_queued( + job_ids_queued[index], "scheduler_id" + ) + runner.get_jobs_status().start_job(job_ids_running[index]) + runner.get_jobs_status().start_job(job_ids_finish[index]) + job_output = { + "version": "11", + "result": {"result": 1}, + "id": "5d54bdcb9b402d15271b3208", + } + runner.finish_job(job_id=job_ids_finish[index], job_output=job_output) + + for index in range(thread_count): + x = threading.Thread( + target=update_states( + index, job_ids_queued, job_ids_running, job_ids_completed ) - threads.append(x) - x.start() - - for index, thread in enumerate(threads): - thread.join() - - # examing updateed job status - queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) - for job in queued_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNotNone(job_rec.get("queued")) - self.assertEqual(job_rec.get("status"), "queued") - - running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) - for job in running_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNotNone(job_rec.get("running")) - self.assertEqual(job_rec.get("status"), "running") - - finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) - for job in finish_jobs: - job_rec = job.to_mongo().to_dict() - self.assertIsNotNone(job_rec.get("finished")) - self.assertEqual(job_rec.get("status"), "completed") - - jobs = self.mongo_util.get_jobs( - job_ids=(job_ids_queued + job_ids_running + job_ids_completed) ) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + threads.append(x) + x.start() + + for index, thread in enumerate(threads): + thread.join() + + # examing updateed job status + queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) + for job in queued_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNotNone(job_rec.get("queued")) + self.assertEqual(job_rec.get("status"), "queued") + + running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) + for job in running_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNotNone(job_rec.get("running")) + self.assertEqual(job_rec.get("status"), "running") + + finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) + for job in finish_jobs: + job_rec = job.to_mongo().to_dict() + self.assertIsNotNone(job_rec.get("finished")) + self.assertEqual(job_rec.get("status"), "completed") + + jobs = self.mongo_util.get_jobs( + job_ids=(job_ids_queued + job_ids_running + job_ids_completed) + ) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) # @patch.object(Catalog, "get_module_version", return_value="module.version") # @patch("lib.execution_engine2.utils.Condor.Condor", autospec=True) @@ -244,225 +248,297 @@ def update_states(index, job_ids_queued, job_ids_running, job_ids_finish): @patch.object(Condor, "run_job", return_value=si) @patch.object(WorkspaceAuth, "can_write", return_value=True) @patch( - "lib.installed_clients.CatalogClient.Catalog.get_module_version", autospec=True + "installed_clients.CatalogClient.Catalog.list_client_group_configs", + autospec=True, ) - @patch("lib.installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) - def test_run_job_stress(self, ccles, cc, workspace, condor): + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + def test_retry_job_stress( + self, cc_get_mod_ver, cc_list_cli_configs, workspace, condor + ): + """ + Not a stress test, more of an impl test + """ + cc_get_mod_ver.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_list_cli_configs.return_value = [] + + # set job method differently to distinguish + method_1 = "app1.a_method" + job_params_1 = get_sample_job_params(method=method_1, app_id="app1/a") + + # Remove fake parent_job_id + del job_params_1["parent_job_id"] + + job_ids = [] + for i in range(10): + job_ids.append(self.impl.run_job(ctx=self.ctx, params=job_params_1)[0]) + + for job_id in job_ids: + self.impl.update_job_status( + ctx=self.ctx, params={"job_id": job_id, "status": "error"} + ) + self.impl.retry_job(ctx=self.ctx, params={"job_id": job_id}) + + @patch.object(Condor, "run_job", return_value=si) + @patch.object(WorkspaceAuth, "can_write", return_value=True) + @patch( + "installed_clients.CatalogClient.Catalog.list_client_group_configs", + autospec=True, + ) + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + @patch("installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) + def test_check_job_batch_stress( + self, cc_log_stats, cc_get_mod_ver, cc_list_cli_configs, workspace, condor + ): + # Note, not a stress test, just an impl file test + cc_get_mod_ver.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_list_cli_configs.return_value = [] + + # set job method differently to distinguish + method_1 = "app1.a_method" + method_2 = "app2.b_method" + + job_params_1 = get_sample_job_params( + method=method_1, app_id="app1/a", wsid=None, parent_job_id=None + ) + job_params_2 = get_sample_job_params( + method=method_2, app_id="app2/b", wsid=None, parent_job_id=None + ) + + batch_id = self.impl.run_job_batch( + ctx=self.ctx, params=[job_params_1, job_params_2], batch_params={} + )[0]["batch_id"] + check_job_batch_status = self.impl.check_job_batch( + ctx=self.ctx, params={"job_id": batch_id} + ) + assert "batch_jobstate" in check_job_batch_status[0] + assert "child_jobstates" in check_job_batch_status[0] + + @patch.object(Condor, "run_job", return_value=si) + @patch.object(WorkspaceAuth, "can_write", return_value=True) + @patch( + "installed_clients.CatalogClient.Catalog.list_client_group_configs", + autospec=True, + ) + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + @patch("installed_clients.CatalogClient.Catalog.log_exec_stats", autospec=True) + def test_run_job_stress( + self, cc_log_stats, cc_get_mod_ver, cc_list_cli_configs, workspace, condor + ): """ testing running 3 different jobs in multiple theads. """ - cc.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_get_mod_ver.return_value = {"git_commit_hash": "moduleversiongoeshere"} + cc_list_cli_configs.return_value = [] thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() + ori_job_count = Job.objects.count() - # set job method differently to distinguish - method_1 = "app1.a_method" - method_2 = "app2.b_method" - method_3 = "app3.c_method" + # set job method differently to distinguish + method_1 = "app1.a_method" + method_2 = "app2.b_method" + method_3 = "app3.c_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - job_params_3 = get_sample_job_params(method=method_3) + job_params_1 = get_sample_job_params(method=method_1, app_id="app1/a") + job_params_2 = get_sample_job_params(method=method_2, app_id="app2/b") + job_params_3 = get_sample_job_params(method=method_3, app_id="app3/c") - threads = list() - job_ids = list() - que = queue.Queue() + threads = list() + job_ids = list() + que = queue.Queue() - # execute run_job for 3 different jobs in threads - for index in range(thread_count): - x = threading.Thread( - target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_1)) - ) - threads.append(x) - x.start() + # execute run_job for 3 different jobs in threads + for index in range(thread_count): + x = threading.Thread( + target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_1)) + ) + threads.append(x) + x.start() - y = threading.Thread( - target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_2)) - ) - threads.append(y) - y.start() + y = threading.Thread( + target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_2)) + ) + threads.append(y) + y.start() - z = threading.Thread( - target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_3)) - ) - threads.append(z) - z.start() + z = threading.Thread( + target=que.put(self.impl.run_job(ctx=self.ctx, params=job_params_3)) + ) + threads.append(z) + z.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_ids.append(que.get()[0]) + while not que.empty(): + job_ids.append(que.get()[0]) - jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs + jobs = self.mongo_util.get_jobs(job_ids=job_ids) # testing get jobs - methods = [job.job_input.method for job in jobs] # examing methods returned - self.assertEqual(len(methods), thread_count * 3) - self.assertEqual(methods.count(method_1), thread_count) - self.assertEqual(methods.count(method_2), thread_count) - self.assertEqual(methods.count(method_3), thread_count) + methods = [job.job_input.method for job in jobs] # examing methods returned + self.assertEqual(len(methods), thread_count * 3) + self.assertEqual(methods.count(method_1), thread_count) + self.assertEqual(methods.count(method_2), thread_count) + self.assertEqual(methods.count(method_3), thread_count) - status = [ - job.status for job in jobs - ] # all jobs should eventually be put to 'queued' status - self.assertCountEqual(status, [Status.queued.value] * thread_count * 3) + status = [ + job.status for job in jobs + ] # all jobs should eventually be put to 'queued' status + self.assertCountEqual(status, [Status.queued.value] * thread_count * 3) - self.assertEqual( - len(set(job_ids)), thread_count * 3 - ) # testing identicalness of job_ids returned - self.assertEqual(len(job_ids), len(set(job_ids))) + self.assertEqual( + len(set(job_ids)), thread_count * 3 + ) # testing identicalness of job_ids returned + self.assertEqual(len(job_ids), len(set(job_ids))) - self.assertEqual( - ori_job_count, Job.objects.count() - thread_count * 3 - ) # testing job numbers created + self.assertEqual( + ori_job_count, Job.objects.count() - thread_count * 3 + ) # testing job numbers created - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_update_job_status(self): """ testing update jobs into different status in multiple threads """ - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() + ori_job_count = Job.objects.count() + runner = self.method_runner - job_params = get_sample_job_params() + job_params = get_sample_job_params() + job_params["job_reqs"] = JobRequirements(1, 1, 1, "njs") - thread_count = self.thread_count # threads to test + thread_count = self.thread_count # threads to test - job_ids_queued = list() # jobs to be set into 'queued' status - job_ids_running = list() # jobs to be set into 'running' status - job_ids_completed = list() # jobs to be set into 'completed' status + job_ids_queued = list() # jobs to be set into 'queued' status + job_ids_running = list() # jobs to be set into 'running' status + job_ids_completed = list() # jobs to be set into 'completed' status - # initializing jobs to be tested - for index in range(thread_count): - job_ids_queued.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_running.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) - job_ids_completed.append( - runner.get_runjob()._init_job_rec(self.user_id, job_params) - ) + # initializing jobs to be tested + for index in range(thread_count): + job_ids_queued.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_running.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) + job_ids_completed.append( + runner.get_runjob()._init_job_rec(self.user_id, job_params) + ) - # examing newly created job status - init_jobs = self.mongo_util.get_jobs( - job_ids=job_ids_queued + job_ids_running + job_ids_completed + # examing newly created job status + init_jobs = self.mongo_util.get_jobs( + job_ids=job_ids_queued + job_ids_running + job_ids_completed + ) + for job in init_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "created") + + threads = list() + + def update_states(index, job_ids_queued, job_ids_running, job_ids_completed): + """ + update jobs status in one thread + """ + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_ids_queued[index], "status": "queued"}, + ) + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_ids_running[index], "status": "running"}, + ) + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_ids_completed[index], "status": "completed"}, ) - for job in init_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "created") - - threads = list() - - def update_states( - index, job_ids_queued, job_ids_running, job_ids_completed - ): - """ - update jobs status in one thread - """ - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_ids_queued[index], "status": "queued"}, - ) - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_ids_running[index], "status": "running"}, - ) - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_ids_completed[index], "status": "completed"}, - ) - for index in range(thread_count): - x = threading.Thread( - target=update_states( - index, job_ids_queued, job_ids_running, job_ids_completed - ) + for index in range(thread_count): + x = threading.Thread( + target=update_states( + index, job_ids_queued, job_ids_running, job_ids_completed ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - # examing updateed job status - queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) - for job in queued_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "queued") + # examing updateed job status + queued_jobs = self.mongo_util.get_jobs(job_ids=job_ids_queued) + for job in queued_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "queued") - running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) - for job in running_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "running") + running_jobs = self.mongo_util.get_jobs(job_ids=job_ids_running) + for job in running_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "running") - finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) - for job in finish_jobs: - self.assertEqual(job.to_mongo().to_dict().get("status"), "completed") + finish_jobs = self.mongo_util.get_jobs(job_ids=job_ids_completed) + for job in finish_jobs: + self.assertEqual(job.to_mongo().to_dict().get("status"), "completed") - jobs = self.mongo_util.get_jobs( - job_ids=(job_ids_queued + job_ids_running + job_ids_completed) - ) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs = self.mongo_util.get_jobs( + job_ids=(job_ids_queued + job_ids_running + job_ids_completed) + ) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) - def test_check_jobs_stress(self): + @patch("installed_clients.CatalogClient.Catalog.get_module_version", autospec=True) + def test_check_jobs_stress(self, cc_get_mod_ver): """ testing check jobs in multiple theads. """ - + cc_get_mod_ver.return_value = {"git_commit_hash": "123"} thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() - - # set job method differently to distinguish - method_1 = "a_method" - method_2 = "b_method" - job_params_1 = get_sample_job_params(method=method_1) - job_params_2 = get_sample_job_params(method=method_2) - - # create jobs - job_id_1 = runner.get_runjob()._init_job_rec(self.user_id, job_params_1) - job_id_2 = runner.get_runjob()._init_job_rec(self.user_id, job_params_2) - - threads = list() - job_status = list() - que = queue.Queue() - - # execute check_jobs in multiple threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.check_jobs( - ctx=self.ctx, params={"job_ids": [job_id_1, job_id_2]} - ) + ori_job_count = Job.objects.count() + runner = self.method_runner + + # set job method differently to distinguish + method_1 = "a_method" + method_2 = "b_method" + job_params_1 = get_sample_job_params(method=method_1) + job_params_1["job_reqs"] = JobRequirements(1, 1, 1, "njs") + job_params_2 = get_sample_job_params(method=method_2) + job_params_2["job_reqs"] = JobRequirements(1, 1, 1, "njs") + + # create jobs + job_id_1 = runner.get_runjob()._init_job_rec(self.user_id, job_params_1) + job_id_2 = runner.get_runjob()._init_job_rec(self.user_id, job_params_2) + + threads = list() + job_status = list() + que = queue.Queue() + + # execute check_jobs in multiple threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.check_jobs( + ctx=self.ctx, params={"job_ids": [job_id_1, job_id_2]} ) ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_status.append(que.get()) + while not que.empty(): + job_status.append(que.get()) - # exam returned job status - for job_status in job_status: - job_status = job_status[0]["job_states"] - job_ids = [js["job_id"] for js in job_status] - job_methods = [js["job_input"]["method"] for js in job_status] - self.assertCountEqual(job_ids, [job_id_1, job_id_2]) - self.assertCountEqual(job_methods, [method_1, method_2]) + # exam returned job status + for job_status in job_status: + job_status = job_status[0]["job_states"] + job_ids = [js["job_id"] for js in job_status] + job_methods = [js["job_input"]["method"] for js in job_status] + self.assertCountEqual(job_ids, [job_id_1, job_id_2]) + self.assertCountEqual(job_methods, [method_1, method_2]) - jobs = self.mongo_util.get_jobs(job_ids=[job_id_1, job_id_2]) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs = self.mongo_util.get_jobs(job_ids=[job_id_1, job_id_2]) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_check_job_canceled_stress(self): """ @@ -471,106 +547,102 @@ def test_check_job_canceled_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() + ori_job_count = Job.objects.count() + runner = self.method_runner - job_params = get_sample_job_params() + job_params = get_sample_job_params() + job_params["job_reqs"] = JobRequirements(1, 1, 1, "njs") - # create jobs - job_id_running = runner.get_runjob()._init_job_rec(self.user_id, job_params) - job_id_terminated = runner.get_runjob()._init_job_rec( - self.user_id, job_params - ) - job_id_completed = runner.get_runjob()._init_job_rec( - self.user_id, job_params - ) + # create jobs + job_id_running = runner.get_runjob()._init_job_rec(self.user_id, job_params) + job_id_terminated = runner.get_runjob()._init_job_rec(self.user_id, job_params) + job_id_completed = runner.get_runjob()._init_job_rec(self.user_id, job_params) - self.impl.update_job_status( - ctx=self.ctx, params={"job_id": job_id_running, "status": "running"} - ) - self.impl.update_job_status( - ctx=self.ctx, - params={"job_id": job_id_terminated, "status": "terminated"}, - ) - self.impl.update_job_status( - ctx=self.ctx, params={"job_id": job_id_completed, "status": "completed"} - ) + self.impl.update_job_status( + ctx=self.ctx, params={"job_id": job_id_running, "status": "running"} + ) + self.impl.update_job_status( + ctx=self.ctx, + params={"job_id": job_id_terminated, "status": "terminated"}, + ) + self.impl.update_job_status( + ctx=self.ctx, params={"job_id": job_id_completed, "status": "completed"} + ) - threads = list() - job_canceled_status = list() - que = queue.Queue() - - # execute check_job_canceled in multiple threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.check_job_canceled( - ctx=self.ctx, params={"job_id": job_id_running} - ) + threads = list() + job_canceled_status = list() + que = queue.Queue() + + # execute check_job_canceled in multiple threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.check_job_canceled( + ctx=self.ctx, params={"job_id": job_id_running} ) ) - threads.append(x) - x.start() - - y = threading.Thread( - target=que.put( - self.impl.check_job_canceled( - ctx=self.ctx, params={"job_id": job_id_terminated} - ) + ) + threads.append(x) + x.start() + + y = threading.Thread( + target=que.put( + self.impl.check_job_canceled( + ctx=self.ctx, params={"job_id": job_id_terminated} ) ) - threads.append(y) - y.start() - - z = threading.Thread( - target=que.put( - self.impl.check_job_canceled( - ctx=self.ctx, params={"job_id": job_id_completed} - ) + ) + threads.append(y) + y.start() + + z = threading.Thread( + target=que.put( + self.impl.check_job_canceled( + ctx=self.ctx, params={"job_id": job_id_completed} ) ) - threads.append(z) - z.start() - - for index, thread in enumerate(threads): - thread.join() - - while not que.empty(): - job_canceled_status.append(que.get()) - - # exam correct job ids returned - job_ids_returned = [ - jcs_return[0]["job_id"] for jcs_return in job_canceled_status - ] - self.assertEqual( - len(job_ids_returned), thread_count * 3 - ) # exam total job number returned - self.assertEqual(job_ids_returned.count(job_id_running), thread_count) - self.assertEqual(job_ids_returned.count(job_id_terminated), thread_count) - self.assertEqual(job_ids_returned.count(job_id_completed), thread_count) - - # exam returned job canceled status - for job_canceled_status_return in job_canceled_status: - job_canceled_status_return = job_canceled_status_return[0] - if job_canceled_status_return["job_id"] == job_id_running: - self.assertFalse(job_canceled_status_return["canceled"]) - self.assertFalse(job_canceled_status_return["finished"]) - if job_canceled_status_return["job_id"] == job_id_terminated: - self.assertTrue(job_canceled_status_return["canceled"]) - self.assertTrue(job_canceled_status_return["finished"]) - if job_canceled_status_return["job_id"] == job_id_completed: - self.assertFalse(job_canceled_status_return["canceled"]) - self.assertTrue(job_canceled_status_return["finished"]) - - jobs = self.mongo_util.get_jobs( - job_ids=[job_id_running, job_id_terminated, job_id_completed] ) + threads.append(z) + z.start() + + for index, thread in enumerate(threads): + thread.join() + + while not que.empty(): + job_canceled_status.append(que.get()) + + # exam correct job ids returned + job_ids_returned = [ + jcs_return[0]["job_id"] for jcs_return in job_canceled_status + ] + self.assertEqual( + len(job_ids_returned), thread_count * 3 + ) # exam total job number returned + self.assertEqual(job_ids_returned.count(job_id_running), thread_count) + self.assertEqual(job_ids_returned.count(job_id_terminated), thread_count) + self.assertEqual(job_ids_returned.count(job_id_completed), thread_count) + + # exam returned job canceled status + for job_canceled_status_return in job_canceled_status: + job_canceled_status_return = job_canceled_status_return[0] + if job_canceled_status_return["job_id"] == job_id_running: + self.assertFalse(job_canceled_status_return["canceled"]) + self.assertFalse(job_canceled_status_return["finished"]) + if job_canceled_status_return["job_id"] == job_id_terminated: + self.assertTrue(job_canceled_status_return["canceled"]) + self.assertTrue(job_canceled_status_return["finished"]) + if job_canceled_status_return["job_id"] == job_id_completed: + self.assertFalse(job_canceled_status_return["canceled"]) + self.assertTrue(job_canceled_status_return["finished"]) + + jobs = self.mongo_util.get_jobs( + job_ids=[job_id_running, job_id_terminated, job_id_completed] + ) - for job in jobs: - job.delete() + for job in jobs: + job.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.assertEqual(ori_job_count, Job.objects.count()) def test_get_job_logs_stress(self): """ @@ -579,57 +651,54 @@ def test_get_job_logs_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - ori_job_count = Job.objects.count() - runner = self.getRunner() + ori_job_count = Job.objects.count() + runner = self.method_runner - # create job - job_id = runner.get_runjob()._init_job_rec( - self.user_id, get_sample_job_params() - ) + # create job + params = get_sample_job_params() + params["job_reqs"] = JobRequirements(1, 1, 1, "njs") + job_id = runner.get_runjob()._init_job_rec(self.user_id, params) - # add one line to job - ts = time.time() - job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] - self.impl.add_job_logs( - ctx=self.ctx, params={"job_id": job_id}, lines=job_line - ) + # add one line to job + ts = time.time() + job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] + self.impl.add_job_logs(ctx=self.ctx, params={"job_id": job_id}, lines=job_line) - threads = list() - job_lines = list() - que = queue.Queue() + threads = list() + job_lines = list() + que = queue.Queue() - # execute get_job_logs in multiple threads - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id}) - ) + # execute get_job_logs in multiple threads + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id}) ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - while not que.empty(): - job_lines.append(que.get()) + while not que.empty(): + job_lines.append(que.get()) - self.assertEqual( - len(job_lines), thread_count - ) # exam total number of job lines returned + self.assertEqual( + len(job_lines), thread_count + ) # exam total number of job lines returned - # exam each get_job_logs result - for job_line in job_lines: - job_line = job_line[0]["lines"][0] - self.assertEqual(job_line["line"], "hello ee2") - self.assertEqual(job_line["linepos"], 0) - self.assertEqual(job_line["is_error"], 1) - self.assertEqual(job_line["ts"], int(ts * 1000)) + # exam each get_job_logs result + for job_line in job_lines: + job_line = job_line[0]["lines"][0] + self.assertEqual(job_line["line"], "hello ee2") + self.assertEqual(job_line["linepos"], 0) + self.assertEqual(job_line["is_error"], 1) + self.assertEqual(job_line["ts"], int(ts * 1000)) - jobs = self.mongo_util.get_jobs(job_ids=[job_id]) - jobs.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + jobs = self.mongo_util.get_jobs(job_ids=[job_id]) + jobs.delete() + self.assertEqual(ori_job_count, Job.objects.count()) def test_add_job_logs_stress(self): """ @@ -638,61 +707,57 @@ def test_add_job_logs_stress(self): thread_count = self.thread_count # threads to test - with self.mongo_util.mongo_engine_connection(): - - ori_job_count = Job.objects.count() - print("original job count is", ori_job_count) - runner = self.getRunner() - - # create job - job_id = runner.get_runjob()._init_job_rec( - self.user_id, get_sample_job_params() - ) - - # job line to be added - ts = time.time() - job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] - - threads = list() - que = queue.Queue() - # execute add_job_logs in multiple threads - print("Number of threads are", thread_count) - for index in range(thread_count): - x = threading.Thread( - target=que.put( - self.impl.add_job_logs( - ctx=self.ctx, params={"job_id": job_id}, lines=job_line - ) + ori_job_count = Job.objects.count() + print("original job count is", ori_job_count) + runner = self.method_runner + + # create job + params = get_sample_job_params() + params["job_reqs"] = JobRequirements(1, 1, 1, "njs") + job_id = runner.get_runjob()._init_job_rec(self.user_id, params) + + # job line to be added + ts = time.time() + job_line = [{"line": "hello ee2", "is_error": 1, "ts": ts}] + + threads = list() + que = queue.Queue() + # execute add_job_logs in multiple threads + print("Number of threads are", thread_count) + for index in range(thread_count): + x = threading.Thread( + target=que.put( + self.impl.add_job_logs( + ctx=self.ctx, params={"job_id": job_id}, lines=job_line ) ) - threads.append(x) - x.start() + ) + threads.append(x) + x.start() - for index, thread in enumerate(threads): - thread.join() + for index, thread in enumerate(threads): + thread.join() - job_lines = self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id})[ - 0 - ] + job_lines = self.impl.get_job_logs(ctx=self.ctx, params={"job_id": job_id})[0] - self.assertEqual( - job_lines["last_line_number"], thread_count - 1 - ) # exam total number of job lines created by add_job_logs + self.assertEqual( + job_lines["last_line_number"], thread_count - 1 + ) # exam total number of job lines created by add_job_logs - # exam each line created by add_job_logs - lines = job_lines["lines"] - self.assertEqual(len(lines), thread_count) - line_pos = list() - for line in lines: - self.assertEqual(line["line"], "hello ee2") - self.assertEqual(line["is_error"], 1) - self.assertEqual(line["ts"], int(ts * 1000)) - line_pos.append(line["linepos"]) - self.assertCountEqual(line_pos, list(range(0, thread_count))) + # exam each line created by add_job_logs + lines = job_lines["lines"] + self.assertEqual(len(lines), thread_count) + line_pos = list() + for line in lines: + self.assertEqual(line["line"], "hello ee2") + self.assertEqual(line["is_error"], 1) + self.assertEqual(line["ts"], int(ts * 1000)) + line_pos.append(line["linepos"]) + self.assertCountEqual(line_pos, list(range(0, thread_count))) - jobs = self.mongo_util.get_jobs(job_ids=[job_id]) + jobs = self.mongo_util.get_jobs(job_ids=[job_id]) - for job in jobs: - job.delete() + for job in jobs: + job.delete() - self.assertEqual(ori_job_count, Job.objects.count()) + self.assertEqual(ori_job_count, Job.objects.count()) diff --git a/test/tests_for_sdkmr/ee2_retry_test.py b/test/tests_for_sdkmr/ee2_retry_test.py new file mode 100644 index 000000000..c567227e3 --- /dev/null +++ b/test/tests_for_sdkmr/ee2_retry_test.py @@ -0,0 +1,141 @@ +""" +Unit tests for the Retry Code +""" +from unittest.mock import create_autospec, MagicMock + +from pytest import raises + +from execution_engine2.exceptions import CannotRetryJob, RetryFailureException +from execution_engine2.sdk.EE2Runjob import EE2RunJob +from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner +from test.utils_shared.test_utils import assert_exception_correct +from test.utils_shared.test_utils import get_example_job + + +def test_retry_db_failures(): + """ + * Test correct db update failure message, and that cancel_job is called + * Test that on exception, the db_update failure is called + """ + sdkmr = MagicMock() + retry_job = get_example_job(status="error") + parent_job = get_example_job(status="error") + retry_job.job_input.parent_job_id = "123" + sdkmr.get_job_with_permission = MagicMock(return_value=retry_job) + sdkmr.cancel_job = MagicMock() + rj = EE2RunJob(sdkmr=sdkmr) + + # Check correct exception and that safe cancel/cancel_job is called + job1 = "job1" + job_to_abort = "job_to_abort" + + # Check to make sure cancel_job is called on failure + with raises(Exception) as e: + rj._db_update_failure( + job_that_failed_operation="job1", + job_to_abort="job_to_abort", + exception=Exception(123), + ) + expected_exception = RetryFailureException( + f"Couldn't update job record:{job1} during retry. Aborting:{job_to_abort} Exception:123 " + ) + assert_exception_correct(e.value, expected_exception) + assert sdkmr.cancel_job.call_count == 1 + + # Check to make sure safe_cancel_call is called on failure + with raises(Exception) as e: + rj._safe_cancel = MagicMock() + rj._db_update_failure( + job_that_failed_operation="job1", + job_to_abort="job_to_abort", + exception=Exception(123), + ) + expected_exception = RetryFailureException( + f"Couldn't update job record:{job1} during retry. Aborting:{job_to_abort} Exception:123 " + ) + assert_exception_correct(e.value, expected_exception) + assert rj._safe_cancel.call_count == 1 + + rj.run = MagicMock(return_value=retry_job) + # One DB failure + rj._db_update_failure = MagicMock(side_effect=Exception("Boom!")) + with raises(Exception): + rj._retry(job_id=retry_job.id, job=retry_job, batch_job=parent_job) + assert rj._db_update_failure.call_count == 1 + + # Two db failures + rj._db_update_failure = MagicMock() + rj._retry(job_id=retry_job.id, job=retry_job, batch_job=parent_job) + + assert not retry_job.retry_saved_toggle + + +def test_validate_retry(): + sdkmr = create_autospec(SDKMethodRunner, instance=True, spec_set=True) + + # Passing case with nothing to assert, all goes well + good_job = get_example_job(status="error") + sdkmr.get_job_with_permission = MagicMock(return_value=good_job) + rj = EE2RunJob(sdkmr=sdkmr) + rj._validate_retry_presubmit("unknown") + + # Fail case with the wrong status + with raises(Exception) as e: + sdkmr.get_job_with_permission = MagicMock( + return_value=get_example_job(status="running") + ) + rj = EE2RunJob(sdkmr=sdkmr) + rj._validate_retry_presubmit("unknown") + expected_exception = CannotRetryJob( + "Error retrying job unknown with status running: can only retry jobs with " + "status 'error' or 'terminated'", + ) + assert_exception_correct(e.value, expected_exception) + + # Fail case with the batch job + with raises(Exception) as e: + good_job.batch_job = True + sdkmr.get_job_with_permission = MagicMock(return_value=good_job) + rj = EE2RunJob(sdkmr=sdkmr) + rj._validate_retry_presubmit("unknown") + + expected_exception = CannotRetryJob( + "Cannot retry batch job parents. Must retry individual jobs" + ) + assert_exception_correct(e.value, expected_exception) + + +def test_retry_get_run_job_params_from_existing_job(): + """ + Test to see that the retried job matches the job it got retried from the db + Not all fields are expected back + """ + example_job = get_example_job() + example_job_as_dict = example_job.to_mongo().to_dict() + extracted_job = EE2RunJob._get_run_job_params_from_existing_job( + example_job, user_id=example_job.user + "other" + ) + # Check Top Level Fields Match + discarded_keys = [ + "user", + "authstrat", + "status", + "job_input", + "child_jobs", + "batch_job", + "retry_ids", + "retry_saved_toggle", + ] + expected_unequal_keys = [ + "updated", + "queued", + "scheduler_id", + ] + for key in example_job_as_dict.keys(): + if key in discarded_keys: + continue + if key in expected_unequal_keys: + if key in extracted_job: + assert example_job_as_dict[key] != extracted_job[key] + else: + assert example_job_as_dict[key] == extracted_job[key] diff --git a/test/tests_for_sdkmr/ee2_scheduler_test.py b/test/tests_for_sdkmr/ee2_scheduler_test.py deleted file mode 100644 index 66ec00622..000000000 --- a/test/tests_for_sdkmr/ee2_scheduler_test.py +++ /dev/null @@ -1,232 +0,0 @@ -# -*- coding: utf-8 -*- -import logging -import os -import unittest - -from lib.execution_engine2.sdk.EE2Runjob import ConciergeParams -from lib.execution_engine2.utils.CatalogUtils import CatalogUtils -from lib.execution_engine2.utils.Condor import Condor -from test.utils_shared.test_utils import bootstrap - -logging.basicConfig(level=logging.INFO) - -bootstrap() - - -class ExecutionEngine2SchedulerTest(unittest.TestCase): - @classmethod - def setUpClass(cls): - deploy = os.environ.get("KB_DEPLOYMENT_CONFIG", "test/deploy.cfg") - cls.condor = Condor(deploy) - cls.job_id = "1234" - cls.user = "kbase" - cls.catalog_utils = CatalogUtils( - url="https://ci.kbase.us/services/Catalog", admin_token="123" - ) - - @classmethod - def tearDownClass(cls): - if hasattr(cls, "wsName"): - cls.wsClient.delete_workspace({"workspace": cls.wsName}) - print("Test workspace was deleted") - - def _create_sample_params(self, cgroups): - params = dict() - params["job_id"] = self.job_id - params["user_id"] = "kbase" - params["token"] = "test_token" - rr = CatalogUtils.normalize_job_settings(cgroups) - - print(rr) - params["cg_resources_requirements"] = rr - - return params - - def test_empty_params(self): - c = self.condor - params = {"job_id": "test_job_id", "user_id": "test", "token": "test_token"} - with self.assertRaisesRegex( - Exception, "cg_resources_requirements not found in params" - ): - c.create_submit(params) - - def test_create_submit_file(self): - # Test with empty clientgroup - logging.info("Testing with njs clientgroup") - c = self.condor - params = self._create_sample_params(cgroups=["njs"]) - - default_sub = c.create_submit(params) - - sub = default_sub - self.assertEqual(sub["executable"], c.initial_dir + "/" + c.executable) - self.assertEqual(sub["arguments"], f"{params['job_id']} {c.ee_endpoint}") - self.assertEqual(sub["universe"], "vanilla") - self.assertEqual(sub["+AccountingGroup"], '"' + params["user_id"] + '"') - self.assertEqual(sub["Concurrency_Limits"], params["user_id"]) - self.assertEqual(sub["+Owner"], '"condor_pool"') - self.assertEqual(sub["ShouldTransferFiles"], "YES") - self.assertEqual(sub["When_To_Transfer_Output"], "ON_EXIT_OR_EVICT") - - self.assertEqual(sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS]) - self.assertEqual( - sub[Condor.REQUEST_MEMORY], c.config["njs"][Condor.REQUEST_MEMORY] - ) - self.assertEqual(sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK]) - - # TODO Test this variable somehow - # environment = sub["environment"].split(" ") - - # Test with filled out clientgroup - logging.info("Testing with complex-empty clientgroup") - - params = self._create_sample_params( - cgroups=["njs,request_cpus=8,request_memory=10GB,request_apples=5"] - ) - - njs_sub = c.create_submit(params) - sub = njs_sub - - self.assertIn("njs", sub["requirements"]) - - self.assertIn('regexp("njs",CLIENTGROUP)', sub["requirements"]) - - self.assertIn('request_apples == "5"', sub["requirements"]) - - self.assertEqual(sub[Condor.REQUEST_CPUS], "8") - self.assertEqual(sub[Condor.REQUEST_MEMORY], "10GB") - self.assertEqual(sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK]) - - logging.info("Testing with regex disabled in old format (no effect)") - - # with self.assertRaisesRegex( - # ValueError, "Illegal argument! Old format does not support this option" - # ): - # params = self._create_sample_params( - # cgroups=["njs,request_cpus=8,request_memory=10GB,request_apples=5,client_group_regex=False"] - # ) - # c.create_submit(params) # pragma: no cover - - # Test with json version of clientgroup - - logging.info("Testing with empty clientgroup defaulting to njs") - - params = self._create_sample_params(cgroups="") - - empty_sub = c.create_submit(params) - sub = empty_sub - - self.assertEqual(sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS]) - self.assertEqual( - sub[Condor.REQUEST_MEMORY], c.config["njs"][Condor.REQUEST_MEMORY] - ) - self.assertEqual(sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK]) - - # logging.info("Testing with empty dict (raises typeerror)") - # - # with self.assertRaises(TypeError): - # params = self._create_sample_params(cgroups={}) - # print(params) - # empty_json_sub = c.create_submit(params) - - logging.info("Testing with empty dict as a string ") - - params = self._create_sample_params(cgroups=["{}"]) - - empty_json_sub = c.create_submit(params) - - params = self._create_sample_params(cgroups=['{"client_group" : "njs"}']) - - json_sub = c.create_submit(params) - - params = self._create_sample_params( - cgroups=['{"client_group" : "njs", "client_group_regex" : "false"}'] - ) - - json_sub_with_regex_disabled_njs = c.create_submit(params) - - # json_sub_with_regex_disabled - - logging.info("Testing with real valid json ") - for sub in [empty_json_sub, json_sub, json_sub_with_regex_disabled_njs]: - self.assertEqual( - sub[Condor.REQUEST_CPUS], c.config["njs"][Condor.REQUEST_CPUS] - ) - self.assertEqual( - sub[Condor.REQUEST_MEMORY], c.config["njs"][Condor.REQUEST_MEMORY] - ) - self.assertEqual( - sub[Condor.REQUEST_DISK], c.config["njs"][Condor.REQUEST_DISK] - ) - - with self.assertRaises(ValueError): - logging.info("Testing with real json invalid cgroup {bigmemzlong} ") - params = self._create_sample_params( - cgroups='{"client_group" : "bigmemzlong", "client_group_regex" : "FaLsE"}' - ) - - # json_sub_with_regex_disabled - c.create_submit(params) - - logging.info("Testing with real json, regex disabled, bigmem") - - params = self._create_sample_params( - cgroups=['{"client_group" : "bigmem", "client_group_regex" : "FaLsE"}'] - ) - - json_sub_with_regex_disabled_bigmem = c.create_submit(params) - self.assertIn( - '(CLIENTGROUP == "bigmem', - json_sub_with_regex_disabled_bigmem["requirements"], - ) - - def _get_concierge_params(self, cg=None): - cp = {} - cp["request_cpus"] = 100 - cp["request_memory"] = 200 - cp["request_disk"] = 1000 - if cg: - cp["client_group"] = cg - return ConciergeParams(**cp) - - def test_create_submit_file_concierge(self): - logging.info("Testing with concierge clientgroup") - c = self.condor - params = self._create_sample_params(cgroups=["njs"]) - cp = self._get_concierge_params() - sub = c.create_submit(params=params, concierge_params=cp) - # Concurrency limits removed - self.assertNotIn("Concurrency_Limits", sub) - self.assertEqual(sub["+AccountingGroup"], '"' + params["user_id"] + '"') - self.assertEqual(sub[Condor.REQUEST_CPUS], str(cp.request_cpus)) - self.assertEqual(sub[Condor.REQUEST_MEMORY], str(cp.request_memory)) - self.assertEqual(sub[Condor.REQUEST_DISK], str(cp.request_disk)) - self.assertEqual(sub["+KB_CLIENTGROUP"], f'"{str(cp.client_group)}"') - - cp.client_group = "LeConcierge" - cp.account_group = "LeCat" - sub2 = c.create_submit(params=params, concierge_params=cp) - self.assertEqual(sub2["+KB_CLIENTGROUP"], f'"{str(cp.client_group)}"') - self.assertEqual(sub2["+AccountingGroup"], '"' + cp.account_group + '"') - self.assertNotIn("Concurrency_Limits", sub2) - - # submission_info = c.run_submit(sub2) - # - # self.assertIsNotNone(submission_info.clusterid) - # self.assertIsNotNone(submission_info.submit) - # self.assertIsNone(submission_info.error) - - # - # def test_extract(self): - # logging.info("Testing with concierge clientgroup") - # c = self.condor - # params = self._create_sample_params(cgroups=["njs"]) - # cp = self._get_concierge_params() - # sub = c.create_submit(params=params, concierge_params=cp) - # submission_info = c.run_submit(sub) - # print(submission_info) - # - # - # def test_get_usage(self): - # job_id = '732' - # print(self.condor.get_job_resource_info(cluster_id=job_id)) diff --git a/test/tests_for_sdkmr/job_submission_parameters_test.py b/test/tests_for_sdkmr/job_submission_parameters_test.py new file mode 100644 index 000000000..33ab3e16b --- /dev/null +++ b/test/tests_for_sdkmr/job_submission_parameters_test.py @@ -0,0 +1,644 @@ +from pytest import raises +from execution_engine2.sdk.job_submission_parameters import ( + JobRequirements, + JobSubmissionParameters, +) +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_job_req_init_minimal(): + jr = JobRequirements(1, 1, 1, "njs") + + assert jr.cpus == 1 + assert jr.memory_MB == 1 + assert jr.disk_GB == 1 + assert jr.client_group == "njs" + assert jr.client_group_regex is None + assert jr.bill_to_user is None + assert jr.ignore_concurrency_limits is False + assert jr.scheduler_requirements == {} + assert jr.debug_mode is False + + +def test_job_req_init_maximal(): + jr = JobRequirements( + 6, + 7, + 8, + " bigmemlong \t ", + True, + "\tsomeuser ", + True, + {"proc": "x286", "maxmem": "640k"}, + True, + ) + + assert jr.cpus == 6 + assert jr.memory_MB == 7 + assert jr.disk_GB == 8 + assert jr.client_group == "bigmemlong" + assert jr.client_group_regex is True + assert jr.bill_to_user == "someuser" + assert jr.ignore_concurrency_limits is True + assert jr.scheduler_requirements == {"proc": "x286", "maxmem": "640k"} + assert jr.debug_mode is True + + +def test_job_req_init_non_bools(): + for inp, expected in { + 1: True, + " ": True, + (1,): True, + 0: False, + "": False, + tuple(): False, + }.items(): + jr = JobRequirements( + 6, + 7, + 8, + "cg", + client_group_regex=inp, + ignore_concurrency_limits=inp, + debug_mode=inp, + ) + + assert jr.client_group_regex is expected + assert jr.ignore_concurrency_limits is expected + assert jr.debug_mode is expected + + +def test_job_req_init_None_for_bools(): + jr = JobRequirements( + 6, + 7, + 8, + "cg", + client_group_regex=None, + ignore_concurrency_limits=None, + debug_mode=None, + ) + + assert jr.client_group_regex is None + assert jr.ignore_concurrency_limits is False + assert jr.debug_mode is False + + +def test_job_req_init_fail(): + n = None + _job_req_init_fail( + n, 1, 1, "f", n, n, IncorrectParamsException("CPU count must be at least 1") + ) + _job_req_init_fail( + 0, 1, 1, "f", n, n, IncorrectParamsException("CPU count must be at least 1") + ) + _job_req_init_fail( + 1, n, 1, "f", n, n, IncorrectParamsException("memory in MB must be at least 1") + ) + _job_req_init_fail( + 1, 0, 1, "f", n, n, IncorrectParamsException("memory in MB must be at least 1") + ) + _job_req_init_fail( + 1, + 1, + n, + "f", + n, + n, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _job_req_init_fail( + 1, + 1, + 0, + "f", + n, + n, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _job_req_init_fail( + 1, + 1, + 1, + n, + n, + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _job_req_init_fail( + 1, + 1, + 1, + " \t ", + n, + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + # as_user is optional, so this is the only possible failure mode + _job_req_init_fail( + 1, + 1, + 1, + "f", + "user\tname", + n, + IncorrectParamsException("bill_to_user contains control characters"), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {n: "a"}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {" \t ": "a"}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {"a": n}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + _job_req_init_fail( + 1, + 1, + 1, + "f", + n, + {"a": " \t "}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + + +def _job_req_init_fail(cpus, mem, disk, cgroup, user, reqs, expected): + with raises(Exception) as got: + JobRequirements(cpus, mem, disk, cgroup, False, user, False, reqs) + assert_exception_correct(got.value, expected) + + +def test_job_req_check_parameters_no_input(): + n = None + assert JobRequirements.check_parameters() == (n, n, n, n, n, n, n, {}, n) + assert JobRequirements.check_parameters(n, n, n, n, n, n, n, n, n) == ( + n, + n, + n, + n, + n, + n, + n, + {}, + n, + ) + + +def test_job_req_check_parameters_full_input(): + assert ( + JobRequirements.check_parameters( + 1, + 1, + 1, + " b ", + "x", + " user ", + 890, + {"proc": "x286", "maxmem": "640k"}, + [], + ) + == (1, 1, 1, "b", True, "user", True, {"proc": "x286", "maxmem": "640k"}, False) + ) + + +def test_job_req_check_parameters_whitespace_as_user(): + assert ( + JobRequirements.check_parameters( + 1, + 1, + 1, + " b ", + 0, + " \t ", + 890, + {"proc": "x286", "maxmem": "640k"}, + 1, + ) + == (1, 1, 1, "b", False, None, True, {"proc": "x286", "maxmem": "640k"}, True) + ) + + +def test_job_req_check_parameters_fail(): + n = None + _job_req_check_parameters_fail( + 0, 1, 1, "c", "u", n, IncorrectParamsException("CPU count must be at least 1") + ) + _job_req_check_parameters_fail( + 1, + 0, + 1, + "c", + "u", + n, + IncorrectParamsException("memory in MB must be at least 1"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 0, + "c", + "u", + n, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + " \t ", + "u", + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + " j\bi ", + n, + IncorrectParamsException("bill_to_user contains control characters"), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {None: 1}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {"a": None}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {" \t ": 1}, + IncorrectParamsException( + "Missing input parameter: key in scheduler requirements structure" + ), + ) + _job_req_check_parameters_fail( + 1, + 1, + 1, + "c", + "u", + {"b": " \t "}, + IncorrectParamsException( + "Missing input parameter: value for key 'b' in scheduler requirements structure" + ), + ) + + +def _job_req_check_parameters_fail(cpu, mem, disk, cgroup, user, reqs, expected): + with raises(Exception) as got: + JobRequirements(cpu, mem, disk, cgroup, True, user, True, reqs) + assert_exception_correct(got.value, expected) + + +def test_job_req_equals(): + c1 = "cligroupf" + c1a = "cligroupf" + c2 = "cligroupg" + t = True + f = False + u1 = "user1" + u1a = "user1" + u2 = "user2" + r1 = {"a": "b"} + r1a = {"a": "b"} + r2 = {"a": "c"} + + jr_sm = JobRequirements(1, 1, 1, c1) + jr_lg = JobRequirements(1, 1, 1, c1, t, u1, f, r1, t) + + assert jr_sm == JobRequirements(1, 1, 1, c1a) + assert jr_lg == JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, t) + + assert jr_sm != JobRequirements(2, 1, 1, c1a) + assert jr_sm != JobRequirements(1, 2, 1, c1a) + assert jr_sm != JobRequirements(1, 1, 2, c1a) + assert jr_sm != JobRequirements(1, 1, 1, c2) + assert jr_sm != (1, 1, 1, c1) + + assert jr_lg != JobRequirements(1, 1, 1, c1a, f, u1a, f, r1a, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u2, f, r1a, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u1a, t, r1a, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u1a, f, r2, t) + assert jr_lg != JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, f) + assert jr_lg != (1, 1, 1, c1a, t, u1a, f, r1a, t) + + +def test_job_req_hash(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + c1 = "cligroupf" + c1a = "cligroupf" + c2 = "cligroupg" + t = True + f = False + u1 = "user1" + u1a = "user1" + u2 = "user2" + r1 = {"a": "b"} + r1a = {"a": "b"} + r2 = {"a": "c"} + + jr_sm = JobRequirements(1, 1, 1, c1) + jr_lg = JobRequirements(1, 1, 1, c1, t, u1, f, r1, t) + + assert hash(jr_sm) == hash(JobRequirements(1, 1, 1, c1a)) + assert hash(jr_lg) == hash(JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, t)) + + assert hash(jr_sm) != hash(JobRequirements(2, 1, 1, c1a)) + assert hash(jr_sm) != hash(JobRequirements(1, 2, 1, c1a)) + assert hash(jr_sm) != hash(JobRequirements(1, 1, 2, c1a)) + assert hash(jr_sm) != hash(JobRequirements(1, 1, 1, c2)) + + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, f, u1a, f, r1a, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u2, f, r1a, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u1a, t, r1a, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u1a, f, r2, t)) + assert hash(jr_lg) != hash(JobRequirements(1, 1, 1, c1a, t, u1a, f, r1a, f)) + + +def test_job_sub_init_minimal(): + jsp = JobSubmissionParameters( + "jobid", + AppInfo("a.b", "a/x"), + JobRequirements(6, 7, 4, "cligroup"), + UserCreds("user", "tokeytoken"), + ) + + assert jsp.job_id == "jobid" + assert jsp.app_info == AppInfo("a.b", "a/x") + assert jsp.job_reqs == JobRequirements(6, 7, 4, "cligroup") + assert jsp.user_creds == UserCreds("user", "tokeytoken") + assert jsp.parent_job_id is None + assert jsp.wsid is None + assert jsp.source_ws_objects == tuple() + + +def test_job_sub_init_maximal(): + jsp = JobSubmissionParameters( + " jobid \t ", + AppInfo("a.b", "a/x"), + JobRequirements(6, 7, 4, "cligroup"), + UserCreds("user", "tokeytoken"), + " parentid \t ", + 1, + [" 1 /\t2 / 4", "6/7/8"], + ) + + assert jsp.job_id == "jobid" + assert jsp.app_info == AppInfo("a.b", "a/x") + assert jsp.job_reqs == JobRequirements(6, 7, 4, "cligroup") + assert jsp.user_creds == UserCreds("user", "tokeytoken") + assert jsp.parent_job_id == "parentid" + assert jsp.wsid == 1 + assert jsp.source_ws_objects == ("1/2/4", "6/7/8") + + +def test_job_sub_init_fail(): + n = None + j = "jobby job job" + a = AppInfo("a.b", "a/x") + r = JobRequirements(6, 7, 4, "cligroup") + u = UserCreds("user", "tokeytoken") + + _job_sub_init_fail( + n, a, r, u, n, n, n, IncorrectParamsException("Missing input parameter: job_id") + ) + _job_sub_init_fail( + " \t ", + a, + r, + u, + n, + n, + n, + IncorrectParamsException("Missing input parameter: job_id"), + ) + _job_sub_init_fail( + j, + n, + r, + u, + n, + n, + n, + ValueError("app_info cannot be a value that evaluates to false"), + ) + _job_sub_init_fail( + j, + a, + n, + u, + n, + n, + n, + ValueError("job_reqs cannot be a value that evaluates to false"), + ) + _job_sub_init_fail( + j, + a, + r, + n, + n, + n, + n, + ValueError("user_creds cannot be a value that evaluates to false"), + ) + # the only way to get parent id to to fail is with a control char + _job_sub_init_fail( + j, + a, + r, + u, + "par\bent", + n, + n, + IncorrectParamsException("parent_job_id contains control characters"), + ) + _job_sub_init_fail( + j, a, r, u, n, 0, n, IncorrectParamsException("wsid must be at least 1") + ) + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + ["1/2/3", n], + IncorrectParamsException( + "source_ws_objects index 1, 'None', is not a valid Unique Permanent Address" + ), + ) + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + {"1/2/3": "5/6/7"}, + IncorrectParamsException("source_ws_objects must be a list"), + ) + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + ["1/2/3", " \t "], + IncorrectParamsException( + "source_ws_objects index 1, ' \t ', is not a valid Unique Permanent Address" + ), + ) + for o in ["1/2", "1/2/", "/1/2", "1/2/3/4", "x/2/3", "1/x/3", "1/2/x"]: + _job_sub_init_fail( + j, + a, + r, + u, + n, + n, + [o], + IncorrectParamsException( + f"source_ws_objects index 0, '{o}', is not a valid Unique Permanent Address" + ), + ) + + +def _job_sub_init_fail(jobid, appinfo, jobreq, usercred, parentid, wsid, wso, expected): + with raises(Exception) as got: + JobSubmissionParameters(jobid, appinfo, jobreq, usercred, parentid, wsid, wso) + assert_exception_correct(got.value, expected) + + +def test_job_sub_equals(): + j1 = "jobby job job" + j1a = "jobby job job" + j2 = "jobby job job JOB" + a1 = AppInfo("a.b", "a/x") + a1a = AppInfo("a.b", "a/x") + a2 = AppInfo("a.b", "a/y") + r1 = JobRequirements(6, 7, 4, "cligroup") + r1a = JobRequirements(6, 7, 4, "cligroup") + r2 = JobRequirements(6, 7, 4, "cligroup2") + u1 = UserCreds("user", "tokeytoken") + u1a = UserCreds("user", "tokeytoken") + u2 = UserCreds("user", "tokeytoken2") + p1 = "I'm so miserable and you just don't care" + p1a = "I'm so miserable and you just don't care" + p2 = "Oh do shut up Portia" + w1 = ["1/2/3"] + w1a = ["1/2/3"] + w2 = ["1/2/4"] + + JSP = JobSubmissionParameters + jsp_sm = JSP(j1, a1, r1, u1) + jsp_lg = JSP(j1, a1, r1, u1, p1, 1, w1) + + assert jsp_sm == JSP(j1a, a1a, r1a, u1a) + assert jsp_lg == JSP(j1a, a1a, r1a, u1a, p1a, 1, w1a) + + assert jsp_sm != JSP(j2, a1a, r1a, u1a) + assert jsp_sm != JSP(j1a, a2, r1a, u1a) + assert jsp_sm != JSP(j1a, a1a, r2, u1a) + assert jsp_sm != JSP(j1a, a1a, r1a, u2) + assert jsp_sm != (j1a, a1a, r1a, u1a) + + assert jsp_lg != JSP(j1a, a1a, r1a, u1a, p2, 1, w1a) + assert jsp_lg != JSP(j1a, a1a, r1a, u1a, p1a, 2, w1a) + assert jsp_lg != JSP(j1a, a1a, r1a, u1a, p1a, 1, w2) + assert jsp_lg != (j1a, a1a, r1a, u1a, p1a, 1, w1a) + + +def test_job_sub_hash(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + j1 = "jobby job job" + j1a = "jobby job job" + j2 = "jobby job job JOB" + a1 = AppInfo("a.b", "a/x") + a1a = AppInfo("a.b", "a/x") + a2 = AppInfo("a.b", "a/y") + r1 = JobRequirements(6, 7, 4, "cligroup") + r1a = JobRequirements(6, 7, 4, "cligroup") + r2 = JobRequirements(6, 7, 4, "cligroup2") + u1 = UserCreds("user", "tokeytoken") + u1a = UserCreds("user", "tokeytoken") + u2 = UserCreds("user", "tokeytoken2") + p1 = "I'm so miserable and you just don't care" + p1a = "I'm so miserable and you just don't care" + p2 = "Oh do shut up Portia" + w1 = ["1/2/3"] + w1a = ["1/2/3"] + w2 = ["1/2/4"] + + JSP = JobSubmissionParameters + jsp_sm = JSP(j1, a1, r1, u1) + jsp_lg = JSP(j1, a1, r1, u1, p1, 1, w1) + + assert hash(jsp_sm) == hash(JSP(j1a, a1a, r1a, u1a)) + assert hash(jsp_lg) == hash(JSP(j1a, a1a, r1a, u1a, p1a, 1, w1a)) + + assert hash(jsp_sm) != hash(JSP(j2, a1a, r1a, u1a)) + assert hash(jsp_sm) != hash(JSP(j1a, a2, r1a, u1a)) + assert hash(jsp_sm) != hash(JSP(j1a, a1a, r2, u1a)) + assert hash(jsp_sm) != hash(JSP(j1a, a1a, r1a, u2)) + + assert hash(jsp_lg) != hash(JSP(j1a, a1a, r1a, u1a, p2, 1, w1a)) + assert hash(jsp_lg) != hash(JSP(j1a, a1a, r1a, u1a, p1a, 2, w1a)) + assert hash(jsp_lg) != hash(JSP(j1a, a1a, r1a, u1a, p1a, 1, w2)) diff --git a/test/tests_for_utils/Condor_test.py b/test/tests_for_utils/Condor_test.py new file mode 100644 index 000000000..d3c32d09e --- /dev/null +++ b/test/tests_for_utils/Condor_test.py @@ -0,0 +1,212 @@ +""" +Unit tests for the Condor wrapper. +""" + +# TODO Add tests for get_job_resource_info and cancel_job + +import htcondor +from unittest.mock import create_autospec + +from execution_engine2.sdk.job_submission_parameters import ( + JobSubmissionParameters, + JobRequirements, +) +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.CondorTuples import SubmissionInfo + +# Note the executable existence code in the constructor appears to be buggy and will never +# throw an error. If it checks for existence of initial-dir/executable as well as just executable +# that makes testing a bit ungainly as executable will have to exist in the current directory. +# TODO fix the executable existence bug in the Condor constructor + + +def _mock_htc(): + htc = create_autospec(htcondor, spec_set=True) + sub = create_autospec(htcondor.Submit, spec_set=True, instance=True) + htc.Submit.return_value = sub + schedd = create_autospec(htcondor.Schedd, spec_set=True, instance=True) + htc.Schedd.return_value = schedd + txn = create_autospec(htcondor.Transaction, spec_set=True, instance=True) + # mock context manager ops + schedd.transaction.return_value = txn + txn.__enter__.return_value = txn + return htc, sub, schedd, txn + + +def _get_common_sub(job_id): + return { + "universe": "vanilla", + "ShouldTransferFiles": "YES", + "on_exit_hold": "ExitCode =!= 0", + "JobLeaseDuration": "43200", + "MaxJobRetirementTime": "43200", + "Periodic_Hold": "( RemoteWallClockTime > 604800 )", + "log": "runner_logs/$(Cluster).$(Process).log", + "error": f"runner_logs/{job_id}.err", + "output": f"runner_logs/{job_id}.out", + "transfer_output_remaps": f'"runner_logs/{job_id}.err=cluster_logs/{job_id}.err;' + + f'runner_logs/{job_id}.out=cluster_logs/{job_id}.out"', + "When_To_Transfer_Output": "ON_EXIT_OR_EVICT", + "getenv": "false", + } + + +def _check_calls(htc, schedd, sub, txn, expected_sub): + htc.Submit.assert_called_once_with(expected_sub) + htc.Schedd.assert_called_once_with() + schedd.transaction.assert_called_once_with() + sub.queue.assert_called_once_with(txn, 1) + + +def test_run_job_minimal(): + htc, sub, schedd, txn = _mock_htc() + c = Condor( + { + "external-url": "https://fake.com", + "executable": "file.exe", + "catalog-token": "cattoken", + }, + htc=htc, + ) + sub.queue.return_value = 123 + + subinfo = c.run_job( + JobSubmissionParameters( + "jobbyjob", + AppInfo("foo.bar"), + JobRequirements(2, 3, 4, "cg"), + UserCreds("user1", "token"), + ) + ) + # presumably sub being part of the submission info is a bug. I assume that it's intended + # to be the submission dictionary. However, that contains admin tokens and SubmissionInfo + # gets logged so maybe it's better this way. + assert subinfo == SubmissionInfo("123", sub, None) + + expected_sub = _get_common_sub("jobbyjob") + expected_sub.update( + { + "JobBatchName": "jobbyjob", + "arguments": "jobbyjob https://fake.com", + "+KB_PARENT_JOB_ID": "", + "+KB_MODULE_NAME": '"foo"', + "+KB_FUNCTION_NAME": '"bar"', + "+KB_APP_ID": "", + "+KB_APP_MODULE_NAME": "", + "+KB_WSID": "", + "+KB_SOURCE_WS_OBJECTS": "", + "request_cpus": "2", + "request_memory": "3MB", + "request_disk": "4GB", + "requirements": 'regexp("cg",CLIENTGROUP)', + "+KB_CLIENTGROUP": '"cg"', + "Concurrency_Limits": "user1", + "+AccountingGroup": '"user1"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=604801 KB_ADMIN_AUTH_TOKEN=cattoken KB_AUTH_TOKEN=token ' + + "CLIENTGROUP=cg JOB_ID=jobbyjob CONDOR_ID=$(Cluster).$(Process) " + + 'PYTHON_EXECUTABLE=/miniconda/bin/python DEBUG_MODE=False PARENT_JOB_ID= "' + ), + "leavejobinqueue": "True", + "initial_dir": "/condor_shared", + "+Owner": '"condor_pool"', + "executable": "/condor_shared/file.exe", + "transfer_input_files": "/condor_shared/JobRunner.tgz", + } + ) + _check_calls(htc, schedd, sub, txn, expected_sub) + + +def test_run_job_maximal_with_concurrency_limits(): + """ + Tests with all constructor arguments and method arguments with concurrency limits. + """ + _run_job_maximal(True, {}) + + +def test_run_job_maximal_without_concurrency_limits(): + """ + Tests with all constructor arguments and method arguments without concurrency limits. + """ + _run_job_maximal(False, {"Concurrency_Limits": "sucker"}) + + +def _run_job_maximal(ignore_concurrency_limits, update): + htc, sub, schedd, txn = _mock_htc() + c = Condor( + { + "external-url": "https://fake2.com", + "executable": "somefile.exe", + "catalog-token": "catsupertoken", + "PYTHON_EXECUTABLE": "python1.3", + "initialdir": "/somedir", + "docker_timeout": 42, + "pool_user": "thosedamnkidsnextdoor", + "leavejobinqueue": "False", + "transfer_input_files": "alan_alda_nude.tiff", + }, + htc=htc, + ) + + sub.queue.return_value = 789 + + subinfo = c.run_job( + JobSubmissionParameters( + "a_job_id", + AppInfo("kb_quast.run_quast_app", "kb_quast/run_QUAST_app"), + JobRequirements( + 6, + 28, + 496, + "clientclientclient", + client_group_regex=False, + bill_to_user="sucker", + ignore_concurrency_limits=ignore_concurrency_limits, + scheduler_requirements={"a": "b", "c": "d"}, + debug_mode=True, + ), + UserCreds("user2", "suparsekrit"), + parent_job_id="old_n_gross", + wsid=89, + source_ws_objects=["1/2/3", "4/5/7"], + ) + ) + # presumably sub being part of the submission info is a bug. I assume that it's intended + # to be the submission dictionary. However, that contains admin tokens and SubmissionInfo + # gets logged so maybe it's better this way. + assert subinfo == SubmissionInfo("789", sub, None) + + expected_sub = _get_common_sub("a_job_id") + expected_sub.update(update) + expected_sub.update( + { + "JobBatchName": "a_job_id", + "arguments": "a_job_id https://fake2.com", + "+KB_PARENT_JOB_ID": '"old_n_gross"', + "+KB_MODULE_NAME": '"kb_quast"', + "+KB_FUNCTION_NAME": '"run_quast_app"', + "+KB_APP_ID": '"kb_quast/run_QUAST_app"', + "+KB_APP_MODULE_NAME": '"kb_quast"', + "+KB_WSID": '"89"', + "+KB_SOURCE_WS_OBJECTS": '"1/2/3,4/5/7"', + "request_cpus": "6", + "request_memory": "28MB", + "request_disk": "496GB", + "requirements": '(CLIENTGROUP == "clientclientclient") && (a == "b") && (c == "d")', + "+KB_CLIENTGROUP": '"clientclientclient"', + "+AccountingGroup": '"sucker"', + "environment": ( + '"DOCKER_JOB_TIMEOUT=42 KB_ADMIN_AUTH_TOKEN=catsupertoken KB_AUTH_TOKEN=suparsekrit ' + + "CLIENTGROUP=clientclientclient JOB_ID=a_job_id CONDOR_ID=$(Cluster).$(Process) " + + 'PYTHON_EXECUTABLE=python1.3 DEBUG_MODE=True PARENT_JOB_ID=old_n_gross "' + ), + "leavejobinqueue": "False", + "initial_dir": "/somedir", + "+Owner": '"thosedamnkidsnextdoor"', + "executable": "/somedir/somefile.exe", + "transfer_input_files": "alan_alda_nude.tiff", + } + ) + _check_calls(htc, schedd, sub, txn, expected_sub) diff --git a/test/tests_for_utils/application_info_test.py b/test/tests_for_utils/application_info_test.py new file mode 100644 index 000000000..364cecd5d --- /dev/null +++ b/test/tests_for_utils/application_info_test.py @@ -0,0 +1,222 @@ +from pytest import raises +from execution_engine2.utils.application_info import AppInfo +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_app_info_init_success_minimal_strict(): + ai = AppInfo(" \t mod . meth ") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module is None + assert ai.application is None + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() is None + + +def test_app_info_init_success_no_app_id_strict(): + for appid in [None, " \t "]: + ai = AppInfo(" \t mod . meth ", appid) + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module is None + assert ai.application is None + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() is None + + +def test_app_info_init_success_strict_full(): + ai = AppInfo(" \t mod . meth ", "mod/ appthing") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod" + assert ai.application == "appthing" + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod/appthing" + + +def test_app_info_init_success_strict_full_dot_separator(): + ai = AppInfo(" \t mod . meth ", "mod . appthing") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod" + assert ai.application == "appthing" + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod.appthing" + + +def test_app_info_init_success_strict_with_app_module_only(): + ai = AppInfo(" \t mod . meth ", " mod \t ") + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod" + assert ai.application is None + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod" + + +def test_app_info_init_success_non_strict(): + ai = AppInfo(" \t mod . meth ", "mod2/ appthing", strict=False) + assert ai.module == "mod" + assert ai.method == "meth" + assert ai.application_module == "mod2" + assert ai.application == "appthing" + assert ai.get_method_id() == "mod.meth" + assert ai.get_application_id() == "mod2/appthing" + + +def test_app_info_init_fail(): + m = "m.n" + _app_info_init_fail( + None, + None, + False, + IncorrectParamsException("Missing input parameter: method ID"), + ) + _app_info_init_fail( + " \t ", + None, + False, + IncorrectParamsException("Missing input parameter: method ID"), + ) + _app_info_init_fail( + " method ", + None, + False, + IncorrectParamsException("Expected exactly one '.' in method ID 'method'"), + ) + _app_info_init_fail( + " mod.innermod.method ", + None, + False, + IncorrectParamsException( + "Expected exactly one '.' in method ID 'mod.innermod.method'" + ), + ) + _app_info_init_fail( + " . meth", + None, + False, + IncorrectParamsException( + "Missing input parameter: module portion of method ID" + ), + ) + _app_info_init_fail( + " mod . ", + None, + False, + IncorrectParamsException( + "Missing input parameter: method portion of method ID" + ), + ) + _app_info_init_fail( + m, + "mod / me\tth ", + False, + IncorrectParamsException("application ID contains control characters"), + ) + _app_info_init_fail( + m, + "mod / meth.bak ", + False, + IncorrectParamsException( + "Application ID 'mod / meth.bak' has both '/' and '.' separators" + ), + ) + _app_info_init_fail( + m, + "mod / meth / bak ", + False, + IncorrectParamsException( + "Expected exactly one '/' in application ID 'mod / meth / bak'" + ), + ) + _app_info_init_fail( + m, + "mod.meth.anothermeth", + False, + IncorrectParamsException( + "Expected exactly one '/' in application ID 'mod.meth.anothermeth'" + ), + ) + _app_info_init_fail( + "mod.meth", + " mod2 /meth", + True, + IncorrectParamsException( + "Application module 'mod2' must equal method module 'mod'" + ), + ) + + _app_info_init_fail( + m, + "mod/", + False, + IncorrectParamsException( + "Missing input parameter: application portion of application ID" + ), + ) + _app_info_init_fail( + m, + "/meth", + False, + IncorrectParamsException( + "Missing input parameter: module portion of application ID" + ), + ) + _app_info_init_fail( + m, + "mod. ", + False, + IncorrectParamsException( + "Missing input parameter: application portion of application ID" + ), + ) + _app_info_init_fail( + m, + " .meth", + False, + IncorrectParamsException( + "Missing input parameter: module portion of application ID" + ), + ) + + +def _app_info_init_fail(meth, app, strict, expected): + with raises(Exception) as got: + AppInfo(meth, app, strict) + assert_exception_correct(got.value, expected) + + +def test_equals(): + assert AppInfo("m.n") == AppInfo("m.n") + assert AppInfo("m.n", "m") == AppInfo("m.n", "m") + assert AppInfo("m.n", "m/p") == AppInfo("m.n", "m/p") + assert AppInfo("m.n", "m.p") == AppInfo("m.n", "m.p") + assert AppInfo("m.n", "p/p", False) == AppInfo("m.n", "p/p", False) + assert AppInfo("m.n", "p.p", False) == AppInfo("m.n", "p.p", False) + + assert AppInfo("m.n", "m/p", False) != AppInfo("n.n", "m/p", False) + assert AppInfo("m.n", "m/p") != AppInfo("m.x", "m/p") + assert AppInfo("m.n", "m/p") != AppInfo("m.n", "m.p") + assert AppInfo("m.n", "m/p", False) != AppInfo("m.n", "x/p", False) + assert AppInfo("m.n", "m/p") != AppInfo("m.n", "m/x") + assert AppInfo("m.n", "m/p") != ("m.n", "m/x") + + +def test_hashcode(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + assert hash(AppInfo("m.n")) == hash(AppInfo("m.n")) + assert hash(AppInfo("m.n", "m")) == hash(AppInfo("m.n", "m")) + assert hash(AppInfo("m.n", "m/p")) == hash(AppInfo("m.n", "m/p")) + assert hash(AppInfo("m.n", "m.p")) == hash(AppInfo("m.n", "m.p")) + assert hash(AppInfo("m.n", "p/p", False)) == hash(AppInfo("m.n", "p/p", False)) + assert hash(AppInfo("m.n", "p.p", False)) == hash(AppInfo("m.n", "p.p", False)) + + assert hash(AppInfo("m.n", "m/p", False)) != hash(AppInfo("n.n", "m/p", False)) + assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.x", "m/p")) + assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.n", "m.p")) + assert hash(AppInfo("m.n", "m/p", False)) != hash(AppInfo("m.n", "x/p", False)) + assert hash(AppInfo("m.n", "m/p")) != hash(AppInfo("m.n", "m/x")) diff --git a/test/tests_for_utils/arg_processing_test.py b/test/tests_for_utils/arg_processing_test.py new file mode 100644 index 000000000..f3282ca05 --- /dev/null +++ b/test/tests_for_utils/arg_processing_test.py @@ -0,0 +1,205 @@ +from pytest import raises + +import datetime +from execution_engine2.utils.arg_processing import ( + parse_bool, + check_string, + not_falsy, + not_falsy_in_iterable, + check_timestamp, +) +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_parse_bool_success(): + testcases = { + None: False, + True: True, + False: False, + # ints + -1: True, + 1: True, + 0: False, + 100: True, + -100: True, + # floats + -1.3: True, + 1.7: True, + 100.89: True, + -100.7: True, + # ints as strings + "-1": True, + "1": True, + "0": False, + "100": True, + "-100": True, + # floats as strings + "-1.3": True, + "1.7": True, + "0.0": False, + "100.89": True, + "-100.7": True, + # booleans as strings + "True": True, + "TRUE": True, + "true": True, + "False": False, + "FALSE": False, + "false": False, + } + + for arg, expected in testcases.items(): + assert parse_bool(arg) is expected, f"Testcase: {arg}" + + # can't go in the hash since equivalent to 0 + assert parse_bool(0.0) is False + + +def test_parse_bool_failure(): + testcases = ["Truthy", "fawlse", " ", "f1", "f1.3"] + + for tc in testcases: + with raises(Exception) as e: + parse_bool(tc) + assert_exception_correct( + e.value, IncorrectParamsException(f"{tc} is not a boolean value") + ) + + +def test_falsy_true(): + for t in ["a", 1, True, [1], {"a": 1}, {1}]: + assert not_falsy(t, "foo") is t + + +def test_falsy_fail(): + for f in ["", 0, False, [], dict(), {}]: + with raises(Exception) as got: + not_falsy(f, "my name") + assert_exception_correct( + got.value, ValueError("my name cannot be a value that evaluates to false") + ) + + +def test_falsy_in_iterable_true(): + for t in [[], [1, "a"], [True], [{"foo"}]]: + assert not_falsy_in_iterable(t, "foo") is t + + +def test_falsy_in_iterable_allow_none(): + assert not_falsy_in_iterable(None, "yay", allow_none=True) is None + + +def test_falsy_in_iterable_no_iterable(): + with raises(Exception) as got: + not_falsy_in_iterable(None, "whee") + assert_exception_correct(got.value, ValueError("whee cannot be None")) + + +def test_falsy_in_iterable_false_insides(): + for item, pos in [ + [["", "bar"], 0], + [["foo", 0], 1], + [[True, True, False, True], 2], + [[[]], 0], + [[dict()], 0], + [[{}], 0], + ]: + with raises(Exception) as got: + not_falsy_in_iterable(item, "my name") + assert_exception_correct( + got.value, + ValueError( + f"Index {pos} of iterable my name cannot be a value that evaluates to false" + ), + ) + + +def test_check_string(): + for string, expected in { + " foo": "foo", + " \t baɷr ": "baɷr", + "baᚠz \t ": "baᚠz", + "bat": "bat", + "a" * 1000: "a" * 1000, + }.items(): + assert check_string(string, "name") == expected + + +def test_check_string_bad_max_len(): + for max_len in [0, -1, -100]: + with raises(Exception) as got: + check_string("str", "var name", max_len=max_len) + assert_exception_correct( + got.value, ValueError("max_len must be > 0 if provided") + ) + + +def test_check_string_optional_true(): + for string in [None, " \t "]: + assert check_string(string, "name", optional=True) is None + + +def test_check_string_optional_false(): + for string in [None, " \t "]: + with raises(Exception) as got: + check_string(string, "var name") + assert_exception_correct( + got.value, IncorrectParamsException("Missing input parameter: var name") + ) + + +def test_check_string_control_characters(): + for string in ["foo \b bar", "foo\u200bbar", "foo\0bar", "foo\bbar"]: + with raises(Exception) as got: + check_string(string, "var name") + assert_exception_correct( + got.value, IncorrectParamsException("var name contains control characters") + ) + + +def test_check_string_max_len(): + for string, length in { + "123456789": 9, + "a": 1, + "a" * 100: 100, + "a" * 10000: 10000, + "a" * 10000: 1000000, + }.items(): + assert check_string(string, "name", max_len=length) == string + + +def test_check_string_long_fail(): + for string, length in {"123456789": 8, "ab": 1, "a" * 100: 99}.items(): + with raises(Exception) as got: + check_string(string, "var name", max_len=length) + assert_exception_correct( + got.value, + IncorrectParamsException(f"var name exceeds maximum length of {length}"), + ) + + +def _dt(timestamp): + return datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc) + + +def test_check_timestamp(): + for t in [-1000000, -256, -1, 0, 1, 6, 100, 100000000000]: + assert check_timestamp(_dt(t), "name") == _dt(t) + + +def test_check_timestamp_fail_bad_args(): + _check_timestamp_fail( + None, "ts", ValueError("ts cannot be a value that evaluates to false") + ) + _check_timestamp_fail( + datetime.datetime.now(), + "tymestampz", + ValueError("tymestampz cannot be a naive datetime"), + ) + + +def _check_timestamp_fail(ts, name, expected): + with raises(Exception) as got: + check_timestamp(ts, name) + assert_exception_correct(got.value, expected) diff --git a/test/tests_for_utils/catalog_cache_test.py b/test/tests_for_utils/catalog_cache_test.py new file mode 100644 index 000000000..0ae9380ba --- /dev/null +++ b/test/tests_for_utils/catalog_cache_test.py @@ -0,0 +1,213 @@ +# This test only tests code that can be exercised without a network connection to services. +# That code is tested in integration tests. +from unittest.mock import create_autospec + +import pytest + +from execution_engine2.utils.catalog_cache import CatalogCache +from installed_clients.CatalogClient import Catalog +from utils_shared.test_utils import ( + assert_exception_correct, + CLIENT_GROUP_CONFIG, +) + + +@pytest.fixture +def catalog(): + return create_autospec(Catalog, spec_set=True, instance=True) + + +@pytest.fixture +def catalog_cache(): + return create_autospec(CatalogCache, spec_set=True, instance=True) + + +def test_fail_cc(): + with pytest.raises(ValueError) as e: + CatalogCache(None) + assert_exception_correct( + e.value, ValueError("Please provide instance of catalog client") + ) + + # Test that a new catalog call is made once more + with pytest.raises(ValueError) as e: + catalog_cache = CatalogCache(catalog=catalog) + catalog_cache.lookup_git_commit_version(method=None, service_ver="dev") + assert_exception_correct(e.value, ValueError("Must provide a method to lookup")) + + +def assert_call_count_and_return_val( + mock, call_count, return_value, expected_return_value +): + assert mock.call_count == call_count + assert return_value == expected_return_value + + +def test_get_catalog(catalog): + assert catalog == CatalogCache(catalog).get_catalog() + + +def test_cc_job_reqs(catalog): + """Test to see the job requirements cache is being used.""" + test_return = {"Test1"} + catalog.list_client_group_configs.return_value = test_return + catalog_cache = CatalogCache(catalog=catalog) + job_reqs_cache = catalog_cache.get_job_resources_cache() + + # Test Cache is called on second call + rv1 = catalog_cache.lookup_job_resource_requirements( + module_name="test1", function_name="test1" + ) + + assert catalog.list_client_group_configs.call_count == 1 + # Test to make sure it still returns values based on the catalog + assert rv1 == test_return + assert "test1" in job_reqs_cache and "test1" in job_reqs_cache["test1"] + catalog.list_client_group_configs.assert_called_with( + {"module_name": "test1", "function_name": "test1"} + ) + + catalog.list_client_group_configs.return_value = CLIENT_GROUP_CONFIG + catalog_cache._job_requirements_cache["test1"]["test1"] = "Something else" + rv2 = catalog_cache.lookup_job_resource_requirements( + module_name="test1", function_name="test1" + ) + # Test to make sure the catalog cache is being used this time, even though the underlying catalog record changed + assert rv2 != CLIENT_GROUP_CONFIG + assert rv2 == "Something else" + catalog.list_client_group_configs.assert_called_with( + {"module_name": "test1", "function_name": "test1"} + ) + + # Test to see a new catalog call is made + assert catalog.list_client_group_configs.call_count == 1 + catalog_cache.lookup_job_resource_requirements( + module_name="test1", function_name="test2" + ) + assert catalog.list_client_group_configs.call_count == 2 + assert "test1" in job_reqs_cache and "test2" in job_reqs_cache["test1"] + catalog.list_client_group_configs.assert_called_with( + {"module_name": "test1", "function_name": "test2"} + ) + + +def test_cc_job_reqs_internal_mutation(catalog): + """ + Tests that if a client alters the job requirements returned from the cache, it does not + affect the cache internals. + """ + catalog.list_client_group_configs.return_value = [{"client_groups": ["kb_upload"]}] + + cc = CatalogCache(catalog) + + # call #1. Depending on the implementation, the catalog info may be returned directly + # or added to the cache and the cache entry returned. + assert cc.lookup_job_resource_requirements( + "kb_uploadmethods", "import_reads_from_staging" + ) == [{"client_groups": ["kb_upload"]}] + + # call #2. Regardless of the implementation, this data should be coming from the cache. + cgs = cc.lookup_job_resource_requirements( + "kb_uploadmethods", "import_reads_from_staging" + ) + assert cgs == [{"client_groups": ["kb_upload"]}] + + # Mutate the cache if the cache implementation allows it + cgs[0]["client_groups"].pop(0) # The job requirements resolver does this + + # call #3. Confirm that the cache was not mutated + assert cc.lookup_job_resource_requirements( + "kb_uploadmethods", "import_reads_from_staging" + ) == [{"client_groups": ["kb_upload"]}] + + # check there was only one call to the cache + catalog.list_client_group_configs.assert_called_once_with( + { + "module_name": "kb_uploadmethods", + "function_name": "import_reads_from_staging", + } + ) + + +def test_cc_git_commit_version(catalog): + """Test to see the git commit cache is being used.""" + catalog_cache = CatalogCache(catalog=catalog) + catalog_git_return_1 = {"git_commit_hash": "1234"} + catalog_git_return_2 = {"git_commit_hash": "12345"} + catalog.get_module_version.return_value = catalog_git_return_1 + method_version_cache = catalog_cache.get_method_version_cache() + + # Test Cache is called on second call + version = catalog_cache.lookup_git_commit_version( + method="method1", service_ver="any" + ) + + # Test to make sure return_value is correct + assert version == catalog_git_return_1["git_commit_hash"] + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "any"} + ) + + # Test to make sure same commit is returned regardless of underlying catalog data + catalog.get_module_version.return_value = catalog_git_return_2 + version2 = catalog_cache.lookup_git_commit_version( + method="method1", service_ver="any" + ) + assert version2 == catalog_git_return_1["git_commit_hash"] + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "any"} + ) + + catalog_cache.lookup_git_commit_version(method="method1", service_ver="any") + assert catalog.get_module_version.call_count == 1 + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "any"} + ) + catalog_cache.lookup_git_commit_version( + method="method1", + ) + assert catalog.get_module_version.call_count == 2 + catalog.get_module_version.assert_called_with( + {"module_name": "method1", "version": "release"} + ) + + assert method_version_cache["method1"] == {"any": "1234", "release": "12345"} + + # Test None defaults to release case + catalog_cache.lookup_git_commit_version(method="method3", service_ver=None) + catalog.get_module_version.assert_called_with( + {"module_name": "method3", "version": "release"} + ) + + assert None not in catalog_cache.get_method_version_cache()["method3"] + assert catalog_cache.get_method_version_cache()["method3"]["release"] + catalog.get_module_version.assert_called_with( + {"module_name": "method3", "version": "release"} + ) + + # Test module_name = method.split(".")[0] and call count + call_count = catalog.get_module_version.call_count + catalog_cache.lookup_git_commit_version( + method="MEGAHIT.run_megahit", service_ver="dev" + ) + catalog.get_module_version.assert_called_with( + {"module_name": "MEGAHIT", "version": "dev"} + ) + assert catalog.get_module_version.call_count == call_count + 1 + + # Test that the catalog is not called, from cache now + catalog_cache.lookup_git_commit_version( + method="MEGAHIT.run_megahit", service_ver="dev" + ) + assert catalog.get_module_version.call_count == call_count + 1 + + # Test that a new catalog call is made once more + catalog_cache.lookup_git_commit_version( + method="MEGAHIT.run_megahit2", service_ver="dev" + ) + catalog.get_module_version.assert_called_with( + {"module_name": "MEGAHIT", "version": "dev"} + ) + + assert method_version_cache["MEGAHIT.run_megahit"] == {"dev": "12345"} + assert method_version_cache["MEGAHIT.run_megahit2"] == {"dev": "12345"} diff --git a/test/tests_for_utils/clients_test.py b/test/tests_for_utils/clients_test.py new file mode 100644 index 000000000..acde30020 --- /dev/null +++ b/test/tests_for_utils/clients_test.py @@ -0,0 +1,144 @@ +# This test only tests code that can be exercised without a network connection to services. +# That code is tested in integration tests. + +from pytest import raises +from unittest.mock import create_autospec + +from execution_engine2.authorization.workspaceauth import WorkspaceAuth +from execution_engine2.utils.clients import ( + UserClientSet, + get_user_client_set, + ClientSet, +) +from utils_shared.test_utils import assert_exception_correct +from utils_shared.mock_utils import get_client_mocks, ALL_CLIENTS +from installed_clients.WorkspaceClient import Workspace + +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.utils.Condor import Condor +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient + +from installed_clients.authclient import KBaseAuth +from installed_clients.CatalogClient import Catalog + + +def test_get_user_client_set_fail(): + ws_err = "missing workspace-url in configuration" + get_user_client_set_fail(None, "foo", "bar", ValueError("cfg is required")) + get_user_client_set_fail({}, "foo", "bar", ValueError("cfg is required")) + get_user_client_set_fail({"a": "b"}, "foo", "bar", ValueError(ws_err)) + get_user_client_set_fail({"workspace-url": None}, "foo", "bar", ValueError(ws_err)) + get_user_client_set_fail( + {"workspace-url": " \t "}, "foo", "bar", ValueError(ws_err) + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + None, + "bar", + ValueError("user_id is required"), + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + " \t ", + "bar", + ValueError("user_id is required"), + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + "foo", + None, + ValueError("token is required"), + ) + get_user_client_set_fail( + {"workspace-url": "https://ws.com"}, + "foo", + " \t ", + ValueError("token is required"), + ) + + +def get_user_client_set_fail(cfg, user, token, expected): + with raises(Exception) as e: + get_user_client_set(cfg, user, token) + assert_exception_correct(e.value, expected) + + +def test_user_client_set_init_fail(): + ws = create_autospec(Workspace, spec_set=True, instance=True) + wsa = WorkspaceAuth("u", ws) + user_client_set_init_fail(None, "t", ws, wsa, ValueError("user_id is required")) + user_client_set_init_fail(" \t ", "t", ws, wsa, ValueError("user_id is required")) + user_client_set_init_fail("u", None, ws, wsa, ValueError("token is required")) + user_client_set_init_fail("u", " \t ", ws, wsa, ValueError("token is required")) + user_client_set_init_fail("u", "t", None, wsa, ValueError("workspace is required")) + user_client_set_init_fail( + "u", "t", ws, None, ValueError("workspace_auth is required") + ) + + +def user_client_set_init_fail(user, token, ws_client, ws_auth, expected): + with raises(Exception) as e: + UserClientSet(user, token, ws_client, ws_auth) + assert_exception_correct(e.value, expected) + + +def test_client_set_init_fail(): + mocks = get_client_mocks(None, None, *ALL_CLIENTS) + a = mocks[KBaseAuth] + aa = mocks[AdminAuthUtil] + c = mocks[Condor] + ca = mocks[Catalog] + j = mocks[JobRequirementsResolver] + k = mocks[KafkaClient] + m = mocks[MongoUtil] + s = mocks[SlackClient] + n = None + + e = ValueError("auth cannot be a value that evaluates to false") + _client_set_init_fail(n, aa, c, ca, ca, j, k, m, s, e) + e = ValueError("auth_admin cannot be a value that evaluates to false") + _client_set_init_fail(a, n, c, ca, ca, j, k, m, s, e) + e = ValueError("condor cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, n, ca, ca, j, k, m, s, e) + e = ValueError("catalog cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, n, ca, j, k, m, s, e) + e = ValueError("catalog_no_auth cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, n, j, k, m, s, e) + e = ValueError("requirements_resolver cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, ca, n, k, m, s, e) + e = ValueError("kafka_client cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, ca, j, n, m, s, e) + e = ValueError("mongo_util cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, ca, j, k, n, s, e) + e = ValueError("slack_client cannot be a value that evaluates to false") + _client_set_init_fail(a, aa, c, ca, ca, j, k, m, n, e) + + +def _client_set_init_fail( + auth: KBaseAuth, + auth_admin: AdminAuthUtil, + condor: Condor, + catalog: Catalog, + catalog_no_auth: Catalog, + requirements_resolver: JobRequirementsResolver, + kafka_client: KafkaClient, + mongo_util: MongoUtil, + slack_client: SlackClient, + expected: Exception, +): + with raises(Exception) as got: + ClientSet( + auth, + auth_admin, + condor, + catalog, + catalog_no_auth, + requirements_resolver, + kafka_client, + mongo_util, + slack_client, + ) + assert_exception_correct(got.value, expected) diff --git a/test/tests_for_utils/job_requirements_resolver_test.py b/test/tests_for_utils/job_requirements_resolver_test.py new file mode 100644 index 000000000..4cd01c26f --- /dev/null +++ b/test/tests_for_utils/job_requirements_resolver_test.py @@ -0,0 +1,1124 @@ +""" +Unit tests for the job requirements resolver. +""" + +from enum import Enum +from io import StringIO +from pytest import raises +from unittest.mock import create_autospec +from execution_engine2.sdk.job_submission_parameters import JobRequirements +from execution_engine2.utils.job_requirements_resolver import ( + JobRequirementsResolver, + RequirementsType, +) +from execution_engine2.exceptions import IncorrectParamsException +from execution_engine2.utils.catalog_cache import CatalogCache +from utils_shared.test_utils import assert_exception_correct + + +def test_normalize_job_reqs_minimal(): + assert JobRequirementsResolver.normalize_job_reqs(None, "mysource") == {} + assert JobRequirementsResolver.normalize_job_reqs({}, "mysource") == {} + assert ( + JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": None, + "request_memory": None, + "request_disk": None, + "client_group": None, + "client_group_regex": None, + "debug_mode": None, + "expect_noop": " fooo ", + }, + "source", + ) + == {} + ) + assert ( + JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": " \t ", + "request_memory": " \t ", + "request_disk": " \t ", + "client_group": " \t ", + "client_group_regex": " \t ", + "debug_mode": " \t ", + "expect_noop": " fooo ", + }, + "source", + ) + == {} + ) + + +def test_normalize_job_reqs_minimal_require_all(): + assert JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": 1, + "request_memory": 1, + "request_disk": 1, + "client_group": "foo", + }, + "source", + True, + ) == { + "request_cpus": 1, + "request_memory": 1, + "request_disk": 1, + "client_group": "foo", + } + + +def test_normalize_job_reqs_maximal_ints(): + assert JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": 56, + "request_memory": 200, + "request_disk": 7000, + "client_group": " njs ", + "client_group_regex": 1, + "debug_mode": -1, + "expect_noop": 1, + }, + "mysource", + ) == { + "request_cpus": 56, + "request_memory": 200, + "request_disk": 7000, + "client_group": "njs", + "client_group_regex": True, + "debug_mode": True, + } + + +def test_normalize_job_reqs_maximal_strings(): + assert JobRequirementsResolver.normalize_job_reqs( + { + "request_cpus": " 56 ", + "request_memory": " 201 ", + "request_disk": " \t 7000 ", + "client_group": " njs ", + "client_group_regex": " False ", + "debug_mode": " true \t ", + "expect_noop": 1, + }, + "mysource", + ) == { + "request_cpus": 56, + "request_memory": 201, + "request_disk": 7000, + "client_group": "njs", + "client_group_regex": False, + "debug_mode": True, + } + + +def test_normalize_job_reqs_memory(): + for mem in [2000, "2000 ", " 2000M ", "2000MB"]: + assert JobRequirementsResolver.normalize_job_reqs( + {"request_memory": mem}, "s" + ) == {"request_memory": 2000} + + +def test_normalize_job_reqs_disk(): + for disk in [6000, "6000", " 6000GB "]: + assert JobRequirementsResolver.normalize_job_reqs( + {"request_disk": disk}, "s" + ) == {"request_disk": 6000} + + +def test_normalize_job_reqs_bools_true(): + for b in [True, 1, -1, 100, -100, " True ", " true"]: + assert JobRequirementsResolver.normalize_job_reqs( + {"client_group_regex": b, "debug_mode": b}, "s" + ) == {"client_group_regex": True, "debug_mode": True} + + +def test_normalize_job_reqs_bools_False(): + for b in [False, 0, " False ", " false"]: + assert JobRequirementsResolver.normalize_job_reqs( + {"client_group_regex": b, "debug_mode": b}, "s" + ) == {"client_group_regex": False, "debug_mode": False} + + +def test_normalize_job_reqs_fail_client_group(): + _normalize_job_reqs_fail( + {"client_group": []}, + "src", + False, + IncorrectParamsException( + "Found illegal client group '[]' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"client_group": "njs=true"}, + "src2", + False, + IncorrectParamsException( + "Found illegal client group 'njs=true' in job requirements from src2" + ), + ) + + +def test_normalize_job_reqs_fail_cpu(): + _normalize_job_reqs_fail( + {"request_cpus": 8.4}, + "src3", + False, + IncorrectParamsException( + "Found illegal cpu request '8.4' in job requirements from src3" + ), + ) + _normalize_job_reqs_fail( + {"request_cpus": "26M"}, + "src4", + False, + IncorrectParamsException( + "Found illegal cpu request '26M' in job requirements from src4" + ), + ) + _normalize_job_reqs_fail( + {"request_cpus": ["26M"]}, + "src4.5", + False, + IncorrectParamsException( + "Found illegal cpu request '['26M']' in job requirements from src4.5" + ), + ) + + +def test_normalize_job_reqs_fail_mem(): + _normalize_job_reqs_fail( + {"request_memory": 3.2}, + "src5", + False, + IncorrectParamsException( + "Found illegal memory request '3.2' in job requirements from src5" + ), + ) + _normalize_job_reqs_fail( + {"request_memory": {}}, + "src5", + False, + IncorrectParamsException( + "Found illegal memory request '{}' in job requirements from src5" + ), + ) + _normalize_job_reqs_fail( + {"request_memory": "26G"}, + "src6", + False, + IncorrectParamsException( + "Found illegal memory request '26G' in job requirements from src6" + ), + ) + + +def test_normalize_job_reqs_fail_disk(): + _normalize_job_reqs_fail( + {"request_disk": 6.5}, + "src", + False, + IncorrectParamsException( + "Found illegal disk request '6.5' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"request_disk": set()}, + "src", + False, + IncorrectParamsException( + "Found illegal disk request 'set()' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"request_disk": "26M"}, + "src", + False, + IncorrectParamsException( + "Found illegal disk request '26M' in job requirements from src" + ), + ) + + +def test_normalize_job_reqs_fail_regex(): + _normalize_job_reqs_fail( + {"client_group_regex": 92.4}, + "src", + False, + IncorrectParamsException( + "Found illegal client group regex '92.4' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"client_group_regex": Enum}, + "src", + False, + IncorrectParamsException( + "Found illegal client group regex '' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"client_group_regex": "truthy"}, + "src", + False, + IncorrectParamsException( + "Found illegal client group regex 'truthy' in job requirements from src" + ), + ) + + +def test_normalize_job_reqs_fail_debug(): + _normalize_job_reqs_fail( + {"debug_mode": 9.5}, + "src", + False, + IncorrectParamsException( + "Found illegal debug mode '9.5' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"debug_mode": int}, + "src", + False, + IncorrectParamsException( + "Found illegal debug mode '' in job requirements from src" + ), + ) + _normalize_job_reqs_fail( + {"debug_mode": " yep "}, + "src", + False, + IncorrectParamsException( + "Found illegal debug mode ' yep ' in job requirements from src" + ), + ) + + +def test_normalize_job_reqs_fail_require_all(): + reqs_all = { + "request_cpus": 56, + "request_memory": 200, + "request_disk": 7000, + "client_group": "njs", + } + for k in ["request_cpus", "request_memory", "request_disk", "client_group"]: + r = dict(reqs_all) + del r[k] + _normalize_job_reqs_fail( + r, + "mysrc", + True, + IncorrectParamsException(f"Missing {k} key in job requirements from mysrc"), + ) + + +def _normalize_job_reqs_fail(reqs, source, req_all_res, expected): + with raises(Exception) as got: + JobRequirementsResolver.normalize_job_reqs(reqs, source, req_all_res) + assert_exception_correct(got.value, expected) + + +def test_get_requirements_type_standard(): + grt = JobRequirementsResolver.get_requirements_type + assert grt() == RequirementsType.STANDARD + assert ( + grt(None, None, None, None, None, None, None, None, None) + == RequirementsType.STANDARD + ) + assert ( + grt(None, None, None, None, None, None, False, {}, False) + == RequirementsType.STANDARD + ) + + +def test_get_requirements_type_processing(): + grt = JobRequirementsResolver.get_requirements_type + assert grt(cpus=4) == RequirementsType.PROCESSING + assert grt(memory_MB=26) == RequirementsType.PROCESSING + assert grt(disk_GB=78) == RequirementsType.PROCESSING + assert grt(client_group="foo") == RequirementsType.PROCESSING + assert grt(client_group_regex=False) == RequirementsType.PROCESSING + assert grt(client_group_regex=True) == RequirementsType.PROCESSING + assert grt(ignore_concurrency_limits=True) == RequirementsType.PROCESSING + assert grt(scheduler_requirements={"a": "b"}) == RequirementsType.PROCESSING + assert grt(debug_mode=True) == RequirementsType.PROCESSING + + assert ( + grt( + cpus=4, + memory_MB=2, + disk_GB=8, + client_group="yay", + client_group_regex=True, + ignore_concurrency_limits=True, + debug_mode=True, + ) + == RequirementsType.PROCESSING + ) + + +def test_get_requirements_type_billing(): + grt = JobRequirementsResolver.get_requirements_type + assert grt(bill_to_user="foo") == RequirementsType.BILLING + + assert ( + grt( + cpus=4, + memory_MB=2, + disk_GB=8, + client_group="yay", + client_group_regex=True, + bill_to_user="can I buy you a drink?", + ignore_concurrency_limits=True, + debug_mode=True, + ) + == RequirementsType.BILLING + ) + + +def test_get_requirements_type_fail(): + # All the illegal requirements testing is delegated to a method outside the code + # unit under test, so we just do one test per input to be sure it's hooked up correctly + # and delegate more thorough testing to the unit tests for the called method. + n = None + _grtf = _get_requirements_type_fail + _grtf(0, n, n, n, n, IncorrectParamsException("CPU count must be at least 1")) + _grtf(n, 0, n, n, n, IncorrectParamsException("memory in MB must be at least 1")) + _grtf( + n, n, 0, n, n, IncorrectParamsException("disk space in GB must be at least 1") + ) + _grtf( + n, + n, + n, + " \t ", + n, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _grtf( + n, + n, + n, + n, + " \bfoo ", + IncorrectParamsException("bill_to_user contains control characters"), + ) + # note there are no invalid values for client_group_regex, ignore_concurrentcy_limits, + # and debug_mode + + +def _get_requirements_type_fail(cpus, mem, disk, cg, btu, expected): + with raises(Exception) as got: + JobRequirementsResolver.get_requirements_type( + cpus, mem, disk, cg, False, btu, False, False + ) + assert_exception_correct(got.value, expected) + + +def _get_simple_deploy_spec_file_obj(): + return StringIO( + """ + [execution_engine2] + request_cpus = 0 + request_memory = 2000M + request_disk = 100GB + + [DEFAULT] + default_client_group = cg2 + + [cg1] + request_cpus = 4 + request_memory = 2000M + request_disk = 100GB + + [cg2] + request_cpus = 8 + request_memory = 700 + request_disk = 32 + debug_mode = True + client_group_regex = false + """ + ) + + +# Note the constructor uses the normalization class method under the hood for normalizing +# the EE2 config file client groups. As such, we don't duplicate the testing of that method +# here other than some spot checks. If the constructor changes significantly more +# testing may be required. + + +def test_init(): + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(spec) + assert jrr.get_default_client_group() == "cg2" + assert jrr.get_override_client_group() is None + assert jrr.get_configured_client_groups() == set(["cg1", "cg2"]) + assert jrr.get_configured_client_group_spec("cg1") == { + "request_cpus": 4, + "request_memory": 2000, + "request_disk": 100, + "client_group": "cg1", + } + + assert jrr.get_configured_client_group_spec("cg2") == { + "request_cpus": 8, + "request_memory": 700, + "request_disk": 32, + "client_group": "cg2", + "debug_mode": True, + "client_group_regex": False, + } + + +def test_init_with_override(): + + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(spec, " \t ") + assert jrr.get_override_client_group() is None + + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(spec, "cg1") + assert jrr.get_override_client_group() == "cg1" + + +def test_init_fail_missing_input(): + _init_fail( + None, + None, + ValueError("cfgfile cannot be a value that evaluates to false"), + ) + _init_fail( + [], + None, + ValueError("cfgfile cannot be a value that evaluates to false"), + ) + + +def test_init_fail_no_override_in_config(): + + spec = _get_simple_deploy_spec_file_obj() + _init_fail( + spec, + "cg3", + ValueError("No deployment configuration entry for override client group 'cg3'"), + ) + + +def test_init_fail_default_config_error(): + + shared_spec = """ + [njs] + request_cpus = 4 + request_memory = 2000M + request_disk = 100GB + """ + + _init_fail( + StringIO(shared_spec), + None, + IncorrectParamsException( + "Missing input parameter: value for DEFAULT.default_client_group in deployment " + + "config file" + ), + ) + + spec = StringIO( + shared_spec + + """ + [DEFAULT] + foo = bar + """ + ) + _init_fail( + spec, + None, + IncorrectParamsException( + "Missing input parameter: value for DEFAULT.default_client_group in deployment " + + "config file" + ), + ) + + spec = StringIO( + shared_spec + + """ + [DEFAULT] + default_client_group = njrs + """ + ) + _init_fail( + spec, + None, + ValueError("No deployment configuration entry for default client group 'njrs'"), + ) + + +def test_init_fail_bad_config(): + + shared_spec = """ + [DEFAULT] + default_client_group = njs + """ + + spec = ( + shared_spec + + """ + [njs] + request_memory = 2000M + request_disk = 100GB + """ + ) + + _init_fail( + StringIO(spec), + None, + IncorrectParamsException( + "Missing request_cpus key in job requirements from section 'njs' of the " + + "deployment configuration" + ), + ) + + spec = ( + shared_spec + + """ + [njs] + request_cpus = 4 + request_disk = 100GB + """ + ) + + _init_fail( + StringIO(spec), + None, + IncorrectParamsException( + "Missing request_memory key in job requirements from section 'njs' of the " + + "deployment configuration" + ), + ) + + spec = ( + shared_spec + + """ + [njs] + request_cpus = 4 + request_memory = 2000M + """ + ) + + _init_fail( + StringIO(spec), + None, + IncorrectParamsException( + "Missing request_disk key in job requirements from section 'njs' of the " + + "deployment configuration" + ), + ) + + +def _init_fail(spec, override, expected): + with raises(Exception) as got: + JobRequirementsResolver(spec, override) + assert_exception_correct(got.value, expected) + + +def test_get_configured_client_group_spec_fail(): + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + + with raises(Exception) as got: + jrr.get_configured_client_group_spec("cg4") + assert_exception_correct( + got.value, ValueError("Client group 'cg4' is not configured") + ) + + +# Note that resolve_requirements uses the normalization class method and an argument checking +# method under the hood. As such, we don't duplicate the testing of those methods +# here other than some spot checks. If the method changes significantly more +# testing may be required. + + +def get_catalog_cache_mock(catalog_return=None): + """ + :param catalog_return: Set the lookup_job_resource_requirements return value + :return: A mocked instance of the CatalogCache + """ + catalog_cache = create_autospec(CatalogCache, spec_set=True, instance=True) + if catalog_return is not None: + catalog_cache.lookup_job_resource_requirements.return_value = catalog_return + return catalog_cache + + +def test_resolve_requirements_from_spec(): + """ + Resolve requirements when no user input and no catalog record is available. + """ + _resolve_requirements_from_spec([]) + _resolve_requirements_from_spec([{}]) + _resolve_requirements_from_spec([{"client_groups": []}]) + + +def _resolve_requirements_from_spec(catalog_return): + + catalog_cache = get_catalog_cache_mock(catalog_return) + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(spec) + + assert jrr.resolve_requirements(" mod.meth ", catalog_cache) == JobRequirements( + 8, + 700, + 32, + "cg2", + client_group_regex=False, + debug_mode=True, + ) + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="mod", function_name="meth" + ) + + +def test_resolve_requirements_from_spec_with_override(): + """ + Test that an override ignores client group information from the catalog and deploy config. + """ + + catalog_cache = get_catalog_cache_mock(catalog_return=[{"client_groups": ["cg2"]}]) + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(spec, " cg1 ") + assert jrr.resolve_requirements( + " module2. some_meth ", catalog_cache + ) == JobRequirements( + 4, + 2000, + 100, + "cg1", + ) + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" + ) + + +def test_resolve_requirements_from_spec_with_override_and_user_client_group(): + """ + Test that a user providing a client group ignores client group information from all other + sources. + """ + + catalog_cache = get_catalog_cache_mock(catalog_return=[{"client_groups": ["cg2"]}]) + spec = _get_simple_deploy_spec_file_obj() + jrr = JobRequirementsResolver(spec, " cg2 ") + + assert jrr.resolve_requirements( + " module2. some_meth ", + client_group=" cg1", + catalog_cache=catalog_cache, + ) == JobRequirements( + 4, + 2000, + 100, + "cg1", + ) + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" + ) + + +def test_resolve_requirements_from_catalog_full_CSV(): + return_value = [ + { + "client_groups": [ + "cg1", + "request_cpus= 78", + " request_memory = 500MB", + "request_disk = 700GB", + "client_group_regex = False", + "debug_mode = true", + "foo=bar=whoop", # test that only one split occurs + "baz=bat", + ] + } + ] + + catalog_cache = get_catalog_cache_mock(return_value) + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(spec) + + assert jrr.resolve_requirements( + " module2. some_meth ", catalog_cache + ) == JobRequirements( + 78, + 500, + 700, + "cg1", + False, + None, + False, + {"foo": "bar=whoop", "baz": "bat"}, + True, + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" + ) + + +def test_resolve_requirements_from_catalog_partial_JSON(): + + return_value = [ + { + "client_groups": [ + '{"client_group": " cg1 "', + '" request_memory ": " 300M "', + '"exactlythesameshape": "asathingy"', + '"request_disk": 100000}', + ] + } + ] + catalog_cache = get_catalog_cache_mock(return_value) + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(spec) + + assert jrr.resolve_requirements( + " module2. some_meth ", catalog_cache + ) == JobRequirements( + 4, + 300, + 100000, + "cg1", + scheduler_requirements={"exactlythesameshape": "asathingy"}, + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" + ) + + +def test_resolve_requirements_from_user_full(): + _resolve_requirements_from_user_full(True) + _resolve_requirements_from_user_full(False) + + +def _resolve_requirements_from_user_full(bool_val): + + return_value = [ + { + "client_groups": [ + "cg2", + "request_cpus= 78", + " request_memory = 500MB", + "request_disk = 700GB", + "client_group_regex = False", + "debug_mode = true", + "foo=bar", + "baz=bat", + ] + } + ] + catalog_cache = get_catalog_cache_mock(return_value) + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(spec) + + assert jrr.resolve_requirements( + " module2. some_meth ", + catalog_cache, + 42, + 789, + 1, + "cg1", + bool_val, + "some_poor_sucker", + bool_val, + { + "foo": "Some of you may die", + "bar": "but that is a sacrifice I am willing to make", + }, + bool_val, + ) == JobRequirements( + 42, + 789, + 1, + "cg1", + bool_val, + "some_poor_sucker", + bool_val, + { + "foo": "Some of you may die", + "bar": "but that is a sacrifice I am willing to make", + "baz": "bat", + }, + bool_val, + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" + ) + + +def test_resolve_requirements_from_user_partial(): + """ + Gets requirements from the user, catalog, and the ee2 deploy config. + + Also tests that special keys are removed from the scheduler requirements. + """ + + return_value = [ + { + "client_groups": [ + "cg2", + "request_cpus= 78", + "request_disk = 700", + "client_group_regex = False", + "debug_mode = true", + "foo=bar", + "baz=bat", + ] + } + ] + catalog_cache = get_catalog_cache_mock(return_value) + + spec = _get_simple_deploy_spec_file_obj() + + jrr = JobRequirementsResolver(spec) + + assert jrr.resolve_requirements( + " module2. some_meth ", + cpus=42, + catalog_cache=catalog_cache, + client_group="cg1", + client_group_regex=True, + scheduler_requirements={ + "client_group": "foo", + "request_cpus": "78", + "request_memory": "800", + "request_disk": "700", + "client_group_regex": "False", + "debug_mode": "True", + "bill_to_user": "foo", + "ignore_concurrency_limits": "true", + "whee": "whoo", + }, + ) == JobRequirements( + 42, + 2000, + 700, + "cg1", + client_group_regex=True, + scheduler_requirements={"foo": "bar", "baz": "bat", "whee": "whoo"}, + debug_mode=True, + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="module2", function_name="some_meth" + ) + + +def test_resolve_requirements_fail_illegal_inputs(): + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + catalog_cache = get_catalog_cache_mock() + + _resolve_requirements_fail( + jrr, + catalog_cache, + None, + {}, + IncorrectParamsException( + "Unrecognized method: 'None'. Please input module_name.function_name" + ), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "method", + {}, + IncorrectParamsException( + "Unrecognized method: 'method'. Please input module_name.function_name" + ), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "mod1.mod2.method", + {}, + IncorrectParamsException( + "Unrecognized method: 'mod1.mod2.method'. Please input module_name.function_name" + ), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"cpus": 0}, + IncorrectParamsException("CPU count must be at least 1"), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"memory_MB": 0}, + IncorrectParamsException("memory in MB must be at least 1"), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"disk_GB": 0}, + IncorrectParamsException("disk space in GB must be at least 1"), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"client_group": " \t "}, + IncorrectParamsException("Missing input parameter: client_group"), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"bill_to_user": "\b"}, + IncorrectParamsException("bill_to_user contains control characters"), + ) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"scheduler_requirements": {"a": None}}, + IncorrectParamsException( + "Missing input parameter: value for key 'a' in scheduler requirements structure" + ), + ) + + +def test_resolve_requirements_fail_catalog_multiple_entries(): + + return_value = [{"client_groups": ["cg2"]}, {}] + catalog_cache = get_catalog_cache_mock(return_value) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {}, + ValueError( + "Unexpected result from the Catalog service: more than one client group " + + f"configuration found for method m.m {return_value}" + ), + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" + ) + + +def test_resolve_requirements_fail_catalog_bad_JSON(): + + return_value = [{"client_groups": ['{"foo": "bar", "baz":}']}] + catalog_cache = get_catalog_cache_mock(return_value) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {}, + ValueError( + "Unable to parse JSON client group entry from catalog for method m.m" + ), + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" + ) + + +def test_resolve_requirements_fail_catalog_bad_CSV(): + + return_value = [{"client_groups": ["cg", "foo is bar"]}] + catalog_cache = get_catalog_cache_mock(return_value) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {}, + ValueError( + "Malformed requirement. Format is =. " + + "Item is 'foo is bar' for catalog method m.m" + ), + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" + ) + + +def test_resolve_requirements_fail_catalog_normalize(): + + return_value = [{"client_groups": ["cg", "request_memory=72TB"]}] + catalog_cache = get_catalog_cache_mock(return_value) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + catalog_cache, + " mod . meth ", + {}, + IncorrectParamsException( + "Found illegal memory request '72TB' in job requirements from catalog method mod.meth" + ), + ) + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="mod", function_name="meth" + ) + + +def test_resolve_requirements_fail_catalog_clientgroup(): + + return_value = [{"client_groups": ["cg", "request_memory=72"]}] + catalog_cache = get_catalog_cache_mock(return_value) + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + catalog_cache, + " mod . meth ", + {}, + IncorrectParamsException( + "Catalog specified illegal client group 'cg' for method mod.meth" + ), + ) + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="mod", function_name="meth" + ) + + +def test_resolve_requirements_fail_input_clientgroup(): + + catalog_cache = get_catalog_cache_mock([]) + + jrr = JobRequirementsResolver(_get_simple_deploy_spec_file_obj()) + _resolve_requirements_fail( + jrr, + catalog_cache, + "m.m", + {"client_group": "cb4"}, + IncorrectParamsException("No such clientgroup: cb4"), + ) + + catalog_cache.lookup_job_resource_requirements.assert_called_once_with( + module_name="m", function_name="m" + ) + + +def _resolve_requirements_fail(jrr, catalog_cache, method, kwargs, expected): + # Workaround to avoid passing catalog multiple times + with raises(Exception) as got: + jrr.resolve_requirements(method, catalog_cache, **kwargs) + assert_exception_correct(got.value, expected) diff --git a/test/tests_for_utils/user_info_test.py b/test/tests_for_utils/user_info_test.py new file mode 100644 index 000000000..42d7b54b7 --- /dev/null +++ b/test/tests_for_utils/user_info_test.py @@ -0,0 +1,61 @@ +from pytest import raises +from execution_engine2.utils.user_info import UserCreds +from execution_engine2.exceptions import IncorrectParamsException +from utils_shared.test_utils import assert_exception_correct + + +def test_user_creds_init_success(): + uc = UserCreds(" username ", " some token ") + assert uc.username == "username" + assert uc.token == "some token" + + +def test_user_creds_init_fail(): + _user_creds_init_fail( + None, "t", IncorrectParamsException("Missing input parameter: username") + ) + _user_creds_init_fail( + " \t ", "t", IncorrectParamsException("Missing input parameter: username") + ) + _user_creds_init_fail( + "u", None, IncorrectParamsException("Missing input parameter: token") + ) + _user_creds_init_fail( + "u", " \t ", IncorrectParamsException("Missing input parameter: token") + ) + + +def _user_creds_init_fail(username, token, expected): + with raises(Exception) as got: + UserCreds(username, token) + assert_exception_correct(got.value, expected) + + +def test_user_creds_eq(): + u1 = "u1" + u1a = "u1" + u2 = "u2" + t1 = "t1" + t1a = "t1" + t2 = "t2" + + assert UserCreds(u1, t1) == UserCreds(u1a, t1a) + assert UserCreds(u1, t1) != UserCreds(u1, t2) + assert UserCreds(u1, t1) != UserCreds(u2, t1) + assert UserCreds(u1, t1) != (u1, t1) + + +def test_user_creds_hash(): + # hashes will change from instance to instance of the python interpreter, and therefore + # tests can't be written that directly test the hash value. See + # https://docs.python.org/3/reference/datamodel.html#object.__hash__ + u1 = "u1" + u1a = "u1" + u2 = "u2" + t1 = "t1" + t1a = "t1" + t2 = "t2" + + assert hash(UserCreds(u1, t1)) == hash(UserCreds(u1a, t1a)) + assert hash(UserCreds(u1, t1)) != hash(UserCreds(u1, t2)) + assert hash(UserCreds(u1, t1)) != hash(UserCreds(u2, t1)) diff --git a/test/utils_shared/mock_utils.py b/test/utils_shared/mock_utils.py new file mode 100644 index 000000000..58cfbf71b --- /dev/null +++ b/test/utils_shared/mock_utils.py @@ -0,0 +1,71 @@ +from unittest.mock import create_autospec + +from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.utils.job_requirements_resolver import JobRequirementsResolver +from execution_engine2.utils.KafkaUtils import KafkaClient +from execution_engine2.utils.SlackUtils import SlackClient + +from installed_clients.authclient import KBaseAuth +from installed_clients.CatalogClient import Catalog + +from execution_engine2.authorization.roles import AdminAuthUtil +from execution_engine2.utils.Condor import Condor +from execution_engine2.sdk.EE2Constants import ADMIN_READ_ROLE, ADMIN_WRITE_ROLE +from execution_engine2.utils.clients import ClientSet + + +def _build_job_reqs(config, cfgfile, impls): + with open(cfgfile) as cf: + return JobRequirementsResolver(cf) + + +_CLASS_IMPLEMENTATION_BUILDERS = { + KBaseAuth: lambda config, cfgfile, impls: KBaseAuth( + auth_url=config["auth-url"] + "/api/legacy/KBase/Sessions/Login" + ), + AdminAuthUtil: lambda config, cfgfile, impls: AdminAuthUtil( + config["auth-url"], [ADMIN_READ_ROLE, ADMIN_WRITE_ROLE] + ), + Condor: lambda config, cfgfile, impls: Condor(config), + Catalog: lambda config, cfgfile, impls: Catalog(config["catalog-url"]), + JobRequirementsResolver: _build_job_reqs, + KafkaClient: lambda config, cfgfile, impls: KafkaClient(config["kafka-host"]), + MongoUtil: lambda config, cfgfile, impls: MongoUtil(config), + SlackClient: lambda config, cfgfile, impls: SlackClient( + config["slack-token"], debug=True, endpoint=config["ee2-url"] + ), +} + +ALL_CLIENTS = sorted(_CLASS_IMPLEMENTATION_BUILDERS.keys(), key=lambda x: x.__name__) + + +def get_client_mocks(config, config_path, *to_be_mocked): + """ + Create a client set containing a mix of mocks and real implementations as needed for + a test. + + config is the config dict from the ee2 section of the deploy.cfg. + config_path is the path to the configfile. + to_be_mocked is the classes in the client set that should be mocked, e.g. KBaseAuth, etc. + + Returns a dict of the class to the class's mock or implementation as specified in + the arguments. + """ + ret = {} + for clazz in ALL_CLIENTS: + if clazz in to_be_mocked: + ret[clazz] = create_autospec(clazz, instance=True, spec_set=True) + else: + ret[clazz] = _CLASS_IMPLEMENTATION_BUILDERS[clazz](config, config_path, ret) + ret[ClientSet] = ClientSet( + ret[KBaseAuth], + ret[AdminAuthUtil], + ret[Condor], + ret[Catalog], # This one is for "catalog" + ret[Catalog], # This one is for "catalog_no_auth" + ret[JobRequirementsResolver], + ret[KafkaClient], + ret[MongoUtil], + ret[SlackClient], + ) + return ret diff --git a/test/utils_shared/producer.py b/test/utils_shared/producer.py index b90926da4..0ff2f5ace 100644 --- a/test/utils_shared/producer.py +++ b/test/utils_shared/producer.py @@ -22,11 +22,8 @@ def send_kafka_message(self, message, topic=DEFAULT_TOPIC): producer = Producer({"bootstrap.servers": self.server_address}) producer.produce(topic, str(message), callback=_delivery_report) producer.poll(2) - logging.info( - f"Successfully sent message to kafka at topic={topic} message={json.dumps(message)} server_address={self.server_address}" - ) except Exception as e: - logging.info( + logging.error( f"Failed to send message to kafka at topic={topic} message={json.dumps(message)} server_address={self.server_address}" ) raise Exception(e) diff --git a/test/utils_shared/test_utils.py b/test/utils_shared/test_utils.py index a49517f11..af5583791 100644 --- a/test/utils_shared/test_utils.py +++ b/test/utils_shared/test_utils.py @@ -1,17 +1,25 @@ import json +import logging import os.path +import socket +import time import uuid from configparser import ConfigParser +from contextlib import closing from datetime import datetime from typing import List, Dict import requests from dotenv import load_dotenv -from lib.execution_engine2.db.models.models import Job, JobInput, Meta -from lib.execution_engine2.db.models.models import Status -from lib.execution_engine2.exceptions import MalformedTimestampException -from lib.execution_engine2.utils.CondorTuples import CondorResources, JobInfo +from execution_engine2.db.models.models import Job, JobInput, Meta +from execution_engine2.db.models.models import Status +from execution_engine2.exceptions import MalformedTimestampException +from execution_engine2.utils.CondorTuples import JobInfo + +EE2_CONFIG_SECTION = "execution_engine2" +KB_DEPLOY_ENV = "KB_DEPLOYMENT_CONFIG" +DEFAULT_TEST_DEPLOY_CFG = "test/deploy.cfg" def bootstrap(): @@ -31,45 +39,94 @@ def get_example_job_as_dict( wsid: int = 123, authstrat: str = "kbaseworkspace", scheduler_id: str = None, + params: dict = None, + narrative_cell_info: dict = None, + source_ws_objects: list = None, + method_name: str = None, + app_id: str = None, ): job = ( get_example_job( - user=user, wsid=wsid, authstrat=authstrat, scheduler_id=scheduler_id + user=user, + wsid=wsid, + authstrat=authstrat, + scheduler_id=scheduler_id, + params=params, + narrative_cell_info=narrative_cell_info, + source_ws_objects=source_ws_objects, + method_name=method_name, + app_id=app_id, ) .to_mongo() .to_dict() ) - job["method"] = job["job_input"]["app_id"] + # Copy fields to match run_job signature + job_input = job["job_input"] + job["meta"] = job_input["narrative_cell_info"] + job["narrative_cell_info"] = job_input["narrative_cell_info"] + job["params"] = job_input["params"] + job["source_ws_objects"] = job_input["source_ws_objects"] + job["method"] = job["job_input"]["method"] job["app_id"] = job["job_input"]["app_id"] job["service_ver"] = job["job_input"]["service_ver"] return job +def get_example_job_input(wsid, params=None, method_name=None, app_id=None): + if params == None: + params = {} + + job_input = JobInput() + job_input.wsid = wsid + + job_input.method = method_name or "module.method" + job_input.params = params + job_input.service_ver = "dev" + job_input.app_id = app_id or "module/super_function" + job_input.source_ws_objects = ["1/2/3", "2/3/4", "3/5/6"] + + m = Meta() + m.cell_id = "ApplePie" + job_input.narrative_cell_info = m + + return job_input + + def get_example_job( user: str = "boris", wsid: int = 123, authstrat: str = "kbaseworkspace", + params: dict = None, scheduler_id: str = None, + narrative_cell_info: dict = None, + source_ws_objects: list = None, + method_name: str = None, + app_id: str = None, + status: str = None, ) -> Job: j = Job() j.user = user j.wsid = wsid - job_input = JobInput() - job_input.wsid = j.wsid - - job_input.method = "method" - job_input.requested_release = "requested_release" - job_input.params = {} - job_input.service_ver = "dev" - job_input.app_id = "super_module.super_function" + job_input = get_example_job_input( + params=params, wsid=wsid, method_name=method_name, app_id=app_id + ) - m = Meta() - m.cell_id = "ApplePie" - job_input.narrative_cell_info = m j.job_input = job_input j.status = "queued" j.authstrat = authstrat + if status: + j.status = status + + if params: + job_input.params = params + + if source_ws_objects: + job_input.source_ws_objects = source_ws_objects + + if narrative_cell_info: + job_input.narrative_cell_info = narrative_cell_info + if scheduler_id is None: scheduler_id = str(uuid.uuid4()) @@ -82,10 +139,14 @@ def get_example_job_as_dict_for_runjob( user=None, wsid=None, authstrat=None, scheduler_id=None ): job = get_example_job( - user=user, wsid=wsid, authstrat=authstrat, scheduler_id=scheduler_id + user=user, + wsid=wsid, + authstrat=authstrat, + scheduler_id=scheduler_id, + narrative_cell_info={}, ) job_dict = job.to_mongo().to_dict() - job_dict["method"] = job["job_input"]["app_id"] + job_dict["method"] = job["job_input"]["method"] job_dict["app_id"] = job["job_input"]["app_id"] job_dict["service_ver"] = job["job_input"]["service_ver"] return job_dict @@ -353,14 +414,16 @@ def get_sample_condor_info(job=None, error=None): return JobInfo(info=job, error=error) -def get_sample_job_params(method=None, wsid="123"): - if not method: - method = "default_method" - +def get_sample_job_params( + method="MEGAHIT.default_method", + wsid=123, + app_id="MEGAHIT/run_megahit", + parent_job_id="9998", +): job_params = { "wsid": wsid, "method": method, - "app_id": "MEGAHIT/run_megahit", + "app_id": app_id, "service_ver": "2.2.1", "params": [ { @@ -374,8 +437,117 @@ def get_sample_job_params(method=None, wsid="123"): } ], "job_input": {}, - "parent_job_id": "9998", + "parent_job_id": parent_job_id, "meta": {"tag": "dev", "token_id": "12345"}, } return job_params + + +def assert_exception_correct(got: Exception, expected: Exception): + assert got.args == expected.args + assert type(got) == type(expected) + + +def assert_close_to_now(time_): + """ + Checks that a timestamp in seconds since the epoch is within a second of the current time. + """ + now_ms = time.time() + assert now_ms + 1 > time_ + assert now_ms - 1 < time_ + + +def find_free_port() -> int: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +class TestException(Exception): + __test__ = False + + +def create_auth_user(auth_url, username, displayname): + ret = requests.post( + auth_url + "/testmode/api/V2/testmodeonly/user", + headers={"accept": "application/json"}, + json={"user": username, "display": displayname}, + ) + if not ret.ok: + ret.raise_for_status() + + +def create_auth_login_token(auth_url, username): + ret = requests.post( + auth_url + "/testmode/api/V2/testmodeonly/token", + headers={"accept": "application/json"}, + json={"user": username, "type": "Login"}, + ) + if not ret.ok: + ret.raise_for_status() + return ret.json()["token"] + + +def create_auth_role(auth_url, role, description): + ret = requests.post( + auth_url + "/testmode/api/V2/testmodeonly/customroles", + headers={"accept": "application/json"}, + json={"id": role, "desc": description}, + ) + if not ret.ok: + ret.raise_for_status() + + +def set_custom_roles(auth_url, user, roles): + ret = requests.put( + auth_url + "/testmode/api/V2/testmodeonly/userroles", + headers={"accept": "application/json"}, + json={"user": user, "customroles": roles}, + ) + if not ret.ok: + ret.raise_for_status() + + +def get_full_test_config() -> ConfigParser: + f""" + Gets the full configuration for ee2, including all sections of the config file. + + If the {KB_DEPLOY_ENV} environment variable is set, loads the configuration from there. + Otherwise, the repo's {DEFAULT_TEST_DEPLOY_CFG} file is used. + """ + config_file = os.environ.get(KB_DEPLOY_ENV, DEFAULT_TEST_DEPLOY_CFG) + logging.info(f"Loading config from {config_file}") + + config_parser = ConfigParser() + config_parser.read(config_file) + if config_parser[EE2_CONFIG_SECTION].get("mongo-in-docker-compose"): + config_parser[EE2_CONFIG_SECTION]["mongo-host"] = config_parser[ + EE2_CONFIG_SECTION + ]["mongo-in-docker-compose"] + return config_parser + + +def get_ee2_test_config() -> Dict[str, str]: + f""" + Gets the configuration for the ee2 service, e.g. the {EE2_CONFIG_SECTION} section of the + deploy.cfg file. + + If the {KB_DEPLOY_ENV} environment variable is set, loads the configuration from there. + Otherwise, the repo's {DEFAULT_TEST_DEPLOY_CFG} file is used. + """ + cp = get_full_test_config() + + cfg = {} + for nameval in cp.items(EE2_CONFIG_SECTION): + cfg[nameval[0]] = nameval[1] + + return cfg + + +CLIENT_GROUP_CONFIG = { + "module_name": "module_name", + "function_name": "function_name", + "client_groups": ["client_groups_go_here"], +} +MODULE_VERSION = {"git_commit_hash": 123} diff --git a/unit_testing_guidelines.md b/unit_testing_guidelines.md new file mode 100644 index 000000000..0130dedb5 --- /dev/null +++ b/unit_testing_guidelines.md @@ -0,0 +1,181 @@ +# Unit and Integration Testing guidelines + +This document briefly covers testing philosophy with regard to integration and unit tests, +especially for the Python language and in the context of developing a KBase core service like +the Execution Engine. + +## Unit versus Integration tests + +Unit tests cover one module, class, or function, called a code unit from here on out. For example, +a unit test file might cover the contents of `my_module.py` or more granularly `my_module.MyClass`. +Code outside the code unit should be excluded from the tests. The exception is "value classes" +which are classes which primarily hold data and whose behavior is based on that data. Other +classes required by the unit under test should be mocked out as far as possible and practical. + +This makes unit tests fast and easy to understand, as only the isolated code unit needs to be +comprehended in order to grasp test failures. + +In contrast an integration test tests that two or more code units work well together. This can +range from anything between testing two code units' interactions to api-to-DB tests for a server. +Integration tests are typically much much slower, much more complex, take much more setup code, +and are harder to understand. Due to this, it is advisable to minimze the number of integration +tests to the least possible to ensure the various code units work together correctly, and write +unit tests to cover as much code as possible. In the author's experience, it is usually not +difficult to write unit tests with 100% coverage for the code unit (although keep in mind +that 100% test coverage does not necessarily indicate quality tests). + +## Mocking dependencies + +As previously described, a unit test should only cover a single unit of code. What this means +is that complex dependencies (e.g. not simple helper functions that may be called from the +class, not value classes, etc.) need to be mocked out. We do this via inversion of control, or +dependency injection. That is, if a code unit needs another code unit as a dependency, the +dependency should *provided to* the code unit rather than *constructed by* the code unit. + +For example, consider a toy function that contacts the workspace service: + +```python +def get_object_name_from_id(url, token, ref): + ws = Workspace(url, token=token) + return ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][1] +``` + +Note that the same situation may arise in a class that needs to contact the workspace regularly and +constructs the client in its `__init__` method. + +This makes the function difficult to unit test, as if run as-is, it will contact the workspace +service. This means that to run the test the workspace service must be running and populated +with data, or a mock service must be running that can validate the call and return the expected +payload. + +Instead, we can rewrite the function (or class) with dependency injection: + +```python +def get_object_name_from_id(ws, ref): + return ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][1] +``` + +Now we can easily pass in a mock object for the `Workspace` depencency in a unit test: + +```python +def test_get_object_name_from_id_success(): + ws = create_autospec(Workspace, spec_set=True, instance=True) [1] + ws.get_object_info3.return_value = {'infos': [ [2] + [3, + 'my_name', + 'Some.Type-1.0', + '1/1/1T01:01:01+00:00', + 1, + 'someguy', + 8, + 'my_workspace', + '79054025255fb1a26e4bc422aef54eb4', 82, {}] + ]} + + assert get_object_name_from_id(ws, '8/3/1') == 'my_name' [3] + + ws.get_object_info3.assert_called_once_with({'objects': [{'ref': '8/3/1'}]}) [4] +``` + +In this test, we: +1. Create the mock object +2. Tell the mock object what to return if the `get_object_info3` method is called +3. Call the method with the mock object as an argument and `assert` that it returns the correct + result +4. Confirm that the mock was called correctly. + +No server, mock or otherwise, is required, nor is confusing and error-prone monkey patching. + +If step 4 is omitted, any code that is run prior to the mock being called is ignored +by the tests as long as the mock is called and an error doesn't occur. Confirming the correct +call is required to test that any code that, for example, mutates the input arguments before +calling the mock with said mutated arguments works correctly. + +For more information on the Python mock standard library, see +https://docs.python.org/3/library/unittest.mock.html. + +For an example of mocks used in real code, see +[this EE2 test](https://github.com/kbase/execution_engine2/blob/e2c8086bd1f52b3ca488882c493aaaa9704626ad/test/tests_for_sdkmr/EE2StatusRange_test.py). + +## More on Dependency Injection + +Dependency Injection (DI), as we've seen, makes unit tests much easier, or even possible. There's +another benefit as well: modularity. DI makes it much easier to swap out modules, even at runtime, +to provide alternate implmentations of the fuctionality. Imagine an application that requires an +authorization module with a large number of parameters: + +```python +class Application: + + def __init__(self, + auth_url, + auth_client_id, + auth_client_secret, + auth_protocol, + auth_cache_time, + # more Application parameters go here + ): + self.auth = SomeCompaniesAuthImplementation( + auth_url, auth_client_id, auth_client_secret, auth_protocol, auth_cache_time) +``` + +If we wish to support `SomeOtherCompaniesAuthImplementation`, determined at runtime, we may need +another batch of parameters to support that implementation as well as a parameter to tell +`Application` which authorization implementation to use. + +An implementation based on DI might look like: + +```python +class Application: + + def __init__(self, auth_implementation): + self.auth = auth_implementation +``` + +Where the interface of `auth_implementation` can be merely documented (e.g. ducktyping) or +more rigorously defined with an [abstract base class](https://docs.python.org/3/library/abc.html) +and [type hints](https://docs.python.org/3/library/typing.html). + +In this way, code that interprets a configuration at run time can build whichever version of +the authentication module that is required and pass it to `Application`. This makes `Application` +more modular, easier to test, easier to use, and simplifies the initialization. + +The drawback of DI is that it pushes the responsibility for building dependencies up the +software stack, making the user of the class have to write that code, although package authors +could provide helper methods. + +## `Mock()` versus `create_autospec` + +Those familiar with the python mock library will be aware of the `Mock` class. In the examples +above, we use `create_autospec` to create the mock rather than creating a mock class directly. +The way `create_autospec` is used, with `spec_set=True` and `instance=True`, creates a mock object +based off the interface of the class being mocked, and unlike a regular mock, will not allow +reading or writing of an attribute that does not exist on the class being mocked (as well as +avoiding [other problems](https://docs.python.org/3/library/unittest.mock.html#auto-speccing)). +This prevents test false positives if the interface of the class changes but the tests are not +updated - a standard `Mock()` will allow method returns to be set and will record method calls +for methods that do not exist, but in the example above, the tests would fail if, for example, +`get_object_info3` was removed from the `Workspace` class. + +The drawback of using `spec_set=True` is that autospeccing is unaware of any instance variables +(e.g. `self.foo = foo_arg` in a constructor, for example). The unittest documentation suggests +a number of approaches to get around this problem, but in the author's opinion the least +bad option is to create getters (and setters for mutable instance variables) for any instance +variables that need to be exposed in the class's public interface. + +## External services + +The rule of thumb is to not mock external services, but instead create a wrapper around the +external service, mock that, and test the wrapper with integration tests. In some cases this +is relatively simple, but other cases are much more difficult. + +If the service is easy to set up and run locally, an integration test with a live service +is likely the best choice. Databases like MongoDB often fit this category as it is quick to +download and run a binary or Docker image. + +If the service is more difficult to run locally, a mock server might be employed to mock the +service responses. This is dangerous because if the service API changes, the test results will +contain false positives. An example is using a mock server in the +[KBase auth2](https://github.com/kbase/auth2) repo to mock identity provider services, which +cannot be installed locally and cannot be incorporated into automated testing without +enormous difficulty.