From ecc2535d2f625b2f5eb73022db025cff10f7aaa5 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 22 Sep 2021 11:58:55 -0500 Subject: [PATCH 01/32] Create 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 96 ++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 docs/adrs/004-SplitAndAggregate.md diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md new file mode 100644 index 000000000..5057f5254 --- /dev/null +++ b/docs/adrs/004-SplitAndAggregate.md @@ -0,0 +1,96 @@ +# Replace KBParallels with another solution to avoid Deadlocks + +Date: 2021-09-22 + +[Related ADR](https://github.com/kbase/execution_engine2/blob/develop/docs/adrs/rework-batch-analysis-architecture.md) + +## Intro +Sometimes a calculation requires too many resources from one node (walltime, memory, disk), so the calculation gets spread across multiple machines. +The final step of the app that uses KBParallels is to create a report. This step may use results from all of the computed jobs to create the final report. +In order to do this, some apps use a mechanism called KBParallel. The apps are listed at [DATAUP-111](https://kbase-jira.atlassian.net/browse/DATAUP-111) + +## The current implementation of Batch Analysis in [kb_BatchApp](https://github.com/kbaseapps/kb_BatchApp) at KBase has the following issues: + +* Current UI is not adequate: Users shouldn’t have to code in order to run batch analysis. Also it’s difficult to do so, even for those familiar with KBase code (have to find object names) +* Dependency on [KBParallel](https://github.com/kbaseapps/KBParallel): any changes to KBParallel could affect KB Batch and subsequently all other apps. +* Queue deadlocking: users have a max of 10 slots in the queue, with the current implementation one is taken up just to manage the jobs. Could lead to deadlock scenarios +* Missing the ability to be able to run, manage and track jobs and their subjobs +* No good way to test and hard to benchmark or measure performance +* Code is split more than is necessary + + +Background: + +## Author(s) + +@bio-boris, @mrcreosote + +## Status + +Pending + +## Alternatives Considered + +* Writing a new Batch App +* Implementing a Dynamic Service +* Implementing a standalone Narrative Widget +* Implementing a new (set) of EE2 endpoint(s) + +## Decision Outcome + +After much [discussion](https://docs.google.com/document/d/1PoiOas-hqgHONNzmCVupjmNCyvUvcned5YfZf-5CxsI/edit#) [and research](https://docs.google.com/spreadsheets/d/1FGecELaEBAWQ7ljXsR29h5RdtmYV_jMyLSiJAWcm4mE/edit#gid=492998772) the decision was to create a new endpoint (or set of endpoints) in EE2 for the purpose of running batch analysis workflows at KBase. + +## Consequences + +We will have to implement a new narrative UI, however this was work that would happen regardless due as we are looking to improve the UX for batch upload and analysis at KBase. + +As far as we have discussed, this decision only seems to have positive consequences. We will be able to streamline the code structure at KBase, make batch analysis easier to test and improve usability and job management. + +Initial architecture sketches for what this could look like have been started [here](https://miro.com/app/board/o9J_kmb4y4Q=/?moveToWidget=3074457350208108487&cot=12) + +Still to be determined (not in scope of this ADR): +* What other endpoints we may need such as cancel batch +* Other implementation details that are TBD: + * Will the Narrative poll EE2 for updates on jobs or subscribe to Kafka? + * Creating sets of objects and what to do at the end of a batch run + * What to do about a set if a child task fails during processing + +## Pros and Cons of the Alternatives + +### Writing a new Batch App + +* `+` We have a potential first pass at implementation [here](https://github.com/bio-boris/simplebatch) +* `+` Removes dependency on KB Parallel +* `+` In current implementation, no longer uses up a slot in the queue (however this could easily morph into a con depending on how we manage set creation. If we need a reduce step this could require us to use multiple slots.) +* `+` Offers some improvement upon job management +* `-` Code is still split between locations +* `-` No good way to test and hard to benchmark or measure performance +* `-` Tracking and managing sub jobs is still a challenge + +### Implementing a Dynamic Service +* `+` No longer uses an app +* `+` Custom endpoint +* `+` No longer uses a slot in the queue +* `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs +* `-` Hard to test (secret variables, dynamic services in app-dev and prod have same catalog, so hard to tell which service is where; service wizard can cause problems) +* `-` Takes microservices approach to the extreme - if we have a service for this it makes more sense for it to live within EE2 rather than be further separated +* `-` Locked into KB-SDK +* `-` Coupled with other dynamic services and Rancher1. DevOps can't easily modify the deployment of these without changes to the service wizard. + +### Implementing a standalone Narrative Widget +* `+` No longer uses an app +* `+` No longer uses a slot in the queue +* `+` Can reuse FE UI logic to build widget / cell, drive the batch submission process, and keep track of job progresses +* `-` Have to maintain new cell type (possible code duplication), adds extra load to eventual narrative refactor +* `-` Tightly coupled to Narrative UI means that if we want to run batch outside of narrative or in any other capacity work will have to be duplicated +* `-` Job resource and queue management has to happen in the narrative +* `-` If the narrative is closed or reaped, job state management stops + +### Implementing a new (set) of EE2 endpoint(s) +* `+` No longer uses a slot in the queue, deadlock is no longer an issue +* `+` No longer uses an app: more logical paradigm, services are easier to test, scale, monitor +* `+` More control over job lifecycle: management, tables, resource requests, subjob cancellation +* `+` Possible to control job priority in queue + * Batch jobs could have lower priority, so individual jobs could also run concurrently. E.g. A user could start a batch of 1000 imports, and still run some analysis task that would be able to start and finish first. + * There’s a possibility for users (and/or admins) to control job priority in queue after submission +* [Code cohesion](https://en.wikipedia.org/wiki/Cohesion_(computer_science)) From 3c9ae6f0815e6070122ae6b9c98163894a5285fb Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 30 Sep 2021 13:58:10 -0500 Subject: [PATCH 02/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 5057f5254..d7a57e0bf 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -17,7 +17,7 @@ In order to do this, some apps use a mechanism called KBParallel. The apps are l * Missing the ability to be able to run, manage and track jobs and their subjobs * No good way to test and hard to benchmark or measure performance * Code is split more than is necessary - +* UI doesn't properly display progress of batch jobs Background: @@ -31,14 +31,13 @@ Pending ## Alternatives Considered -* Writing a new Batch App -* Implementing a Dynamic Service -* Implementing a standalone Narrative Widget -* Implementing a new (set) of EE2 endpoint(s) +* Ignore most issues and just make apps that run kbparallels limited to N instances of kbparallels per user to avoid deadlocks +* Writing new ee2 endpoints to entirely handle batch execution +* Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split, aggregate and a batch endpoint ## Decision Outcome -After much [discussion](https://docs.google.com/document/d/1PoiOas-hqgHONNzmCVupjmNCyvUvcned5YfZf-5CxsI/edit#) [and research](https://docs.google.com/spreadsheets/d/1FGecELaEBAWQ7ljXsR29h5RdtmYV_jMyLSiJAWcm4mE/edit#gid=492998772) the decision was to create a new endpoint (or set of endpoints) in EE2 for the purpose of running batch analysis workflows at KBase. + ## Consequences From 312b9f9fa50d2350348d84b1fd00405bf4772386 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Oct 2021 09:05:48 -0500 Subject: [PATCH 03/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 43 ++++++++++++------------------ 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index d7a57e0bf..a3f76e80b 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -32,49 +32,40 @@ Pending ## Alternatives Considered * Ignore most issues and just make apps that run kbparallels limited to N instances of kbparallels per user to avoid deadlocks -* Writing new ee2 endpoints to entirely handle batch execution -* Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split, aggregate and a batch endpoint +* Writing new ee2 endpoints to entirely handle batch execution and possibly use a DAG +* Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG ## Decision Outcome - +For the first pass, we would likely limit the number of kbparallel runs +For the next pass, we would want to create a comprehensive generalized solution to submit,split and aggregate, with recipes or conveniences for common operations for creating sets, reports, or things of that nature. +We would also want to do a user study on what we want from the UI and which functionality we want, as the UI may inform the design of the backend system. ## Consequences -We will have to implement a new narrative UI, however this was work that would happen regardless due as we are looking to improve the UX for batch upload and analysis at KBase. - -As far as we have discussed, this decision only seems to have positive consequences. We will be able to streamline the code structure at KBase, make batch analysis easier to test and improve usability and job management. - -Initial architecture sketches for what this could look like have been started [here](https://miro.com/app/board/o9J_kmb4y4Q=/?moveToWidget=3074457350208108487&cot=12) +* We will have to roll out fixes in multiple stages +* We will have to implement a new narrative UI, however this was work that would happen regardless due as we are looking to improve the UX for batch upload and analysis at KBase. +* This will take significant time to further research and engineer the solutions Still to be determined (not in scope of this ADR): -* What other endpoints we may need such as cancel batch -* Other implementation details that are TBD: - * Will the Narrative poll EE2 for updates on jobs or subscribe to Kafka? - * Creating sets of objects and what to do at the end of a batch run - * What to do about a set if a child task fails during processing +* UI and how it relates to the bulk execution +* XSV Analysis and how it relates to the bulk execution ## Pros and Cons of the Alternatives -### Writing a new Batch App +### Limit multiple instances of kbparallels + +* `+` Simplest solution, quickest turnaround, fixes deadlock issue +* `-` UI still broken for batch analysis -* `+` We have a potential first pass at implementation [here](https://github.com/bio-boris/simplebatch) -* `+` Removes dependency on KB Parallel -* `+` In current implementation, no longer uses up a slot in the queue (however this could easily morph into a con depending on how we manage set creation. If we need a reduce step this could require us to use multiple slots.) -* `+` Offers some improvement upon job management -* `-` Code is still split between locations -* `-` No good way to test and hard to benchmark or measure performance -* `-` Tracking and managing sub jobs is still a challenge +## Seperate Queue for kbparallels apps -### Implementing a Dynamic Service +### Deprecate kbparallels, and write new ee2 endpoints to entirely handle batch execution and possibly use a DAG * `+` No longer uses an app * `+` Custom endpoint * `+` No longer uses a slot in the queue * `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs -* `-` Hard to test (secret variables, dynamic services in app-dev and prod have same catalog, so hard to tell which service is where; service wizard can cause problems) -* `-` Takes microservices approach to the extreme - if we have a service for this it makes more sense for it to live within EE2 rather than be further separated -* `-` Locked into KB-SDK -* `-` Coupled with other dynamic services and Rancher1. DevOps can't easily modify the deployment of these without changes to the service wizard. +* `-` Still deadlocks and uses a slot in the queue ### Implementing a standalone Narrative Widget * `+` No longer uses an app From f05daffd52ca812aabdf363f9bc1dc311fbee538 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Oct 2021 10:40:42 -0500 Subject: [PATCH 04/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 100 +++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index a3f76e80b..d88b57218 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -4,6 +4,10 @@ Date: 2021-09-22 [Related ADR](https://github.com/kbase/execution_engine2/blob/develop/docs/adrs/rework-batch-analysis-architecture.md) +## Note +This ADR is more of a place to keep the current discussions we had at https://docs.google.com/document/d/1AWjayMoqCoGkpO9-tjXxEvO40yYnFtcECbdne5vTURo +Rather than to make a decision. There is still more planning, scoping and testing involved before we can fully design this system. + ## Intro Sometimes a calculation requires too many resources from one node (walltime, memory, disk), so the calculation gets spread across multiple machines. The final step of the app that uses KBParallels is to create a report. This step may use results from all of the computed jobs to create the final report. @@ -14,7 +18,8 @@ In order to do this, some apps use a mechanism called KBParallel. The apps are l * Current UI is not adequate: Users shouldn’t have to code in order to run batch analysis. Also it’s difficult to do so, even for those familiar with KBase code (have to find object names) * Dependency on [KBParallel](https://github.com/kbaseapps/KBParallel): any changes to KBParallel could affect KB Batch and subsequently all other apps. * Queue deadlocking: users have a max of 10 slots in the queue, with the current implementation one is taken up just to manage the jobs. Could lead to deadlock scenarios -* Missing the ability to be able to run, manage and track jobs and their subjobs +* KBP can spawn other KBP jobs. Batch jobs can spawn other batch jobs. +* Missing the ability to be able to run, manage (cancel) and track jobs and their subjobs along with the ability to specify resources differently between the main and sub jobs * No good way to test and hard to benchmark or measure performance * Code is split more than is necessary * UI doesn't properly display progress of batch jobs @@ -27,15 +32,16 @@ Background: ## Status -Pending +Needs more research ## Alternatives Considered * Ignore most issues and just make apps that run kbparallels limited to N instances of kbparallels per user to avoid deadlocks * Writing new ee2 endpoints to entirely handle batch execution and possibly use a DAG * Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG +* Different DevOps solutions -## Decision Outcome +## Decision Outcome (pending more research to iron out more details) For the first pass, we would likely limit the number of kbparallel runs For the next pass, we would want to create a comprehensive generalized solution to submit,split and aggregate, with recipes or conveniences for common operations for creating sets, reports, or things of that nature. @@ -58,29 +64,69 @@ Still to be determined (not in scope of this ADR): * `+` Simplest solution, quickest turnaround, fixes deadlock issue * `-` UI still broken for batch analysis -## Seperate Queue for kbparallels apps +## DEADLOCK: Increase number of slots or Seperate Queue for kbparallels apps without 10 job limit +* `+` Simple solutions, quick turnarounds, fixes deadlock issue +* `-` UI still broken for batch analysis +* `-` A small amount of users can take over the entire system +* `-` The calculations done by the apps will interfere with other apps and cause crashes/failures + +## DEADLOCK: Modify KBP to do only local submission, Move the job to a machine with larger resources +* `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues + * `-` We have a limited number of larger resources machines +* `-` Continued dependency on deprecated KBP tools -### Deprecate kbparallels, and write new ee2 endpoints to entirely handle batch execution and possibly use a DAG -* `+` No longer uses an app -* `+` Custom endpoint -* `+` No longer uses a slot in the queue +### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate +* `+` No longer uses an app, No longer uses a slot in the queue +* `-` All jobs would have to change to stop using KBP in favor of these via the ee2 client, * `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs -* `-` Still deadlocks and uses a slot in the queue - -### Implementing a standalone Narrative Widget -* `+` No longer uses an app -* `+` No longer uses a slot in the queue -* `+` Can reuse FE UI logic to build widget / cell, drive the batch submission process, and keep track of job progresses -* `-` Have to maintain new cell type (possible code duplication), adds extra load to eventual narrative refactor -* `-` Tightly coupled to Narrative UI means that if we want to run batch outside of narrative or in any other capacity work will have to be duplicated -* `-` Job resource and queue management has to happen in the narrative -* `-` If the narrative is closed or reaped, job state management stops - -### Implementing a new (set) of EE2 endpoint(s) -* `+` No longer uses a slot in the queue, deadlock is no longer an issue -* `+` No longer uses an app: more logical paradigm, services are easier to test, scale, monitor -* `+` More control over job lifecycle: management, tables, resource requests, subjob cancellation -* `+` Possible to control job priority in queue - * Batch jobs could have lower priority, so individual jobs could also run concurrently. E.g. A user could start a batch of 1000 imports, and still run some analysis task that would be able to start and finish first. - * There’s a possibility for users (and/or admins) to control job priority in queue after submission -* [Code cohesion](https://en.wikipedia.org/wiki/Cohesion_(computer_science)) +* `-` Job monitoring thread would have to run in ee2 +* `-` Requires new data structures and models in ee2 to make sure that the relationship is preserved between jobs and sub jobs, and to make sure deadlocking does not occur +* `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* +* `-` The endpoint would need to know how to split up input files and aggregate them + + +### Deprecate KBP and instead break out apps into 3 parts + +* Fan out (FO) +* Process in parallel (PIP) +* Fan in (FI) +### Steps: +1. User launches job as normal +2. Possibly the job is marked as a FO job, Makes it easier for the UI to display the job correctly initially, Ideally would be marked in the spec, but this might be a lot of work Could potentially be marked in the catalog UI (e.g. along with the job requirements) +3. Job figures out what the PIP / sub jobs should be +4. Job sends the following info to EE2 +* Its own job ID +* The parameters for each of the sub jobs +* The app of the FI job, e.g. kb_phylogenomics/build_microbial_speciestree_reduce +* EE2 starts the subjobs and associates them with with FO job (Probably need retry handling for the subjobs to deal with transient errors) +5. Whenever a subjob finishes, EE2 checks to see if all the subjobs are finished +* If true, EE2 starts the FI job, providing the outputs of the subjobs as a list to the reduce job +* When the FI job is finished, the job is done. +* The various jobs can communicate by storing temporary data in the caching service or in the Blobstore. If the latter is used, the FI job should clean up the Blobstore nodes when its complete. +* Could make a helper app for this? +* What about workflow engines (WDL, Cromwell)? Are we reinventing the wheel here? +* Can new EE2 endpoints speed up or reduce the complexity of any of these steps? + +### Notes about DAG +``` +Your dag would +need to have (at least) a first job followed by a SUBDAG EXTERNAL. +Somewhere in the first job you'd generate a new dag workflow that +defines the N clusters followed by the N+1 job, which runs in the +subdag. + +As for DAGMan support in the Python bindings, we do this in the +following two ways: + +1) There is a htcondor.Submit.from_dag() option which takes the name +of a dag filename. You then submit the resulting object just like any +regular job. +2) We have a htcondor.dags library which can be used to +programmatically construct a DAG workflow in computer memory, then +write to a .dag file and submit using the function mentioned in 1) +above. +``` + +Between these there are several different ways to do what you want. +There's a useful example here that shows the general workflow in the +bindings: https://htcondor.readthedocs.io/en/latest/apis/python-bindings/tutorials/DAG-Creation-And-Submission.html#Describing-the-DAG-using-htcondor.dags From 802a89bc210c7bc03cb11dda37e12276dfde084b Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Oct 2021 10:42:13 -0500 Subject: [PATCH 05/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 87 +++++++++++++++--------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index d88b57218..81809bfa4 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -34,12 +34,7 @@ Background: Needs more research -## Alternatives Considered -* Ignore most issues and just make apps that run kbparallels limited to N instances of kbparallels per user to avoid deadlocks -* Writing new ee2 endpoints to entirely handle batch execution and possibly use a DAG -* Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG -* Different DevOps solutions ## Decision Outcome (pending more research to iron out more details) @@ -47,43 +42,6 @@ For the first pass, we would likely limit the number of kbparallel runs For the next pass, we would want to create a comprehensive generalized solution to submit,split and aggregate, with recipes or conveniences for common operations for creating sets, reports, or things of that nature. We would also want to do a user study on what we want from the UI and which functionality we want, as the UI may inform the design of the backend system. -## Consequences - -* We will have to roll out fixes in multiple stages -* We will have to implement a new narrative UI, however this was work that would happen regardless due as we are looking to improve the UX for batch upload and analysis at KBase. -* This will take significant time to further research and engineer the solutions - -Still to be determined (not in scope of this ADR): -* UI and how it relates to the bulk execution -* XSV Analysis and how it relates to the bulk execution - -## Pros and Cons of the Alternatives - -### Limit multiple instances of kbparallels - -* `+` Simplest solution, quickest turnaround, fixes deadlock issue -* `-` UI still broken for batch analysis - -## DEADLOCK: Increase number of slots or Seperate Queue for kbparallels apps without 10 job limit -* `+` Simple solutions, quick turnarounds, fixes deadlock issue -* `-` UI still broken for batch analysis -* `-` A small amount of users can take over the entire system -* `-` The calculations done by the apps will interfere with other apps and cause crashes/failures - -## DEADLOCK: Modify KBP to do only local submission, Move the job to a machine with larger resources -* `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues - * `-` We have a limited number of larger resources machines -* `-` Continued dependency on deprecated KBP tools - -### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate -* `+` No longer uses an app, No longer uses a slot in the queue -* `-` All jobs would have to change to stop using KBP in favor of these via the ee2 client, -* `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs -* `-` Job monitoring thread would have to run in ee2 -* `-` Requires new data structures and models in ee2 to make sure that the relationship is preserved between jobs and sub jobs, and to make sure deadlocking does not occur -* `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* -* `-` The endpoint would need to know how to split up input files and aggregate them - ### Deprecate KBP and instead break out apps into 3 parts @@ -130,3 +88,48 @@ above. Between these there are several different ways to do what you want. There's a useful example here that shows the general workflow in the bindings: https://htcondor.readthedocs.io/en/latest/apis/python-bindings/tutorials/DAG-Creation-And-Submission.html#Describing-the-DAG-using-htcondor.dags + +## Consequences + +* We will have to roll out fixes in multiple stages +* We will have to implement a new narrative UI, however this was work that would happen regardless due as we are looking to improve the UX for batch upload and analysis at KBase. +* This will take significant time to further research and engineer the solutions + +Still to be determined (not in scope of this ADR): +* UI and how it relates to the bulk execution +* XSV Analysis and how it relates to the bulk execution + +## Alternatives Considered + +* Ignore most issues and just make apps that run kbparallels limited to N instances of kbparallels per user to avoid deadlocks +* Writing new ee2 endpoints to entirely handle batch execution and possibly use a DAG +* Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG +* Different DevOps solutions + + +## Pros and Cons of the Alternatives + +### Limit multiple instances of kbparallels + +* `+` Simplest solution, quickest turnaround, fixes deadlock issue +* `-` UI still broken for batch analysis + +## DEADLOCK: Increase number of slots or Seperate Queue for kbparallels apps without 10 job limit +* `+` Simple solutions, quick turnarounds, fixes deadlock issue +* `-` UI still broken for batch analysis +* `-` A small amount of users can take over the entire system +* `-` The calculations done by the apps will interfere with other apps and cause crashes/failures + +## DEADLOCK: Modify KBP to do only local submission, Move the job to a machine with larger resources +* `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues + * `-` We have a limited number of larger resources machines +* `-` Continued dependency on deprecated KBP tools + +### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate +* `+` No longer uses an app, No longer uses a slot in the queue +* `-` All jobs would have to change to stop using KBP in favor of these via the ee2 client, +* `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs +* `-` Job monitoring thread would have to run in ee2 +* `-` Requires new data structures and models in ee2 to make sure that the relationship is preserved between jobs and sub jobs, and to make sure deadlocking does not occur +* `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* +* `-` The endpoint would need to know how to split up input files and aggregate them From 31c6c79e7ff5a81a17f7341f43b877125ca37d00 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Oct 2021 10:42:45 -0500 Subject: [PATCH 06/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 81809bfa4..29bd80ed5 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -24,16 +24,12 @@ In order to do this, some apps use a mechanism called KBParallel. The apps are l * Code is split more than is necessary * UI doesn't properly display progress of batch jobs -Background: - ## Author(s) @bio-boris, @mrcreosote ## Status - -Needs more research - +Needs more planning, but current ideas are documented here ## Decision Outcome (pending more research to iron out more details) From 65a7d7e42799ac64d0cbc90c6d68addb31c276e8 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Oct 2021 10:44:45 -0500 Subject: [PATCH 07/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 29bd80ed5..e41b17b45 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -7,6 +7,11 @@ Date: 2021-09-22 ## Note This ADR is more of a place to keep the current discussions we had at https://docs.google.com/document/d/1AWjayMoqCoGkpO9-tjXxEvO40yYnFtcECbdne5vTURo Rather than to make a decision. There is still more planning, scoping and testing involved before we can fully design this system. +Still to be determined (not in scope of this ADR): + + UI and how it relates to the bulk execution + XSV Analysis and how it relates to the bulk execution + ## Intro Sometimes a calculation requires too many resources from one node (walltime, memory, disk), so the calculation gets spread across multiple machines. @@ -34,8 +39,10 @@ Needs more planning, but current ideas are documented here ## Decision Outcome (pending more research to iron out more details) -For the first pass, we would likely limit the number of kbparallel runs +For the first pass, we would likely limit the number of kbparallel runs. + For the next pass, we would want to create a comprehensive generalized solution to submit,split and aggregate, with recipes or conveniences for common operations for creating sets, reports, or things of that nature. + We would also want to do a user study on what we want from the UI and which functionality we want, as the UI may inform the design of the backend system. @@ -44,6 +51,8 @@ We would also want to do a user study on what we want from the UI and which func * Fan out (FO) * Process in parallel (PIP) * Fan in (FI) + + ### Steps: 1. User launches job as normal 2. Possibly the job is marked as a FO job, Makes it easier for the UI to display the job correctly initially, Ideally would be marked in the spec, but this might be a lot of work Could potentially be marked in the catalog UI (e.g. along with the job requirements) @@ -61,10 +70,9 @@ We would also want to do a user study on what we want from the UI and which func * What about workflow engines (WDL, Cromwell)? Are we reinventing the wheel here? * Can new EE2 endpoints speed up or reduce the complexity of any of these steps? -### Notes about DAG +### Notes about DAG in ee2 Endpoints ``` -Your dag would -need to have (at least) a first job followed by a SUBDAG EXTERNAL. +Your dag would need to have (at least) a first job followed by a SUBDAG EXTERNAL. Somewhere in the first job you'd generate a new dag workflow that defines the N clusters followed by the N+1 job, which runs in the subdag. @@ -82,6 +90,7 @@ above. ``` Between these there are several different ways to do what you want. + There's a useful example here that shows the general workflow in the bindings: https://htcondor.readthedocs.io/en/latest/apis/python-bindings/tutorials/DAG-Creation-And-Submission.html#Describing-the-DAG-using-htcondor.dags From 78d1849d8dfb26af8f37b1f6ac9037b00295c85b Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 5 Oct 2021 10:45:21 -0500 Subject: [PATCH 08/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index e41b17b45..0a6b17379 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -7,10 +7,10 @@ Date: 2021-09-22 ## Note This ADR is more of a place to keep the current discussions we had at https://docs.google.com/document/d/1AWjayMoqCoGkpO9-tjXxEvO40yYnFtcECbdne5vTURo Rather than to make a decision. There is still more planning, scoping and testing involved before we can fully design this system. -Still to be determined (not in scope of this ADR): - UI and how it relates to the bulk execution - XSV Analysis and how it relates to the bulk execution +Still to be determined (not in scope of this ADR): +* UI and how it relates to the bulk execution +* XSV Analysis and how it relates to the bulk execution ## Intro @@ -111,7 +111,6 @@ Still to be determined (not in scope of this ADR): * Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG * Different DevOps solutions - ## Pros and Cons of the Alternatives ### Limit multiple instances of kbparallels From eed6fbed88615dfb48d05752af197b1d5957b99d Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 09:09:07 -0600 Subject: [PATCH 09/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 0a6b17379..4286e3d48 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -22,7 +22,7 @@ In order to do this, some apps use a mechanism called KBParallel. The apps are l * Current UI is not adequate: Users shouldn’t have to code in order to run batch analysis. Also it’s difficult to do so, even for those familiar with KBase code (have to find object names) * Dependency on [KBParallel](https://github.com/kbaseapps/KBParallel): any changes to KBParallel could affect KB Batch and subsequently all other apps. -* Queue deadlocking: users have a max of 10 slots in the queue, with the current implementation one is taken up just to manage the jobs. Could lead to deadlock scenarios +* Queue deadlocking: users have a max of 10 slots in the queue, with the current implementation one management job is created to manage the jobs that it submits. This could lead to deadlock scenarios, as there can be 10 management jobs waiting to submit computation jobs, but they cannot, as there all slots are being used up. * KBP can spawn other KBP jobs. Batch jobs can spawn other batch jobs. * Missing the ability to be able to run, manage (cancel) and track jobs and their subjobs along with the ability to specify resources differently between the main and sub jobs * No good way to test and hard to benchmark or measure performance From 60489bc162f380337c7c8c4ad6adcfe9bcad9452 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 09:16:16 -0600 Subject: [PATCH 10/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 4286e3d48..59fbbd758 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -116,17 +116,17 @@ Still to be determined (not in scope of this ADR): ### Limit multiple instances of kbparallels * `+` Simplest solution, quickest turnaround, fixes deadlock issue -* `-` UI still broken for batch analysis +* `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs -## DEADLOCK: Increase number of slots or Seperate Queue for kbparallels apps without 10 job limit +### Increase number of slots or Seperate Queue for kbparallels apps without 10 job limit * `+` Simple solutions, quick turnarounds, fixes deadlock issue -* `-` UI still broken for batch analysis +* `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs * `-` A small amount of users can take over the entire system * `-` The calculations done by the apps will interfere with other apps and cause crashes/failures -## DEADLOCK: Modify KBP to do only local submission, Move the job to a machine with larger resources +### Modify KBP to do only local submission, Move the job to a machine with larger resources * `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues - * `-` We have a limited number of larger resources machines +* `-` We have a limited number of larger resources machines * `-` Continued dependency on deprecated KBP tools ### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate From 7cafa3d09eb468c7b80c245ced36f8fd92e833d7 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 09:24:59 -0600 Subject: [PATCH 11/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 59fbbd758..6ae757577 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -16,7 +16,14 @@ Still to be determined (not in scope of this ADR): ## Intro Sometimes a calculation requires too many resources from one node (walltime, memory, disk), so the calculation gets spread across multiple machines. The final step of the app that uses KBParallels is to create a report. This step may use results from all of the computed jobs to create the final report. -In order to do this, some apps use a mechanism called KBParallel. The apps are listed at [DATAUP-111](https://kbase-jira.atlassian.net/browse/DATAUP-111) +In order to do this, some apps use a mechanism called KBParallel. The apps listed at [git](https://github.com/search?q=kbparallel&type=code) all use kbparallels. There are 7 known apps +* kb_Bowtie2 +* refseq_importer +* kb_concoct +* kb_phylogenomics +* kb_hisat2 +* kb_meta_decoder +* kb_Bwa. ## The current implementation of Batch Analysis in [kb_BatchApp](https://github.com/kbaseapps/kb_BatchApp) at KBase has the following issues: From 2de187ae46519b31379212f5215036f95df32b44 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 09:25:17 -0600 Subject: [PATCH 12/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 6ae757577..e1c4fc234 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -23,7 +23,7 @@ In order to do this, some apps use a mechanism called KBParallel. The apps liste * kb_phylogenomics * kb_hisat2 * kb_meta_decoder -* kb_Bwa. +* kb_Bwa ## The current implementation of Batch Analysis in [kb_BatchApp](https://github.com/kbaseapps/kb_BatchApp) at KBase has the following issues: From c1e9f1527a75fa65de00284ba74a72581a61cbaa Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 09:25:56 -0600 Subject: [PATCH 13/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index e1c4fc234..6caf88dce 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -16,7 +16,7 @@ Still to be determined (not in scope of this ADR): ## Intro Sometimes a calculation requires too many resources from one node (walltime, memory, disk), so the calculation gets spread across multiple machines. The final step of the app that uses KBParallels is to create a report. This step may use results from all of the computed jobs to create the final report. -In order to do this, some apps use a mechanism called KBParallel. The apps listed at [git](https://github.com/search?q=kbparallel&type=code) all use kbparallels. There are 7 known apps +In order to do this, the following apps use a mechanism called KBParallel * kb_Bowtie2 * refseq_importer * kb_concoct From 8e1596973de8151c6485d1a406308089d54fcbf8 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 15:02:58 -0600 Subject: [PATCH 14/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 6caf88dce..56f038b3c 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -125,12 +125,21 @@ Still to be determined (not in scope of this ADR): * `+` Simplest solution, quickest turnaround, fixes deadlock issue * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs -### Increase number of slots or Seperate Queue for kbparallels apps without 10 job limit -* `+` Simple solutions, quick turnarounds, fixes deadlock issue +### Increase number of slots per user > 10 +* `+` Simple solutions, quick turnarounds, fixes deadlock issue for small numbers of jobs. +* `-` Doesn't fix deadlock issue as the user can still submit more KBP jobs * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs * `-` A small amount of users can take over the entire system * `-` The calculations done by the apps will interfere with other apps and cause crashes/failures +### Seperate Queue for kbparallels apps that may or may not have its own limit to running jobs. +* `+` Simple solutions, quick turnarounds, fixes deadlock issue +* `+` Requires minimum changes to ee2 and condor if condor supports this feature +* `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs +* `-` A small amount of users can take over the entire system unless the new queue has its own limit to running jobs then it prevents users from taking over. +* `-` The calculations done by the apps will interfere with other apps and cause crashes/failures + + ### Modify KBP to do only local submission, Move the job to a machine with larger resources * `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues * `-` We have a limited number of larger resources machines From db694914e511cd83358e644926316276ebbd816b Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 18:27:46 -0600 Subject: [PATCH 15/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 56f038b3c..86eaee45b 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -136,7 +136,7 @@ Still to be determined (not in scope of this ADR): * `+` Simple solutions, quick turnarounds, fixes deadlock issue * `+` Requires minimum changes to ee2 and condor if condor supports this feature * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs -* `-` A small amount of users can take over the entire system unless the new queue has its own limit to running jobs then it prevents users from taking over. +* `-` A small amount of users can take over the new queue unless the new queue has its own limit to running jobs then it prevents users from taking over. * `-` The calculations done by the apps will interfere with other apps and cause crashes/failures From 862baa760810b326581fdc9484ff767db74203f4 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Mon, 15 Nov 2021 18:36:53 -0600 Subject: [PATCH 16/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 86eaee45b..9b5b20dea 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -120,7 +120,7 @@ Still to be determined (not in scope of this ADR): ## Pros and Cons of the Alternatives -### Limit multiple instances of kbparallels +### Limit multiple instances of kbparallels by keeping a list of apps that use it * `+` Simplest solution, quickest turnaround, fixes deadlock issue * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs @@ -129,15 +129,15 @@ Still to be determined (not in scope of this ADR): * `+` Simple solutions, quick turnarounds, fixes deadlock issue for small numbers of jobs. * `-` Doesn't fix deadlock issue as the user can still submit more KBP jobs * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs -* `-` A small amount of users can take over the entire system -* `-` The calculations done by the apps will interfere with other apps and cause crashes/failures +* `-` A small amount of users can take over the entire system by being able to submit more than 10 jobs +* `-` > 10 nodes will be taken up by jobs that do little computation as each job gets its own node -### Seperate Queue for kbparallels apps that may or may not have its own limit to running jobs. -* `+` Simple solutions, quick turnarounds, fixes deadlock issue -* `+` Requires minimum changes to ee2 and condor if condor supports this feature +### Seperate Queue for kbparallels apps that allows multiple instances of apps that use kbparallels running on the same machine +* `+` Simple solutions, quick turnarounds, fixes deadlock issue , requires changes to ee2 using an allowlist and condor to make jobs in the new queue a consumable resource. * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs -* `-` A small amount of users can take over the new queue unless the new queue has its own limit to running jobs then it prevents users from taking over. -* `-` The calculations done by the apps will interfere with other apps and cause crashes/failures +* `-` A small amount of users can take over the new queue unless the new queue has its own limit to running jobs then it prevents users from taking over. +* `-` The calculations done by the apps will interfere with other apps in the new queue still and cause crashes/failures since they are doing more than managing the jobs right now. This would be solved by not allowing multple instances of apps that use KBP, but then it would waste a lot of resources. + ### Modify KBP to do only local submission, Move the job to a machine with larger resources From 9fb7e18f92e14fc5eba8262f252b72bf4bfc26fd Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 16 Nov 2021 10:26:51 -0600 Subject: [PATCH 17/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 9b5b20dea..f7819d349 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -130,15 +130,19 @@ Still to be determined (not in scope of this ADR): * `-` Doesn't fix deadlock issue as the user can still submit more KBP jobs * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs * `-` A small amount of users can take over the entire system by being able to submit more than 10 jobs -* `-` > 10 nodes will be taken up by jobs that do little computation as each job gets its own node - -### Seperate Queue for kbparallels apps that allows multiple instances of apps that use kbparallels running on the same machine -* `+` Simple solutions, quick turnarounds, fixes deadlock issue , requires changes to ee2 using an allowlist and condor to make jobs in the new queue a consumable resource. -* `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs -* `-` A small amount of users can take over the new queue unless the new queue has its own limit to running jobs then it prevents users from taking over. -* `-` The calculations done by the apps will interfere with other apps in the new queue still and cause crashes/failures since they are doing more than managing the jobs right now. This would be solved by not allowing multple instances of apps that use KBP, but then it would waste a lot of resources. - - +* `-` > 10 nodes will continue be taken up by jobs that do little computation as each job gets its own node + +### LIMIT KBP jobs to a maximum of 5 active jobs per user +* `+` Simple solution requires ee2 to maintain list of KBP apps, and add a KBP_LIMIT to jobs from this list. [Condor](https://github.com/kbase/condor/pull/26) will need KBP_LIMIT Added +* `+` List of apps is not frequently updated +* `-` If a new app uses KBP and their app is not on the list, it won't be limited by the KBP_LIMIT unless the owner lets us know. +* `-` If an existing app no longer uses KBP, their app is still limited unless the owner lets us know. +* `-` Nodes will continue be taken up by jobs that do little computation as each job gets its own node. + +### LIMIT KBP jobs to a maximum of 5 active jobs per user + Seperate queue for kbparallels apps +* `+` Allows us to group up KBP jobs onto fewer machines, instead of giving them their entire node +* `-` Requires going through each app and understanding the worst case computational needs in order to set the estimated cpu and memory needs for each app +* `-` Apps can interfere with other innocent apps and take them down ### Modify KBP to do only local submission, Move the job to a machine with larger resources * `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues From f98620690d1fd325d13bac25f0e39761b4d98044 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 16 Nov 2021 10:30:42 -0600 Subject: [PATCH 18/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index f7819d349..a3126496c 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -143,7 +143,8 @@ Still to be determined (not in scope of this ADR): * `+` Allows us to group up KBP jobs onto fewer machines, instead of giving them their entire node * `-` Requires going through each app and understanding the worst case computational needs in order to set the estimated cpu and memory needs for each app * `-` Apps can interfere with other innocent apps and take them down - +* `-` Creating a new queue requires balancing between how many active KBP nodes there vs how many nodes are available for other NJS jobs. + ### Modify KBP to do only local submission, Move the job to a machine with larger resources * `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues * `-` We have a limited number of larger resources machines From 1b9ad89f123ee94838af5c778c182e297e8b4267 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 18 Nov 2021 15:11:24 -0600 Subject: [PATCH 19/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 49 +++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index a3126496c..2fdf06591 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -117,13 +117,14 @@ Still to be determined (not in scope of this ADR): * Writing new ee2 endpoints to entirely handle batch execution and possibly use a DAG * Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG * Different DevOps solutions +* Rewriting KBP or swapping it out for a lightweight alternative that has a subset of the KBP features ## Pros and Cons of the Alternatives -### Limit multiple instances of kbparallels by keeping a list of apps that use it - -* `+` Simplest solution, quickest turnaround, fixes deadlock issue -* `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs +### General Notes +* With the current implementation of KBP, Having a separate KBP queue with multiple machines can save a spot from a user's 10 job maximum for running more jobs, but takes up / wastes compute resources (especially if the nodes sit idle). The user still gets 10 jobs, but there are less spots for jobs to run overall in the system if we make another queue, as this requires taking up more compute nodes that are currently dedicated to the NJS queue. +* Without changing apps that use KBP, running multiple KBP apps on the same machine can interfere with each other and we want to avoid this. +* If we scrap KBP in favor of a "lightweight alternative" we can avoid some of the previous issues, if we modify all apps that use KBP to use a lightweight alternative. A lightweight alternative would have to guarantee that no computation besides job management occured, and then we could have the management jobs sit and wait for other jobs without interfering with other jobs on the system. ### Increase number of slots per user > 10 * `+` Simple solutions, quick turnarounds, fixes deadlock issue for small numbers of jobs. @@ -131,24 +132,58 @@ Still to be determined (not in scope of this ADR): * `-` Addresses only the deadlocking issue, UI still broken for regular runs and batch runs * `-` A small amount of users can take over the entire system by being able to submit more than 10 jobs * `-` > 10 nodes will continue be taken up by jobs that do little computation as each job gets its own node +* `-` Capacity is still wasted, as some KBP jobs sit around waiting for other jobs to run -### LIMIT KBP jobs to a maximum of 5 active jobs per user +### LIMIT KBP jobs to a maximum of N<10 active KBP jobs per user * `+` Simple solution requires ee2 to maintain list of KBP apps, and add a KBP_LIMIT to jobs from this list. [Condor](https://github.com/kbase/condor/pull/26) will need KBP_LIMIT Added * `+` List of apps is not frequently updated +* `+` Apps do not need to be modified * `-` If a new app uses KBP and their app is not on the list, it won't be limited by the KBP_LIMIT unless the owner lets us know. * `-` If an existing app no longer uses KBP, their app is still limited unless the owner lets us know. * `-` Nodes will continue be taken up by jobs that do little computation as each job gets its own node. +* `-` Users may not be able to effectively use up their 10 job spots +* `-` Capacity is still wasted, as some KBP jobs sit around waiting for other jobs to run -### LIMIT KBP jobs to a maximum of 5 active jobs per user + Seperate queue for kbparallels apps +### LIMIT KBP jobs to a maximum of N<10 active jobs per user + Seperate queue for kbparallels apps +* `+` Same pros as above * `+` Allows us to group up KBP jobs onto fewer machines, instead of giving them their entire node * `-` Requires going through each app and understanding the worst case computational needs in order to set the estimated cpu and memory needs for each app * `-` Apps can interfere with other innocent apps and take them down * `-` Creating a new queue requires balancing between how many active KBP nodes there vs how many nodes are available for other NJS jobs. +* `-` Capacity is still wasted, as some KBP jobs sit around waiting for other jobs to run + + +### Build KBP Lightweight Version + KBP Queue + + +#### Design of new verison +* All apps must be modified to use the new KBP lightweight version, which will: +* Can either modify KBP, or create a new tool/package to use instead of KBP + +1) Launch a management job called the *Job Manager* that sits in the KBP Queue, alongside other KBP jobs. +2) Launch a new NJS queue job called the *Setup Job* which will + 1) Use the User Parameters and/or + 2) Download the Data from the initial parameters (Optional) + 3) The results of information gathered from the initial download and or paramters will be sent to (Job Manager) +3) The *Job Manager* now has enough parameters to setup *Fan Out* Jobs, and will send out jobs and wait for them (and possibly retry them upon failure) +4) Fan Out jobs download data and perform calculations, save them back to the system, and return references to the saved objects +5) The *Job Manager* optionally launches a *Group/Reduce* job based on User Parameters and or the results of *Fan Out* Jobs +6) The *Group/Reduce* job downloads objects from the system, and creates a set or other grouping, and then saves the set back to the system +7) The *Job Manager* sends the results back to a *Report* Job, which downloads the final data and uploads a report based on that data +8) The *Job Manager* returns the reference to the results of the *Report Job* + +Pros/Cons +* `-` Addresses the deadlocking issue, UI still broken for regular runs and batch runs if we re-use KBP +* `+` All KBP jobs can run on a small subset of machines +* `+` Job Manager +* `+` Minimal changes to ee2 required + -### Modify KBP to do only local submission, Move the job to a machine with larger resources +### Modify Apps to do only local submission by remove KBP, and moving the job * `+` Simple solutions, quick turnarounds, fixes deadlock issue, fixes UI issues * `-` We have a limited number of larger resources machines * `-` Continued dependency on deprecated KBP tools +* `-` App runs may take longer since fewer resources may be available to the app run ### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate * `+` No longer uses an app, No longer uses a slot in the queue From bebaad7d03ea772ee8ea194353f4036d0446f19b Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 18 Nov 2021 15:12:26 -0600 Subject: [PATCH 20/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 2fdf06591..52622b383 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -162,9 +162,9 @@ Still to be determined (not in scope of this ADR): 1) Launch a management job called the *Job Manager* that sits in the KBP Queue, alongside other KBP jobs. 2) Launch a new NJS queue job called the *Setup Job* which will - 1) Use the User Parameters and/or - 2) Download the Data from the initial parameters (Optional) - 3) The results of information gathered from the initial download and or paramters will be sent to (Job Manager) + * Use the User Parameters and/or + * Download the Data from the initial parameters (Optional) + * The results of information gathered from the initial download and or paramters will be sent to (Job Manager) 3) The *Job Manager* now has enough parameters to setup *Fan Out* Jobs, and will send out jobs and wait for them (and possibly retry them upon failure) 4) Fan Out jobs download data and perform calculations, save them back to the system, and return references to the saved objects 5) The *Job Manager* optionally launches a *Group/Reduce* job based on User Parameters and or the results of *Fan Out* Jobs From 39eec885df428191f20199c2b7d2a25352bc0349 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Thu, 18 Nov 2021 16:09:40 -0600 Subject: [PATCH 21/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 52622b383..bdd1905ca 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -160,13 +160,14 @@ Still to be determined (not in scope of this ADR): * All apps must be modified to use the new KBP lightweight version, which will: * Can either modify KBP, or create a new tool/package to use instead of KBP -1) Launch a management job called the *Job Manager* that sits in the KBP Queue, alongside other KBP jobs. -2) Launch a new NJS queue job called the *Setup Job* which will - * Use the User Parameters and/or - * Download the Data from the initial parameters (Optional) - * The results of information gathered from the initial download and or paramters will be sent to (Job Manager) +1) Launch a management job called the *Job Manager* that sits in the KBP Queue, alongside other KBP jobs. Other jobs are launched in the NJS queue. +2) Launch the *Setup Job* which will + * Use the *User Parameters* and/or + * Optionally Download the Data from the *User Parameters* to figure out *Job Manager* parameters + * Use the results of information gathered from the initial download and or *User Parameters* + * Generate final parameters to be sent to the *Job Manager* to launch *Fan Out* jobs 3) The *Job Manager* now has enough parameters to setup *Fan Out* Jobs, and will send out jobs and wait for them (and possibly retry them upon failure) -4) Fan Out jobs download data and perform calculations, save them back to the system, and return references to the saved objects +4) *Fan Out* jobs download data and perform calculations, save them back to the system, and return references to the saved objects 5) The *Job Manager* optionally launches a *Group/Reduce* job based on User Parameters and or the results of *Fan Out* Jobs 6) The *Group/Reduce* job downloads objects from the system, and creates a set or other grouping, and then saves the set back to the system 7) The *Job Manager* sends the results back to a *Report* Job, which downloads the final data and uploads a report based on that data @@ -174,9 +175,11 @@ Still to be determined (not in scope of this ADR): Pros/Cons * `-` Addresses the deadlocking issue, UI still broken for regular runs and batch runs if we re-use KBP -* `+` All KBP jobs can run on a small subset of machines -* `+` Job Manager -* `+` Minimal changes to ee2 required +* `-` Have to rewrite all apps that use KBP to use this new paradigm +* `-` Will have to upload data and download data more times as no computation is done in the *Job Manager* +* `+` All KBP jobs can run on a small subset of machines, deadlock issue is fixed +* `+` No changes to ee2 required + ### Modify Apps to do only local submission by remove KBP, and moving the job From 907cdca8819ce8e263c58ee9cfcfb98842d0ff55 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 10:23:21 -0600 Subject: [PATCH 22/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index bdd1905ca..66724f911 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -189,10 +189,11 @@ Pros/Cons * `-` App runs may take longer since fewer resources may be available to the app run ### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate +* See [ADR](https://github.com/kbase/execution_engine2/blob/develop/docs/adrs/rework-batch-analysis-architecture.md#implementing-a-new-set-of-ee2-endpoints) * `+` No longer uses an app, No longer uses a slot in the queue * `-` All jobs would have to change to stop using KBP in favor of these via the ee2 client, * `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs * `-` Job monitoring thread would have to run in ee2 * `-` Requires new data structures and models in ee2 to make sure that the relationship is preserved between jobs and sub jobs, and to make sure deadlocking does not occur -* `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* -* `-` The endpoint would need to know how to split up input files and aggregate them +* `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* (This is due to not wanting to use a HTC DAG in order to manage the running order, as all submitted jobs would start in random order, so this guarantees order) +* `-` The endpoint would need to know how to split up input files and aggregate them (crazypants) From a8eba2e67d3b7265eb5dfe2a8fc1db7a3b925615 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 10:46:59 -0600 Subject: [PATCH 23/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 66724f911..eb458fd88 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -165,8 +165,8 @@ Still to be determined (not in scope of this ADR): * Use the *User Parameters* and/or * Optionally Download the Data from the *User Parameters* to figure out *Job Manager* parameters * Use the results of information gathered from the initial download and or *User Parameters* - * Generate final parameters to be sent to the *Job Manager* to launch *Fan Out* jobs -3) The *Job Manager* now has enough parameters to setup *Fan Out* Jobs, and will send out jobs and wait for them (and possibly retry them upon failure) + * Generate final parameters to be sent to the *Job Manager* to launch *Fan Out* jobs, or launch *Fan Out* jobs and return job ids +3) The *Job Manager* now has enough parameters to launch and/or monitor *Fan Out* Jobs, and monitor/manage the jobs (and possibly retry them upon failure) 4) *Fan Out* jobs download data and perform calculations, save them back to the system, and return references to the saved objects 5) The *Job Manager* optionally launches a *Group/Reduce* job based on User Parameters and or the results of *Fan Out* Jobs 6) The *Group/Reduce* job downloads objects from the system, and creates a set or other grouping, and then saves the set back to the system @@ -174,12 +174,11 @@ Still to be determined (not in scope of this ADR): 8) The *Job Manager* returns the reference to the results of the *Report Job* Pros/Cons -* `-` Addresses the deadlocking issue, UI still broken for regular runs and batch runs if we re-use KBP -* `-` Have to rewrite all apps that use KBP to use this new paradigm -* `-` Will have to upload data and download data more times as no computation is done in the *Job Manager* +* `+` On an as needed basis, would have to rewrite apps that use KBP to use this new paradigm * `+` All KBP jobs can run on a small subset of machines, deadlock issue is fixed * `+` No changes to ee2 required - +* `-` Will have to upload data and download data more times as no computation is done in the *Job Manager* +* `-` Addresses the deadlocking issue, UI still broken for regular runs and batch runs if we re-use KBP ### Modify Apps to do only local submission by remove KBP, and moving the job @@ -191,9 +190,8 @@ Pros/Cons ### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate * See [ADR](https://github.com/kbase/execution_engine2/blob/develop/docs/adrs/rework-batch-analysis-architecture.md#implementing-a-new-set-of-ee2-endpoints) * `+` No longer uses an app, No longer uses a slot in the queue -* `-` All jobs would have to change to stop using KBP in favor of these via the ee2 client, * `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs -* `-` Job monitoring thread would have to run in ee2 +* `+` Apps could change to stop using KBP in favor of these via the ee2 client on an as-needed basis * `-` Requires new data structures and models in ee2 to make sure that the relationship is preserved between jobs and sub jobs, and to make sure deadlocking does not occur * `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* (This is due to not wanting to use a HTC DAG in order to manage the running order, as all submitted jobs would start in random order, so this guarantees order) * `-` The endpoint would need to know how to split up input files and aggregate them (crazypants) From 9a677532551fd5ad93984a2f729e9d703c303bef Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 10:48:15 -0600 Subject: [PATCH 24/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index eb458fd88..8840d6f54 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -146,6 +146,7 @@ Still to be determined (not in scope of this ADR): ### LIMIT KBP jobs to a maximum of N<10 active jobs per user + Seperate queue for kbparallels apps * `+` Same pros as above +* `+` Users will be able to more effectively use their 10 job spots * `+` Allows us to group up KBP jobs onto fewer machines, instead of giving them their entire node * `-` Requires going through each app and understanding the worst case computational needs in order to set the estimated cpu and memory needs for each app * `-` Apps can interfere with other innocent apps and take them down From 60cf061cb77d334f9cac17f160847c26bf69c806 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 10:49:32 -0600 Subject: [PATCH 25/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 8840d6f54..470449d62 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -103,7 +103,6 @@ bindings: https://htcondor.readthedocs.io/en/latest/apis/python-bindings/tutoria ## Consequences -* We will have to roll out fixes in multiple stages * We will have to implement a new narrative UI, however this was work that would happen regardless due as we are looking to improve the UX for batch upload and analysis at KBase. * This will take significant time to further research and engineer the solutions From 517ebeac3923905f5e63f5f4e1eb8ea11fa51e99 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 12:16:36 -0600 Subject: [PATCH 26/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 470449d62..692cc77f1 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -186,12 +186,3 @@ Pros/Cons * `-` We have a limited number of larger resources machines * `-` Continued dependency on deprecated KBP tools * `-` App runs may take longer since fewer resources may be available to the app run - -### Deprecate kbparallels, and write new ee2 endpoints to entirely handle split and aggregate -* See [ADR](https://github.com/kbase/execution_engine2/blob/develop/docs/adrs/rework-batch-analysis-architecture.md#implementing-a-new-set-of-ee2-endpoints) -* `+` No longer uses an app, No longer uses a slot in the queue -* `+` Can manage jobs more closely: have tables, more control over lifecycle, resource requests, canceling subjobs -* `+` Apps could change to stop using KBP in favor of these via the ee2 client on an as-needed basis -* `-` Requires new data structures and models in ee2 to make sure that the relationship is preserved between jobs and sub jobs, and to make sure deadlocking does not occur -* `-` Requires storing the job outside of condor to prevent job submission, unless we can mark jobs as runnable or not via the HTCondor API* (This is due to not wanting to use a HTC DAG in order to manage the running order, as all submitted jobs would start in random order, so this guarantees order) -* `-` The endpoint would need to know how to split up input files and aggregate them (crazypants) From 2189149171dac68438ef48ce4c54c292d5f9c20b Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 13:34:08 -0600 Subject: [PATCH 27/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index 692cc77f1..e1d5c1636 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -157,27 +157,28 @@ Still to be determined (not in scope of this ADR): #### Design of new verison + + * All apps must be modified to use the new KBP lightweight version, which will: * Can either modify KBP, or create a new tool/package to use instead of KBP + 1) Launch a management job called the *Job Manager* that sits in the KBP Queue, alongside other KBP jobs. Other jobs are launched in the NJS queue. 2) Launch the *Setup Job* which will * Use the *User Parameters* and/or * Optionally Download the Data from the *User Parameters* to figure out *Job Manager* parameters * Use the results of information gathered from the initial download and or *User Parameters* - * Generate final parameters to be sent to the *Job Manager* to launch *Fan Out* jobs, or launch *Fan Out* jobs and return job ids + * Generate final parameters to be sent to the *Job Manager* to launch *Fan Out* jobs, or directly launch *Fan Out* jobs and return job ids 3) The *Job Manager* now has enough parameters to launch and/or monitor *Fan Out* Jobs, and monitor/manage the jobs (and possibly retry them upon failure) 4) *Fan Out* jobs download data and perform calculations, save them back to the system, and return references to the saved objects -5) The *Job Manager* optionally launches a *Group/Reduce* job based on User Parameters and or the results of *Fan Out* Jobs -6) The *Group/Reduce* job downloads objects from the system, and creates a set or other grouping, and then saves the set back to the system -7) The *Job Manager* sends the results back to a *Report* Job, which downloads the final data and uploads a report based on that data +5) The *Job Manager* launches one *FanIn* job based on User Parameters and or the results of *Fan Out* Jobs +6) The *FanIn* (a.k.a Group/Reduce/Report) job downloads objects from the system, and creates a set or other grouping, and then saves the object(s) back to the system. Final data and report is uploaded back to the system 8) The *Job Manager* returns the reference to the results of the *Report Job* Pros/Cons * `+` On an as needed basis, would have to rewrite apps that use KBP to use this new paradigm * `+` All KBP jobs can run on a small subset of machines, deadlock issue is fixed * `+` No changes to ee2 required -* `-` Will have to upload data and download data more times as no computation is done in the *Job Manager* * `-` Addresses the deadlocking issue, UI still broken for regular runs and batch runs if we re-use KBP From 80be1070c891f0832839f34455398d3369e78199 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 23 Nov 2021 15:42:09 -0600 Subject: [PATCH 28/32] Update 004-SplitAndAggregate.md --- docs/adrs/004-SplitAndAggregate.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/adrs/004-SplitAndAggregate.md b/docs/adrs/004-SplitAndAggregate.md index e1d5c1636..1bbf641e1 100644 --- a/docs/adrs/004-SplitAndAggregate.md +++ b/docs/adrs/004-SplitAndAggregate.md @@ -113,11 +113,11 @@ Still to be determined (not in scope of this ADR): ## Alternatives Considered * Ignore most issues and just make apps that run kbparallels limited to N instances of kbparallels per user to avoid deadlocks -* Writing new ee2 endpoints to entirely handle batch execution and possibly use a DAG * Remove kbparallels and change apps to a collection of 2-3 apps that do submit, split and aggregate and an use an ee2 endpoint to create a DAG * Different DevOps solutions * Rewriting KBP or swapping it out for a lightweight alternative that has a subset of the KBP features + ## Pros and Cons of the Alternatives ### General Notes @@ -176,10 +176,11 @@ Still to be determined (not in scope of this ADR): 8) The *Job Manager* returns the reference to the results of the *Report Job* Pros/Cons -* `+` On an as needed basis, would have to rewrite apps that use KBP to use this new paradigm + * `+` All KBP jobs can run on a small subset of machines, deadlock issue is fixed * `+` No changes to ee2 required * `-` Addresses the deadlocking issue, UI still broken for regular runs and batch runs if we re-use KBP +* `-` On an as needed basis, would have to rewrite apps that use KBP to use this new paradigm ### Modify Apps to do only local submission by remove KBP, and moving the job From f32c11f3a53f6061721b8cf5c9b68d07e039ad72 Mon Sep 17 00:00:00 2001 From: Gavin Date: Fri, 17 Dec 2021 15:42:01 -0800 Subject: [PATCH 29/32] Fix tests on non-UTC systems The tests pass with or without the changes in this commit in the docker container, which is UTC, but fail without the changes on my laptop, which is not UTC. The problem is that `datetime.utcnow()` is not timezone aware, so it only works on UTC systems. --- lib/execution_engine2/db/MongoUtil.py | 4 ++-- test/tests_for_db/ee2_MongoUtil_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 349b066bc..3588ada9b 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -3,7 +3,7 @@ import time import traceback from contextlib import contextmanager -from datetime import datetime +from datetime import datetime, timezone from typing import Dict, List from bson.objectid import ObjectId from mongoengine import connect, connection @@ -285,7 +285,7 @@ def update_jobs_to_queued( bulk_update_scheduler_jobs = [] bulk_update_created_to_queued = [] - queue_time_now = datetime.utcnow().timestamp() + queue_time_now = datetime.now(tz=timezone.utc).timestamp() for job_id_pair in job_id_pairs: if job_id_pair.job_id is None: raise ValueError( diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index 9591f16a1..e0719a5f1 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -2,7 +2,7 @@ import logging import os import unittest -from datetime import datetime +from datetime import datetime, timezone from bson.objectid import ObjectId @@ -87,7 +87,7 @@ def test_update_jobs_enmasse(self): scheduler_ids = ["humpty", "dumpty", "alice"] jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) - now_ms = datetime.utcnow().timestamp() + now_ms = datetime.now(tz=timezone.utc).timestamp() self.getMongoUtil().update_jobs_to_queued(jobs_to_update) job.reload() From e7b51154a1da4e41515ce7aafbc6a7841f496f2a Mon Sep 17 00:00:00 2001 From: Gavin Date: Sun, 19 Dec 2021 18:07:11 -0800 Subject: [PATCH 30/32] Switch to time.time() --- lib/execution_engine2/db/MongoUtil.py | 3 +-- test/tests_for_db/ee2_MongoUtil_test.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 3588ada9b..88527d029 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -3,7 +3,6 @@ import time import traceback from contextlib import contextmanager -from datetime import datetime, timezone from typing import Dict, List from bson.objectid import ObjectId from mongoengine import connect, connection @@ -285,7 +284,7 @@ def update_jobs_to_queued( bulk_update_scheduler_jobs = [] bulk_update_created_to_queued = [] - queue_time_now = datetime.now(tz=timezone.utc).timestamp() + queue_time_now = time.time() for job_id_pair in job_id_pairs: if job_id_pair.job_id is None: raise ValueError( diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index e0719a5f1..c13388a8e 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- import logging import os +import time import unittest -from datetime import datetime, timezone from bson.objectid import ObjectId @@ -87,8 +87,7 @@ def test_update_jobs_enmasse(self): scheduler_ids = ["humpty", "dumpty", "alice"] jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) - now_ms = datetime.now(tz=timezone.utc).timestamp() - + now_ms = time.time() self.getMongoUtil().update_jobs_to_queued(jobs_to_update) job.reload() job2.reload() From 0465820b47fa623bca8cf891fd19a9359bc21168 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Sun, 19 Dec 2021 20:36:12 -0800 Subject: [PATCH 31/32] DATAUP-679: Add MongoUtil method to update a single job to queued (#429) * Add MongoUtil method to update a single job to queued * run black --- lib/execution_engine2/db/MongoUtil.py | 36 ++++++++++++++++++++++++ test/tests_for_db/ee2_MongoUtil_test.py | 37 +++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index 88527d029..c7feb0f6a 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -271,6 +271,42 @@ def check_if_already_finished(job_status): return True return False + def update_job_to_queued( + self, job_id: str, scheduler_id: str, scheduler_type: str = "condor" + ) -> None: + f""" + * Updates a {Status.created.value} job to queued and sets scheduler state. + Always sets scheduler state, but will only update to queued if the job is in the + {Status.created.value} state. + :param job_id: the ID of the job. + :param scheduler_id: the scheduler's job ID for the job. + :param scheduler_type: The scheduler this job was queued in, default condor + """ + if not job_id or not scheduler_id or not scheduler_type: + raise ValueError("None of the 3 arguments can be falsy") + # could also test that the job ID is a valid job ID rather than having mongo throw an + # error + mongo_collection = self.config["mongo-jobs-collection"] + queue_time_now = time.time() + with self.pymongo_client(mongo_collection) as pymongo_client: + ee2_jobs_col = pymongo_client[self.mongo_database][mongo_collection] + # should we check that the job was updated and do something if it wasn't? + ee2_jobs_col.update_one( + {"_id": ObjectId(job_id), "status": Status.created.value}, + {"$set": {"status": Status.queued.value, "queued": queue_time_now}}, + ) + # originally had a single query, but seems safer to always record the scheduler + # state no matter the state of the job + ee2_jobs_col.update_one( + {"_id": ObjectId(job_id)}, + { + "$set": { + "scheduler_id": scheduler_id, + "scheduler_type": scheduler_type, + } + }, + ) + def update_jobs_to_queued( self, job_id_pairs: List[JobIdPair], scheduler_type: str = "condor" ) -> None: diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index c13388a8e..2f6a819c5 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -3,6 +3,7 @@ import os import time import unittest +from pytest import raises from bson.objectid import ObjectId @@ -13,6 +14,8 @@ bootstrap, get_example_job, read_config_into_dict, + assert_exception_correct, + assert_close_to_now, ) from tests_for_db.mongo_test_helper import MongoTestHelper @@ -70,6 +73,40 @@ def test_insert_jobs(self): for i, retrieved_job in enumerate(retrieved_jobs): assert jobs_to_insert[i].to_json() == retrieved_job.to_json() + def test_update_job_to_queued_fail_with_bad_args(self): + jid = "aaaaaaaaaaaaaaaaaaaaaaaa" + err = ValueError("None of the 3 arguments can be falsy") + self.update_job_to_queued_fail(None, "sid", "sch", err) + self.update_job_to_queued_fail("", "sid", "sch", err) + self.update_job_to_queued_fail(jid, None, "sch", err) + self.update_job_to_queued_fail(jid, "", "sch", err) + self.update_job_to_queued_fail(jid, "sid", None, err) + self.update_job_to_queued_fail(jid, "sid", "", err) + + def update_job_to_queued_fail(self, job_id, schd_id, schd, expected): + with raises(Exception) as got: + self.getMongoUtil().update_job_to_queued(job_id, schd_id, schd) + assert_exception_correct(got.value, expected) + + def test_update_job_to_queued(self): + for state in Status: + j = get_example_job(status=state.value) + j.scheduler_id = None + j.save() + assert j.scheduler_id is None + + self.getMongoUtil().update_job_to_queued(j.id, "schdID", "condenast") + j.reload() + assert_close_to_now(j.updated) + assert j.scheduler_id == "schdID" + assert j.scheduler_type == "condenast" + if state == Status.created: + assert_close_to_now(j.queued) + assert j.status == Status.queued.value + else: + assert j.queued is None + assert j.status == state.value + def test_update_jobs_enmasse(self): """Check to see that created jobs get updated to queued""" for state in Status: From 18408b25510383b44025c9dde9cf4d4813977f52 Mon Sep 17 00:00:00 2001 From: MrCreosote Date: Sun, 19 Dec 2021 21:17:47 -0800 Subject: [PATCH 32/32] DATAUP-679: Update job to queued directly after submit (#430) * Update job to queued directly after submit If many jobs are submitted in a batch, the jobs can start running before being updated to queued, which means the jobs never get a queued timestamp. * run black * Bump version, add release notes --- RELEASE_NOTES.md | 4 +++ kbase.yml | 2 +- lib/execution_engine2/db/MongoUtil.py | 8 +++-- .../execution_engine2Impl.py | 2 +- lib/execution_engine2/sdk/EE2Runjob.py | 32 ++++++++----------- test/tests_for_db/ee2_MongoUtil_test.py | 3 +- test/tests_for_sdkmr/EE2Runjob_test.py | 15 +++++---- 7 files changed, 35 insertions(+), 31 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 68376cfdd..750cb8cb9 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,10 @@ # execution_engine2 (ee2) release notes ========================================= +## 0.0.7 +* Fixed a bug that could cause missing `queued` timestamps if many jobs were submitted in a + batch + ## 0.0.6 * Release of MVP diff --git a/kbase.yml b/kbase.yml index 0cee4a309..15f81f845 100644 --- a/kbase.yml +++ b/kbase.yml @@ -8,7 +8,7 @@ service-language: python module-version: - 0.0.5 + 0.0.7 owners: [bsadkhin, tgu2, wjriehl, gaprice] diff --git a/lib/execution_engine2/db/MongoUtil.py b/lib/execution_engine2/db/MongoUtil.py index c7feb0f6a..f71d53cc1 100644 --- a/lib/execution_engine2/db/MongoUtil.py +++ b/lib/execution_engine2/db/MongoUtil.py @@ -3,7 +3,7 @@ import time import traceback from contextlib import contextmanager -from typing import Dict, List +from typing import Dict, List, NamedTuple from bson.objectid import ObjectId from mongoengine import connect, connection from pymongo import MongoClient, UpdateOne @@ -16,7 +16,11 @@ ) from lib.execution_engine2.utils.arg_processing import parse_bool -from execution_engine2.sdk.EE2Runjob import JobIdPair + + +class JobIdPair(NamedTuple): + job_id: str + scheduler_id: str class MongoUtil: diff --git a/lib/execution_engine2/execution_engine2Impl.py b/lib/execution_engine2/execution_engine2Impl.py index 5b6366de5..2be7e6813 100644 --- a/lib/execution_engine2/execution_engine2Impl.py +++ b/lib/execution_engine2/execution_engine2Impl.py @@ -28,7 +28,7 @@ class execution_engine2: # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa - VERSION = "0.0.5" + VERSION = "0.0.7" GIT_URL = "https://github.com/mrcreosote/execution_engine2.git" GIT_COMMIT_HASH = "2ad95ce47caa4f1e7b939651f2b1773840e67a8a" diff --git a/lib/execution_engine2/sdk/EE2Runjob.py b/lib/execution_engine2/sdk/EE2Runjob.py index ec6d3952c..2beed9a7a 100644 --- a/lib/execution_engine2/sdk/EE2Runjob.py +++ b/lib/execution_engine2/sdk/EE2Runjob.py @@ -75,11 +75,6 @@ class PreparedJobParams(NamedTuple): job_id: str -class JobIdPair(NamedTuple): - job_id: str - scheduler_id: str - - from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -312,17 +307,11 @@ def _run_multiple(self, runjob_params: List[Dict]): ).start() return job_ids - def _update_to_queued_multiple(self, job_ids, scheduler_ids): + def _finish_multiple_job_submission(self, job_ids): """ This is called during job submission. If a job is terminated during job submission, we have the chance to re-issue a termination and remove the job from the Job Queue """ - if len(job_ids) != len(scheduler_ids): - raise Exception( - "Need to provide the same amount of job ids and scheduler_ids" - ) - jobs_to_update = list(map(JobIdPair, job_ids, scheduler_ids)) - self.sdkmr.get_mongo_util().update_jobs_to_queued(jobs_to_update) jobs = self.sdkmr.get_mongo_util().get_jobs(job_ids) for job in jobs: @@ -377,14 +366,21 @@ def _submit_multiple(self, job_submission_params): ) raise RuntimeError(error_msg) condor_job_ids.append(condor_job_id) - - self.logger.error(f"It took {time.time() - begin} to submit jobs to condor") - # It took 4.836009502410889 to submit jobs to condor + # Previously the jobs were updated in a batch after submitting all jobs to condor. + # This led to issues where a large job count could result in jobs switching to + # running prior to all jobs being submitted and so the queued timestamp was + # never added to the job record. + self.sdkmr.get_mongo_util().update_job_to_queued(job_id, condor_job_id) + + self.logger.error( + f"It took {time.time() - begin} to submit jobs to condor and update to queued" + ) update_time = time.time() - self._update_to_queued_multiple(job_ids=job_ids, scheduler_ids=condor_job_ids) - # It took 1.9239885807037354 to update jobs - self.logger.error(f"It took {time.time() - update_time} to update jobs ") + self._finish_multiple_job_submission(job_ids=job_ids) + self.logger.error( + f"It took {time.time() - update_time} to finish job submission" + ) return job_ids diff --git a/test/tests_for_db/ee2_MongoUtil_test.py b/test/tests_for_db/ee2_MongoUtil_test.py index 2f6a819c5..32933e99b 100644 --- a/test/tests_for_db/ee2_MongoUtil_test.py +++ b/test/tests_for_db/ee2_MongoUtil_test.py @@ -7,9 +7,8 @@ from bson.objectid import ObjectId -from execution_engine2.db.MongoUtil import MongoUtil +from execution_engine2.db.MongoUtil import MongoUtil, JobIdPair from execution_engine2.db.models.models import Job, JobLog, Status -from execution_engine2.sdk.EE2Runjob import JobIdPair from test.utils_shared.test_utils import ( bootstrap, get_example_job, diff --git a/test/tests_for_sdkmr/EE2Runjob_test.py b/test/tests_for_sdkmr/EE2Runjob_test.py index 6dec768bf..5d2c42f85 100644 --- a/test/tests_for_sdkmr/EE2Runjob_test.py +++ b/test/tests_for_sdkmr/EE2Runjob_test.py @@ -27,7 +27,7 @@ AuthError, InvalidParameterForBatch, ) -from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions, JobIdPair +from execution_engine2.sdk.EE2Runjob import EE2RunJob, JobPermissions from execution_engine2.sdk.SDKMethodRunner import SDKMethodRunner from execution_engine2.sdk.job_submission_parameters import ( JobSubmissionParameters, @@ -908,12 +908,13 @@ def _check_common_mock_calls_batch( ) # update to queued state - child_job_pairs = [ - JobIdPair(_JOB_ID_1, _CLUSTER_1), - JobIdPair(_JOB_ID_2, _CLUSTER_2), - ] - mocks[MongoUtil].update_jobs_to_queued.assert_has_calls([call(child_job_pairs)]) - job_ids = [child_job_pair.job_id for child_job_pair in child_job_pairs] + mocks[MongoUtil].update_job_to_queued.assert_has_calls( + [ + call(_JOB_ID_1, _CLUSTER_1), + call(_JOB_ID_2, _CLUSTER_2), + ] + ) + job_ids = [_JOB_ID_1, _JOB_ID_2] mocks[MongoUtil].get_jobs.assert_has_calls([call(job_ids)]) if not terminated_during_submit: