Merge pull request #120 from valeriupredoi/dev_parrallel_ts

Batch timeseries analysis in slurm
valeriupredoi · Nov 28, 2023 · b086386 · b086386
2 parents 2f9c912 + 355c8c4
commit b086386
Show file tree

Hide file tree

Showing 12 changed files with 601 additions and 4 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -41,6 +41,7 @@ jobs:
           analysis_level3_sameYear --help
           analysis_p2p --help
           analysis_timeseries --help
+          batch_timeseries --help
           bgcval2_make_report --help
           download_from_mass --help
       - shell: bash -l {0}

diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,7 @@ local_test/BGC_data/valeriu
 mass_scripts
 CompareReports2
 .idea/workspace.xml
+*.iml
+.idea/inspectionProfiles/profiles_settings.xml
+.idea/misc.xml
+.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
 [![Github Actions Test](https://github.com/valeriupredoi/bgcval2/actions/workflows/run-tests.yml/badge.svg)](https://github.com/valeriupredoi/bgcval2/actions/workflows/run-tests.yml)
 
-![bgcval2logo](https://github.com/valeriupredoi/bgcval2/blob/main/doc/figures/BGCVal2-logo-2.png)
+![bgcval2logo](https://github.com/valeriupredoi/bgcval2/blob/main/doc/figures/bgcval2_logo_v_small.png)
 
 bgcval2
 =======
@@ -115,6 +115,7 @@ Executable name | What it does | Command
 `bgcval` | runs time series and point to point. | bgcval jobID
 `bgcval2_make_report` | makes the single model HTML report. | bgcval2_make_report jobID
 `analysis_compare` | runs comparison of multiple single jobs  | analysis_compare
+`batch_timeseries` | Submits single job time series analysis to slurm | batch_timeseries
 
 
 ### Checking out development branches
@@ -319,6 +320,50 @@ then the report will appear on the [JASMIN public facing page](https://gws-acces
 which is public facing but password protected.
 
 
+Batch times series Analysis
+===========================
+
+The `batch_timeseries` tool can take an `analysis_compare` input yaml file,
+and instead of running the time series analysis for each job on
+the interactive shell terminal in series, it uses slurm to submit
+each job as an independent job. 
+
+On jasmin, users can run up to five jobs simulataneously,
+so this can singnificantly boost the speed of the analysis. 
+
+The command to run it is:
+```
+batch_timeseries - y comparison_recipe.yml
+```
+
+This will submit a time-series analysis for each job, using a command which looks like this:
+```
+sbatch -J jobID --error=logs/jobID .err --output=logs/jobID .out lotus_timeseries.sh jobID  kmf physics bgc
+```
+The output and error messages will be in the `logs` directory with the jobID as the file prefix.
+The job name on slurm will also be the jobID, so it's easy to tell which jobs are running.
+The analysis suites will be appended as a list to the end of the command.
+In order to reduce the chance of analysing the same jobID twice, `batch_timeseries`
+checks whether a job exists, either currently running or in the queue before submitting.
+If a jobID exists, it is not re-submitted. However, this means that
+if two versions of the same jobID are submitted one after the other
+with different suite lists (`kmf`, `physics`, `bgc`), then only the first 
+set of suites will be run. 
+
+There is also an optional flag `-d` or `--dry_run` to test `batch_timeseries`, 
+which outputs the submission command to screen but does not submit the jobs.
+
+Note that this task does not run the `analysis_compare` suite so it will 
+not generate the html report. However, the html report can be generated more quickly
+with the `-s` argument to skip the `analysis_timeseries` section 
+described above. 
+
+In addition, note that this will not run the `download_from_mass`
+script, so jobs added here will not be included in the automated download.
+However, these jobs are added for automated download when `analysis_compare` 
+is used. 
+
+
 Downloading data using MASS
 ===========================
 

diff --git a/bgcval2/analysis_timeseries.py b/bgcval2/analysis_timeseries.py
@@ -733,7 +733,8 @@ def applyLandMask1e3(nc, keys):
                 gridFile=av[name]['gridFile'],
                 clean=False,
             )
-
+    print("analysis_timeseries:\tINFO:\tEnd of the timeseries analysis", jobID, suites)
+
 
 def get_args():
     """Parse command line arguments. """

diff --git a/bgcval2/batch_timeseries.py b/bgcval2/batch_timeseries.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+#
+# Copyright 2015, Plymouth Marine Laboratory
+#
+# This file is part of the bgc-val library.
+#
+# bgc-val is free software: you can redistribute it and/or modify it
+# under the terms of the Revised Berkeley Software Distribution (BSD) 3-clause license.
+
+# bgc-val is distributed in the hope that it will be useful, but
+# without any warranty; without even the implied warranty of merchantability
+# or fitness for a particular purpose. See the revised BSD license for more details.
+# You should have received a copy of the revised BSD license along with bgc-val.
+# If not, see <http://opensource.org/licenses/BSD-3-Clause>.
+#
+# Address:
+# Plymouth Marine Laboratory
+# Prospect Place, The Hoe
+# Plymouth, PL1 3DH, UK
+#
+# Email:
+# [email protected]
+#
+"""
+.. module:: batch_timeseries
+   :platform: Unix
+   :synopsis: A script to submit slurm scripts time series.
+
+.. moduleauthor:: Lee de Mora <[email protected]>
+
+"""
+import argparse
+import subprocess
+import os
+import sys
+
+from getpass import getuser
+
+from bgcval2.analysis_compare import load_comparison_yml
+
+
+def get_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-y',
+                        '--compare_yml',
+                        nargs='+',
+                        type=str,
+                        help='One or more Comparison Analysis configuration file, for examples see bgcval2 input_yml directory.',
+                        required=True,
+                        )
+
+    parser.add_argument('-c',
+                        '--config-file',
+                        default=os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                             'default-bgcval2-config.yml'),
+                        help='User configuration file (for paths).',
+                        required=False)
+
+    parser.add_argument('--dry_run',
+                        '-d',
+                        default=False,
+                        help='When True: Do not submit the jobs to lotus.',
+                        action=argparse.BooleanOptionalAction,
+                        required=False)
+
+    args = parser.parse_args()
+    return args
+
+
+def submits_lotus(compare_yml, config_user, dry_run=False):
+    """
+     Loads the yaml file and submits individual time series to sbatch.
+     """
+    # Load details from yml file
+    details = load_comparison_yml(compare_yml)
+
+    # list of job IDS
+    jobs = details['jobs']
+
+    # username
+    user = getuser()
+
+    # Load current on-going list of this users slurm jobs:
+    out = str(subprocess.check_output(["squeue", "--user="+user]))
+
+    # loop over jobs:
+    for job in jobs:
+        # Check whether there's already a job running for this jobID
+        if out.find(job) > -1:
+            print("That job exists already: skipping", job)
+            continue
+
+        # Get list of suites for each job
+        suites = details['suites'][job]
+
+        # Make it a list:
+        if isinstance(suites, str):
+            suites = suites.split(' ')
+
+        # prepare the command
+        command_txt = ['sbatch', 
+             '-J', job, 
+              ''.join(['--error=logs/', job,'.err']),
+              ''.join(['--output=logs/', job,'.out']),
+             'lotus_timeseries.sh', job]
+        for suite in suites:
+            command_txt.append(suite)
+
+        # Send it!
+        if dry_run:
+            print('Not submitting (dry-run):', ' '.join(command_txt))
+        else:
+            # Submit job:
+            print('Submitting:', ' '.join(command_txt))
+            #command1 = subprocess.Popen(command_txt)
+            command1 = subprocess.Popen(
+                  command_txt,
+                  stdout=subprocess.PIPE,
+                  stderr=subprocess.STDOUT,
+            )
+
+
+def main():
+
+    """Run the main routine."""
+    args = get_args()
+
+    # This has a sensible default value.
+    config_user=args.config_file
+
+    # This shouldn't fail as it's a required argument.
+    compare_ymls = args.compare_yml
+
+    for compare_yml in compare_ymls:
+        print(f"analysis_timeseries: Comparison config file {compare_yml}")
+
+        if not os.path.isfile(compare_yml):
+            print(f"analysis_timeseries: Could not find comparison config file {compare_yml}")
+            sys.exit(1)
+        dry_run = args.dry_run
+        submits_lotus(compare_yml, config_user, dry_run)
+
+
+if __name__ == "__main__":
+    from ._version import __version__
+    print(f'BGCVal2: {__version__}')
+    main()
+
diff --git a/bgcval2/timeseries/timeseriesAnalysis.py b/bgcval2/timeseries/timeseriesAnalysis.py
@@ -137,7 +137,7 @@ def loadModel(self):
         if self.debug: print("timeseriesAnalysis:\tloadModel.")
         ####
         # load and calculate the model info
-        if glob.glob(self.shelvefn):
+        if glob.glob(self.shelvefn+'*'): # shelve files have .bak .dat .dir files now
             sh = shOpen(self.shelvefn)
             print('seems fine:', self.shelvefn)
             sh = shOpen(self.shelvefn)

diff --git a/doc/figures/bgcval2_logo_v_small.png b/doc/figures/bgcval2_logo_v_small.png