Merge branch 'refs/heads/devel'

hubmapconsortium · Apr 25, 2024 · e43d07d · e43d07d
2 parents 66bf25b + 0f1b3d1
commit e43d07d
Show file tree

Hide file tree

Showing 21 changed files with 500 additions and 321 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+extend-ignore = E203,E501,W503
+max-line-length = 99
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -0,0 +1,32 @@
+name: Linters
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 black isort
+      - name: Flake8 Lint
+        run: |
+          flake8 --ignore=E501,W503,E203 .
+      - name: Black Lint
+        run: |
+          black --line-length 99 --check --verbose .
+      - name: isort Lint
+        run: |
+          isort --profile black --check-only --diff .
diff --git a/.gitignore b/.gitignore
@@ -105,6 +105,7 @@ celerybeat.pid
 
 # Environments
 .env
+.envrc
 .venv
 env/
 venv/

diff --git a/README.md b/README.md
@@ -1,9 +1,58 @@
 # ingest-validation-tests
 
-This repository contains plug-in tests for use during validation of submissions.  It is referenced by ingest-validation-tools.
+This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.
 
 ## Development process
 
+### Branches
+
 - Make new feature branches from `devel`.
+- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory:
+
+  ```
+  black --line-length 99 .
+  isort --profile black --multi-line 3 .
+  flake8
+  ```
+
+  (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.)
+
 - Make PRs to `devel`. (This is the default branch.)
-- The last reviewer to approve a PR should merge it.  At the moment that is likely to be @jswelling .
+- The last reviewer to approve a PR should merge it.
+
+### Setup
+
+- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv:
+
+  ```
+  python3.9 -m venv hm-ingest-validation-tests
+  source hm-ingest-validation-tests/bin/activate
+  ```
+
+- Run `pip install -r requirements-dev.txt`
+- (optional) Integrate black with your editor.
+  - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
+- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor.
+- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor.
+
+### Testing
+
+- If ingest-validation-tools is not already set up:
+
+  ```
+  # Starting from ingest-validation-tests...
+  cd ..
+  git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
+  cd ingest-validation-tests
+  pip install -r ../ingest-validation-tools/requirements.txt
+  pip install -r ../ingest-validation-tools/requirements-dev.txt
+  ```
+
+- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run:
+
+  ```
+  pip install -r <path-to-ingest-validation-tools>/requirements.txt
+  pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
+  ```
+
+- Run `test.sh`
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[tool.black]
+line-length = 99
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,11 @@
+black==23.12.1
+flake8==7.0.0
+git+https://github.com/hubmapconsortium/[email protected]#egg=hubmap-fastq-utils
+imagecodecs>=2023.3.16
+isort==5.13.2
+jsonschema==4.4.0
+pandas>=1.2.0
+pytest==8.0.0
+python-frontmatter>=1.0.0
+tifffile==2020.10.1
+xmlschema>=1.6
diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py
@@ -20,22 +20,20 @@ def _split_cycle_dir_string(cycle_str):
     """
     Given a cycle-and-region directory name, split out the cyle and region numbers
     """
-    words = cycle_str.split('_')
+    words = cycle_str.split("_")
     assert len(words) >= 2, f'Directory string "{cycle_str}" has unexpected form'
-    assert words[0].startswith('cyc'), (f'directory string "{cycle_str}" does'
-                                        ' not start with "cyc"')
+    assert words[0].startswith("cyc"), (
+        f'directory string "{cycle_str}" does' ' not start with "cyc"'
+    )
     try:
-        cyc_id = int(words[0][len('cyc'):])
+        cyc_id = int(words[0][len("cyc") :])
     except ValueError:
-        raise AssertionError(f'Directory string "{cycle_str}" cycle number is'
-                             ' not an integer')
-    assert words[1].startswith('reg'), (f'Directory string "{cycle_str}" does'
-                                        ' not include "_reg"')
+        raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer")
+    assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"'
     try:
-        reg_id = int(words[1][len('reg'):])
+        reg_id = int(words[1][len("reg") :])
     except ValueError:
-        raise AssertionError(f'Directory string "{cycle_str}" region number is'
-                             ' not an integer')
+        raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer")
     return cyc_id, reg_id
 
 
@@ -47,115 +45,110 @@ class CodexCommonErrorsValidator(Validator):
 
     description = "Test for common problems found in CODEX"
     cost = 1.0
+    required = "codex"
 
     def collect_errors(self, **kwargs) -> List[str]:
         """
         Return the errors found by this validator
         """
         del kwargs
-        if self.assay_type != 'CODEX':
+        if self.required not in self.contains and self.assay_type.lower() != self.required:
             return []  # We only test CODEX data
         rslts = []
         for path in self.paths:
             rslt = []
             try:
                 # is the raw/src_ directory present?
                 prefix = None
-                if (path / 'raw').is_dir():
-                    prefix = path / 'raw'
+                if (path / "raw").is_dir():
+                    prefix = path / "raw"
                 else:
-                    for candidate in path.glob('src_*'):
+                    for candidate in path.glob("src_*"):
                         prefix = candidate
                 if prefix is None:
-                    rslt.append('The raw/src_ subdirectory is missing?')
+                    rslt.append("The raw/src_ subdirectory is missing?")
                     raise QuitNowException()
 
                 # Does dataset.json exist?  If so, 'new CODEX' syntax rules
                 # are in effect
                 dataset_json_exists = False
                 any_dataset_json_exists = False
-                for candidate in path.glob('**/dataset.json'):
+                for candidate in path.glob("**/dataset.json"):
                     any_dataset_json_exists = True
-                    if candidate == prefix / 'dataset.json':
+                    if candidate == prefix / "dataset.json":
                         dataset_json_exists = True
                 if dataset_json_exists:
-                    print('FOUND dataset.json; skipping further analysis')
+                    print("FOUND dataset.json; skipping further analysis")
                     raise QuitNowException()
                 elif any_dataset_json_exists:
-                    rslt.append(
-                        'A dataset.json file exists but'
-                        ' is in the wrong place'
-                    )
+                    rslt.append("A dataset.json file exists but" " is in the wrong place")
 
                 # is the segmentation.json file on the right side?
                 found = False
                 right_place = False
-                for filepath in path.glob('*/[Ss]egmentation.json'):
+                for filepath in path.glob("*/[Ss]egmentation.json"):
                     rel_path = filepath.relative_to(path)
                     found = True
-                    if str(rel_path).startswith(('raw', 'src_')):
+                    if str(rel_path).startswith(("raw", "src_")):
                         right_place = True
                 if found:
                     if right_place:
                         pass
                     else:
-                        rslt.append(
-                            'The segmentation.json file is in the wrong subdirectory'
-                        )
+                        rslt.append("The segmentation.json file is in the wrong subdirectory")
                 else:
-                    rslt.append('The segmentation.json file is missing or misplaced')
+                    rslt.append("The segmentation.json file is missing or misplaced")
 
                 # Does the channelnames.txt file exist?
-                channelnames_txt_path = prefix / 'channelnames.txt'
+                channelnames_txt_path = prefix / "channelnames.txt"
                 if not channelnames_txt_path.is_file():
                     # sometimes we see this variant
-                    channelnames_txt_path = prefix / 'channelNames.txt'
+                    channelnames_txt_path = prefix / "channelNames.txt"
                     if not channelnames_txt_path.is_file():
-                        rslt.append('channelnames.txt is missing')
+                        rslt.append("channelnames.txt is missing")
                         raise QuitNowException()
 
                 # Parse channelnames.txt into a dataframe
                 try:
                     cn_df = pd.read_csv(str(channelnames_txt_path), header=None)
                 except Exception:
-                    rslt.append(f'Unexpected error reading {channelnames_txt_path}')
+                    rslt.append(f"Unexpected error reading {channelnames_txt_path}")
                     raise QuitNowException()
                 if len(cn_df.columns) != 1:
-                    rslt.append(f'Unexpected format for {channelnames_txt_path}')
+                    rslt.append(f"Unexpected format for {channelnames_txt_path}")
                     raise QuitNowException()
 
                 # Does the channelnames_report.csv file exist?
-                report_csv_path = prefix / 'channelnames_report.csv'
+                report_csv_path = prefix / "channelnames_report.csv"
                 if report_csv_path.is_file():
                     # Parse channelnames_report.txt into a dataframe
                     try:
-                        rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None)
+                        rpt_df = pd.read_csv(str(report_csv_path), sep=",", header=None)
                     except Exception:
-                        rslt.append(f'Unexpected error reading {report_csv_path}')
+                        rslt.append(f"Unexpected error reading {report_csv_path}")
                         raise QuitNowException()
                     if len(rpt_df) == len(cn_df) + 1:
                         # channelnames_report.csv appears to have a header
                         try:
-                            rpt_df = pd.read_csv(str(report_csv_path), sep=',')
+                            rpt_df = pd.read_csv(str(report_csv_path), sep=",")
                         except Exception:
-                            rslt.append(f'Unexpected error reading {report_csv_path}')
+                            rslt.append(f"Unexpected error reading {report_csv_path}")
                             raise QuitNowException()
                     if len(rpt_df.columns) != 2:
                         rslt.append(
-                            f'Could not parse {report_csv_path}.'
-                            ' Is it a comma-separated table?'
+                            f"Could not parse {report_csv_path}." " Is it a comma-separated table?"
                         )
                         raise QuitNowException()
                     col_0, col_1 = rpt_df.columns
-                    rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'})
+                    rpt_df = rpt_df.rename(columns={col_0: "Marker", col_1: "Result"})
                     # Do they match?
-                    rpt_df['other'] = cn_df[0]
-                    mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']]
+                    rpt_df["other"] = cn_df[0]
+                    mismatches_df = rpt_df[rpt_df["other"] != rpt_df["Marker"]]
                     if len(mismatches_df) != 0:
                         for idx, row in mismatches_df.iterrows():
                             rslt.append(
-                                f'{channelnames_txt_path.name} does not'
-                                ' match channelnames_report.txt'
+                                f"{channelnames_txt_path.name} does not"
+                                " match channelnames_report.txt"
                                 f' on line {idx}: {row["other"]} vs {row["Marker"]}'
                             )
                         raise QuitNowException()
@@ -164,7 +157,7 @@ def collect_errors(self, **kwargs) -> List[str]:
 
                 # Tabulate the cycle and region info
                 all_cycle_dirs = []
-                for glob_str in ['cyc*', 'Cyc*']:
+                for glob_str in ["cyc*", "Cyc*"]:
                     for pth in prefix.glob(glob_str):
                         if pth.is_dir():
                             all_cycle_dirs.append(str(pth.stem).lower())
@@ -189,28 +182,26 @@ def collect_errors(self, **kwargs) -> List[str]:
                 failures = []
                 # First cycle must be 1
                 if cycles[0] != 1:
-                    failures.append('Cycle numbering does not start at 1')
+                    failures.append("Cycle numbering does not start at 1")
                 # First region must be 1
                 if regions[0] != 1:
-                    failures.append('Region numbering does not start at 1')
+                    failures.append("Region numbering does not start at 1")
                 # Cycle range must be contiguous ints
                 if cycles != list(range(cycles[0], cycles[-1] + 1)):
-                    failures.append('Cycle numbers are not contiguous')
+                    failures.append("Cycle numbers are not contiguous")
                 # Region range must be contiguous ints
                 if regions != list(range(regions[0], regions[-1] + 1)):
-                    failures.append('Region numbers are not contiguous')
+                    failures.append("Region numbers are not contiguous")
                 # All cycle, region pairs must be present
                 if len(cycles) * len(regions) != total_entries:
-                    failures.append('Not all cycle/region pairs are present')
+                    failures.append("Not all cycle/region pairs are present")
                 # Total number of channels / total number of cycles must be integer,
                 # excluding any HandE channels
                 total_channel_count = len(cn_df)
-                h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')])
-                channels_per_cycle = (
-                    total_channel_count - h_and_e_channel_count
-                ) / len(cycles)
+                h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")])
+                channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles)
                 if channels_per_cycle != int(channels_per_cycle):
-                    failures.append('The number of channels per cycle is not constant')
+                    failures.append("The number of channels per cycle is not constant")
                 if failures:
                     rslt += failures
                     raise QuitNowException()

diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py
@@ -9,22 +9,23 @@
 class CodexJsonValidator(Validator):
     description = "Check CODEX JSON against schema"
     cost = 1.0
+    required = "codex"
 
     def collect_errors(self, **kwargs) -> List[str]:
         del kwargs
-        if 'codex' not in self.assay_type.lower():
-            return []
+        if self.required not in self.contains and self.assay_type.lower() != self.required:
+            return []  # We only test CODEX data
 
-        schema_path = Path(__file__).parent / 'codex_schema.json'
+        schema_path = Path(__file__).parent / "codex_schema.json"
         schema = json.loads(schema_path.read_text())
 
         rslt = []
-        for glob_expr in ['**/dataset.json']:
+        for glob_expr in ["**/dataset.json"]:
             for path in self.paths:
                 for file in path.glob(glob_expr):
                     instance = json.loads(file.read_text())
                     try:
                         validate(instance=instance, schema=schema)
                     except Exception as e:
-                        rslt.append(f'{file}: {e}')
+                        rslt.append(f"{file}: {e}")
         return rslt
diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py
@@ -1,17 +1,17 @@
 from os import cpu_count
 from typing import List
 
-from ingest_validation_tools.plugin_validator import Validator
 from fastq_validator_logic import FASTQValidatorLogic, _log
+from ingest_validation_tools.plugin_validator import Validator
 
 
 class FASTQValidator(Validator):
     description = "Check FASTQ files for basic syntax and consistency."
     cost = 15.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
-        _log(f'Threading at {threads}')
+        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
+        _log(f"Threading at {threads}")
         validator = FASTQValidatorLogic(verbose=True)
         validator.validate_fastq_files_in_path(self.paths, threads)
         return validator.errors