Skip to content

Commit

Permalink
Merge branch 'refs/heads/devel'
Browse files Browse the repository at this point in the history
  • Loading branch information
sunset666 committed Apr 25, 2024
2 parents 66bf25b + 0f1b3d1 commit e43d07d
Show file tree
Hide file tree
Showing 21 changed files with 500 additions and 321 deletions.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
extend-ignore = E203,E501,W503
max-line-length = 99
32 changes: 32 additions & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Linters

on:
push:
pull_request:
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 black isort
- name: Flake8 Lint
run: |
flake8 --ignore=E501,W503,E203 .
- name: Black Lint
run: |
black --line-length 99 --check --verbose .
- name: isort Lint
run: |
isort --profile black --check-only --diff .
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ celerybeat.pid

# Environments
.env
.envrc
.venv
env/
venv/
Expand Down
53 changes: 51 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,58 @@
# ingest-validation-tests

This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.
This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.

## Development process

### Branches

- Make new feature branches from `devel`.
- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory:

```
black --line-length 99 .
isort --profile black --multi-line 3 .
flake8
```

(Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.)

- Make PRs to `devel`. (This is the default branch.)
- The last reviewer to approve a PR should merge it. At the moment that is likely to be @jswelling .
- The last reviewer to approve a PR should merge it.

### Setup

- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv:

```
python3.9 -m venv hm-ingest-validation-tests
source hm-ingest-validation-tests/bin/activate
```

- Run `pip install -r requirements-dev.txt`
- (optional) Integrate black with your editor.
- [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor.
- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor.

### Testing

- If ingest-validation-tools is not already set up:

```
# Starting from ingest-validation-tests...
cd ..
git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
cd ingest-validation-tests
pip install -r ../ingest-validation-tools/requirements.txt
pip install -r ../ingest-validation-tools/requirements-dev.txt
```

- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run:

```
pip install -r <path-to-ingest-validation-tools>/requirements.txt
pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
```

- Run `test.sh`
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[tool.black]
line-length = 99

[tool.isort]
profile = "black"
multi_line_output = 3
11 changes: 11 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
black==23.12.1
flake8==7.0.0
git+https://github.com/hubmapconsortium/[email protected]#egg=hubmap-fastq-utils
imagecodecs>=2023.3.16
isort==5.13.2
jsonschema==4.4.0
pandas>=1.2.0
pytest==8.0.0
python-frontmatter>=1.0.0
tifffile==2020.10.1
xmlschema>=1.6
105 changes: 48 additions & 57 deletions src/ingest_validation_tests/codex_common_errors_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,20 @@ def _split_cycle_dir_string(cycle_str):
"""
Given a cycle-and-region directory name, split out the cyle and region numbers
"""
words = cycle_str.split('_')
words = cycle_str.split("_")
assert len(words) >= 2, f'Directory string "{cycle_str}" has unexpected form'
assert words[0].startswith('cyc'), (f'directory string "{cycle_str}" does'
' not start with "cyc"')
assert words[0].startswith("cyc"), (
f'directory string "{cycle_str}" does' ' not start with "cyc"'
)
try:
cyc_id = int(words[0][len('cyc'):])
cyc_id = int(words[0][len("cyc") :])
except ValueError:
raise AssertionError(f'Directory string "{cycle_str}" cycle number is'
' not an integer')
assert words[1].startswith('reg'), (f'Directory string "{cycle_str}" does'
' not include "_reg"')
raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer")
assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"'
try:
reg_id = int(words[1][len('reg'):])
reg_id = int(words[1][len("reg") :])
except ValueError:
raise AssertionError(f'Directory string "{cycle_str}" region number is'
' not an integer')
raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer")
return cyc_id, reg_id


Expand All @@ -47,115 +45,110 @@ class CodexCommonErrorsValidator(Validator):

description = "Test for common problems found in CODEX"
cost = 1.0
required = "codex"

def collect_errors(self, **kwargs) -> List[str]:
"""
Return the errors found by this validator
"""
del kwargs
if self.assay_type != 'CODEX':
if self.required not in self.contains and self.assay_type.lower() != self.required:
return [] # We only test CODEX data
rslts = []
for path in self.paths:
rslt = []
try:
# is the raw/src_ directory present?
prefix = None
if (path / 'raw').is_dir():
prefix = path / 'raw'
if (path / "raw").is_dir():
prefix = path / "raw"
else:
for candidate in path.glob('src_*'):
for candidate in path.glob("src_*"):
prefix = candidate
if prefix is None:
rslt.append('The raw/src_ subdirectory is missing?')
rslt.append("The raw/src_ subdirectory is missing?")
raise QuitNowException()

# Does dataset.json exist? If so, 'new CODEX' syntax rules
# are in effect
dataset_json_exists = False
any_dataset_json_exists = False
for candidate in path.glob('**/dataset.json'):
for candidate in path.glob("**/dataset.json"):
any_dataset_json_exists = True
if candidate == prefix / 'dataset.json':
if candidate == prefix / "dataset.json":
dataset_json_exists = True
if dataset_json_exists:
print('FOUND dataset.json; skipping further analysis')
print("FOUND dataset.json; skipping further analysis")
raise QuitNowException()
elif any_dataset_json_exists:
rslt.append(
'A dataset.json file exists but'
' is in the wrong place'
)
rslt.append("A dataset.json file exists but" " is in the wrong place")

# is the segmentation.json file on the right side?
found = False
right_place = False
for filepath in path.glob('*/[Ss]egmentation.json'):
for filepath in path.glob("*/[Ss]egmentation.json"):
rel_path = filepath.relative_to(path)
found = True
if str(rel_path).startswith(('raw', 'src_')):
if str(rel_path).startswith(("raw", "src_")):
right_place = True
if found:
if right_place:
pass
else:
rslt.append(
'The segmentation.json file is in the wrong subdirectory'
)
rslt.append("The segmentation.json file is in the wrong subdirectory")
else:
rslt.append('The segmentation.json file is missing or misplaced')
rslt.append("The segmentation.json file is missing or misplaced")

# Does the channelnames.txt file exist?
channelnames_txt_path = prefix / 'channelnames.txt'
channelnames_txt_path = prefix / "channelnames.txt"
if not channelnames_txt_path.is_file():
# sometimes we see this variant
channelnames_txt_path = prefix / 'channelNames.txt'
channelnames_txt_path = prefix / "channelNames.txt"
if not channelnames_txt_path.is_file():
rslt.append('channelnames.txt is missing')
rslt.append("channelnames.txt is missing")
raise QuitNowException()

# Parse channelnames.txt into a dataframe
try:
cn_df = pd.read_csv(str(channelnames_txt_path), header=None)
except Exception:
rslt.append(f'Unexpected error reading {channelnames_txt_path}')
rslt.append(f"Unexpected error reading {channelnames_txt_path}")
raise QuitNowException()
if len(cn_df.columns) != 1:
rslt.append(f'Unexpected format for {channelnames_txt_path}')
rslt.append(f"Unexpected format for {channelnames_txt_path}")
raise QuitNowException()

# Does the channelnames_report.csv file exist?
report_csv_path = prefix / 'channelnames_report.csv'
report_csv_path = prefix / "channelnames_report.csv"
if report_csv_path.is_file():
# Parse channelnames_report.txt into a dataframe
try:
rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None)
rpt_df = pd.read_csv(str(report_csv_path), sep=",", header=None)
except Exception:
rslt.append(f'Unexpected error reading {report_csv_path}')
rslt.append(f"Unexpected error reading {report_csv_path}")
raise QuitNowException()
if len(rpt_df) == len(cn_df) + 1:
# channelnames_report.csv appears to have a header
try:
rpt_df = pd.read_csv(str(report_csv_path), sep=',')
rpt_df = pd.read_csv(str(report_csv_path), sep=",")
except Exception:
rslt.append(f'Unexpected error reading {report_csv_path}')
rslt.append(f"Unexpected error reading {report_csv_path}")
raise QuitNowException()
if len(rpt_df.columns) != 2:
rslt.append(
f'Could not parse {report_csv_path}.'
' Is it a comma-separated table?'
f"Could not parse {report_csv_path}." " Is it a comma-separated table?"
)
raise QuitNowException()
col_0, col_1 = rpt_df.columns
rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'})
rpt_df = rpt_df.rename(columns={col_0: "Marker", col_1: "Result"})
# Do they match?
rpt_df['other'] = cn_df[0]
mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']]
rpt_df["other"] = cn_df[0]
mismatches_df = rpt_df[rpt_df["other"] != rpt_df["Marker"]]
if len(mismatches_df) != 0:
for idx, row in mismatches_df.iterrows():
rslt.append(
f'{channelnames_txt_path.name} does not'
' match channelnames_report.txt'
f"{channelnames_txt_path.name} does not"
" match channelnames_report.txt"
f' on line {idx}: {row["other"]} vs {row["Marker"]}'
)
raise QuitNowException()
Expand All @@ -164,7 +157,7 @@ def collect_errors(self, **kwargs) -> List[str]:

# Tabulate the cycle and region info
all_cycle_dirs = []
for glob_str in ['cyc*', 'Cyc*']:
for glob_str in ["cyc*", "Cyc*"]:
for pth in prefix.glob(glob_str):
if pth.is_dir():
all_cycle_dirs.append(str(pth.stem).lower())
Expand All @@ -189,28 +182,26 @@ def collect_errors(self, **kwargs) -> List[str]:
failures = []
# First cycle must be 1
if cycles[0] != 1:
failures.append('Cycle numbering does not start at 1')
failures.append("Cycle numbering does not start at 1")
# First region must be 1
if regions[0] != 1:
failures.append('Region numbering does not start at 1')
failures.append("Region numbering does not start at 1")
# Cycle range must be contiguous ints
if cycles != list(range(cycles[0], cycles[-1] + 1)):
failures.append('Cycle numbers are not contiguous')
failures.append("Cycle numbers are not contiguous")
# Region range must be contiguous ints
if regions != list(range(regions[0], regions[-1] + 1)):
failures.append('Region numbers are not contiguous')
failures.append("Region numbers are not contiguous")
# All cycle, region pairs must be present
if len(cycles) * len(regions) != total_entries:
failures.append('Not all cycle/region pairs are present')
failures.append("Not all cycle/region pairs are present")
# Total number of channels / total number of cycles must be integer,
# excluding any HandE channels
total_channel_count = len(cn_df)
h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')])
channels_per_cycle = (
total_channel_count - h_and_e_channel_count
) / len(cycles)
h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")])
channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles)
if channels_per_cycle != int(channels_per_cycle):
failures.append('The number of channels per cycle is not constant')
failures.append("The number of channels per cycle is not constant")
if failures:
rslt += failures
raise QuitNowException()
Expand Down
11 changes: 6 additions & 5 deletions src/ingest_validation_tests/codex_json_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,23 @@
class CodexJsonValidator(Validator):
description = "Check CODEX JSON against schema"
cost = 1.0
required = "codex"

def collect_errors(self, **kwargs) -> List[str]:
del kwargs
if 'codex' not in self.assay_type.lower():
return []
if self.required not in self.contains and self.assay_type.lower() != self.required:
return [] # We only test CODEX data

schema_path = Path(__file__).parent / 'codex_schema.json'
schema_path = Path(__file__).parent / "codex_schema.json"
schema = json.loads(schema_path.read_text())

rslt = []
for glob_expr in ['**/dataset.json']:
for glob_expr in ["**/dataset.json"]:
for path in self.paths:
for file in path.glob(glob_expr):
instance = json.loads(file.read_text())
try:
validate(instance=instance, schema=schema)
except Exception as e:
rslt.append(f'{file}: {e}')
rslt.append(f"{file}: {e}")
return rslt
6 changes: 3 additions & 3 deletions src/ingest_validation_tests/fastq_validator.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
from os import cpu_count
from typing import List

from ingest_validation_tools.plugin_validator import Validator
from fastq_validator_logic import FASTQValidatorLogic, _log
from ingest_validation_tools.plugin_validator import Validator


class FASTQValidator(Validator):
description = "Check FASTQ files for basic syntax and consistency."
cost = 15.0

def collect_errors(self, **kwargs) -> List[str]:
threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
_log(f'Threading at {threads}')
threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
_log(f"Threading at {threads}")
validator = FASTQValidatorLogic(verbose=True)
validator.validate_fastq_files_in_path(self.paths, threads)
return validator.errors
Loading

0 comments on commit e43d07d

Please sign in to comment.