diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..70fc30a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +extend-ignore = E203,E501,W503 +max-line-length = 99 diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..b4fb9d4 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,32 @@ +name: Linters + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 black isort + - name: Flake8 Lint + run: | + flake8 --ignore=E501,W503,E203 . + - name: Black Lint + run: | + black --line-length 99 --check --verbose . + - name: isort Lint + run: | + isort --profile black --check-only --diff . diff --git a/.gitignore b/.gitignore index 916db7b..da9fde3 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,7 @@ celerybeat.pid # Environments .env +.envrc .venv env/ venv/ diff --git a/README.md b/README.md index 2db444a..0cca7ba 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,58 @@ # ingest-validation-tests -This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. +This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. ## Development process +### Branches + - Make new feature branches from `devel`. +- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory: + + ``` + black --line-length 99 . + isort --profile black --multi-line 3 . + flake8 + ``` + + (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.) + - Make PRs to `devel`. (This is the default branch.) -- The last reviewer to approve a PR should merge it. At the moment that is likely to be @jswelling . +- The last reviewer to approve a PR should merge it. + +### Setup + +- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv: + + ``` + python3.9 -m venv hm-ingest-validation-tests + source hm-ingest-validation-tests/bin/activate + ``` + +- Run `pip install -r requirements-dev.txt` +- (optional) Integrate black with your editor. + - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) +- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor. +- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor. + +### Testing + +- If ingest-validation-tools is not already set up: + + ``` + # Starting from ingest-validation-tests... + cd .. + git clone https://github.com/hubmapconsortium/ingest-validation-tools.git + cd ingest-validation-tests + pip install -r ../ingest-validation-tools/requirements.txt + pip install -r ../ingest-validation-tools/requirements-dev.txt + ``` + +- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run: + + ``` + pip install -r /requirements.txt + pip install -r /requirements-dev.txt + ``` + +- Run `test.sh` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..93863fa --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.black] +line-length = 99 + +[tool.isort] +profile = "black" +multi_line_output = 3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ba6f71b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,11 @@ +black==23.12.1 +flake8==7.0.0 +git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils +imagecodecs>=2023.3.16 +isort==5.13.2 +jsonschema==4.4.0 +pandas>=1.2.0 +pytest==8.0.0 +python-frontmatter>=1.0.0 +tifffile==2020.10.1 +xmlschema>=1.6 diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 391e9cc..06984e9 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -20,22 +20,20 @@ def _split_cycle_dir_string(cycle_str): """ Given a cycle-and-region directory name, split out the cyle and region numbers """ - words = cycle_str.split('_') + words = cycle_str.split("_") assert len(words) >= 2, f'Directory string "{cycle_str}" has unexpected form' - assert words[0].startswith('cyc'), (f'directory string "{cycle_str}" does' - ' not start with "cyc"') + assert words[0].startswith("cyc"), ( + f'directory string "{cycle_str}" does' ' not start with "cyc"' + ) try: - cyc_id = int(words[0][len('cyc'):]) + cyc_id = int(words[0][len("cyc") :]) except ValueError: - raise AssertionError(f'Directory string "{cycle_str}" cycle number is' - ' not an integer') - assert words[1].startswith('reg'), (f'Directory string "{cycle_str}" does' - ' not include "_reg"') + raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer") + assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"' try: - reg_id = int(words[1][len('reg'):]) + reg_id = int(words[1][len("reg") :]) except ValueError: - raise AssertionError(f'Directory string "{cycle_str}" region number is' - ' not an integer') + raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer") return cyc_id, reg_id @@ -47,13 +45,14 @@ class CodexCommonErrorsValidator(Validator): description = "Test for common problems found in CODEX" cost = 1.0 + required = "codex" def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ del kwargs - if self.assay_type != 'CODEX': + if self.required not in self.contains and self.assay_type.lower() != self.required: return [] # We only test CODEX data rslts = [] for path in self.paths: @@ -61,101 +60,95 @@ def collect_errors(self, **kwargs) -> List[str]: try: # is the raw/src_ directory present? prefix = None - if (path / 'raw').is_dir(): - prefix = path / 'raw' + if (path / "raw").is_dir(): + prefix = path / "raw" else: - for candidate in path.glob('src_*'): + for candidate in path.glob("src_*"): prefix = candidate if prefix is None: - rslt.append('The raw/src_ subdirectory is missing?') + rslt.append("The raw/src_ subdirectory is missing?") raise QuitNowException() # Does dataset.json exist? If so, 'new CODEX' syntax rules # are in effect dataset_json_exists = False any_dataset_json_exists = False - for candidate in path.glob('**/dataset.json'): + for candidate in path.glob("**/dataset.json"): any_dataset_json_exists = True - if candidate == prefix / 'dataset.json': + if candidate == prefix / "dataset.json": dataset_json_exists = True if dataset_json_exists: - print('FOUND dataset.json; skipping further analysis') + print("FOUND dataset.json; skipping further analysis") raise QuitNowException() elif any_dataset_json_exists: - rslt.append( - 'A dataset.json file exists but' - ' is in the wrong place' - ) + rslt.append("A dataset.json file exists but" " is in the wrong place") # is the segmentation.json file on the right side? found = False right_place = False - for filepath in path.glob('*/[Ss]egmentation.json'): + for filepath in path.glob("*/[Ss]egmentation.json"): rel_path = filepath.relative_to(path) found = True - if str(rel_path).startswith(('raw', 'src_')): + if str(rel_path).startswith(("raw", "src_")): right_place = True if found: if right_place: pass else: - rslt.append( - 'The segmentation.json file is in the wrong subdirectory' - ) + rslt.append("The segmentation.json file is in the wrong subdirectory") else: - rslt.append('The segmentation.json file is missing or misplaced') + rslt.append("The segmentation.json file is missing or misplaced") # Does the channelnames.txt file exist? - channelnames_txt_path = prefix / 'channelnames.txt' + channelnames_txt_path = prefix / "channelnames.txt" if not channelnames_txt_path.is_file(): # sometimes we see this variant - channelnames_txt_path = prefix / 'channelNames.txt' + channelnames_txt_path = prefix / "channelNames.txt" if not channelnames_txt_path.is_file(): - rslt.append('channelnames.txt is missing') + rslt.append("channelnames.txt is missing") raise QuitNowException() # Parse channelnames.txt into a dataframe try: cn_df = pd.read_csv(str(channelnames_txt_path), header=None) except Exception: - rslt.append(f'Unexpected error reading {channelnames_txt_path}') + rslt.append(f"Unexpected error reading {channelnames_txt_path}") raise QuitNowException() if len(cn_df.columns) != 1: - rslt.append(f'Unexpected format for {channelnames_txt_path}') + rslt.append(f"Unexpected format for {channelnames_txt_path}") raise QuitNowException() # Does the channelnames_report.csv file exist? - report_csv_path = prefix / 'channelnames_report.csv' + report_csv_path = prefix / "channelnames_report.csv" if report_csv_path.is_file(): # Parse channelnames_report.txt into a dataframe try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None) + rpt_df = pd.read_csv(str(report_csv_path), sep=",", header=None) except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f"Unexpected error reading {report_csv_path}") raise QuitNowException() if len(rpt_df) == len(cn_df) + 1: # channelnames_report.csv appears to have a header try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',') + rpt_df = pd.read_csv(str(report_csv_path), sep=",") except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f"Unexpected error reading {report_csv_path}") raise QuitNowException() if len(rpt_df.columns) != 2: rslt.append( - f'Could not parse {report_csv_path}.' - ' Is it a comma-separated table?' + f"Could not parse {report_csv_path}." " Is it a comma-separated table?" ) raise QuitNowException() col_0, col_1 = rpt_df.columns - rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'}) + rpt_df = rpt_df.rename(columns={col_0: "Marker", col_1: "Result"}) # Do they match? - rpt_df['other'] = cn_df[0] - mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']] + rpt_df["other"] = cn_df[0] + mismatches_df = rpt_df[rpt_df["other"] != rpt_df["Marker"]] if len(mismatches_df) != 0: for idx, row in mismatches_df.iterrows(): rslt.append( - f'{channelnames_txt_path.name} does not' - ' match channelnames_report.txt' + f"{channelnames_txt_path.name} does not" + " match channelnames_report.txt" f' on line {idx}: {row["other"]} vs {row["Marker"]}' ) raise QuitNowException() @@ -164,7 +157,7 @@ def collect_errors(self, **kwargs) -> List[str]: # Tabulate the cycle and region info all_cycle_dirs = [] - for glob_str in ['cyc*', 'Cyc*']: + for glob_str in ["cyc*", "Cyc*"]: for pth in prefix.glob(glob_str): if pth.is_dir(): all_cycle_dirs.append(str(pth.stem).lower()) @@ -189,28 +182,26 @@ def collect_errors(self, **kwargs) -> List[str]: failures = [] # First cycle must be 1 if cycles[0] != 1: - failures.append('Cycle numbering does not start at 1') + failures.append("Cycle numbering does not start at 1") # First region must be 1 if regions[0] != 1: - failures.append('Region numbering does not start at 1') + failures.append("Region numbering does not start at 1") # Cycle range must be contiguous ints if cycles != list(range(cycles[0], cycles[-1] + 1)): - failures.append('Cycle numbers are not contiguous') + failures.append("Cycle numbers are not contiguous") # Region range must be contiguous ints if regions != list(range(regions[0], regions[-1] + 1)): - failures.append('Region numbers are not contiguous') + failures.append("Region numbers are not contiguous") # All cycle, region pairs must be present if len(cycles) * len(regions) != total_entries: - failures.append('Not all cycle/region pairs are present') + failures.append("Not all cycle/region pairs are present") # Total number of channels / total number of cycles must be integer, # excluding any HandE channels total_channel_count = len(cn_df) - h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')]) - channels_per_cycle = ( - total_channel_count - h_and_e_channel_count - ) / len(cycles) + h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")]) + channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles) if channels_per_cycle != int(channels_per_cycle): - failures.append('The number of channels per cycle is not constant') + failures.append("The number of channels per cycle is not constant") if failures: rslt += failures raise QuitNowException() diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 1e9e9ba..bd1717d 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -9,22 +9,23 @@ class CodexJsonValidator(Validator): description = "Check CODEX JSON against schema" cost = 1.0 + required = "codex" def collect_errors(self, **kwargs) -> List[str]: del kwargs - if 'codex' not in self.assay_type.lower(): - return [] + if self.required not in self.contains and self.assay_type.lower() != self.required: + return [] # We only test CODEX data - schema_path = Path(__file__).parent / 'codex_schema.json' + schema_path = Path(__file__).parent / "codex_schema.json" schema = json.loads(schema_path.read_text()) rslt = [] - for glob_expr in ['**/dataset.json']: + for glob_expr in ["**/dataset.json"]: for path in self.paths: for file in path.glob(glob_expr): instance = json.loads(file.read_text()) try: validate(instance=instance, schema=schema) except Exception as e: - rslt.append(f'{file}: {e}') + rslt.append(f"{file}: {e}") return rslt diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index 0a30e42..d31335f 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,8 +1,8 @@ from os import cpu_count from typing import List -from ingest_validation_tools.plugin_validator import Validator from fastq_validator_logic import FASTQValidatorLogic, _log +from ingest_validation_tools.plugin_validator import Validator class FASTQValidator(Validator): @@ -10,8 +10,8 @@ class FASTQValidator(Validator): cost = 15.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - _log(f'Threading at {threads}') + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + _log(f"Threading at {threads}") validator = FASTQValidatorLogic(verbose=True) validator.validate_fastq_files_in_path(self.paths, threads) return validator.errors diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 85eeb44..7ae516f 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -15,10 +15,7 @@ def is_valid_filename(filename: str) -> bool: def _open_fastq_file(file: Path) -> TextIO: - return ( - gzip.open(file, 'rt') if file.name.endswith('.gz') - else file.open() - ) + return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open() def _log(message: str) -> str: @@ -54,13 +51,13 @@ class FASTQValidatorLogic: contain the same number of symbols as letters in the sequence. """ - _FASTQ_LINE_2_VALID_CHARS = 'ACGNT' + _FASTQ_LINE_2_VALID_CHARS = "ACGNT" def __init__(self, verbose=False): self.errors: List[str] = [] self._file_record_counts = Manager().dict() self._file_prefix_counts = Manager().dict() - self._filename = '' + self._filename = "" self._line_number = 0 self._verbose = verbose @@ -80,7 +77,7 @@ def _format_error(self, error: str) -> str: return message def _validate_fastq_line_1(self, line: str) -> List[str]: - if not line or line[0] != '@': + if not line or line[0] != "@": return ["Line does not begin with '@'."] return [] @@ -89,45 +86,47 @@ def _validate_fastq_line_2(self, line: str) -> List[str]: self._line_2_length = len(line) self._last_line_2_number = self._line_number - invalid_chars = ''.join( - c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) + invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) if invalid_chars: return [f"Line contains invalid character(s): {invalid_chars}"] return [] def _validate_fastq_line_3(self, line: str) -> List[str]: - if not line or line[0] != '+': + if not line or line[0] != "+": return ["Line does not begin with '+'."] return [] def _validate_fastq_line_4(self, line: str) -> List[str]: errors: List[str] = [] - invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126) + invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126) if invalid_chars: - errors.append("Line contains invalid quality character(s): " - f'"{invalid_chars}"') + errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"') if len(line) != self._line_2_length: - errors.append(f"Line contains {len(line)} characters which " - f"does not match line {self._last_line_2_number}'s " - f"{self._line_2_length} characters.") + errors.append( + f"Line contains {len(line)} characters which " + f"does not match line {self._last_line_2_number}'s " + f"{self._line_2_length} characters." + ) return errors - _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1, - 2: _validate_fastq_line_2, - 3: _validate_fastq_line_3, - 4: _validate_fastq_line_4} + _VALIDATE_FASTQ_LINE_METHODS = { + 1: _validate_fastq_line_1, + 2: _validate_fastq_line_2, + 3: _validate_fastq_line_3, + 4: _validate_fastq_line_4, + } def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \ + validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = ( self._VALIDATE_FASTQ_LINE_METHODS[line_index] + ) - assert validator_method, \ - f"No validator method defined for record index {line_index}" + assert validator_method, f"No validator method defined for record index {line_index}" return validator_method(self, line) @@ -139,8 +138,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int: for line_count, line in enumerate(fastq_data): self._line_number = line_count + 1 self.errors.extend( - self._format_error(error) for error in - self.validate_fastq_record(line.rstrip(), line_count) + self._format_error(error) + for error in self.validate_fastq_record(line.rstrip(), line_count) ) return line_count + 1 @@ -151,9 +150,9 @@ def validate_fastq_file(self, fastq_file: Path) -> None: if not is_valid_filename(fastq_file.name): # If we don't like the filename, don't bother reading the contents. - self.errors.append(_log( - "Filename does not have proper format " - "and will not be processed")) + self.errors.append( + _log("Filename does not have proper format " "and will not be processed") + ) return self._line_number = 0 @@ -163,13 +162,17 @@ def validate_fastq_file(self, fastq_file: Path) -> None: with _open_fastq_file(fastq_file) as fastq_data: records_read = self.validate_fastq_stream(fastq_data) except gzip.BadGzipFile: - self.errors.append( - self._format_error(f"Bad gzip file: {fastq_file}.")) + self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}.")) return except IOError: - self.errors.append( - self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) + self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return + except EOFError: + self.errors.append(self._format_error(f"EOF in FASTQ data file {fastq_file}.")) + except Exception as e: + self.errors.append( + self._format_error(f"Unexpected error: {e} on data file {fastq_file}.") + ) self._file_record_counts[str(fastq_file)] = records_read def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: @@ -187,12 +190,17 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: for file in files: file_list.append(Path(path / rel_path / file)) try: - logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.") + logging.info( + f"Passing file list for paths {paths} to engine. File list: {file_list}." + ) pool = Pool(threads) engine = Engine(self) data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f"Error {e}") + pool.close() + pool.join() + data_found_one.append(f"Error {e}") else: pool.close() pool.join() @@ -214,14 +222,18 @@ def _find_duplicates(self, dirs_and_files): files_per_path[filepath.name].append(data_path / sub_path) for filename, filepaths in files_per_path.items(): if len(filepaths) > 1: - self.errors.append(_log( - f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}.")) # noqa: E501 + self.errors.append( + _log( + f"{filename} has been found multiple times during this validation. " + f"Locations of duplicates: {filepaths}." # noqa: E501 + ) + ) def _find_shared_prefixes(self, lock): # This pattern seeks out the string that includes the lane number (since # that is expected to be present to help anchor the prefix) that comes # before any of _I1, _I2, _R1, or _R2. - fastq_file_prefix_regex = re.compile(r'(.+_L\d+.*)_[IR][12][._]') + fastq_file_prefix_regex = re.compile(r"(.+_L\d+.*)_[IR][12][._]") for fastq_file, records_read in self._file_record_counts.items(): match = fastq_file_prefix_regex.match(Path(fastq_file).name) with lock: @@ -233,26 +245,31 @@ def _find_shared_prefixes(self, lock): # Find a file we've validated already that matches this # prefix. extant_files = [ - str(Path(filepath).name) for filepath, record_count - in self._file_record_counts.items() - if record_count == extant_count and Path(filepath).name.startswith(filename_prefix) + str(Path(filepath).name) + for filepath, record_count in self._file_record_counts.items() + if record_count == extant_count + and Path(filepath).name.startswith(filename_prefix) ] # Based on how the dictionaries are created, there should # always be at least one matching filename. assert extant_files - self.errors.append(_log( - f"{Path(fastq_file).name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines).")) + self.errors.append( + _log( + f"{Path(fastq_file).name} ({records_read} lines) " + f"does not match length of {extant_files[0]} " + f"({extant_count} lines)." + ) + ) else: self._file_prefix_counts[filename_prefix] = records_read def main(): - parser = argparse.ArgumentParser(description='Validate FASTQ files.') - parser.add_argument('filepaths', type=Path, nargs='+', - help="Files to validate for FASTQ syntax") + parser = argparse.ArgumentParser(description="Validate FASTQ files.") + parser.add_argument( + "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax" + ) args = parser.parse_args() if isinstance(args.filepaths, List): @@ -262,13 +279,11 @@ def main(): elif isinstance(args.filepaths, str): filepaths = [Path(args.filepaths)] else: - raise Exception( - f"Validator init received base_paths arg as type {type(args.filepaths)}" - ) + raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}") validator = FASTQValidatorLogic(True) validator.validate_fastq_files_in_path(filepaths, Lock()) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index 58ad671..fe0a147 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -1,9 +1,9 @@ +import gzip +import re from multiprocessing import Pool from os import cpu_count -import re from typing import List -import gzip from ingest_validation_tools.plugin_validator import Validator @@ -46,6 +46,9 @@ def collect_errors(self, **kwargs) -> List[str]: data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f"Error {e}") + pool.close() + pool.join() + data_output2.extend(f"Error: {e}") else: pool.close() pool.join() diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index 4c7ccc1..a3d705a 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -2,10 +2,11 @@ Test for some common errors in the directory and file structure of publications. """ -from typing import List -import re import json +import re from pathlib import Path +from typing import List + import frontmatter from ingest_validation_tools.plugin_validator import Validator @@ -15,50 +16,58 @@ class PublicationValidator(Validator): Test for some common errors in the directory and file structure of publications. """ + description = "Test for common problems found in publications" cost = 1.0 - base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)' - url_re = r'[Uu][Rr][Ll]' + base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)" + url_re = r"[Uu][Rr][Ll]" + required = "publication" def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ del kwargs - if self.assay_type != 'Publication': + if self.required not in self.contains and self.assay_type.lower() != self.required: return [] # We only test Publication data rslt = [] for path in self.paths: try: - vignette_path = path / 'vignettes' - assert vignette_path.is_dir(), 'vignettes not found or not a directory' - for this_vignette_path in vignette_path.glob('*'): - assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}" - " in vignettes") - this_vignette_all_paths = set(this_vignette_path.glob('*')) + vignette_path = path / "vignettes" + assert vignette_path.is_dir(), "vignettes not found or not a directory" + for this_vignette_path in vignette_path.glob("*"): + assert this_vignette_path.is_dir(), ( + f"Found the non-dir {this_vignette_path}" " in vignettes" + ) + this_vignette_all_paths = set(this_vignette_path.glob("*")) if not all(pth.is_file() for pth in this_vignette_all_paths): - raise AssertionError('Found a subdirectory in a vignette') + raise AssertionError("Found a subdirectory in a vignette") md_found = False vig_figures = [] - for md_path in this_vignette_path.glob('*.md'): + for md_path in this_vignette_path.glob("*.md"): if md_found: - raise AssertionError('A vignette has more than one markdown file') + raise AssertionError("A vignette has more than one markdown file") else: md_found = True vig_fm = frontmatter.loads(md_path.read_text()) - for key in ['name', 'figures']: - assert key in vig_fm.metadata, ('vignette markdown is incorrectly' - f' formatted or has no {key}') - for fig_dict in vig_fm.metadata['figures']: - assert 'file' in fig_dict, 'figure dict does not reference a file' - assert 'name' in fig_dict, 'figure dict does not provide a name' - vig_figures.append(fig_dict['file']) + for key in ["name", "figures"]: + assert key in vig_fm.metadata, ( + "vignette markdown is incorrectly" f" formatted or has no {key}" + ) + for fig_dict in vig_fm.metadata["figures"]: + assert "file" in fig_dict, "figure dict does not reference a file" + assert "name" in fig_dict, "figure dict does not provide a name" + vig_figures.append(fig_dict["file"]) this_vignette_all_paths.remove(md_path) for fname in vig_figures: - rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path)) + rslt.extend( + self.validate_vitessce_config(this_vignette_path / fname, path) + ) this_vignette_all_paths.remove(this_vignette_path / fname) - assert not this_vignette_all_paths, ('unexpected files in vignette:' - f' {list(str(elt) for elt in this_vignette_all_paths)}') + assert not this_vignette_all_paths, ( + "unexpected files in vignette:" + f" {list(str(elt) for elt in this_vignette_all_paths)}" + ) except AssertionError as excp: rslt.append(str(excp)) @@ -94,9 +103,10 @@ def validate_vitessce_config(self, json_path, path): match = re.match(self.base_url_re, val) if match: # it starts with {{ base_url }} extra_url = match.group(2) - data_path = path / 'data' / extra_url - assert data_path.exists(), ("expected data file" - f" {Path('data') / extra_url} is absent") + data_path = path / "data" / extra_url + assert data_path.exists(), ( + "expected data file" f" {Path('data') / extra_url} is absent" + ) except AssertionError as excp: rslt.append(str(excp)) diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index db840c1..23b568a 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -37,13 +37,15 @@ class TiffValidator(Validator): cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 pool = Pool(threads) filenames_to_test = [] - for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: + for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: for path in self.paths: for file in path.glob(glob_expr): filenames_to_test.append(file) - return list(rslt for rslt in pool.imap_unordered(_check_tiff_file, - filenames_to_test) - if rslt is not None) + return list( + rslt + for rslt in pool.imap_unordered(_check_tiff_file, filenames_to_test) + if rslt is not None + ) diff --git a/tests/pytest_runner.py b/tests/pytest_runner.py index 0743dad..ac7e5a3 100644 --- a/tests/pytest_runner.py +++ b/tests/pytest_runner.py @@ -1,12 +1,15 @@ import sys from pathlib import Path + import pytest -class add_path(): + +class add_path: """ Add an element to sys.path using a context. Thanks to Eugene Yarmash https://stackoverflow.com/a/39855753 """ + def __init__(self, path): self.path = path @@ -22,16 +25,13 @@ def __exit__(self, exc_type, exc_value, traceback): def main(): if len(sys.argv) != 2: - sys.exit(f'usage: {sys.argv[0]} path-to-ingest-validation-tools') - tools_path = Path(sys.argv[1]).resolve() / 'src' - plugins_path = (Path(__file__).resolve().parent.parent - / 'src' - / 'ingest_validation_tests' - ) + sys.exit(f"usage: {sys.argv[0]} path-to-ingest-validation-tools") + tools_path = Path(sys.argv[1]).resolve() / "src" + plugins_path = Path(__file__).resolve().parent.parent / "src" / "ingest_validation_tests" with add_path(str(tools_path)): with add_path(str(plugins_path)): - sys.exit(pytest.main(['-vv'])) + sys.exit(pytest.main(["-vv"])) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py index 23cd5e7..68d1d82 100644 --- a/tests/test_codex_common_errors_validator.py +++ b/tests/test_codex_common_errors_validator.py @@ -1,53 +1,84 @@ -from pathlib import Path import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_starts_list'), ( - ('test_data/fake_codex_tree_0.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_1.zip', ['The segmentation.json file is in', - 'Unexpected error reading']), - ('test_data/fake_codex_tree_2.zip', ['The raw/src_ subdirectory is missing?']), - ('test_data/fake_codex_tree_3.zip', ['channelnames.txt is missing']), - ('test_data/fake_codex_tree_4.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_5.zip', ['channelnames.txt does not match channelnames_report.txt' - ' on line 1: HLADR vs HLA-DR', - 'channelnames.txt does not match channelnames_report.txt' - ' on line 6: Empty vs Blank']), - ('test_data/fake_codex_tree_6.zip', ['Could not parse ']), - ('test_data/fake_codex_tree_7.zip', []), - ('test_data/fake_codex_tree_8.zip', ['Region numbers are not contiguous']), - ('test_data/fake_codex_tree_9.zip', ['Cycle numbers are not contiguous', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_10.zip', ['Directory string "cyc0a3_reg001_211119_040351"' - ' cycle number is not an integer']), - ('test_data/fake_codex_tree_11.zip', ['Directory string "cyc003_reg0a1_211119_040351"' - ' region number is not an integer']), - ('test_data/fake_codex_tree_12.zip', ['Directory string "cyc002_rig001_211119_040351"' - ' does not include "_reg"']), - ('test_data/fake_codex_tree_13.zip', ['Cycle numbering does not start at 1']), - ('test_data/fake_codex_tree_14.zip', ['Region numbering does not start at 1']), - ('test_data/fake_codex_tree_15.zip', ['Not all cycle/region pairs are present', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_16.zip', []), - ('test_data/fake_codex_tree_17.zip', ['A dataset.json file exists but is in the wrong place', - 'Region numbering does not start at 1']), - ('test_data/fake_codex_tree_18.zip', ['The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_19.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_starts_list"), + ( + ("test_data/fake_codex_tree_0.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_1.zip", + ["The segmentation.json file is in", "Unexpected error reading"], + ), + ("test_data/fake_codex_tree_2.zip", ["The raw/src_ subdirectory is missing?"]), + ("test_data/fake_codex_tree_3.zip", ["channelnames.txt is missing"]), + ("test_data/fake_codex_tree_4.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_5.zip", + [ + "channelnames.txt does not match channelnames_report.txt" + " on line 1: HLADR vs HLA-DR", + "channelnames.txt does not match channelnames_report.txt" + " on line 6: Empty vs Blank", + ], + ), + ("test_data/fake_codex_tree_6.zip", ["Could not parse "]), + ("test_data/fake_codex_tree_7.zip", []), + ("test_data/fake_codex_tree_8.zip", ["Region numbers are not contiguous"]), + ( + "test_data/fake_codex_tree_9.zip", + [ + "Cycle numbers are not contiguous", + "The number of channels per cycle is not constant", + ], + ), + ( + "test_data/fake_codex_tree_10.zip", + ['Directory string "cyc0a3_reg001_211119_040351"' " cycle number is not an integer"], + ), + ( + "test_data/fake_codex_tree_11.zip", + ['Directory string "cyc003_reg0a1_211119_040351"' " region number is not an integer"], + ), + ( + "test_data/fake_codex_tree_12.zip", + ['Directory string "cyc002_rig001_211119_040351"' ' does not include "_reg"'], + ), + ("test_data/fake_codex_tree_13.zip", ["Cycle numbering does not start at 1"]), + ("test_data/fake_codex_tree_14.zip", ["Region numbering does not start at 1"]), + ( + "test_data/fake_codex_tree_15.zip", + [ + "Not all cycle/region pairs are present", + "The number of channels per cycle is not constant", + ], + ), + ("test_data/fake_codex_tree_16.zip", []), + ( + "test_data/fake_codex_tree_17.zip", + [ + "A dataset.json file exists but is in the wrong place", + "Region numbering does not start at 1", + ], + ), + ("test_data/fake_codex_tree_18.zip", ["The number of channels per cycle is not constant"]), + ("test_data/fake_codex_tree_19.zip", []), + ), +) def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_path): from codex_common_errors_validator import CodexCommonErrorsValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], - 'CODEX' - ) + validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], "CODEX") errors = validator.collect_errors()[:] - print(f'ERRORS FOLLOW FOR {test_data_fname}') + print(f"ERRORS FOLLOW FOR {test_data_fname}") for err in errors: print(err) - print('ERRORS ABOVE') + print("ERRORS ABOVE") assert len(msg_starts_list) == len(errors) for err_str, expected_str in zip(errors, msg_starts_list): assert err_str.startswith(expected_str) diff --git a/tests/test_codex_json_validator.py b/tests/test_codex_json_validator.py index a2998ff..4539270 100644 --- a/tests/test_codex_json_validator.py +++ b/tests/test_codex_json_validator.py @@ -1,22 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip', - [".*is not of type 'object'.*"]), - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip", + [".*is not of type 'object'.*"], + ), + ("test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip", []), + ), +) def test_codex_json_validator(test_data_fname, msg_re_list, tmp_path): from codex_json_validator import CodexJsonValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexJsonValidator(tmp_path / test_data_path.stem, - 'CODEX' - ) + validator = CodexJsonValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors()[:] assert len(msg_re_list) == len(errors) for err_str, expected_re in zip(errors, msg_re_list): diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index a244cd8..31bd68a 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -1,36 +1,31 @@ -from multiprocessing import Lock +import gzip from pathlib import Path from typing import TextIO -import gzip import pytest -from src.ingest_validation_tests.fastq_validator_logic import \ - FASTQValidatorLogic +from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic -_GOOD_RECORDS = '''\ +_GOOD_RECORDS = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFFF -''' +""" _GOOD_QUALITY_RECORD = ( - '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ' - r'[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' + "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + r"[\]^_`abcdefghijklmnopqrstuvwxyz{|}~" ) _GOOD_SEQUENCE_FOR_QUALITY = ( - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ) def _open_output_file(filename: Path, use_gzip: bool) -> TextIO: - return ( - gzip.open(filename, 'wt') if use_gzip - else open(filename, 'wt') - ) + return gzip.open(filename, "wt") if use_gzip else open(filename, "wt") class TestFASTQValidatorLogic: @@ -46,33 +41,29 @@ def test_fastq_validator_no_files(self, fastq_validator, tmp_path): def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path): # Note that the filename ends in .gz, although it will not contain # compressed data. - test_file = tmp_path.joinpath('test.fastq.gz') + test_file = tmp_path.joinpath("test.fastq.gz") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) assert "Bad gzip file" in fastq_validator.errors[0] - def test_fastq_validator_unrecognized_file(self, fastq_validator, - tmp_path): - test_file = tmp_path.joinpath('test.txt') + def test_fastq_validator_unrecognized_file(self, fastq_validator, tmp_path): + test_file = tmp_path.joinpath("test.txt") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) - assert "Filename does not have proper format" in \ - fastq_validator.errors[0] + assert "Filename does not have proper format" in fastq_validator.errors[0] - def test_fastq_validator_empty_directory(self, fastq_validator, - tmp_path): + def test_fastq_validator_empty_directory(self, fastq_validator, tmp_path): fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # No files in path means no errors assert fastq_validator.errors == [] @pytest.mark.parametrize("use_gzip", [False, True]) def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): - test_file = tmp_path.joinpath('test.fastq.gz' if use_gzip - else 'test.fastq') + test_file = tmp_path.joinpath("test.fastq.gz" if use_gzip else "test.fastq") with _open_output_file(test_file, use_gzip) as output: output.write(_GOOD_RECORDS) @@ -80,9 +71,9 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): assert not fastq_validator.errors def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): - test_file = tmp_path.joinpath('test.fastq') + test_file = tmp_path.joinpath("test.fastq") with _open_output_file(test_file, False) as output: - output.write('ABCDEF') + output.write("ABCDEF") fastq_validator.validate_fastq_files_in_path([tmp_path], 2) @@ -91,56 +82,54 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): assert fastq_validator.errors def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path): - for subdirectory in ['a', 'b']: + for subdirectory in ["a", "b"]: subdirectory_path = tmp_path.joinpath(subdirectory) subdirectory_path.mkdir() - with _open_output_file(subdirectory_path.joinpath('test.fastq'), - False) as output: + with _open_output_file(subdirectory_path.joinpath("test.fastq"), False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - assert "test.fastq has been found multiple times" in \ - fastq_validator.errors[0] + assert "test.fastq has been found multiple times" in fastq_validator.errors[0] def test_fastq_validator_io_error(self, fastq_validator, tmp_path): - fake_path = tmp_path.joinpath('does-not-exist.fastq') + fake_path = tmp_path.joinpath("does-not-exist.fastq") fastq_validator.validate_fastq_file(fake_path) assert "Unable to open" in fastq_validator.errors[0] def test_fastq_validator_line_1_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('@SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("@SEQ_ID", 0) assert not result def test_fastq_validator_line_1_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('*SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("*SEQ_ID", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_1_empty(self, fastq_validator): - result = fastq_validator.validate_fastq_record('', 0) + result = fastq_validator.validate_fastq_record("", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_2_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACTGACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACTGACTGNNNN", 1) assert not result def test_fastq_validator_line_2_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACT$ACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACT$ACTGNNNN", 1) assert "contains invalid character(s): $" in result[0] def test_fastq_validator_line_3_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('+SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("+SEQ_ID", 2) assert not result def test_fastq_validator_line_3_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('!SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("!SEQ_ID", 2) assert "does not begin with '+'" in result[0] @@ -151,42 +140,41 @@ def test_fastq_validator_line_4_good(self, fastq_validator): assert not result def test_fastq_validator_line_4_bad(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABC !@#', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABC !@#", 3) assert 'contains invalid quality character(s): " "' in result[0] def test_fastq_validator_line_4_matching_length(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABCDEFG', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABCDEFG", 3) assert not result - def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, - tmp_path): - fastq_validator.validate_fastq_record('123456789ABCDEF', 1) - fastq_validator.validate_fastq_record('ABC', 3) + def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, tmp_path): + fastq_validator.validate_fastq_record("123456789ABCDEF", 1) + fastq_validator.validate_fastq_record("ABC", 3) - test_data = '''\ + test_data = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFF -''' +""" - new_file = tmp_path.joinpath('test.fastq') + new_file = tmp_path.joinpath("test.fastq") with _open_output_file(new_file, False) as output: output.write(test_data) fastq_validator.validate_fastq_file(new_file) - assert "contains 9 characters which does not match line 2's 10" in \ - fastq_validator.errors[0] + assert ( + "contains 9 characters which does not match line 2's 10" in fastq_validator.errors[0] + ) - def test_fastq_validator_record_counts_good(self, fastq_validator, - tmp_path): + def test_fastq_validator_record_counts_good(self, fastq_validator, tmp_path): for filename in [ - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq', - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq' + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq", + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq", ]: new_file = tmp_path.joinpath(filename) with _open_output_file(new_file, False) as output: @@ -196,15 +184,14 @@ def test_fastq_validator_record_counts_good(self, fastq_validator, assert not fastq_validator.errors - def test_fastq_validator_record_counts_bad(self, fastq_validator, - tmp_path): - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq'), - False) as output: + def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path): + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq'), - False) as output: + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) output.write(_GOOD_RECORDS) diff --git a/tests/test_gz_validator.py b/tests/test_gz_validator.py index 0c6c8b2..ad748c4 100644 --- a/tests/test_gz_validator.py +++ b/tests/test_gz_validator.py @@ -1,19 +1,24 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/fake_snrnaseq_tree_good.zip', []), - ('test_data/fake_snrnaseq_tree_bad.zip', ['.*text2.txt.gz is not a valid gzipped file']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/fake_snrnaseq_tree_good.zip", []), + ("test_data/fake_snrnaseq_tree_bad.zip", [".*text2.txt.gz is not a valid gzipped file"]), + ), +) def test_gz_validator(test_data_fname, msg_re_list, tmp_path): from gz_validator import GZValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = GZValidator(tmp_path / test_data_path.stem, 'snRNAseq') + validator = GZValidator(tmp_path / test_data_path.stem, "snRNAseq") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_ome_tiff_validator.py b/tests/test_ome_tiff_validator.py index 09e2163..89ad198 100644 --- a/tests/test_ome_tiff_validator.py +++ b/tests/test_ome_tiff_validator.py @@ -1,20 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/codex_tree_ometiff_bad.zip', - ['.*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*']), - ('test_data/codex_tree_ometiff_good.zip',[]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/codex_tree_ometiff_bad.zip", + [".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*"], + ), + ("test_data/codex_tree_ometiff_good.zip", []), + ), +) def test_ome_tiff_validator(test_data_fname, msg_re_list, tmp_path): from ome_tiff_validator import OmeTiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = OmeTiffValidator(tmp_path / test_data_path.stem, 'CODEX') + validator = OmeTiffValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_publication_validator.py b/tests/test_publication_validator.py index ebb768c..50c3a0e 100644 --- a/tests/test_publication_validator.py +++ b/tests/test_publication_validator.py @@ -1,34 +1,46 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/publication_tree_good.zip', []), - ('test_data/publication_tree_good_complex.zip', []), - ('test_data/publication_tree_bad_complex.zip', - [ - 'expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent', - 'expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent', - 'expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent', - ]), - ('test_data/publication_tree_bad_1.zip', ['vignettes not found or not a directory']), - ('test_data/publication_tree_bad_2.zip', ['Found a subdirectory in a vignette']), - ('test_data/publication_tree_bad_3.zip', ['A vignette has more than one markdown file']), - ('test_data/publication_tree_bad_4.zip', ['figure dict does not provide a name']), - ('test_data/publication_tree_bad_5.zip', ['figure dict does not reference a file']), - ('test_data/publication_tree_bad_6.zip', ['unexpected files in vignette.*']), - ('test_data/publication_tree_bad_7.zip', ['expected data file' - ' data/codeluppi_2018_nature_methods.molecules.h5ad.zarr' - ' is absent']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/publication_tree_good.zip", []), + ("test_data/publication_tree_good_complex.zip", []), + ( + "test_data/publication_tree_bad_complex.zip", + [ + "expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent", # noqa: E501 + "expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent", # noqa: E501 + "expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent", # noqa: E501 + ], + ), + ("test_data/publication_tree_bad_1.zip", ["vignettes not found or not a directory"]), + ("test_data/publication_tree_bad_2.zip", ["Found a subdirectory in a vignette"]), + ("test_data/publication_tree_bad_3.zip", ["A vignette has more than one markdown file"]), + ("test_data/publication_tree_bad_4.zip", ["figure dict does not provide a name"]), + ("test_data/publication_tree_bad_5.zip", ["figure dict does not reference a file"]), + ("test_data/publication_tree_bad_6.zip", ["unexpected files in vignette.*"]), + ( + "test_data/publication_tree_bad_7.zip", + [ + "expected data file" + " data/codeluppi_2018_nature_methods.molecules.h5ad.zarr" + " is absent" + ], + ), + ), +) def test_publication_validator(test_data_fname, msg_re_list, tmp_path): from publication_validator import PublicationValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = PublicationValidator(tmp_path / test_data_path.stem, 'Publication') + validator = PublicationValidator(tmp_path / test_data_path.stem, "Publication") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = [] diff --git a/tests/test_tiff_validator.py b/tests/test_tiff_validator.py index c2d271b..4ebe66c 100644 --- a/tests/test_tiff_validator.py +++ b/tests/test_tiff_validator.py @@ -1,24 +1,32 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/tiff_tree_good.zip', []), - ('test_data/tiff_tree_bad.zip', [ - '.*notatiff.tif is not a valid TIFF file', - '.*notatiff.tiff is not a valid TIFF file', - '.*notatiff.TIFF is not a valid TIFF file', - '.*notatiff.TIF is not a valid TIFF file', - ]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/tiff_tree_good.zip", []), + ( + "test_data/tiff_tree_bad.zip", + [ + ".*notatiff.tif is not a valid TIFF file", + ".*notatiff.tiff is not a valid TIFF file", + ".*notatiff.TIFF is not a valid TIFF file", + ".*notatiff.TIF is not a valid TIFF file", + ], + ), + ), +) def test_tiff_validator(test_data_fname, msg_re_list, tmp_path): from tiff_validator import TiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = TiffValidator(tmp_path / test_data_path.stem, 'codex') + validator = TiffValidator(tmp_path / test_data_path.stem, "codex") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = []