From c51bf666d742f980a308705404e0310d3c42e8c7 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 12 Jan 2024 11:51:30 -0500 Subject: [PATCH 01/19] update to accommodate multi-assay--component assays passed from IVT in Validator.contains --- .../codex_common_errors_validator.py | 7 ++++++- src/ingest_validation_tests/codex_json_validator.py | 8 ++++++-- src/ingest_validation_tests/publication_validator.py | 9 +++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 01bda98..74cedc3 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -45,11 +45,16 @@ class CodexCommonErrorsValidator(Validator): """ description = "Test for common problems found in CODEX" cost = 1.0 + required = "codex" + def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ - if self.assay_type != 'CODEX': + if ( + self.required not in self.contains + and self.assay_type.lower() != self.required + ): return [] # We only test CODEX data rslt = [] try: diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 17b4ee9..e75b9ba 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -10,10 +10,14 @@ class CodexJsonValidator(Validator): description = "Check CODEX JSON against schema" cost = 1.0 + required = "codex" def collect_errors(self, **kwargs) -> List[str]: - if 'codex' not in self.assay_type.lower(): - return [] + if ( + self.required not in self.contains + and self.assay_type.lower() != self.required + ): + return [] # We only test CODEX data schema_path = Path(__file__).parent / 'codex_schema.json' schema = json.loads(schema_path.read_text()) diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index 9399f2c..59cdadf 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -19,11 +19,16 @@ class PublicationValidator(Validator): cost = 1.0 base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)' url_re = r'[Uu][Rr][Ll]' + required = "publications" + def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ - if self.assay_type != 'Publication': + if ( + self.required not in self.contains + and self.assay_type.lower() != self.required + ): return [] # We only test Publication data rslt = [] try: @@ -86,7 +91,7 @@ def validate_vitessce_config(self, json_path): rslt = [] with open(json_path) as f: dct = json.load(f) - for key, val in self.url_search_iter(dct): + for _, val in self.url_search_iter(dct): try: match = re.match(self.base_url_re, val) if match: # it starts with {{ base_url }} From f69bf7db12ead2fba59ce200b85c92dbfba4530d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Tue, 23 Jan 2024 14:22:10 -0500 Subject: [PATCH 02/19] adding linting and formatting tools --- .flake8 | 3 +++ pyproject.toml | 7 +++++++ requirements-dev.txt | 10 ++++++++++ 3 files changed, 20 insertions(+) create mode 100644 .flake8 create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..ad98539 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 100 +extend-ignore = E203 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..72dd8d9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[tool.black] +line-length = 99 + +[tool.isort] +profile = "black" +multi_line_output = 3 +src_paths = ["src/ingest-validation-tests"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..d5c087a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,10 @@ +black==23.12.1 +flake8==7.0.0 +git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils +imagecodecs>=2023.3.16 +isort==5.13.2 +jsonschema==4.4.0 +pandas>=1.2.0 +python-frontmatter>=1.0.0 +tifffile==2020.10.1 +xmlschema>=1.6 From 930fd015a671cdadd51520c8024d39ab8b4fe71d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 11:35:10 -0500 Subject: [PATCH 03/19] ran black / flake8 / isort, fixed errors, updated .gitignore --- .gitignore | 1 + .../codex_common_errors_validator.py | 104 +++++++--------- .../codex_json_validator.py | 8 +- .../fastq_validator.py | 6 +- .../fastq_validator_logic.py | 107 ++++++++-------- src/ingest_validation_tests/gz_validator.py | 4 +- .../publication_validator.py | 63 ++++++---- src/ingest_validation_tests/tiff_validator.py | 12 +- tests/pytest_runner.py | 18 +-- tests/test_codex_common_errors_validator.py | 107 ++++++++++------ tests/test_codex_json_validator.py | 25 ++-- tests/test_fastq_validator_logic.py | 116 ++++++++---------- tests/test_gz_validator.py | 19 +-- tests/test_ome_tiff_validator.py | 23 ++-- tests/test_publication_validator.py | 56 +++++---- tests/test_tiff_validator.py | 32 +++-- 16 files changed, 381 insertions(+), 320 deletions(-) diff --git a/.gitignore b/.gitignore index 916db7b..da9fde3 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,7 @@ celerybeat.pid # Environments .env +.envrc .venv env/ venv/ diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 391e9cc..5edbec8 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -20,22 +20,20 @@ def _split_cycle_dir_string(cycle_str): """ Given a cycle-and-region directory name, split out the cyle and region numbers """ - words = cycle_str.split('_') + words = cycle_str.split("_") assert len(words) >= 2, f'Directory string "{cycle_str}" has unexpected form' - assert words[0].startswith('cyc'), (f'directory string "{cycle_str}" does' - ' not start with "cyc"') + assert words[0].startswith("cyc"), ( + f'directory string "{cycle_str}" does' ' not start with "cyc"' + ) try: - cyc_id = int(words[0][len('cyc'):]) + cyc_id = int(words[0][len("cyc") :]) except ValueError: - raise AssertionError(f'Directory string "{cycle_str}" cycle number is' - ' not an integer') - assert words[1].startswith('reg'), (f'Directory string "{cycle_str}" does' - ' not include "_reg"') + raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer") + assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"' try: - reg_id = int(words[1][len('reg'):]) + reg_id = int(words[1][len("reg") :]) except ValueError: - raise AssertionError(f'Directory string "{cycle_str}" region number is' - ' not an integer') + raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer") return cyc_id, reg_id @@ -53,7 +51,7 @@ def collect_errors(self, **kwargs) -> List[str]: Return the errors found by this validator """ del kwargs - if self.assay_type != 'CODEX': + if self.assay_type != "CODEX": return [] # We only test CODEX data rslts = [] for path in self.paths: @@ -61,101 +59,95 @@ def collect_errors(self, **kwargs) -> List[str]: try: # is the raw/src_ directory present? prefix = None - if (path / 'raw').is_dir(): - prefix = path / 'raw' + if (path / "raw").is_dir(): + prefix = path / "raw" else: - for candidate in path.glob('src_*'): + for candidate in path.glob("src_*"): prefix = candidate if prefix is None: - rslt.append('The raw/src_ subdirectory is missing?') + rslt.append("The raw/src_ subdirectory is missing?") raise QuitNowException() # Does dataset.json exist? If so, 'new CODEX' syntax rules # are in effect dataset_json_exists = False any_dataset_json_exists = False - for candidate in path.glob('**/dataset.json'): + for candidate in path.glob("**/dataset.json"): any_dataset_json_exists = True - if candidate == prefix / 'dataset.json': + if candidate == prefix / "dataset.json": dataset_json_exists = True if dataset_json_exists: - print('FOUND dataset.json; skipping further analysis') + print("FOUND dataset.json; skipping further analysis") raise QuitNowException() elif any_dataset_json_exists: - rslt.append( - 'A dataset.json file exists but' - ' is in the wrong place' - ) + rslt.append("A dataset.json file exists but" " is in the wrong place") # is the segmentation.json file on the right side? found = False right_place = False - for filepath in path.glob('*/[Ss]egmentation.json'): + for filepath in path.glob("*/[Ss]egmentation.json"): rel_path = filepath.relative_to(path) found = True - if str(rel_path).startswith(('raw', 'src_')): + if str(rel_path).startswith(("raw", "src_")): right_place = True if found: if right_place: pass else: - rslt.append( - 'The segmentation.json file is in the wrong subdirectory' - ) + rslt.append("The segmentation.json file is in the wrong subdirectory") else: - rslt.append('The segmentation.json file is missing or misplaced') + rslt.append("The segmentation.json file is missing or misplaced") # Does the channelnames.txt file exist? - channelnames_txt_path = prefix / 'channelnames.txt' + channelnames_txt_path = prefix / "channelnames.txt" if not channelnames_txt_path.is_file(): # sometimes we see this variant - channelnames_txt_path = prefix / 'channelNames.txt' + channelnames_txt_path = prefix / "channelNames.txt" if not channelnames_txt_path.is_file(): - rslt.append('channelnames.txt is missing') + rslt.append("channelnames.txt is missing") raise QuitNowException() # Parse channelnames.txt into a dataframe try: cn_df = pd.read_csv(str(channelnames_txt_path), header=None) except Exception: - rslt.append(f'Unexpected error reading {channelnames_txt_path}') + rslt.append(f"Unexpected error reading {channelnames_txt_path}") raise QuitNowException() if len(cn_df.columns) != 1: - rslt.append(f'Unexpected format for {channelnames_txt_path}') + rslt.append(f"Unexpected format for {channelnames_txt_path}") raise QuitNowException() # Does the channelnames_report.csv file exist? - report_csv_path = prefix / 'channelnames_report.csv' + report_csv_path = prefix / "channelnames_report.csv" if report_csv_path.is_file(): # Parse channelnames_report.txt into a dataframe try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None) + rpt_df = pd.read_csv(str(report_csv_path), sep=",", header=None) except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f"Unexpected error reading {report_csv_path}") raise QuitNowException() if len(rpt_df) == len(cn_df) + 1: # channelnames_report.csv appears to have a header try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',') + rpt_df = pd.read_csv(str(report_csv_path), sep=",") except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f"Unexpected error reading {report_csv_path}") raise QuitNowException() if len(rpt_df.columns) != 2: rslt.append( - f'Could not parse {report_csv_path}.' - ' Is it a comma-separated table?' + f"Could not parse {report_csv_path}." " Is it a comma-separated table?" ) raise QuitNowException() col_0, col_1 = rpt_df.columns - rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'}) + rpt_df = rpt_df.rename(columns={col_0: "Marker", col_1: "Result"}) # Do they match? - rpt_df['other'] = cn_df[0] - mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']] + rpt_df["other"] = cn_df[0] + mismatches_df = rpt_df[rpt_df["other"] != rpt_df["Marker"]] if len(mismatches_df) != 0: for idx, row in mismatches_df.iterrows(): rslt.append( - f'{channelnames_txt_path.name} does not' - ' match channelnames_report.txt' + f"{channelnames_txt_path.name} does not" + " match channelnames_report.txt" f' on line {idx}: {row["other"]} vs {row["Marker"]}' ) raise QuitNowException() @@ -164,7 +156,7 @@ def collect_errors(self, **kwargs) -> List[str]: # Tabulate the cycle and region info all_cycle_dirs = [] - for glob_str in ['cyc*', 'Cyc*']: + for glob_str in ["cyc*", "Cyc*"]: for pth in prefix.glob(glob_str): if pth.is_dir(): all_cycle_dirs.append(str(pth.stem).lower()) @@ -189,28 +181,26 @@ def collect_errors(self, **kwargs) -> List[str]: failures = [] # First cycle must be 1 if cycles[0] != 1: - failures.append('Cycle numbering does not start at 1') + failures.append("Cycle numbering does not start at 1") # First region must be 1 if regions[0] != 1: - failures.append('Region numbering does not start at 1') + failures.append("Region numbering does not start at 1") # Cycle range must be contiguous ints if cycles != list(range(cycles[0], cycles[-1] + 1)): - failures.append('Cycle numbers are not contiguous') + failures.append("Cycle numbers are not contiguous") # Region range must be contiguous ints if regions != list(range(regions[0], regions[-1] + 1)): - failures.append('Region numbers are not contiguous') + failures.append("Region numbers are not contiguous") # All cycle, region pairs must be present if len(cycles) * len(regions) != total_entries: - failures.append('Not all cycle/region pairs are present') + failures.append("Not all cycle/region pairs are present") # Total number of channels / total number of cycles must be integer, # excluding any HandE channels total_channel_count = len(cn_df) - h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')]) - channels_per_cycle = ( - total_channel_count - h_and_e_channel_count - ) / len(cycles) + h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")]) + channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles) if channels_per_cycle != int(channels_per_cycle): - failures.append('The number of channels per cycle is not constant') + failures.append("The number of channels per cycle is not constant") if failures: rslt += failures raise QuitNowException() diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 1e9e9ba..41a6955 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -12,19 +12,19 @@ class CodexJsonValidator(Validator): def collect_errors(self, **kwargs) -> List[str]: del kwargs - if 'codex' not in self.assay_type.lower(): + if "codex" not in self.assay_type.lower(): return [] - schema_path = Path(__file__).parent / 'codex_schema.json' + schema_path = Path(__file__).parent / "codex_schema.json" schema = json.loads(schema_path.read_text()) rslt = [] - for glob_expr in ['**/dataset.json']: + for glob_expr in ["**/dataset.json"]: for path in self.paths: for file in path.glob(glob_expr): instance = json.loads(file.read_text()) try: validate(instance=instance, schema=schema) except Exception as e: - rslt.append(f'{file}: {e}') + rslt.append(f"{file}: {e}") return rslt diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index 0a30e42..d31335f 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,8 +1,8 @@ from os import cpu_count from typing import List -from ingest_validation_tools.plugin_validator import Validator from fastq_validator_logic import FASTQValidatorLogic, _log +from ingest_validation_tools.plugin_validator import Validator class FASTQValidator(Validator): @@ -10,8 +10,8 @@ class FASTQValidator(Validator): cost = 15.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - _log(f'Threading at {threads}') + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + _log(f"Threading at {threads}") validator = FASTQValidatorLogic(verbose=True) validator.validate_fastq_files_in_path(self.paths, threads) return validator.errors diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 85eeb44..8095b74 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -15,10 +15,7 @@ def is_valid_filename(filename: str) -> bool: def _open_fastq_file(file: Path) -> TextIO: - return ( - gzip.open(file, 'rt') if file.name.endswith('.gz') - else file.open() - ) + return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open() def _log(message: str) -> str: @@ -54,13 +51,13 @@ class FASTQValidatorLogic: contain the same number of symbols as letters in the sequence. """ - _FASTQ_LINE_2_VALID_CHARS = 'ACGNT' + _FASTQ_LINE_2_VALID_CHARS = "ACGNT" def __init__(self, verbose=False): self.errors: List[str] = [] self._file_record_counts = Manager().dict() self._file_prefix_counts = Manager().dict() - self._filename = '' + self._filename = "" self._line_number = 0 self._verbose = verbose @@ -80,7 +77,7 @@ def _format_error(self, error: str) -> str: return message def _validate_fastq_line_1(self, line: str) -> List[str]: - if not line or line[0] != '@': + if not line or line[0] != "@": return ["Line does not begin with '@'."] return [] @@ -89,45 +86,47 @@ def _validate_fastq_line_2(self, line: str) -> List[str]: self._line_2_length = len(line) self._last_line_2_number = self._line_number - invalid_chars = ''.join( - c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) + invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) if invalid_chars: return [f"Line contains invalid character(s): {invalid_chars}"] return [] def _validate_fastq_line_3(self, line: str) -> List[str]: - if not line or line[0] != '+': + if not line or line[0] != "+": return ["Line does not begin with '+'."] return [] def _validate_fastq_line_4(self, line: str) -> List[str]: errors: List[str] = [] - invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126) + invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126) if invalid_chars: - errors.append("Line contains invalid quality character(s): " - f'"{invalid_chars}"') + errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"') if len(line) != self._line_2_length: - errors.append(f"Line contains {len(line)} characters which " - f"does not match line {self._last_line_2_number}'s " - f"{self._line_2_length} characters.") + errors.append( + f"Line contains {len(line)} characters which " + f"does not match line {self._last_line_2_number}'s " + f"{self._line_2_length} characters." + ) return errors - _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1, - 2: _validate_fastq_line_2, - 3: _validate_fastq_line_3, - 4: _validate_fastq_line_4} + _VALIDATE_FASTQ_LINE_METHODS = { + 1: _validate_fastq_line_1, + 2: _validate_fastq_line_2, + 3: _validate_fastq_line_3, + 4: _validate_fastq_line_4, + } def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \ - self._VALIDATE_FASTQ_LINE_METHODS[line_index] + validator_method: Callable[ + [FASTQValidatorLogic, str], List[str] + ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index] - assert validator_method, \ - f"No validator method defined for record index {line_index}" + assert validator_method, f"No validator method defined for record index {line_index}" return validator_method(self, line) @@ -139,8 +138,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int: for line_count, line in enumerate(fastq_data): self._line_number = line_count + 1 self.errors.extend( - self._format_error(error) for error in - self.validate_fastq_record(line.rstrip(), line_count) + self._format_error(error) + for error in self.validate_fastq_record(line.rstrip(), line_count) ) return line_count + 1 @@ -151,9 +150,9 @@ def validate_fastq_file(self, fastq_file: Path) -> None: if not is_valid_filename(fastq_file.name): # If we don't like the filename, don't bother reading the contents. - self.errors.append(_log( - "Filename does not have proper format " - "and will not be processed")) + self.errors.append( + _log("Filename does not have proper format " "and will not be processed") + ) return self._line_number = 0 @@ -163,12 +162,10 @@ def validate_fastq_file(self, fastq_file: Path) -> None: with _open_fastq_file(fastq_file) as fastq_data: records_read = self.validate_fastq_stream(fastq_data) except gzip.BadGzipFile: - self.errors.append( - self._format_error(f"Bad gzip file: {fastq_file}.")) + self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}.")) return except IOError: - self.errors.append( - self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) + self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return self._file_record_counts[str(fastq_file)] = records_read @@ -187,7 +184,9 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: for file in files: file_list.append(Path(path / rel_path / file)) try: - logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.") + logging.info( + f"Passing file list for paths {paths} to engine. File list: {file_list}." + ) pool = Pool(threads) engine = Engine(self) data_output = pool.imap_unordered(engine, file_list) @@ -214,14 +213,17 @@ def _find_duplicates(self, dirs_and_files): files_per_path[filepath.name].append(data_path / sub_path) for filename, filepaths in files_per_path.items(): if len(filepaths) > 1: - self.errors.append(_log( - f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}.")) # noqa: E501 + self.errors.append( + _log( + f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}." # noqa: E501 + ) + ) def _find_shared_prefixes(self, lock): # This pattern seeks out the string that includes the lane number (since # that is expected to be present to help anchor the prefix) that comes # before any of _I1, _I2, _R1, or _R2. - fastq_file_prefix_regex = re.compile(r'(.+_L\d+.*)_[IR][12][._]') + fastq_file_prefix_regex = re.compile(r"(.+_L\d+.*)_[IR][12][._]") for fastq_file, records_read in self._file_record_counts.items(): match = fastq_file_prefix_regex.match(Path(fastq_file).name) with lock: @@ -233,26 +235,31 @@ def _find_shared_prefixes(self, lock): # Find a file we've validated already that matches this # prefix. extant_files = [ - str(Path(filepath).name) for filepath, record_count - in self._file_record_counts.items() - if record_count == extant_count and Path(filepath).name.startswith(filename_prefix) + str(Path(filepath).name) + for filepath, record_count in self._file_record_counts.items() + if record_count == extant_count + and Path(filepath).name.startswith(filename_prefix) ] # Based on how the dictionaries are created, there should # always be at least one matching filename. assert extant_files - self.errors.append(_log( - f"{Path(fastq_file).name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines).")) + self.errors.append( + _log( + f"{Path(fastq_file).name} ({records_read} lines) " + f"does not match length of {extant_files[0]} " + f"({extant_count} lines)." + ) + ) else: self._file_prefix_counts[filename_prefix] = records_read def main(): - parser = argparse.ArgumentParser(description='Validate FASTQ files.') - parser.add_argument('filepaths', type=Path, nargs='+', - help="Files to validate for FASTQ syntax") + parser = argparse.ArgumentParser(description="Validate FASTQ files.") + parser.add_argument( + "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax" + ) args = parser.parse_args() if isinstance(args.filepaths, List): @@ -262,13 +269,11 @@ def main(): elif isinstance(args.filepaths, str): filepaths = [Path(args.filepaths)] else: - raise Exception( - f"Validator init received base_paths arg as type {type(args.filepaths)}" - ) + raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}") validator = FASTQValidatorLogic(True) validator.validate_fastq_files_in_path(filepaths, Lock()) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index 58ad671..9bb0162 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -1,9 +1,9 @@ +import gzip +import re from multiprocessing import Pool from os import cpu_count -import re from typing import List -import gzip from ingest_validation_tools.plugin_validator import Validator diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index 4c7ccc1..528a2b5 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -2,10 +2,11 @@ Test for some common errors in the directory and file structure of publications. """ -from typing import List -import re import json +import re from pathlib import Path +from typing import List + import frontmatter from ingest_validation_tools.plugin_validator import Validator @@ -15,50 +16,57 @@ class PublicationValidator(Validator): Test for some common errors in the directory and file structure of publications. """ + description = "Test for common problems found in publications" cost = 1.0 - base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)' - url_re = r'[Uu][Rr][Ll]' + base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)" + url_re = r"[Uu][Rr][Ll]" def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ del kwargs - if self.assay_type != 'Publication': + if self.assay_type != "Publication": return [] # We only test Publication data rslt = [] for path in self.paths: try: - vignette_path = path / 'vignettes' - assert vignette_path.is_dir(), 'vignettes not found or not a directory' - for this_vignette_path in vignette_path.glob('*'): - assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}" - " in vignettes") - this_vignette_all_paths = set(this_vignette_path.glob('*')) + vignette_path = path / "vignettes" + assert vignette_path.is_dir(), "vignettes not found or not a directory" + for this_vignette_path in vignette_path.glob("*"): + assert this_vignette_path.is_dir(), ( + f"Found the non-dir {this_vignette_path}" " in vignettes" + ) + this_vignette_all_paths = set(this_vignette_path.glob("*")) if not all(pth.is_file() for pth in this_vignette_all_paths): - raise AssertionError('Found a subdirectory in a vignette') + raise AssertionError("Found a subdirectory in a vignette") md_found = False vig_figures = [] - for md_path in this_vignette_path.glob('*.md'): + for md_path in this_vignette_path.glob("*.md"): if md_found: - raise AssertionError('A vignette has more than one markdown file') + raise AssertionError("A vignette has more than one markdown file") else: md_found = True vig_fm = frontmatter.loads(md_path.read_text()) - for key in ['name', 'figures']: - assert key in vig_fm.metadata, ('vignette markdown is incorrectly' - f' formatted or has no {key}') - for fig_dict in vig_fm.metadata['figures']: - assert 'file' in fig_dict, 'figure dict does not reference a file' - assert 'name' in fig_dict, 'figure dict does not provide a name' - vig_figures.append(fig_dict['file']) + for key in ["name", "figures"]: + assert key in vig_fm.metadata, ( + "vignette markdown is incorrectly" f" formatted or has no {key}" + ) + for fig_dict in vig_fm.metadata["figures"]: + assert "file" in fig_dict, "figure dict does not reference a file" + assert "name" in fig_dict, "figure dict does not provide a name" + vig_figures.append(fig_dict["file"]) this_vignette_all_paths.remove(md_path) for fname in vig_figures: - rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path)) + rslt.extend( + self.validate_vitessce_config(this_vignette_path / fname, path) + ) this_vignette_all_paths.remove(this_vignette_path / fname) - assert not this_vignette_all_paths, ('unexpected files in vignette:' - f' {list(str(elt) for elt in this_vignette_all_paths)}') + assert not this_vignette_all_paths, ( + "unexpected files in vignette:" + f" {list(str(elt) for elt in this_vignette_all_paths)}" + ) except AssertionError as excp: rslt.append(str(excp)) @@ -94,9 +102,10 @@ def validate_vitessce_config(self, json_path, path): match = re.match(self.base_url_re, val) if match: # it starts with {{ base_url }} extra_url = match.group(2) - data_path = path / 'data' / extra_url - assert data_path.exists(), ("expected data file" - f" {Path('data') / extra_url} is absent") + data_path = path / "data" / extra_url + assert data_path.exists(), ( + "expected data file" f" {Path('data') / extra_url} is absent" + ) except AssertionError as excp: rslt.append(str(excp)) diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index db840c1..23b568a 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -37,13 +37,15 @@ class TiffValidator(Validator): cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 pool = Pool(threads) filenames_to_test = [] - for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: + for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: for path in self.paths: for file in path.glob(glob_expr): filenames_to_test.append(file) - return list(rslt for rslt in pool.imap_unordered(_check_tiff_file, - filenames_to_test) - if rslt is not None) + return list( + rslt + for rslt in pool.imap_unordered(_check_tiff_file, filenames_to_test) + if rslt is not None + ) diff --git a/tests/pytest_runner.py b/tests/pytest_runner.py index 0743dad..ac7e5a3 100644 --- a/tests/pytest_runner.py +++ b/tests/pytest_runner.py @@ -1,12 +1,15 @@ import sys from pathlib import Path + import pytest -class add_path(): + +class add_path: """ Add an element to sys.path using a context. Thanks to Eugene Yarmash https://stackoverflow.com/a/39855753 """ + def __init__(self, path): self.path = path @@ -22,16 +25,13 @@ def __exit__(self, exc_type, exc_value, traceback): def main(): if len(sys.argv) != 2: - sys.exit(f'usage: {sys.argv[0]} path-to-ingest-validation-tools') - tools_path = Path(sys.argv[1]).resolve() / 'src' - plugins_path = (Path(__file__).resolve().parent.parent - / 'src' - / 'ingest_validation_tests' - ) + sys.exit(f"usage: {sys.argv[0]} path-to-ingest-validation-tools") + tools_path = Path(sys.argv[1]).resolve() / "src" + plugins_path = Path(__file__).resolve().parent.parent / "src" / "ingest_validation_tests" with add_path(str(tools_path)): with add_path(str(plugins_path)): - sys.exit(pytest.main(['-vv'])) + sys.exit(pytest.main(["-vv"])) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py index 23cd5e7..68d1d82 100644 --- a/tests/test_codex_common_errors_validator.py +++ b/tests/test_codex_common_errors_validator.py @@ -1,53 +1,84 @@ -from pathlib import Path import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_starts_list'), ( - ('test_data/fake_codex_tree_0.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_1.zip', ['The segmentation.json file is in', - 'Unexpected error reading']), - ('test_data/fake_codex_tree_2.zip', ['The raw/src_ subdirectory is missing?']), - ('test_data/fake_codex_tree_3.zip', ['channelnames.txt is missing']), - ('test_data/fake_codex_tree_4.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_5.zip', ['channelnames.txt does not match channelnames_report.txt' - ' on line 1: HLADR vs HLA-DR', - 'channelnames.txt does not match channelnames_report.txt' - ' on line 6: Empty vs Blank']), - ('test_data/fake_codex_tree_6.zip', ['Could not parse ']), - ('test_data/fake_codex_tree_7.zip', []), - ('test_data/fake_codex_tree_8.zip', ['Region numbers are not contiguous']), - ('test_data/fake_codex_tree_9.zip', ['Cycle numbers are not contiguous', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_10.zip', ['Directory string "cyc0a3_reg001_211119_040351"' - ' cycle number is not an integer']), - ('test_data/fake_codex_tree_11.zip', ['Directory string "cyc003_reg0a1_211119_040351"' - ' region number is not an integer']), - ('test_data/fake_codex_tree_12.zip', ['Directory string "cyc002_rig001_211119_040351"' - ' does not include "_reg"']), - ('test_data/fake_codex_tree_13.zip', ['Cycle numbering does not start at 1']), - ('test_data/fake_codex_tree_14.zip', ['Region numbering does not start at 1']), - ('test_data/fake_codex_tree_15.zip', ['Not all cycle/region pairs are present', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_16.zip', []), - ('test_data/fake_codex_tree_17.zip', ['A dataset.json file exists but is in the wrong place', - 'Region numbering does not start at 1']), - ('test_data/fake_codex_tree_18.zip', ['The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_19.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_starts_list"), + ( + ("test_data/fake_codex_tree_0.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_1.zip", + ["The segmentation.json file is in", "Unexpected error reading"], + ), + ("test_data/fake_codex_tree_2.zip", ["The raw/src_ subdirectory is missing?"]), + ("test_data/fake_codex_tree_3.zip", ["channelnames.txt is missing"]), + ("test_data/fake_codex_tree_4.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_5.zip", + [ + "channelnames.txt does not match channelnames_report.txt" + " on line 1: HLADR vs HLA-DR", + "channelnames.txt does not match channelnames_report.txt" + " on line 6: Empty vs Blank", + ], + ), + ("test_data/fake_codex_tree_6.zip", ["Could not parse "]), + ("test_data/fake_codex_tree_7.zip", []), + ("test_data/fake_codex_tree_8.zip", ["Region numbers are not contiguous"]), + ( + "test_data/fake_codex_tree_9.zip", + [ + "Cycle numbers are not contiguous", + "The number of channels per cycle is not constant", + ], + ), + ( + "test_data/fake_codex_tree_10.zip", + ['Directory string "cyc0a3_reg001_211119_040351"' " cycle number is not an integer"], + ), + ( + "test_data/fake_codex_tree_11.zip", + ['Directory string "cyc003_reg0a1_211119_040351"' " region number is not an integer"], + ), + ( + "test_data/fake_codex_tree_12.zip", + ['Directory string "cyc002_rig001_211119_040351"' ' does not include "_reg"'], + ), + ("test_data/fake_codex_tree_13.zip", ["Cycle numbering does not start at 1"]), + ("test_data/fake_codex_tree_14.zip", ["Region numbering does not start at 1"]), + ( + "test_data/fake_codex_tree_15.zip", + [ + "Not all cycle/region pairs are present", + "The number of channels per cycle is not constant", + ], + ), + ("test_data/fake_codex_tree_16.zip", []), + ( + "test_data/fake_codex_tree_17.zip", + [ + "A dataset.json file exists but is in the wrong place", + "Region numbering does not start at 1", + ], + ), + ("test_data/fake_codex_tree_18.zip", ["The number of channels per cycle is not constant"]), + ("test_data/fake_codex_tree_19.zip", []), + ), +) def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_path): from codex_common_errors_validator import CodexCommonErrorsValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], - 'CODEX' - ) + validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], "CODEX") errors = validator.collect_errors()[:] - print(f'ERRORS FOLLOW FOR {test_data_fname}') + print(f"ERRORS FOLLOW FOR {test_data_fname}") for err in errors: print(err) - print('ERRORS ABOVE') + print("ERRORS ABOVE") assert len(msg_starts_list) == len(errors) for err_str, expected_str in zip(errors, msg_starts_list): assert err_str.startswith(expected_str) diff --git a/tests/test_codex_json_validator.py b/tests/test_codex_json_validator.py index a2998ff..4539270 100644 --- a/tests/test_codex_json_validator.py +++ b/tests/test_codex_json_validator.py @@ -1,22 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip', - [".*is not of type 'object'.*"]), - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip", + [".*is not of type 'object'.*"], + ), + ("test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip", []), + ), +) def test_codex_json_validator(test_data_fname, msg_re_list, tmp_path): from codex_json_validator import CodexJsonValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexJsonValidator(tmp_path / test_data_path.stem, - 'CODEX' - ) + validator = CodexJsonValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors()[:] assert len(msg_re_list) == len(errors) for err_str, expected_re in zip(errors, msg_re_list): diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index a244cd8..c554d6f 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -1,36 +1,30 @@ -from multiprocessing import Lock +import gzip from pathlib import Path from typing import TextIO -import gzip import pytest +from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic -from src.ingest_validation_tests.fastq_validator_logic import \ - FASTQValidatorLogic - -_GOOD_RECORDS = '''\ +_GOOD_RECORDS = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFFF -''' +""" _GOOD_QUALITY_RECORD = ( - '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ' - r'[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' + "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + r"[\]^_`abcdefghijklmnopqrstuvwxyz{|}~" ) _GOOD_SEQUENCE_FOR_QUALITY = ( - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ) def _open_output_file(filename: Path, use_gzip: bool) -> TextIO: - return ( - gzip.open(filename, 'wt') if use_gzip - else open(filename, 'wt') - ) + return gzip.open(filename, "wt") if use_gzip else open(filename, "wt") class TestFASTQValidatorLogic: @@ -46,33 +40,29 @@ def test_fastq_validator_no_files(self, fastq_validator, tmp_path): def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path): # Note that the filename ends in .gz, although it will not contain # compressed data. - test_file = tmp_path.joinpath('test.fastq.gz') + test_file = tmp_path.joinpath("test.fastq.gz") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) assert "Bad gzip file" in fastq_validator.errors[0] - def test_fastq_validator_unrecognized_file(self, fastq_validator, - tmp_path): - test_file = tmp_path.joinpath('test.txt') + def test_fastq_validator_unrecognized_file(self, fastq_validator, tmp_path): + test_file = tmp_path.joinpath("test.txt") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) - assert "Filename does not have proper format" in \ - fastq_validator.errors[0] + assert "Filename does not have proper format" in fastq_validator.errors[0] - def test_fastq_validator_empty_directory(self, fastq_validator, - tmp_path): + def test_fastq_validator_empty_directory(self, fastq_validator, tmp_path): fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # No files in path means no errors assert fastq_validator.errors == [] @pytest.mark.parametrize("use_gzip", [False, True]) def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): - test_file = tmp_path.joinpath('test.fastq.gz' if use_gzip - else 'test.fastq') + test_file = tmp_path.joinpath("test.fastq.gz" if use_gzip else "test.fastq") with _open_output_file(test_file, use_gzip) as output: output.write(_GOOD_RECORDS) @@ -80,9 +70,9 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): assert not fastq_validator.errors def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): - test_file = tmp_path.joinpath('test.fastq') + test_file = tmp_path.joinpath("test.fastq") with _open_output_file(test_file, False) as output: - output.write('ABCDEF') + output.write("ABCDEF") fastq_validator.validate_fastq_files_in_path([tmp_path], 2) @@ -91,56 +81,54 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): assert fastq_validator.errors def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path): - for subdirectory in ['a', 'b']: + for subdirectory in ["a", "b"]: subdirectory_path = tmp_path.joinpath(subdirectory) subdirectory_path.mkdir() - with _open_output_file(subdirectory_path.joinpath('test.fastq'), - False) as output: + with _open_output_file(subdirectory_path.joinpath("test.fastq"), False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - assert "test.fastq has been found multiple times" in \ - fastq_validator.errors[0] + assert "test.fastq has been found multiple times" in fastq_validator.errors[0] def test_fastq_validator_io_error(self, fastq_validator, tmp_path): - fake_path = tmp_path.joinpath('does-not-exist.fastq') + fake_path = tmp_path.joinpath("does-not-exist.fastq") fastq_validator.validate_fastq_file(fake_path) assert "Unable to open" in fastq_validator.errors[0] def test_fastq_validator_line_1_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('@SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("@SEQ_ID", 0) assert not result def test_fastq_validator_line_1_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('*SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("*SEQ_ID", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_1_empty(self, fastq_validator): - result = fastq_validator.validate_fastq_record('', 0) + result = fastq_validator.validate_fastq_record("", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_2_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACTGACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACTGACTGNNNN", 1) assert not result def test_fastq_validator_line_2_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACT$ACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACT$ACTGNNNN", 1) assert "contains invalid character(s): $" in result[0] def test_fastq_validator_line_3_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('+SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("+SEQ_ID", 2) assert not result def test_fastq_validator_line_3_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('!SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("!SEQ_ID", 2) assert "does not begin with '+'" in result[0] @@ -151,42 +139,41 @@ def test_fastq_validator_line_4_good(self, fastq_validator): assert not result def test_fastq_validator_line_4_bad(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABC !@#', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABC !@#", 3) assert 'contains invalid quality character(s): " "' in result[0] def test_fastq_validator_line_4_matching_length(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABCDEFG', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABCDEFG", 3) assert not result - def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, - tmp_path): - fastq_validator.validate_fastq_record('123456789ABCDEF', 1) - fastq_validator.validate_fastq_record('ABC', 3) + def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, tmp_path): + fastq_validator.validate_fastq_record("123456789ABCDEF", 1) + fastq_validator.validate_fastq_record("ABC", 3) - test_data = '''\ + test_data = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFF -''' +""" - new_file = tmp_path.joinpath('test.fastq') + new_file = tmp_path.joinpath("test.fastq") with _open_output_file(new_file, False) as output: output.write(test_data) fastq_validator.validate_fastq_file(new_file) - assert "contains 9 characters which does not match line 2's 10" in \ - fastq_validator.errors[0] + assert ( + "contains 9 characters which does not match line 2's 10" in fastq_validator.errors[0] + ) - def test_fastq_validator_record_counts_good(self, fastq_validator, - tmp_path): + def test_fastq_validator_record_counts_good(self, fastq_validator, tmp_path): for filename in [ - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq', - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq' + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq", + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq", ]: new_file = tmp_path.joinpath(filename) with _open_output_file(new_file, False) as output: @@ -196,15 +183,14 @@ def test_fastq_validator_record_counts_good(self, fastq_validator, assert not fastq_validator.errors - def test_fastq_validator_record_counts_bad(self, fastq_validator, - tmp_path): - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq'), - False) as output: + def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path): + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq'), - False) as output: + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) output.write(_GOOD_RECORDS) diff --git a/tests/test_gz_validator.py b/tests/test_gz_validator.py index 0c6c8b2..ad748c4 100644 --- a/tests/test_gz_validator.py +++ b/tests/test_gz_validator.py @@ -1,19 +1,24 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/fake_snrnaseq_tree_good.zip', []), - ('test_data/fake_snrnaseq_tree_bad.zip', ['.*text2.txt.gz is not a valid gzipped file']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/fake_snrnaseq_tree_good.zip", []), + ("test_data/fake_snrnaseq_tree_bad.zip", [".*text2.txt.gz is not a valid gzipped file"]), + ), +) def test_gz_validator(test_data_fname, msg_re_list, tmp_path): from gz_validator import GZValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = GZValidator(tmp_path / test_data_path.stem, 'snRNAseq') + validator = GZValidator(tmp_path / test_data_path.stem, "snRNAseq") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_ome_tiff_validator.py b/tests/test_ome_tiff_validator.py index 09e2163..89ad198 100644 --- a/tests/test_ome_tiff_validator.py +++ b/tests/test_ome_tiff_validator.py @@ -1,20 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/codex_tree_ometiff_bad.zip', - ['.*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*']), - ('test_data/codex_tree_ometiff_good.zip',[]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/codex_tree_ometiff_bad.zip", + [".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*"], + ), + ("test_data/codex_tree_ometiff_good.zip", []), + ), +) def test_ome_tiff_validator(test_data_fname, msg_re_list, tmp_path): from ome_tiff_validator import OmeTiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = OmeTiffValidator(tmp_path / test_data_path.stem, 'CODEX') + validator = OmeTiffValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_publication_validator.py b/tests/test_publication_validator.py index ebb768c..50c3a0e 100644 --- a/tests/test_publication_validator.py +++ b/tests/test_publication_validator.py @@ -1,34 +1,46 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/publication_tree_good.zip', []), - ('test_data/publication_tree_good_complex.zip', []), - ('test_data/publication_tree_bad_complex.zip', - [ - 'expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent', - 'expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent', - 'expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent', - ]), - ('test_data/publication_tree_bad_1.zip', ['vignettes not found or not a directory']), - ('test_data/publication_tree_bad_2.zip', ['Found a subdirectory in a vignette']), - ('test_data/publication_tree_bad_3.zip', ['A vignette has more than one markdown file']), - ('test_data/publication_tree_bad_4.zip', ['figure dict does not provide a name']), - ('test_data/publication_tree_bad_5.zip', ['figure dict does not reference a file']), - ('test_data/publication_tree_bad_6.zip', ['unexpected files in vignette.*']), - ('test_data/publication_tree_bad_7.zip', ['expected data file' - ' data/codeluppi_2018_nature_methods.molecules.h5ad.zarr' - ' is absent']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/publication_tree_good.zip", []), + ("test_data/publication_tree_good_complex.zip", []), + ( + "test_data/publication_tree_bad_complex.zip", + [ + "expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent", # noqa: E501 + "expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent", # noqa: E501 + "expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent", # noqa: E501 + ], + ), + ("test_data/publication_tree_bad_1.zip", ["vignettes not found or not a directory"]), + ("test_data/publication_tree_bad_2.zip", ["Found a subdirectory in a vignette"]), + ("test_data/publication_tree_bad_3.zip", ["A vignette has more than one markdown file"]), + ("test_data/publication_tree_bad_4.zip", ["figure dict does not provide a name"]), + ("test_data/publication_tree_bad_5.zip", ["figure dict does not reference a file"]), + ("test_data/publication_tree_bad_6.zip", ["unexpected files in vignette.*"]), + ( + "test_data/publication_tree_bad_7.zip", + [ + "expected data file" + " data/codeluppi_2018_nature_methods.molecules.h5ad.zarr" + " is absent" + ], + ), + ), +) def test_publication_validator(test_data_fname, msg_re_list, tmp_path): from publication_validator import PublicationValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = PublicationValidator(tmp_path / test_data_path.stem, 'Publication') + validator = PublicationValidator(tmp_path / test_data_path.stem, "Publication") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = [] diff --git a/tests/test_tiff_validator.py b/tests/test_tiff_validator.py index c2d271b..4ebe66c 100644 --- a/tests/test_tiff_validator.py +++ b/tests/test_tiff_validator.py @@ -1,24 +1,32 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/tiff_tree_good.zip', []), - ('test_data/tiff_tree_bad.zip', [ - '.*notatiff.tif is not a valid TIFF file', - '.*notatiff.tiff is not a valid TIFF file', - '.*notatiff.TIFF is not a valid TIFF file', - '.*notatiff.TIF is not a valid TIFF file', - ]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/tiff_tree_good.zip", []), + ( + "test_data/tiff_tree_bad.zip", + [ + ".*notatiff.tif is not a valid TIFF file", + ".*notatiff.tiff is not a valid TIFF file", + ".*notatiff.TIFF is not a valid TIFF file", + ".*notatiff.TIF is not a valid TIFF file", + ], + ), + ), +) def test_tiff_validator(test_data_fname, msg_re_list, tmp_path): from tiff_validator import TiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = TiffValidator(tmp_path / test_data_path.stem, 'codex') + validator = TiffValidator(tmp_path / test_data_path.stem, "codex") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = [] From 174aae05eca907b1f24589040b2a5e300838c704 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 12:40:50 -0500 Subject: [PATCH 04/19] testing black gh action --- .github/workflows/black.yml | 14 +++++++++++++ README.md | 42 +++++++++++++++++++++++++++++++++++-- requirements-dev.txt | 2 +- 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..a4a0460 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,14 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: psf/black@stable + with: + options: "--check --verbose --line-length 99" + src: "./src" + version: "~= 24.0" diff --git a/README.md b/README.md index 2db444a..033f5b4 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,47 @@ # ingest-validation-tests -This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. +This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. ## Development process +### Branches + - Make new feature branches from `devel`. - Make PRs to `devel`. (This is the default branch.) -- The last reviewer to approve a PR should merge it. At the moment that is likely to be @jswelling . +- The last reviewer to approve a PR should merge it. + +### Setup + +- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv: + +``` +python3.9 -m venv hm-ingest-validation-tests +source hm-ingest-validation-tests/bin/activate +``` + +- Run `pip install -r requirements-dev.txt` +- (optional) Integrate black with your editor. + - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) + - If you choose not to integrate black with your editor, run the following from the base `ingest-validation-tests` directory before pushing code to GitHub: `black --line-length 99 .` + +### Testing + +- If ingest-validation-tools is not already set up: + +``` +# Starting from ingest-validation-tests... +cd .. +git clone https://github.com/hubmapconsortium/ingest-validation-tools.git +cd ingest-validation-tests +pip install -r ../ingest-validation-tools/requirements.txt +pip install -r ../ingest-validation-tools/requirements-dev.txt +``` + +- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run + +``` +pip install -r /requirements.txt +pip install -r /requirements-dev.txt +``` + +- Run `test.sh` diff --git a/requirements-dev.txt b/requirements-dev.txt index d5c087a..2cd9d49 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,10 +1,10 @@ black==23.12.1 -flake8==7.0.0 git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils imagecodecs>=2023.3.16 isort==5.13.2 jsonschema==4.4.0 pandas>=1.2.0 +pytest==8.0.0 python-frontmatter>=1.0.0 tifffile==2020.10.1 xmlschema>=1.6 From 0dd0cb643a841301b7f1acea6e7c1891bfa70af6 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 12:53:29 -0500 Subject: [PATCH 05/19] linting change --- src/ingest_validation_tests/fastq_validator_logic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 8095b74..440ceb4 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -122,9 +122,9 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[ - [FASTQValidatorLogic, str], List[str] - ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index] + validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = ( + self._VALIDATE_FASTQ_LINE_METHODS[line_index] + ) assert validator_method, f"No validator method defined for record index {line_index}" From 303e87d8179cceb5b310d0faa64f76726b213d69 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:01:40 -0500 Subject: [PATCH 06/19] testing update to dismiss error about node version --- .github/workflows/black.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index a4a0460..d8d7e0b 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,7 +6,7 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: psf/black@stable with: options: "--check --verbose --line-length 99" From edac78ced52c81068baa4504558d7bdd9e5eeeed Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:06:05 -0500 Subject: [PATCH 07/19] trying isort gh action --- .github/workflows/isort.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/isort.yml diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml new file mode 100644 index 0000000..7bfa84f --- /dev/null +++ b/.github/workflows/isort.yml @@ -0,0 +1,13 @@ +name: Run isort +on: + - push + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: isort/isort-action@v1 + with: + configuration: "--profile black" + requirements-files: "requirements.txt requirements-dev.txt" From 9d368d12c070eddb93dc35d96786b4a7740edd3f Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:49:30 -0500 Subject: [PATCH 08/19] testing isort 3 --- .github/workflows/isort.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index 7bfa84f..fdf5e4a 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -7,7 +7,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: isort/isort-action@v1 + - uses: actions/setup-python@v5 with: - configuration: "--profile black" - requirements-files: "requirements.txt requirements-dev.txt" + python-version: 3.9 + - uses: isort/isort-action@master + with: + configuration: "--profile black" + requirementsFiles: "requirements.txt requirements-test.txt" From 2f6f1d9cbc2070e63565625537acf7f013b97f2c Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:51:03 -0500 Subject: [PATCH 09/19] testing isort 4 --- .github/workflows/isort.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index fdf5e4a..52f9cf5 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -12,5 +12,5 @@ jobs: python-version: 3.9 - uses: isort/isort-action@master with: - configuration: "--profile black" + configuration: "--check-only --diff --profile black" requirementsFiles: "requirements.txt requirements-test.txt" From 4f9c09fb12e3cd645a5aa8c925c0106bcacdcdf4 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:10:23 -0500 Subject: [PATCH 10/19] updated actions to run on PR; updated README.md --- .github/workflows/black.yml | 3 +-- .github/workflows/isort.yml | 3 +-- README.md | 5 ++++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index d8d7e0b..7c02822 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -1,6 +1,5 @@ name: Lint - -on: [push, pull_request] +on: pull_request jobs: lint: diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index 52f9cf5..db105a8 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -1,6 +1,5 @@ name: Run isort -on: - - push +on: pull_request jobs: build: diff --git a/README.md b/README.md index 033f5b4..4bddaf1 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ This repository contains plug-in tests for use during validation of submissions. ### Branches - Make new feature branches from `devel`. +- Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory: + - `black --line-length 99 .` (if you choose not to integrate black with your editor (see Setup section) + - `isort .` + - Make PRs to `devel`. (This is the default branch.) - The last reviewer to approve a PR should merge it. @@ -22,7 +26,6 @@ source hm-ingest-validation-tests/bin/activate - Run `pip install -r requirements-dev.txt` - (optional) Integrate black with your editor. - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) - - If you choose not to integrate black with your editor, run the following from the base `ingest-validation-tests` directory before pushing code to GitHub: `black --line-length 99 .` ### Testing From a029b0061d251cba10393e0be7843e2a86fc9a00 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:35:56 -0500 Subject: [PATCH 11/19] cleanup --- .flake8 | 3 --- .gitignore | 1 + README.md | 48 +++++++++++++++++++++++++++--------------------- 3 files changed, 28 insertions(+), 24 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index ad98539..0000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 100 -extend-ignore = E203 diff --git a/.gitignore b/.gitignore index da9fde3..79cb0cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .vscode/ +.flake8 # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 4bddaf1..b0f37ab 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,13 @@ This repository contains plug-in tests for use during validation of submissions. - Make new feature branches from `devel`. - Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory: - - `black --line-length 99 .` (if you choose not to integrate black with your editor (see Setup section) - - `isort .` + + ``` + black --line-length 99 . + isort --profile black --multi-line 3 . + ``` + + (You can integrate black and potentially isort with your editor to skip this step, see Setup section below) - Make PRs to `devel`. (This is the default branch.) - The last reviewer to approve a PR should merge it. @@ -18,33 +23,34 @@ This repository contains plug-in tests for use during validation of submissions. - Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv: -``` -python3.9 -m venv hm-ingest-validation-tests -source hm-ingest-validation-tests/bin/activate -``` + ``` + python3.9 -m venv hm-ingest-validation-tests + source hm-ingest-validation-tests/bin/activate + ``` - Run `pip install -r requirements-dev.txt` - (optional) Integrate black with your editor. - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) +- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor. ### Testing - If ingest-validation-tools is not already set up: -``` -# Starting from ingest-validation-tests... -cd .. -git clone https://github.com/hubmapconsortium/ingest-validation-tools.git -cd ingest-validation-tests -pip install -r ../ingest-validation-tools/requirements.txt -pip install -r ../ingest-validation-tools/requirements-dev.txt -``` - -- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run - -``` -pip install -r /requirements.txt -pip install -r /requirements-dev.txt -``` + ``` + # Starting from ingest-validation-tests... + cd .. + git clone https://github.com/hubmapconsortium/ingest-validation-tools.git + cd ingest-validation-tests + pip install -r ../ingest-validation-tools/requirements.txt + pip install -r ../ingest-validation-tools/requirements-dev.txt + ``` + +- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run: + + ``` + pip install -r /requirements.txt + pip install -r /requirements-dev.txt + ``` - Run `test.sh` From 9604ce7a38b7a20ebfa37c93c95bffbdca0ed045 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:42:24 -0500 Subject: [PATCH 12/19] cleanup --- .github/workflows/isort.yml | 4 ++-- pyproject.toml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index db105a8..2ef4557 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -11,5 +11,5 @@ jobs: python-version: 3.9 - uses: isort/isort-action@master with: - configuration: "--check-only --diff --profile black" - requirementsFiles: "requirements.txt requirements-test.txt" + configuration: "--check-only --diff --profile black" + requirementsFiles: "requirements.txt requirements-test.txt" diff --git a/pyproject.toml b/pyproject.toml index 72dd8d9..93863fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,3 @@ line-length = 99 [tool.isort] profile = "black" multi_line_output = 3 -src_paths = ["src/ingest-validation-tests"] From e30f4263b8b9c44c98262fc2f8fce199b209b33c Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:45:11 -0500 Subject: [PATCH 13/19] oops forgot to update test file --- tests/test_fastq_validator_logic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index c554d6f..31bd68a 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -3,6 +3,7 @@ from typing import TextIO import pytest + from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic _GOOD_RECORDS = """\ From 99858a1dfca86648c83b1f4b9241bd11d59c8b5d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 2 Feb 2024 13:46:17 -0500 Subject: [PATCH 14/19] simplifying workflows yml file, adding flake8 --- .flake8 | 3 +++ .github/workflows/black.yml | 13 ------------- .github/workflows/isort.yml | 15 --------------- .github/workflows/linting.yml | 32 ++++++++++++++++++++++++++++++++ .gitignore | 1 - README.md | 6 ++++-- requirements-dev.txt | 1 + 7 files changed, 40 insertions(+), 31 deletions(-) create mode 100644 .flake8 delete mode 100644 .github/workflows/black.yml delete mode 100644 .github/workflows/isort.yml create mode 100644 .github/workflows/linting.yml diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..70fc30a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +extend-ignore = E203,E501,W503 +max-line-length = 99 diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index 7c02822..0000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: Lint -on: pull_request - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: psf/black@stable - with: - options: "--check --verbose --line-length 99" - src: "./src" - version: "~= 24.0" diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml deleted file mode 100644 index 2ef4557..0000000 --- a/.github/workflows/isort.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: Run isort -on: pull_request - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.9 - - uses: isort/isort-action@master - with: - configuration: "--check-only --diff --profile black" - requirementsFiles: "requirements.txt requirements-test.txt" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..f34ebe8 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,32 @@ +name: Linters + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 black isort + - name: Flake8 Lint + run: | + flake8 --ignore=E501,W503,E203 + - name: Black Lint + run: | + black --line-length 99 --check --verbose + - name: isort Lint + run: | + isort --profile black --check-only --diff diff --git a/.gitignore b/.gitignore index 79cb0cf..da9fde3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .vscode/ -.flake8 # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index b0f37ab..0cca7ba 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,15 @@ This repository contains plug-in tests for use during validation of submissions. ### Branches - Make new feature branches from `devel`. -- Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory: +- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory: ``` black --line-length 99 . isort --profile black --multi-line 3 . + flake8 ``` - (You can integrate black and potentially isort with your editor to skip this step, see Setup section below) + (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.) - Make PRs to `devel`. (This is the default branch.) - The last reviewer to approve a PR should merge it. @@ -32,6 +33,7 @@ This repository contains plug-in tests for use during validation of submissions. - (optional) Integrate black with your editor. - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) - (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor. +- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor. ### Testing diff --git a/requirements-dev.txt b/requirements-dev.txt index 2cd9d49..ba6f71b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ black==23.12.1 +flake8==7.0.0 git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils imagecodecs>=2023.3.16 isort==5.13.2 From 67436d4e59399b25fce5d76d159edc85a42b9ca2 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 2 Feb 2024 13:47:38 -0500 Subject: [PATCH 15/19] fixing mistake --- .github/workflows/linting.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index f34ebe8..b4fb9d4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -23,10 +23,10 @@ jobs: pip install flake8 black isort - name: Flake8 Lint run: | - flake8 --ignore=E501,W503,E203 + flake8 --ignore=E501,W503,E203 . - name: Black Lint run: | - black --line-length 99 --check --verbose + black --line-length 99 --check --verbose . - name: isort Lint run: | - isort --profile black --check-only --diff + isort --profile black --check-only --diff . From ee0ebc71bcbd4d9ed1dab3726b12637aefe5cccd Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Mon, 26 Feb 2024 16:15:30 -0500 Subject: [PATCH 16/19] fixing silly typo --- src/ingest_validation_tests/publication_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index a68d549..a3d705a 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -21,7 +21,7 @@ class PublicationValidator(Validator): cost = 1.0 base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)" url_re = r"[Uu][Rr][Ll]" - required = "publications" + required = "publication" def collect_errors(self, **kwargs) -> List[str]: """ From 92d305faf7bc3f0de7589074e9d206dd689efc55 Mon Sep 17 00:00:00 2001 From: David Betancur Date: Mon, 15 Apr 2024 12:24:30 -0400 Subject: [PATCH 17/19] Adding exception handling for different errors on Gzip files. Adding exception tasks to properly close multiprocessing pools. --- src/ingest_validation_tests/fastq_validator_logic.py | 7 +++++++ src/ingest_validation_tests/gz_validator.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 440ceb4..2c87c91 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -167,6 +167,10 @@ def validate_fastq_file(self, fastq_file: Path) -> None: except IOError: self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return + except EOFError: + self.errors.append(self._format_error(f"EOF in FASTQ data file {fastq_file}.")) + except Exception as e: + self.errors.append(self._format_error(f"Unexpected error: {e} on data file {fastq_file}.")) self._file_record_counts[str(fastq_file)] = records_read def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: @@ -192,6 +196,9 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f"Error {e}") + pool.close() + pool.join() + data_found_one.append(f"Error {e}") else: pool.close() pool.join() diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index 9bb0162..fe0a147 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -46,6 +46,9 @@ def collect_errors(self, **kwargs) -> List[str]: data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f"Error {e}") + pool.close() + pool.join() + data_output2.extend(f"Error: {e}") else: pool.close() pool.join() From fadcf183dbfca18a56ed8dfe1400de3aa435b755 Mon Sep 17 00:00:00 2001 From: David Betancur Date: Mon, 15 Apr 2024 12:29:19 -0400 Subject: [PATCH 18/19] Updating long line. --- src/ingest_validation_tests/fastq_validator_logic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 2c87c91..b786fc3 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -222,7 +222,8 @@ def _find_duplicates(self, dirs_and_files): if len(filepaths) > 1: self.errors.append( _log( - f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}." # noqa: E501 + f"{filename} has been found multiple times during this validation. " + f"Locations of duplicates: {filepaths}." # noqa: E501 ) ) From 30e25082c2b4dec13b86989cf62f1698c6327d85 Mon Sep 17 00:00:00 2001 From: David Betancur Date: Mon, 15 Apr 2024 12:30:50 -0400 Subject: [PATCH 19/19] Black linting --- src/ingest_validation_tests/fastq_validator_logic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index b786fc3..7ae516f 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -170,7 +170,9 @@ def validate_fastq_file(self, fastq_file: Path) -> None: except EOFError: self.errors.append(self._format_error(f"EOF in FASTQ data file {fastq_file}.")) except Exception as e: - self.errors.append(self._format_error(f"Unexpected error: {e} on data file {fastq_file}.")) + self.errors.append( + self._format_error(f"Unexpected error: {e} on data file {fastq_file}.") + ) self._file_record_counts[str(fastq_file)] = records_read def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: