From 53e886af56a8fb4f92d467d5dc4b09b3d646efbb Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 20 Oct 2020 10:12:12 -0400 Subject: [PATCH 01/63] Expose pipeline manifest "is_qa_qc" flag to Elasticsearch index --- src/ingest-pipeline/airflow/dags/utils.py | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/utils.py b/src/ingest-pipeline/airflow/dags/utils.py index d764fb39..16f45275 100644 --- a/src/ingest-pipeline/airflow/dags/utils.py +++ b/src/ingest-pipeline/airflow/dags/utils.py @@ -100,9 +100,12 @@ COMPILED_WORKFLOW_MAP: Optional[List[Tuple[Pattern, Pattern, str]]] = None +ManifestMatch = Tuple[bool, Optional[str], Optional[str], Optional[bool]] + + class FileMatcher(ABC): @abstractmethod - def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]: + def get_file_metadata(self, file_path: Path) -> ManifestMatch: """ :return: A 3-tuple: [0] bool, whether to add `file_path` to a downstream index @@ -113,20 +116,21 @@ def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optio class PipelineFileMatcher(FileMatcher): # (file/directory regex, description template, EDAM ontology term) - matchers: List[Tuple[Pattern, str, str]] + matchers: List[Tuple[Pattern, str, str, bool]] def __init__(self): self.matchers = [] @classmethod - def read_manifest(cls, pipeline_file_manifest: Path) -> Iterable[Tuple[Pattern, str, str]]: + def read_manifest(cls, pipeline_file_manifest: Path) -> Iterable[Tuple[Pattern, str, str, bool]]: with open(pipeline_file_manifest) as f: manifest = json.load(f) localized_assert_json_matches_schema(manifest, 'pipeline_file_manifest.yml') for annotation in manifest: pattern = re.compile(annotation['pattern']) - yield pattern, annotation['description'], annotation['edam_ontology_term'] + is_qa_qc = annotation.get('is_qa_qc', False) + yield pattern, annotation['description'], annotation['edam_ontology_term'], is_qa_qc @classmethod def create_from_files(cls, pipeline_file_manifests: Iterable[Path]): @@ -135,7 +139,7 @@ def create_from_files(cls, pipeline_file_manifests: Iterable[Path]): obj.matchers.extend(cls.read_manifest(manifest)) return obj - def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]: + def get_file_metadata(self, file_path: Path) -> ManifestMatch: """ Checks `file_path` against the list of patterns stored in this object. At the first match, return the associated description and ontology term. @@ -143,13 +147,13 @@ def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optio the "first-match" behavior is deliberate. """ path_str = fspath(file_path) - for pattern, description_template, ontology_term in self.matchers: + for pattern, description_template, ontology_term, is_qa_qc in self.matchers: # TODO: walrus operator m = pattern.search(path_str) if m: formatted_description = description_template.format_map(m.groupdict()) - return True, formatted_description, ontology_term - return False, None, None + return True, formatted_description, ontology_term, is_qa_qc + return False, None, None, None class DummyFileMatcher(FileMatcher): @@ -376,6 +380,7 @@ def get_file_metadata(root_dir: str, matcher: FileMatcher) -> List[Mapping[str, 'size': , 'description': , 'edam_term': , + 'is_qa_qc': , }, ... ] @@ -389,7 +394,7 @@ def get_file_metadata(root_dir: str, matcher: FileMatcher) -> List[Mapping[str, for fn in fnames: full_path = dp / fn relative_path = full_path.relative_to(root_path) - add_to_index, description, ontology_term = matcher.get_file_metadata(relative_path) + add_to_index, description, ontology_term, is_qa_qc = matcher.get_file_metadata(relative_path) if add_to_index: # sha1sum disabled because of run time issues on large data collections #line = check_output([word.format(fname=full_path) @@ -402,6 +407,7 @@ def get_file_metadata(root_dir: str, matcher: FileMatcher) -> List[Mapping[str, 'size': getsize(full_path), 'description': description, 'edam_term': ontology_term, + 'is_qa_qc': is_qa_qc, #'sha1sum': cs, } ) From 0bf8b12ffdf5415a2469b5ff48d571c008fdf694 Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 20 Oct 2020 10:16:10 -0400 Subject: [PATCH 02/63] Add "is_qa_qc" to metadata schemas Not required in the pipeline_file_manifest schema; it defaults to False --- src/ingest-pipeline/schemata/file_info_schema.json | 3 +++ src/ingest-pipeline/schemata/file_info_schema.yml | 1 + src/ingest-pipeline/schemata/pipeline_file_manifest.json | 4 ++++ src/ingest-pipeline/schemata/pipeline_file_manifest.yml | 3 +++ 4 files changed, 11 insertions(+) diff --git a/src/ingest-pipeline/schemata/file_info_schema.json b/src/ingest-pipeline/schemata/file_info_schema.json index dd4b2112..b4ab7fad 100644 --- a/src/ingest-pipeline/schemata/file_info_schema.json +++ b/src/ingest-pipeline/schemata/file_info_schema.json @@ -24,6 +24,9 @@ }, "edam_term": { "type": "string" + }, + "is_qa_qc": { + "type": "boolean" } } }, diff --git a/src/ingest-pipeline/schemata/file_info_schema.yml b/src/ingest-pipeline/schemata/file_info_schema.yml index 3ee1a5d4..6af38210 100644 --- a/src/ingest-pipeline/schemata/file_info_schema.yml +++ b/src/ingest-pipeline/schemata/file_info_schema.yml @@ -15,6 +15,7 @@ 'size': {'type': 'integer', 'minimum': 0} 'description': {'type': 'string'} 'edam_term': {'type': 'string'} + 'is_qa_qc': { 'type': 'boolean' } #'sha1sum': {'type': 'string', 'pattern': '^[a-fA-F0-9]{40}$'} 'file_info': 'type': 'array' diff --git a/src/ingest-pipeline/schemata/pipeline_file_manifest.json b/src/ingest-pipeline/schemata/pipeline_file_manifest.json index b36ed263..542f6635 100644 --- a/src/ingest-pipeline/schemata/pipeline_file_manifest.json +++ b/src/ingest-pipeline/schemata/pipeline_file_manifest.json @@ -18,6 +18,10 @@ "edam_ontology_term": { "type": "string", "description": "Term in the EDAM ontology describing this pipeline output file" + }, + "is_qa_qc": { + "type": "boolean", + "description": "Whether this file is a QA/QC report" } }, "required": ["pattern", "description", "edam_ontology_term"] diff --git a/src/ingest-pipeline/schemata/pipeline_file_manifest.yml b/src/ingest-pipeline/schemata/pipeline_file_manifest.yml index ff464b2d..27d96952 100644 --- a/src/ingest-pipeline/schemata/pipeline_file_manifest.yml +++ b/src/ingest-pipeline/schemata/pipeline_file_manifest.yml @@ -15,4 +15,7 @@ 'edam_ontology_term': 'type': 'string' 'description': 'Term in the EDAM ontology describing this pipeline output file' + 'is_qa_qc': + 'type': 'boolean' + 'description': 'Whether this file is a QA/QC report' 'required': ['pattern', 'description', 'edam_ontology_term'] From 21eb367ede9c892a2b7e2f870bca2cdaaf108007 Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 20 Oct 2020 10:34:09 -0400 Subject: [PATCH 03/63] Fix return type of DummyFileMatcher.get_file_metadata --- src/ingest-pipeline/airflow/dags/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/utils.py b/src/ingest-pipeline/airflow/dags/utils.py index 16f45275..b56f6f12 100644 --- a/src/ingest-pipeline/airflow/dags/utils.py +++ b/src/ingest-pipeline/airflow/dags/utils.py @@ -161,8 +161,8 @@ class DummyFileMatcher(FileMatcher): Drop-in replacement for PipelineFileMatcher which allows everything and always provides empty descriptions and ontology terms. """ - def get_file_metadata(self, file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]: - return True, '', '' + def get_file_metadata(self, file_path: Path) -> ManifestMatch: + return True, '', '', False def find_pipeline_manifests(cwl_files: Iterable[Path]) -> List[Path]: From 457cdf82877ca87c18906ca14693f12273a14809 Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 21 Oct 2020 10:25:03 -0400 Subject: [PATCH 04/63] Bump salmon-rnaseq submodule to v1.5.4 --- src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq index 11e4b28c..bd013b61 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq +++ b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq @@ -1 +1 @@ -Subproject commit 11e4b28c8e22d856fb3e1f65231b6b31b556e150 +Subproject commit bd013b61984c2dff304249205f69bdbca3055100 From b93e6b602385a2026dd2fec27070ce6007170bb8 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Mon, 26 Oct 2020 20:39:13 -0400 Subject: [PATCH 05/63] tifffile.py was the only thing in thirdparty and it is no longer needed. --- build_number | 2 +- src/ingest-pipeline/md/thirdparty/__init__.py | 0 src/ingest-pipeline/md/thirdparty/tifffile.py | 12122 ---------------- 3 files changed, 1 insertion(+), 12123 deletions(-) delete mode 100644 src/ingest-pipeline/md/thirdparty/__init__.py delete mode 100644 src/ingest-pipeline/md/thirdparty/tifffile.py diff --git a/build_number b/build_number index bf2c10d2..cd5a7180 100644 --- a/build_number +++ b/build_number @@ -1 +1 @@ -477 +591 diff --git a/src/ingest-pipeline/md/thirdparty/__init__.py b/src/ingest-pipeline/md/thirdparty/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/ingest-pipeline/md/thirdparty/tifffile.py b/src/ingest-pipeline/md/thirdparty/tifffile.py deleted file mode 100644 index 56055552..00000000 --- a/src/ingest-pipeline/md/thirdparty/tifffile.py +++ /dev/null @@ -1,12122 +0,0 @@ -# -*- coding: utf-8 -*- -# tifffile.py - -# Copyright (c) 2008-2019, Christoph Gohlke -# Copyright (c) 2008-2019, The Regents of the University of California -# Produced at the Laboratory for Fluorescence Dynamics -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - -"""Read and write TIFF(r) files. - -Tifffile is a Python library to - -(1) store numpy arrays in TIFF (Tagged Image File Format) files, and -(2) read image and metadata from TIFF-like files used in bioimaging. - -Image and metadata can be read from TIFF, BigTIFF, OME-TIFF, STK, LSM, SGI, -NIHImage, ImageJ, MicroManager, FluoView, ScanImage, SEQ, GEL, SVS, SCN, SIS, -ZIF, QPI, NDPI, and GeoTIFF files. - -Numpy arrays can be written to TIFF, BigTIFF, and ImageJ hyperstack compatible -files in multi-page, memory-mappable, tiled, predicted, or compressed form. - -Only a subset of the TIFF specification is supported, mainly uncompressed and -losslessly compressed 1, 8, 16, 32 and 64-bit integer, 16, 32 and 64-bit float, -grayscale and RGB(A) images. -Specifically, reading slices of image data, CCITT and OJPEG compression, -chroma subsampling without JPEG compression, or IPTC and XMP metadata are not -implemented. - -TIFF(r), the Tagged Image File Format, is a trademark and under control of -Adobe Systems Incorporated. BigTIFF allows for files greater than 4 GB. -STK, LSM, FluoView, SGI, SEQ, GEL, and OME-TIFF, are custom extensions -defined by Molecular Devices (Universal Imaging Corporation), Carl Zeiss -MicroImaging, Olympus, Silicon Graphics International, Media Cybernetics, -Molecular Dynamics, and the Open Microscopy Environment consortium -respectively. - -For command line usage run ``python -m tifffile --help`` - -:Author: - `Christoph Gohlke `_ - -:Organization: - Laboratory for Fluorescence Dynamics, University of California, Irvine - -:License: 3-clause BSD - -:Version: 2019.7.26 - -Requirements ------------- -This release has been tested with the following requirements and dependencies -(other versions may work): - -* `CPython 2.7.16, 3.5.4, 3.6.8, 3.7.4, 64-bit `_ -* `Numpy 1.16.4 `_ -* `Imagecodecs 2019.5.22 `_ - (optional; used for encoding and decoding LZW, JPEG, etc.) -* `Matplotlib 3.1 `_ (optional; used for plotting) -* Python 2.7 requires 'futures', 'enum34', and 'pathlib'. - -Revisions ---------- -2019.7.26 - Pass 2869 tests. - Fix infinite loop reading more than two tags of same code in IFD. - Delay import of logging module. -2019.7.20 - Fix OME-XML detection for files created by Imaris. - Remove or replace assert statements. -2019.7.2 - Do not write SampleFormat tag for unsigned data types. - Write ByteCount tag values as SHORT or LONG if possible. - Allow to specify axes in FileSequence pattern via group names. - Add option to concurrently read FileSequence using threads. - Derive TiffSequence from FileSequence. - Use str(datetime.timedelta) to format Timer duration. - Use perf_counter for Timer if possible. -2019.6.18 - Fix reading planar RGB ImageJ files created by Bio-Formats. - Fix reading single-file, multi-image OME-TIFF without UUID. - Presume LSM stores uncompressed images contiguously per page. - Reformat some complex expressions. -2019.5.30 - Ignore invalid frames in OME-TIFF. - Set default subsampling to (2, 2) for RGB JPEG compression. - Fix reading and writing planar RGB JPEG compression. - Replace buffered_read with FileHandle.read_segments. - Include page or frame numbers in exceptions and warnings. - Add Timer class. -2019.5.22 - Add optional chroma subsampling for JPEG compression. - Enable writing PNG, JPEG, JPEGXR, and JPEG2000 compression (WIP). - Fix writing tiled images with WebP compression. - Improve handling GeoTIFF sparse files. -2019.3.18 - Fix regression decoding JPEG with RGB photometrics. - Fix reading OME-TIFF files with corrupted but unused pages. - Allow to load TiffFrame without specifying keyframe. - Calculate virtual TiffFrames for non-BigTIFF ScanImage files > 2GB. - Rename property is_chroma_subsampled to is_subsampled (breaking). - Make more attributes and methods private (WIP). -2019.3.8 - Fix MemoryError when RowsPerStrip > ImageLength. - Fix SyntaxWarning on Python 3.8. - Fail to decode JPEG to planar RGB (tentative). - Separate public from private test files (WIP). - Allow testing without data files or imagecodecs. -2019.2.22 - Use imagecodecs-lite as a fallback for imagecodecs. - Simplify reading numpy arrays from file. - Use TiffFrames when reading arrays from page sequences. - Support slices and iterators in TiffPageSeries sequence interface. - Auto-detect uniform series. - Use page hash to determine generic series. - Turn off page cache (tentative). - Pass through more parameters in imread. - Discontinue movie parameter in imread and TiffFile (breaking). - Discontinue bigsize parameter in imwrite (breaking). - Raise TiffFileError in case of issues with TIFF structure. - Return TiffFile.ome_metadata as XML (breaking). - Ignore OME series when last dimensions are not stored in TIFF pages. -2019.2.10 - Assemble IFDs in memory to speed-up writing on some slow media. - Handle discontinued arguments fastij, multifile_close, and pages. -2019.1.30 - Use black background in imshow. - Do not write datetime tag by default (breaking). - Fix OME-TIFF with SamplesPerPixel > 1. - Allow 64-bit IFD offsets for NDPI (files > 4GB still not supported). -2019.1.4 - Fix decoding deflate without imagecodecs. -2019.1.1 - Update copyright year. - Require imagecodecs >= 2018.12.16. - Do not use JPEG tables from keyframe. - Enable decoding large JPEG in NDPI. - Decode some old-style JPEG. - Reorder OME channel axis to match PlanarConfiguration storage. - Return tiled images as contiguous arrays. - Add decode_lzw proxy function for compatibility with old czifile module. - Use dedicated logger. -2018.11.28 - Make SubIFDs accessible as TiffPage.pages. - Make parsing of TiffSequence axes pattern optional (breaking). - Limit parsing of TiffSequence axes pattern to file names, not path names. - Do not interpolate in imshow if image dimensions <= 512, else use bilinear. - Use logging.warning instead of warnings.warn in many cases. - Fix numpy FutureWarning for out == 'memmap'. - Adjust ZSTD and WebP compression to libtiff-4.0.10 (WIP). - Decode old-style LZW with imagecodecs >= 2018.11.8. - Remove TiffFile.qptiff_metadata (QPI metadata are per page). - Do not use keyword arguments before variable positional arguments. - Make either all or none return statements in a function return expression. - Use pytest parametrize to generate tests. - Replace test classes with functions. -2018.11.6 - Rename imsave function to imwrite. - Readd Python implementations of packints, delta, and bitorder codecs. - Fix TiffFrame.compression AttributeError. -2018.10.18 - Rename tiffile package to tifffile. -2018.10.10 - Read ZIF, the Zoomable Image Format (WIP). - Decode YCbCr JPEG as RGB (tentative). - Improve restoration of incomplete tiles. - Allow to write grayscale with extrasamples without specifying planarconfig. - Enable decoding of PNG and JXR via imagecodecs. - Deprecate 32-bit platforms (too many memory errors during tests). -2018.9.27 - Read Olympus SIS (WIP). - Allow to write non-BigTIFF files up to ~4 GB (fix). - Fix parsing date and time fields in SEM metadata. - Detect some circular IFD references. - Enable WebP codecs via imagecodecs. - Add option to read TiffSequence from ZIP containers. - Remove TiffFile.isnative. - Move TIFF struct format constants out of TiffFile namespace. -2018.8.31 - Fix wrong TiffTag.valueoffset. - Towards reading Hamamatsu NDPI (WIP). - Enable PackBits compression of byte and bool arrays. - Fix parsing NULL terminated CZ_SEM strings. -2018.8.24 - Move tifffile.py and related modules into tiffile package. - Move usage examples to module docstring. - Enable multi-threading for compressed tiles and pages by default. - Add option to concurrently decode image tiles using threads. - Do not skip empty tiles (fix). - Read JPEG and J2K compressed strips and tiles. - Allow floating-point predictor on write. - Add option to specify subfiletype on write. - Depend on imagecodecs package instead of _tifffile, lzma, etc modules. - Remove reverse_bitorder, unpack_ints, and decode functions. - Use pytest instead of unittest. -2018.6.20 - Save RGBA with unassociated extrasample by default (breaking). - Add option to specify ExtraSamples values. -2018.6.17 (included with 0.15.1) - Towards reading JPEG and other compressions via imagecodecs package (WIP). - Read SampleFormat VOID as UINT. - Add function to validate TIFF using 'jhove -m TIFF-hul'. - Save bool arrays as bilevel TIFF. - Accept pathlib.Path as filenames. - Move 'software' argument from TiffWriter __init__ to save. - Raise DOS limit to 16 TB. - Lazy load LZMA and ZSTD compressors and decompressors. - Add option to save IJMetadata tags. - Return correct number of pages for truncated series (fix). - Move EXIF tags to TIFF.TAG as per TIFF/EP standard. -2018.2.18 - Always save RowsPerStrip and Resolution tags as required by TIFF standard. - Do not use badly typed ImageDescription. - Coherce bad ASCII string tags to bytes. - Tuning of __str__ functions. - Fix reading 'undefined' tag values. - Read and write ZSTD compressed data. - Use hexdump to print byte strings. - Determine TIFF byte order from data dtype in imsave. - Add option to specify RowsPerStrip for compressed strips. - Allow memory-map of arrays with non-native byte order. - Attempt to handle ScanImage <= 5.1 files. - Restore TiffPageSeries.pages sequence interface. - Use numpy.frombuffer instead of fromstring to read from binary data. - Parse GeoTIFF metadata. - Add option to apply horizontal differencing before compression. - Towards reading PerkinElmer QPI (QPTIFF, no test files). - Do not index out of bounds data in tifffile.c unpackbits and decodelzw. -2017.9.29 - Many backward incompatible changes improving speed and resource usage: - Add detail argument to __str__ function. Remove info functions. - Fix potential issue correcting offsets of large LSM files with positions. - Remove TiffFile sequence interface; use TiffFile.pages instead. - Do not make tag values available as TiffPage attributes. - Use str (not bytes) type for tag and metadata strings (WIP). - Use documented standard tag and value names (WIP). - Use enums for some documented TIFF tag values. - Remove 'memmap' and 'tmpfile' options; use out='memmap' instead. - Add option to specify output in asarray functions. - Add option to concurrently decode pages using threads. - Add TiffPage.asrgb function (WIP). - Do not apply colormap in asarray. - Remove 'colormapped', 'rgbonly', and 'scale_mdgel' options from asarray. - Consolidate metadata in TiffFile _metadata functions. - Remove non-tag metadata properties from TiffPage. - Add function to convert LSM to tiled BIN files. - Align image data in file. - Make TiffPage.dtype a numpy.dtype. - Add 'ndim' and 'size' properties to TiffPage and TiffPageSeries. - Allow imsave to write non-BigTIFF files up to ~4 GB. - Only read one page for shaped series if possible. - Add memmap function to create memory-mapped array stored in TIFF file. - Add option to save empty arrays to TIFF files. - Add option to save truncated TIFF files. - Allow single tile images to be saved contiguously. - Add optional movie mode for files with uniform pages. - Lazy load pages. - Use lightweight TiffFrame for IFDs sharing properties with key TiffPage. - Move module constants to 'TIFF' namespace (speed up module import). - Remove 'fastij' option from TiffFile. - Remove 'pages' parameter from TiffFile. - Remove TIFFfile alias. - Deprecate Python 2. - Require enum34 and futures packages on Python 2.7. - Remove Record class and return all metadata as dict instead. - Add functions to parse STK, MetaSeries, ScanImage, SVS, Pilatus metadata. - Read tags from EXIF and GPS IFDs. - Use pformat for tag and metadata values. - Fix reading some UIC tags. - Do not modify input array in imshow (fix). - Fix Python implementation of unpack_ints. -2017.5.23 - Write correct number of SampleFormat values (fix). - Use Adobe deflate code to write ZIP compressed files. - Add option to pass tag values as packed binary data for writing. - Defer tag validation to attribute access. - Use property instead of lazyattr decorator for simple expressions. -2017.3.17 - Write IFDs and tag values on word boundaries. - Read ScanImage metadata. - Remove is_rgb and is_indexed attributes from TiffFile. - Create files used by doctests. -2017.1.12 (included with scikit-image 0.14.x) - Read Zeiss SEM metadata. - Read OME-TIFF with invalid references to external files. - Rewrite C LZW decoder (5x faster). - Read corrupted LSM files missing EOI code in LZW stream. -2017.1.1 - ... - -Refer to the CHANGES file for older revisions. - -Notes ------ -The API is not stable yet and might change between revisions. - -Tested on little-endian platforms only. - -Python 2.7 and 32-bit versions are deprecated. - -Tifffile relies on the `imagecodecs `_ -package for encoding and decoding LZW, JPEG, and other compressed images. -The `imagecodecs-lite `_ package, -which is easier to build, can be used for decoding LZW compressed images -instead. - -Several TIFF-like formats do not strictly adhere to the TIFF6 specification, -some of which allow file or data sizes to exceed the 4 GB limit: - -* *BigTIFF* is identified by version number 43 and uses different file - header, IFD, and tag structures with 64-bit offsets. It adds more data types. - Tifffile can read and write BigTIFF files. -* *ImageJ* hyperstacks store all image data, which may exceed 4 GB, - contiguously after the first IFD. Files > 4 GB contain one IFD only. - The size (shape and dtype) of the up to 6-dimensional image data can be - determined from the ImageDescription tag of the first IFD, which is Latin-1 - encoded. Tifffile can read and write ImageJ hyperstacks. -* *OME-TIFF* stores up to 8-dimensional data in one or multiple TIFF of BigTIFF - files. The 8-bit UTF-8 encoded OME-XML metadata found in the ImageDescription - tag of the first IFD defines the position of TIFF IFDs in the high - dimensional data. Tifffile can read OME-TIFF files, except when the OME-XML - metadata is stored in a separate file. -* *LSM* stores all IFDs below 4 GB but wraps around 32-bit StripOffsets. - The StripOffsets of each series and position require separate unwrapping. - The StripByteCounts tag contains the number of bytes for the uncompressed - data. Tifffile can read large LSM files. -* *NDPI* uses some 64-bit offsets in the file header, IFD, and tag structures - and might require correcting 32-bit offsets found in tags. - JPEG compressed tiles with dimensions > 65536 are not readable with libjpeg. - Tifffile can read NDPI files < 4 GB and decompress large JPEG tiles using - the imagecodecs library on Windows. -* *ScanImage* optionally allows corrupt non-BigTIFF files > 2 GB. The values - of StripOffsets and StripByteCounts can be recovered using the constant - differences of the offsets of IFD and tag values throughout the file. - Tifffile can read such files on Python 3 if the image data is stored - contiguously in each page. -* *GeoTIFF* sparse files allow strip or tile offsets and byte counts to be 0. - Such segments are implicitly set to 0 or the NODATA value on reading. - Tifffile can read GeoTIFF sparse files. - -Other libraries for reading scientific TIFF files from Python: - -* `Python-bioformats `_ -* `Imread `_ -* `GDAL `_ -* `OpenSlide-python `_ -* `PyLibTiff `_ -* `SimpleITK `_ -* `PyLSM `_ -* `PyMca.TiffIO.py `_ (same as fabio.TiffIO) -* `BioImageXD.Readers `_ -* `CellCognition `_ -* `pymimage `_ -* `pytiff `_ -* `ScanImageTiffReaderPython - `_ -* `bigtiff `_ - -Some libraries are using tifffile to write OME-TIFF files: - -* `Zeiss Apeer OME-TIFF library - `_ -* `Allen Institute for Cell Science imageio - `_ - -References ----------- -1) TIFF 6.0 Specification and Supplements. Adobe Systems Incorporated. - https://www.adobe.io/open/standards/TIFF.html -2) TIFF File Format FAQ. https://www.awaresystems.be/imaging/tiff/faq.html -3) MetaMorph Stack (STK) Image File Format. - http://mdc.custhelp.com/app/answers/detail/a_id/18862 -4) Image File Format Description LSM 5/7 Release 6.0 (ZEN 2010). - Carl Zeiss MicroImaging GmbH. BioSciences. May 10, 2011 -5) The OME-TIFF format. - https://docs.openmicroscopy.org/ome-model/5.6.4/ome-tiff/ -6) UltraQuant(r) Version 6.0 for Windows Start-Up Guide. - http://www.ultralum.com/images%20ultralum/pdf/UQStart%20Up%20Guide.pdf -7) Micro-Manager File Formats. - https://micro-manager.org/wiki/Micro-Manager_File_Formats -8) Tags for TIFF and Related Specifications. Digital Preservation. - https://www.loc.gov/preservation/digital/formats/content/tiff_tags.shtml -9) ScanImage BigTiff Specification - ScanImage 2016. - http://scanimage.vidriotechnologies.com/display/SI2016/ - ScanImage+BigTiff+Specification -10) CIPA DC-008-2016: Exchangeable image file format for digital still cameras: - Exif Version 2.31. - http://www.cipa.jp/std/documents/e/DC-008-Translation-2016-E.pdf -11) ZIF, the Zoomable Image File format. http://zif.photo/ -12) GeoTIFF File Format https://www.gdal.org/frmt_gtiff.html - -Examples --------- -Save a 3D numpy array to a multi-page, 16-bit grayscale TIFF file: - ->>> data = numpy.random.randint(0, 2**12, (4, 301, 219), 'uint16') ->>> imwrite('temp.tif', data, photometric='minisblack') - -Read the whole image stack from the TIFF file as numpy array: - ->>> image_stack = imread('temp.tif') ->>> image_stack.shape -(4, 301, 219) ->>> image_stack.dtype -dtype('uint16') - -Read the image from first page in the TIFF file as numpy array: - ->>> image = imread('temp.tif', key=0) ->>> image.shape -(301, 219) - -Read images from a sequence of TIFF files as numpy array: - ->>> image_sequence = imread(['temp.tif', 'temp.tif']) ->>> image_sequence.shape -(2, 4, 301, 219) - -Save a numpy array to a single-page RGB TIFF file: - ->>> data = numpy.random.randint(0, 255, (256, 256, 3), 'uint8') ->>> imwrite('temp.tif', data, photometric='rgb') - -Save a floating-point array and metadata, using zlib compression: - ->>> data = numpy.random.rand(2, 5, 3, 301, 219).astype('float32') ->>> imwrite('temp.tif', data, compress=6, metadata={'axes': 'TZCYX'}) - -Save a volume with xyz voxel size 2.6755x2.6755x3.9474 µm^3 to an ImageJ file: - ->>> volume = numpy.random.randn(57*256*256).astype('float32') ->>> volume.shape = 1, 57, 1, 256, 256, 1 # dimensions in TZCYXS order ->>> imwrite('temp.tif', volume, imagej=True, resolution=(1./2.6755, 1./2.6755), -... metadata={'spacing': 3.947368, 'unit': 'um'}) - -Get the shape and dtype of the images stored in the TIFF file: - ->>> tif = TiffFile('temp.tif') ->>> len(tif.pages) # number of pages in the file -57 ->>> page = tif.pages[0] # get shape and dtype of the image in the first page ->>> page.shape -(256, 256) ->>> page.dtype -dtype('float32') ->>> page.axes -'YX' ->>> series = tif.series[0] # get shape and dtype of the first image series ->>> series.shape -(57, 256, 256) ->>> series.dtype -dtype('float32') ->>> series.axes -'ZYX' ->>> tif.close() - -Read hyperstack and metadata from the ImageJ file: - ->>> with TiffFile('temp.tif') as tif: -... imagej_hyperstack = tif.asarray() -... imagej_metadata = tif.imagej_metadata ->>> imagej_hyperstack.shape -(57, 256, 256) ->>> imagej_metadata['slices'] -57 - -Read the "XResolution" tag from the first page in the TIFF file: - ->>> with TiffFile('temp.tif') as tif: -... tag = tif.pages[0].tags['XResolution'] ->>> tag.value -(2000, 5351) ->>> tag.name -'XResolution' ->>> tag.code -282 ->>> tag.count -1 ->>> tag.dtype -'2I' ->>> tag.valueoffset -360 - -Read images from a selected range of pages: - ->>> image = imread('temp.tif', key=range(4, 40, 2)) ->>> image.shape -(18, 256, 256) - -Create an empty TIFF file and write to the memory-mapped numpy array: - ->>> memmap_image = memmap('temp.tif', shape=(256, 256), dtype='float32') ->>> memmap_image[255, 255] = 1.0 ->>> memmap_image.flush() ->>> memmap_image.shape, memmap_image.dtype -((256, 256), dtype('float32')) ->>> del memmap_image - -Memory-map image data of the first page in the TIFF file: - ->>> memmap_image = memmap('temp.tif', page=0) ->>> memmap_image[255, 255] -1.0 ->>> del memmap_image - -Successively append images to a BigTIFF file, which can exceed 4 GB: - ->>> data = numpy.random.randint(0, 255, (5, 2, 3, 301, 219), 'uint8') ->>> with TiffWriter('temp.tif', bigtiff=True) as tif: -... for i in range(data.shape[0]): -... tif.save(data[i], compress=6, photometric='minisblack') - -Iterate over pages and tags in the TIFF file and successively read images: - ->>> with TiffFile('temp.tif') as tif: -... image_stack = tif.asarray() -... for page in tif.pages: -... for tag in page.tags.values(): -... tag_name, tag_value = tag.name, tag.value -... image = page.asarray() - -Save two image series to a TIFF file: - ->>> data0 = numpy.random.randint(0, 255, (301, 219, 3), 'uint8') ->>> data1 = numpy.random.randint(0, 255, (5, 301, 219), 'uint16') ->>> with TiffWriter('temp.tif') as tif: -... tif.save(data0, compress=6, photometric='rgb') -... tif.save(data1, compress=6, photometric='minisblack', contiguous=False) - -Read the second image series from the TIFF file: - ->>> series1 = imread('temp.tif', series=1) ->>> series1.shape -(5, 301, 219) - -Read an image stack from a series of TIFF files with a file name pattern: - ->>> imwrite('temp_C001T001.tif', numpy.random.rand(64, 64)) ->>> imwrite('temp_C001T002.tif', numpy.random.rand(64, 64)) ->>> image_sequence = TiffSequence('temp_C001*.tif', pattern='axes') ->>> image_sequence.shape -(1, 2) ->>> image_sequence.axes -'CT' ->>> data = image_sequence.asarray() ->>> data.shape -(1, 2, 64, 64) - -""" - -from __future__ import division, print_function - -__version__ = '2019.7.26' -__docformat__ = 'restructuredtext en' -__all__ = ( - 'imwrite', - 'imread', - 'imshow', - 'memmap', - 'lsm2bin', - 'TiffFile', - 'TiffFileError', - 'TiffSequence', - 'TiffWriter', - 'TiffPage', - 'TiffPageSeries', - 'TiffFrame', - 'TiffTag', - 'TIFF', - # utility classes and functions used by oiffile, czifile, etc - 'FileHandle', - 'FileSequence', - 'Timer', - 'lazyattr', - 'natural_sorted', - 'stripnull', - 'transpose_axes', - 'squeeze_axes', - 'create_output', - 'repeat_nd', - 'format_size', - 'astype', - 'product', - 'xml2dict', - 'pformat', - 'str2bytes', - 'nullfunc', - 'update_kwargs', - 'parse_kwargs', - 'askopenfilename', - '_app_show', - # deprecated - 'imsave', - 'decode_lzw', - 'decodelzw', -) - -import sys -import os -import io -import re -import glob -import math -import time -import json -import enum -import struct -import pathlib -import warnings -import binascii -import datetime -import threading -import collections - -try: - from collections.abc import Iterable -except ImportError: - from collections import Iterable - -from concurrent.futures import ThreadPoolExecutor - -import numpy - -try: - import imagecodecs -except ImportError: - import zlib - - try: - import imagecodecs_lite as imagecodecs - except ImportError: - imagecodecs = None - -# delay import of mmap, pprint, fractions, xml, lxml, matplotlib, tkinter, -# logging, subprocess, multiprocessing, tempfile, zipfile, fnmatch - - -def imread(files, **kwargs): - """Return image data from TIFF file(s) as numpy array. - - Refer to the TiffFile and TiffSequence classes and their asarray - functions for documentation. - - Parameters - ---------- - files : str, binary stream, or sequence - File name, seekable binary stream, glob pattern, or sequence of - file names. - kwargs : dict - Parameters 'name', 'offset', 'size', 'multifile', and 'is_ome' - are passed to the TiffFile constructor. - The 'pattern' and 'ioworkers' parameters are passed to the - TiffSequence constructor. - Other parameters are passed to the asarray functions. - The first image series in the file is returned if no arguments are - provided. - - """ - kwargs_file = parse_kwargs( - kwargs, - 'is_ome', - 'multifile', - '_useframes', - 'name', - 'offset', - 'size', - # legacy - 'multifile_close', - 'fastij', - 'movie', - ) - kwargs_seq = parse_kwargs(kwargs, 'pattern', 'ioworkers') - - if kwargs.get('pages', None) is not None: - if kwargs.get('key', None) is not None: - raise TypeError( - "the 'pages' and 'key' arguments cannot be used together") - log_warning("imread: the 'pages' argument is deprecated") - kwargs['key'] = kwargs.pop('pages') - - if isinstance(files, basestring) and any(i in files for i in '?*'): - files = glob.glob(files) - if not files: - raise ValueError('no files found') - if not hasattr(files, 'seek') and len(files) == 1: - files = files[0] - - if isinstance(files, basestring) or hasattr(files, 'seek'): - with TiffFile(files, **kwargs_file) as tif: - return tif.asarray(**kwargs) - else: - with TiffSequence(files, **kwargs_seq) as imseq: - return imseq.asarray(**kwargs) - - -def imwrite(file, data=None, shape=None, dtype=None, **kwargs): - """Write numpy array to TIFF file. - - Refer to the TiffWriter class and its asarray function for documentation. - - A BigTIFF file is created if the data size in bytes is larger than 4 GB - minus 32 MB (for metadata), and 'bigtiff' is not specified, and 'imagej' - or 'truncate' are not enabled. - - Parameters - ---------- - file : str or binary stream - File name or writable binary stream, such as an open file or BytesIO. - data : array_like - Input image. The last dimensions are assumed to be image depth, - height, width, and samples. - If None, an empty array of the specified shape and dtype is - saved to file. - Unless 'byteorder' is specified in 'kwargs', the TIFF file byte order - is determined from the data's dtype or the dtype argument. - shape : tuple - If 'data' is None, shape of an empty array to save to the file. - dtype : numpy.dtype - If 'data' is None, datatype of an empty array to save to the file. - kwargs : dict - Parameters 'append', 'byteorder', 'bigtiff', and 'imagej', are passed - to the TiffWriter constructor. Other parameters are passed to the - TiffWriter.save function. - - Returns - ------- - offset, bytecount : tuple or None - If the image data are written contiguously, return offset and bytecount - of image data in the file. - - """ - tifargs = parse_kwargs(kwargs, 'append', 'bigtiff', 'byteorder', 'imagej') - if data is None: - dtype = numpy.dtype(dtype) - size = product(shape) * dtype.itemsize - byteorder = dtype.byteorder - else: - try: - size = data.nbytes - byteorder = data.dtype.byteorder - except Exception: - size = 0 - byteorder = None - bigsize = kwargs.pop('bigsize', 2**32 - 2**25) - if ( - 'bigtiff' not in tifargs - and size > bigsize - and not ( - tifargs.get('imagej', False) or tifargs.get('truncate', False) - ) - ): - tifargs['bigtiff'] = True - if 'byteorder' not in tifargs: - tifargs['byteorder'] = byteorder - - with TiffWriter(file, **tifargs) as tif: - return tif.save(data, shape, dtype, **kwargs) - - -def memmap(filename, shape=None, dtype=None, page=None, series=0, mode='r+', - **kwargs): - """Return memory-mapped numpy array stored in TIFF file. - - Memory-mapping requires data stored in native byte order, without tiling, - compression, predictors, etc. - If 'shape' and 'dtype' are provided, existing files will be overwritten or - appended to depending on the 'append' parameter. - Otherwise the image data of a specified page or series in an existing - file will be memory-mapped. By default, the image data of the first page - series is memory-mapped. - Call flush() to write any changes in the array to the file. - Raise ValueError if the image data in the file is not memory-mappable. - - Parameters - ---------- - filename : str - Name of the TIFF file which stores the array. - shape : tuple - Shape of the empty array. - dtype : numpy.dtype - Datatype of the empty array. - page : int - Index of the page which image data to memory-map. - series : int - Index of the page series which image data to memory-map. - mode : {'r+', 'r', 'c'} - The file open mode. Default is to open existing file for reading and - writing ('r+'). - kwargs : dict - Additional parameters passed to imwrite() or TiffFile(). - - """ - if shape is not None and dtype is not None: - # create a new, empty array - kwargs.update( - data=None, - shape=shape, - dtype=dtype, - returnoffset=True, - align=TIFF.ALLOCATIONGRANULARITY - ) - result = imwrite(filename, **kwargs) - if result is None: - # TODO: fail before creating file or writing data - raise ValueError('image data are not memory-mappable') - offset = result[0] - else: - # use existing file - with TiffFile(filename, **kwargs) as tif: - if page is not None: - page = tif.pages[page] - if not page.is_memmappable: - raise ValueError('image data are not memory-mappable') - offset, _ = page.is_contiguous - shape = page.shape - dtype = page.dtype - else: - series = tif.series[series] - if series.offset is None: - raise ValueError('image data are not memory-mappable') - shape = series.shape - dtype = series.dtype - offset = series.offset - dtype = tif.byteorder + dtype.char - return numpy.memmap(filename, dtype, mode, offset, shape, 'C') - - -class lazyattr(object): - """Attribute whose value is computed on first access.""" - - # TODO: help() doesn't work - __slots__ = ('func',) - - def __init__(self, func): - self.func = func - # self.__name__ = func.__name__ - # self.__doc__ = func.__doc__ - # self.lock = threading.RLock() - - def __get__(self, instance, owner): - # with self.lock: - if instance is None: - return self - try: - value = self.func(instance) - except AttributeError as exc: - raise RuntimeError(exc) - if value is NotImplemented: - return getattr(super(owner, instance), self.func.__name__) - setattr(instance, self.func.__name__, value) - return value - - -class TiffFileError(Exception): - """Exception to indicate invalid TIFF structure.""" - - -class TiffWriter(object): - """Write numpy arrays to TIFF file. - - TiffWriter instances must be closed using the 'close' method, which is - automatically called when using the 'with' context manager. - - TiffWriter instances are not thread-safe. - - TiffWriter's main purpose is saving nD numpy array's as TIFF, - not to create any possible TIFF format. Specifically, SubIFDs, ExifIFD, - and GPSIFD tags are not supported. - - """ - - def __init__(self, file, bigtiff=False, byteorder=None, append=False, - imagej=False): - """Open a TIFF file for writing. - - An empty TIFF file is created if the file does not exist, else the - file is overwritten with an empty TIFF file unless 'append' - is true. Use 'bigtiff=True' when creating files larger than 4 GB. - - Parameters - ---------- - file : str, binary stream, or FileHandle - File name or writable binary stream, such as an open file - or BytesIO. - bigtiff : bool - If True, the BigTIFF format is used. - byteorder : {'<', '>', '=', '|'} - The endianness of the data in the file. - By default, this is the system's native byte order. - append : bool - If True and 'file' is an existing standard TIFF file, image data - and tags are appended to the file. - Appending data may corrupt specifically formatted TIFF files - such as LSM, STK, ImageJ, or FluoView. - imagej : bool - If True, write an ImageJ hyperstack compatible file. - This format can handle data types uint8, uint16, or float32 and - data shapes up to 6 dimensions in TZCYXS order. - RGB images (S=3 or S=4) must be uint8. - ImageJ's default byte order is big-endian but this implementation - uses the system's native byte order by default. - ImageJ hyperstacks do not support BigTIFF or compression. - The ImageJ file format is undocumented. - When using compression, use ImageJ's Bio-Formats import function. - - """ - if append: - # determine if file is an existing TIFF file that can be extended - try: - with FileHandle(file, mode='rb', size=0) as fh: - pos = fh.tell() - try: - with TiffFile(fh) as tif: - if append != 'force' and not tif.is_appendable: - raise TiffFileError('cannot append to file' - ' containing metadata') - byteorder = tif.byteorder - bigtiff = tif.is_bigtiff - self._ifdoffset = tif.pages.next_page_offset - finally: - fh.seek(pos) - except (IOError, FileNotFoundError): - append = False - - if byteorder in (None, '=', '|'): - byteorder = '<' if sys.byteorder == 'little' else '>' - elif byteorder not in ('<', '>'): - raise ValueError('invalid byteorder %s' % byteorder) - if imagej and bigtiff: - warnings.warn('writing incompatible BigTIFF ImageJ') - - self._byteorder = byteorder - self._imagej = bool(imagej) - self._truncate = False - self._metadata = None - self._colormap = None - - self._descriptionoffset = 0 - self._descriptionlen = 0 - self._descriptionlenoffset = 0 - self._tags = None - self._shape = None # normalized shape of data in consecutive pages - self._datashape = None # shape of data in consecutive pages - self._datadtype = None # data type - self._dataoffset = None # offset to data - self._databytecounts = None # byte counts per plane - self._tagoffsets = None # strip or tile offset tag code - - if bigtiff: - self._bigtiff = True - self._offsetsize = 8 - self._tagsize = 20 - self._tagnoformat = 'Q' - self._offsetformat = 'Q' - self._valueformat = '8s' - else: - self._bigtiff = False - self._offsetsize = 4 - self._tagsize = 12 - self._tagnoformat = 'H' - self._offsetformat = 'I' - self._valueformat = '4s' - - if append: - self._fh = FileHandle(file, mode='r+b', size=0) - self._fh.seek(0, 2) - else: - self._fh = FileHandle(file, mode='wb', size=0) - self._fh.write({'<': b'II', '>': b'MM'}[byteorder]) - if bigtiff: - self._fh.write(struct.pack(byteorder + 'HHH', 43, 8, 0)) - else: - self._fh.write(struct.pack(byteorder + 'H', 42)) - # first IFD - self._ifdoffset = self._fh.tell() - self._fh.write(struct.pack(byteorder + self._offsetformat, 0)) - - def save(self, data=None, shape=None, dtype=None, returnoffset=False, - photometric=None, planarconfig=None, extrasamples=None, tile=None, - contiguous=True, align=16, truncate=False, compress=0, - rowsperstrip=None, predictor=False, subsampling=None, - colormap=None, description=None, datetime=None, resolution=None, - subfiletype=0, software='tifffile.py', metadata={}, - ijmetadata=None, extratags=()): - """Write numpy array and tags to TIFF file. - - The data shape's last dimensions are assumed to be image depth, - height (length), width, and samples. - If a colormap is provided, the data's dtype must be uint8 or uint16 - and the data values are indices into the last dimension of the - colormap. - If 'shape' and 'dtype' are specified, an empty array is saved. - This option cannot be used with compression or multiple tiles. - Image data are written uncompressed in one strip per plane by default. - Dimensions larger than 2 to 4 (depending on photometric mode, planar - configuration, and SGI mode) are flattened and saved as separate pages. - The SampleFormat and BitsPerSample tags are derived from the data type. - - Parameters - ---------- - data : numpy.ndarray or None - Input image array. - shape : tuple or None - Shape of the empty array to save. Used only if 'data' is None. - dtype : numpy.dtype or None - Datatype of the empty array to save. Used only if 'data' is None. - returnoffset : bool - If True and the image data in the file is memory-mappable, return - the offset and number of bytes of the image data in the file. - photometric : {'MINISBLACK', 'MINISWHITE', 'RGB', 'PALETTE', 'CFA'} - The color space of the image data. - By default, this setting is inferred from the data shape and the - value of colormap. - For CFA images, DNG tags must be specified in 'extratags'. - planarconfig : {'CONTIG', 'SEPARATE'} - Specifies if samples are stored interleaved or in separate planes. - By default, this setting is inferred from the data shape. - If this parameter is set, extra samples are used to store grayscale - images. - 'CONTIG': last dimension contains samples. - 'SEPARATE': third last dimension contains samples. - extrasamples : tuple of {'UNSPECIFIED', 'ASSOCALPHA', 'UNASSALPHA'} - Defines the interpretation of extra components in pixels. - 'UNSPECIFIED': no transparency information (default). - 'ASSOCALPHA': single, true transparency with pre-multiplied color. - 'UNASSALPHA': independent transparency masks. - tile : tuple of int - The shape ([depth,] length, width) of image tiles to write. - If None (default), image data are written in strips. - The tile length and width must be a multiple of 16. - If the tile depth is provided, the SGI ImageDepth and TileDepth - tags are used to save volume data. - Unless a single tile is used, tiles cannot be used to write - contiguous files. - Few software can read the SGI format, e.g. MeVisLab. - contiguous : bool - If True (default) and the data and parameters are compatible with - previous ones, if any, the image data are stored contiguously after - the previous one. In that case, 'photometric', 'planarconfig', - 'rowsperstrip', are ignored. Metadata such as 'description', - 'metadata', 'datetime', and 'extratags' are written to the first - page of a contiguous series only. - align : int - Byte boundary on which to align the image data in the file. - Default 16. Use mmap.ALLOCATIONGRANULARITY for memory-mapped data. - Following contiguous writes are not aligned. - truncate : bool - If True, only write the first page including shape metadata if - possible (uncompressed, contiguous, not tiled). - Other TIFF readers will only be able to read part of the data. - compress : int or str or (str, int) - If 0 (default), data are written uncompressed. - If 0-9, the level of ADOBE_DEFLATE compression. - If a str, one of TIFF.COMPESSORS, e.g. 'LZMA' or 'ZSTD'. - If a tuple, the first item is one of TIFF.COMPESSORS and the - second item is the compression level. - Compression cannot be used to write contiguous files. - Compressors may require certain data shapes, types or value ranges. - For example, JPEG requires grayscale or RGB(A), uint8 or 12-bit - uint16. JPEG compression is experimental. JPEG markers and TIFF - tags may not match. - rowsperstrip : int - The number of rows per strip. By default, strips will be ~64 KB - if compression is enabled, else rowsperstrip is set to the image - length. Bilevel images are always stored in one strip per plane. - predictor : bool - If True, apply horizontal differencing or floating-point predictor - before compression. - subsampling : {(1, 1), (2, 1), (2, 2), (4, 1)} - The horizontal and vertical subsampling factors used for the - chrominance components of images. The default is (2, 2). - Currently applies to JPEG compression of RGB images only. - Images will be stored in YCbCr colorspace. - Segment widths must be a multiple of the horizontal factor. - Segment lengths and rowsperstrip must be a multiple of the vertical - factor. - colormap : numpy.ndarray - RGB color values for the corresponding data value. - Must be of shape (3, 2**(data.itemsize*8)) and dtype uint16. - description : str - The subject of the image. Must be 7-bit ASCII. Cannot be used with - the ImageJ format. Saved with the first page only. - datetime : datetime, str, or bool - Date and time of image creation in '%Y:%m:%d %H:%M:%S' format or - datetime object. Else if True, the current date and time is used. - Saved with the first page only. - resolution : (float, float[, str]) or ((int, int), (int, int)[, str]) - X and Y resolutions in pixels per resolution unit as float or - rational numbers. A third, optional parameter specifies the - resolution unit, which must be None (default for ImageJ), - 'INCH' (default), or 'CENTIMETER'. - subfiletype : int - Bitfield to indicate the kind of data. Set bit 0 if the image - is a reduced-resolution version of another image. Set bit 1 if - the image is part of a multi-page image. Set bit 2 if the image - is transparency mask for another image (photometric must be - MASK, SamplesPerPixel and BitsPerSample must be 1). - software : str - Name of the software used to create the file. Must be 7-bit ASCII. - Saved with the first page only. - metadata : dict - Additional metadata to be saved along with shape information - in JSON or ImageJ formats in an ImageDescription tag. - If None, do not write a second ImageDescription tag. - Strings must be 7-bit ASCII. Saved with the first page only. - ijmetadata : dict - Additional metadata to be saved in application specific - IJMetadata and IJMetadataByteCounts tags. Refer to the - imagej_metadata_tag function for valid keys and values. - Saved with the first page only. - extratags : sequence of tuples - Additional tags as [(code, dtype, count, value, writeonce)]. - - code : int - The TIFF tag Id. - dtype : str - Data type of items in 'value' in Python struct format. - One of B, s, H, I, 2I, b, h, i, 2i, f, d, Q, or q. - count : int - Number of data values. Not used for string or byte string - values. - value : sequence - 'Count' values compatible with 'dtype'. - Byte strings must contain count values of dtype packed as - binary data. - writeonce : bool - If True, the tag is written to the first page only. - - """ - # TODO: refactor this function - fh = self._fh - byteorder = self._byteorder - - if data is None: - if compress: - raise ValueError('cannot save compressed empty file') - datashape = shape - datadtype = numpy.dtype(dtype).newbyteorder(byteorder) - datadtypechar = datadtype.char - else: - data = numpy.asarray(data, byteorder + data.dtype.char, 'C') - if data.size == 0: - raise ValueError('cannot save empty array') - datashape = data.shape - datadtype = data.dtype - datadtypechar = data.dtype.char - - returnoffset = returnoffset and datadtype.isnative - bilevel = datadtypechar == '?' - if bilevel: - index = -1 if datashape[-1] > 1 else -2 - datasize = product(datashape[:index]) - if datashape[index] % 8: - datasize *= datashape[index] // 8 + 1 - else: - datasize *= datashape[index] // 8 - else: - datasize = product(datashape) * datadtype.itemsize - - # just append contiguous data if possible - self._truncate = bool(truncate) - if self._datashape: - if ( - not contiguous - or self._datashape[1:] != datashape - or self._datadtype != datadtype - or (compress and self._tags) - or tile - or not numpy.array_equal(colormap, self._colormap) - ): - # incompatible shape, dtype, compression mode, or colormap - self._write_remaining_pages() - self._write_image_description() - self._truncate = False - self._descriptionoffset = 0 - self._descriptionlenoffset = 0 - self._datashape = None - self._colormap = None - if self._imagej: - raise ValueError( - 'ImageJ does not support non-contiguous data') - else: - # consecutive mode - self._datashape = (self._datashape[0] + 1,) + datashape - if not compress: - # write contiguous data, write IFDs/tags later - offset = fh.tell() - if data is None: - fh.write_empty(datasize) - else: - fh.write_array(data) - if returnoffset: - return offset, datasize - return None - - input_shape = datashape - tagnoformat = self._tagnoformat - valueformat = self._valueformat - offsetformat = self._offsetformat - offsetsize = self._offsetsize - tagsize = self._tagsize - - MINISBLACK = TIFF.PHOTOMETRIC.MINISBLACK - MINISWHITE = TIFF.PHOTOMETRIC.MINISWHITE - RGB = TIFF.PHOTOMETRIC.RGB - CFA = TIFF.PHOTOMETRIC.CFA - PALETTE = TIFF.PHOTOMETRIC.PALETTE - CONTIG = TIFF.PLANARCONFIG.CONTIG - SEPARATE = TIFF.PLANARCONFIG.SEPARATE - - # parse input - if photometric is not None: - photometric = enumarg(TIFF.PHOTOMETRIC, photometric) - if planarconfig: - planarconfig = enumarg(TIFF.PLANARCONFIG, planarconfig) - if extrasamples is None: - extrasamples_ = None - else: - extrasamples_ = tuple( - enumarg(TIFF.EXTRASAMPLE, es) for es in sequence(extrasamples) - ) - if not compress: - compress = False - compresstag = 1 - # TODO: support predictors without compression - predictor = False - predictortag = 1 - else: - if isinstance(compress, (tuple, list)): - compress, compresslevel = compress - elif isinstance(compress, int): - compress, compresslevel = 'ADOBE_DEFLATE', int(compress) - if not 0 <= compresslevel <= 9: - raise ValueError('invalid compression level %s' % compress) - else: - compresslevel = None - compress = compress.upper() - compresstag = enumarg(TIFF.COMPRESSION, compress) - - if predictor: - if compresstag == 7: - predictor = False # disable predictor for lossy compression - elif datadtype.kind in 'iu': - predictortag = 2 - predictor = TIFF.PREDICTORS[2] - elif datadtype.kind == 'f': - predictortag = 3 - predictor = TIFF.PREDICTORS[3] - else: - raise ValueError('cannot apply predictor to %s' % datadtype) - - # prepare ImageJ format - if self._imagej: - # if predictor or compress: - # warnings.warn( - # 'ImageJ cannot handle predictors or compression') - if description: - warnings.warn('not writing description to ImageJ file') - description = None - volume = False - if datadtypechar not in 'BHhf': - raise ValueError( - 'ImageJ does not support data type %s' % datadtypechar) - ijrgb = photometric == RGB if photometric else None - if datadtypechar not in 'B': - ijrgb = False - ijshape = imagej_shape(datashape, ijrgb) - if ijshape[-1] in (3, 4): - photometric = RGB - if datadtypechar not in 'B': - raise ValueError( - 'ImageJ does not support data type %s for RGB' - % datadtypechar) - elif photometric is None: - photometric = MINISBLACK - planarconfig = None - if planarconfig == SEPARATE: - raise ValueError('ImageJ does not support planar images') - planarconfig = CONTIG if ijrgb else None - - # verify colormap and indices - if colormap is not None: - if datadtypechar not in 'BH': - raise ValueError('invalid data dtype for palette mode') - colormap = numpy.asarray(colormap, dtype=byteorder + 'H') - if colormap.shape != (3, 2**(datadtype.itemsize * 8)): - raise ValueError('invalid color map shape') - self._colormap = colormap - - # verify tile shape - if tile: - tile = tuple(int(i) for i in tile[:3]) - volume = len(tile) == 3 - if ( - len(tile) < 2 - or tile[-1] % 16 - or tile[-2] % 16 - or any(i < 1 for i in tile) - ): - raise ValueError('invalid tile shape') - else: - tile = () - volume = False - - # normalize data shape to 5D or 6D, depending on volume: - # (pages, planar_samples, [depth,] height, width, contig_samples) - datashape = reshape_nd(datashape, 3 if photometric == RGB else 2) - shape = datashape - ndim = len(datashape) - - samplesperpixel = 1 - extrasamples = 0 - if volume and ndim < 3: - volume = False - if colormap is not None: - photometric = PALETTE - planarconfig = None - if photometric is None: - photometric = MINISBLACK - if bilevel: - photometric = MINISWHITE - elif planarconfig == CONTIG: - if ndim > 2 and shape[-1] in (3, 4): - photometric = RGB - elif planarconfig == SEPARATE: - if volume and ndim > 3 and shape[-4] in (3, 4): - photometric = RGB - elif ndim > 2 and shape[-3] in (3, 4): - photometric = RGB - elif ndim > 2 and shape[-1] in (3, 4): - photometric = RGB - elif self._imagej: - photometric = MINISBLACK - elif volume and ndim > 3 and shape[-4] in (3, 4): - photometric = RGB - elif ndim > 2 and shape[-3] in (3, 4): - photometric = RGB - if planarconfig and len(shape) <= (3 if volume else 2): - planarconfig = None - if photometric not in (0, 1, 3, 4): - photometric = MINISBLACK - if photometric == RGB: - if len(shape) < 3: - raise ValueError('not a RGB(A) image') - if len(shape) < 4: - volume = False - if planarconfig is None: - if shape[-1] in (3, 4): - planarconfig = CONTIG - elif shape[-4 if volume else -3] in (3, 4): - planarconfig = SEPARATE - elif shape[-1] > shape[-4 if volume else -3]: - planarconfig = SEPARATE - else: - planarconfig = CONTIG - if planarconfig == CONTIG: - datashape = (-1, 1) + shape[(-4 if volume else -3):] - samplesperpixel = datashape[-1] - else: - datashape = (-1,) + shape[(-4 if volume else -3):] + (1,) - samplesperpixel = datashape[1] - if samplesperpixel > 3: - extrasamples = samplesperpixel - 3 - elif photometric == CFA: - if len(shape) != 2: - raise ValueError('invalid CFA image') - volume = False - planarconfig = None - datashape = (-1, 1) + shape[-2:] + (1,) - if 50706 not in (et[0] for et in extratags): - raise ValueError('must specify DNG tags for CFA image') - elif planarconfig and len(shape) > (3 if volume else 2): - if planarconfig == CONTIG: - datashape = (-1, 1) + shape[(-4 if volume else -3):] - samplesperpixel = datashape[-1] - else: - datashape = (-1,) + shape[(-4 if volume else -3):] + (1,) - samplesperpixel = datashape[1] - extrasamples = samplesperpixel - 1 - else: - planarconfig = None - while len(shape) > 2 and shape[-1] == 1: - shape = shape[:-1] # remove trailing 1s - if len(shape) < 3: - volume = False - if extrasamples_ is None: - datashape = (-1, 1) + shape[(-3 if volume else -2):] + (1,) - else: - datashape = (-1, 1) + shape[(-4 if volume else -3):] - samplesperpixel = datashape[-1] - extrasamples = samplesperpixel - 1 - - if subfiletype & 0b100: - # FILETYPE_MASK - if not ( - bilevel - and samplesperpixel == 1 - and photometric in (0, 1, 4) - ): - raise ValueError('invalid SubfileType MASK') - photometric = TIFF.PHOTOMETRIC.MASK - - if bilevel: - bitspersample = 1 - elif compresstag == 7 and datadtype == 'uint16': - bitspersample = 12 # use 12-bit JPEG compression - else: - bitspersample = datadtype.itemsize * 8 - - # normalize shape to 6D - if len(datashape) not in (5, 6): - raise RuntimeError('len(datashape) not in (5, 6)') - if len(datashape) == 5: - datashape = datashape[:2] + (1,) + datashape[2:] - if datashape[0] == -1: - s0 = product(input_shape) // product(datashape[1:]) - datashape = (s0,) + datashape[1:] - shape = datashape - if data is not None: - data = data.reshape(shape) - - if photometric == PALETTE: - if ( - samplesperpixel != 1 - or extrasamples - or shape[1] != 1 - or shape[-1] != 1 - ): - raise ValueError('invalid data shape for palette mode') - - if photometric == RGB and samplesperpixel == 2: - raise ValueError('not a RGB image (samplesperpixel=2)') - - if bilevel: - if compresstag not in (1, 32773): - raise ValueError('cannot compress bilevel image') - if tile: - raise ValueError('cannot save tiled bilevel image') - if photometric not in (0, 1, 4): - raise ValueError('cannot save bilevel image as %s' - % str(photometric)) - datashape = list(datashape) - if datashape[-2] % 8: - datashape[-2] = datashape[-2] // 8 + 1 - else: - datashape[-2] = datashape[-2] // 8 - datashape = tuple(datashape) - if datasize != product(datashape): - raise RuntimeError('datasize != product(datashape)') - if data is not None: - data = numpy.packbits(data, axis=-2) - if datashape[-2] != data.shape[-2]: - raise RuntimeError('datashape[-2] != data.shape[-2]') - - tags = [] # list of (code, ifdentry, ifdvalue, writeonce) - - strip_or_tile = 'Tile' if tile else 'Strip' - tagbytecounts = TIFF.TAG_NAMES[strip_or_tile + 'ByteCounts'] - tagoffsets = TIFF.TAG_NAMES[strip_or_tile + 'Offsets'] - self._tagoffsets = tagoffsets - - def pack(fmt, *val): - return struct.pack(byteorder + fmt, *val) - - def addtag(code, dtype, count, value, writeonce=False): - # Compute ifdentry & ifdvalue bytes from code, dtype, count, value - # Append (code, ifdentry, ifdvalue, writeonce) to tags list - code = int(TIFF.TAG_NAMES.get(code, code)) - try: - tifftype = TIFF.DATA_DTYPES[dtype] - except KeyError: - raise ValueError('unknown dtype %s' % dtype) - rawcount = count - - if dtype == 's': - # strings; enforce 7-bit ASCII on unicode strings - value = bytestr(value, 'ascii') + b'\0' - count = rawcount = len(value) - rawcount = value.find(b'\0\0') - if rawcount < 0: - rawcount = count - else: - rawcount += 1 # length of string without buffer - value = (value,) - elif isinstance(value, bytes): - # packed binary data - dtsize = struct.calcsize(dtype) - if len(value) % dtsize: - raise ValueError('invalid packed binary data') - count = len(value) // dtsize - if len(dtype) > 1: - count *= int(dtype[:-1]) - dtype = dtype[-1] - ifdentry = [pack('HH', code, tifftype), - pack(offsetformat, rawcount)] - ifdvalue = None - if struct.calcsize(dtype) * count <= offsetsize: - # value(s) can be written directly - if isinstance(value, bytes): - ifdentry.append(pack(valueformat, value)) - elif count == 1: - if isinstance(value, (tuple, list, numpy.ndarray)): - value = value[0] - ifdentry.append(pack(valueformat, pack(dtype, value))) - else: - ifdentry.append(pack(valueformat, - pack(str(count) + dtype, *value))) - else: - # use offset to value(s) - ifdentry.append(pack(offsetformat, 0)) - if isinstance(value, bytes): - ifdvalue = value - elif isinstance(value, numpy.ndarray): - if value.size != count: - raise RuntimeError('value.size != count') - if value.dtype.char != dtype: - raise RuntimeError('value.dtype.char != dtype') - ifdvalue = value.tostring() - elif isinstance(value, (tuple, list)): - ifdvalue = pack(str(count) + dtype, *value) - else: - ifdvalue = pack(dtype, value) - tags.append((code, b''.join(ifdentry), ifdvalue, writeonce)) - - def rational(arg, max_denominator=1000000): - """"Return nominator and denominator from float or two integers.""" - from fractions import Fraction # delayed import - try: - f = Fraction.from_float(arg) - except TypeError: - f = Fraction(arg[0], arg[1]) - f = f.limit_denominator(max_denominator) - return f.numerator, f.denominator - - if description: - # user provided description - addtag('ImageDescription', 's', 0, description, writeonce=True) - - # write shape and metadata to ImageDescription - self._metadata = {} if not metadata else metadata.copy() - if self._imagej: - description = imagej_description( - input_shape, - shape[-1] in (3, 4), - self._colormap is not None, - **self._metadata) - elif metadata or metadata == {}: - if self._truncate: - self._metadata.update(truncated=True) - description = json_description(input_shape, **self._metadata) - # elif metadata is None and self._truncate: - # raise ValueError('cannot truncate without writing metadata') - else: - description = None - if description: - # add 64 bytes buffer - # the image description might be updated later with the final shape - description = str2bytes(description, 'ascii') - description += b'\0' * 64 - self._descriptionlen = len(description) - addtag('ImageDescription', 's', 0, description, writeonce=True) - - if software: - addtag('Software', 's', 0, software, writeonce=True) - if datetime: - if isinstance(datetime, str): - if len(datetime) != 19 or datetime[16] != ':': - raise ValueError('invalid datetime string') - else: - try: - datetime = datetime.strftime('%Y:%m:%d %H:%M:%S') - except AttributeError: - datetime = self._now().strftime('%Y:%m:%d %H:%M:%S') - addtag('DateTime', 's', 0, datetime, writeonce=True) - addtag('Compression', 'H', 1, compresstag) - if predictor: - addtag('Predictor', 'H', 1, predictortag) - addtag('ImageWidth', 'I', 1, shape[-2]) - addtag('ImageLength', 'I', 1, shape[-3]) - if tile: - addtag('TileWidth', 'I', 1, tile[-1]) - addtag('TileLength', 'I', 1, tile[-2]) - if volume: - addtag('ImageDepth', 'I', 1, shape[-4]) - addtag('TileDepth', 'I', 1, tile[0]) - addtag('NewSubfileType', 'I', 1, subfiletype) - if not bilevel and not datadtype.kind == 'u': - sampleformat = {'u': 1, 'i': 2, 'f': 3, 'c': 6}[datadtype.kind] - addtag('SampleFormat', 'H', samplesperpixel, - (sampleformat,) * samplesperpixel) - if colormap is not None: - addtag('ColorMap', 'H', colormap.size, colormap) - addtag('SamplesPerPixel', 'H', 1, samplesperpixel) - if bilevel: - pass - elif planarconfig and samplesperpixel > 1: - addtag('PlanarConfiguration', 'H', 1, planarconfig.value) - addtag('BitsPerSample', 'H', samplesperpixel, - (bitspersample,) * samplesperpixel) - else: - addtag('BitsPerSample', 'H', 1, bitspersample) - if extrasamples: - if extrasamples_ is not None: - if extrasamples != len(extrasamples_): - raise ValueError('wrong number of extrasamples specified') - addtag('ExtraSamples', 'H', extrasamples, extrasamples_) - elif photometric == RGB and extrasamples == 1: - # Unassociated alpha channel - addtag('ExtraSamples', 'H', 1, 2) - else: - # Unspecified alpha channel - addtag('ExtraSamples', 'H', extrasamples, (0,) * extrasamples) - - if compresstag == 7 and photometric == RGB and planarconfig == 1: - # JPEG compression with subsampling. Store as YCbCr - # TODO: use JPEGTables for multiple tiles or strips - if subsampling is None: - subsampling = (2, 2) - elif subsampling not in ((1, 1), (2, 1), (2, 2), (4, 1)): - raise ValueError('invalid subsampling factors') - maxsampling = max(subsampling) * 8 - if tile and (tile[-1] % maxsampling or tile[-2] % maxsampling): - raise ValueError('tile shape not a multiple of %i' - % maxsampling) - if extrasamples > 1: - raise ValueError('JPEG subsampling requires RGB(A) images') - addtag('YCbCrSubSampling', 'H', 2, subsampling) - addtag('PhotometricInterpretation', 'H', 1, 6) # YCBCR - else: - if subsampling not in (None, (1, 1)): - log_warning('cannot apply subsampling') - subsampling = None - maxsampling = 1 - addtag('PhotometricInterpretation', 'H', 1, photometric.value) - if compresstag == 7: - addtag('YCbCrSubSampling', 'H', 2, (1, 1)) - - if resolution is not None: - addtag('XResolution', '2I', 1, rational(resolution[0])) - addtag('YResolution', '2I', 1, rational(resolution[1])) - if len(resolution) > 2: - unit = resolution[2] - unit = 1 if unit is None else enumarg(TIFF.RESUNIT, unit) - elif self._imagej: - unit = 1 - else: - unit = 2 - addtag('ResolutionUnit', 'H', 1, unit) - elif not self._imagej: - addtag('XResolution', '2I', 1, (1, 1)) - addtag('YResolution', '2I', 1, (1, 1)) - addtag('ResolutionUnit', 'H', 1, 1) - if ijmetadata: - for t in imagej_metadata_tag(ijmetadata, byteorder): - addtag(*t) - - def bytecount_format(bytecounts, compress=compress, size=offsetsize): - """Return bytecount format.""" - if len(bytecounts) == 1: - return {4: 'I', 8: 'Q'}[size] - bytecount = bytecounts[0] - if compress: - bytecount = bytecount * 10 - if bytecount < 2**16: - return 'H' - if bytecount < 2**32: - return 'I' - if size == 4: - return 'I' - return 'Q' - - contiguous = not compress - if tile: - # one chunk per tile per plane - if len(tile) == 3: - tiles = ( - (shape[2] + tile[0] - 1) // tile[0], - (shape[3] + tile[1] - 1) // tile[1], - (shape[4] + tile[2] - 1) // tile[2], - ) - else: - tiles = ( - (shape[3] + tile[0] - 1) // tile[0], - (shape[4] + tile[1] - 1) // tile[1], - ) - numtiles = product(tiles) * shape[1] - databytecounts = [ - product(tile) * shape[-1] * datadtype.itemsize] * numtiles - bytecountformat = bytecount_format(databytecounts) - addtag(tagbytecounts, bytecountformat, numtiles, databytecounts) - addtag(tagoffsets, offsetformat, numtiles, [0] * numtiles) - contiguous = contiguous and product(tiles) == 1 - if not contiguous: - # allocate tile buffer - chunk = numpy.empty(tile + (shape[-1],), dtype=datadtype) - bytecountformat = bytecountformat * numtiles - elif contiguous and (bilevel or rowsperstrip is None): - # one strip per plane - if bilevel: - databytecounts = [product(datashape[2:])] * shape[1] - else: - databytecounts = [ - product(datashape[2:]) * datadtype.itemsize] * shape[1] - bytecountformat = bytecount_format(databytecounts) - addtag(tagbytecounts, bytecountformat, shape[1], databytecounts) - addtag(tagoffsets, offsetformat, shape[1], [0] * shape[1]) - addtag('RowsPerStrip', 'I', 1, shape[-3]) - bytecountformat = bytecountformat * shape[1] - else: - # use rowsperstrip - rowsize = product(shape[-2:]) * datadtype.itemsize - if rowsperstrip is None: - # compress ~64 KB chunks by default - rowsperstrip = 65536 // rowsize if compress else shape[-3] - if rowsperstrip < 1: - rowsperstrip = maxsampling - elif rowsperstrip > shape[-3]: - rowsperstrip = shape[-3] - elif subsampling and rowsperstrip % maxsampling: - rowsperstrip = (math.ceil(rowsperstrip / maxsampling) * - maxsampling) - addtag('RowsPerStrip', 'I', 1, rowsperstrip) - - numstrips1 = (shape[-3] + rowsperstrip - 1) // rowsperstrip - numstrips = numstrips1 * shape[1] - # TODO: save bilevel data with rowsperstrip - stripsize = rowsperstrip * rowsize - databytecounts = [stripsize] * numstrips - stripsize -= rowsize * (numstrips1 * rowsperstrip - shape[-3]) - for i in range(numstrips1 - 1, numstrips, numstrips1): - databytecounts[i] = stripsize - bytecountformat = bytecount_format(databytecounts) - addtag(tagbytecounts, bytecountformat, numstrips, databytecounts) - addtag(tagoffsets, offsetformat, numstrips, [0] * numstrips) - bytecountformat = bytecountformat * numstrips - - if data is None and not contiguous: - raise ValueError('cannot write non-contiguous empty file') - - # add extra tags from user - for t in extratags: - addtag(*t) - - # define compress function - if compress: - compressor = TIFF.COMPESSORS[compresstag] - if predictor: - - def compress(data, compressor=compressor, level=compresslevel): - data = predictor(data, axis=-2) - return compressor(data, level) - - elif subsampling: - # JPEG with subsampling. Store RGB as YCbCr - # TODO: use JPEGTables for multiple tiles or strips - def compress(data, compressor=compressor, level=compresslevel, - subsampling=subsampling): - return compressor(data, level, subsampling=subsampling, - colorspace=2, outcolorspace=3) - - else: - - def compress(data, compressor=compressor, level=compresslevel): - return compressor(data, level) - - # TODO: check TIFFReadDirectoryCheckOrder warning in files containing - # multiple tags of same code - # the entries in an IFD must be sorted in ascending order by tag code - tags = sorted(tags, key=lambda x: x[0]) - - fhpos = fh.tell() - if ( - not (self._bigtiff or self._imagej) - and fhpos + datasize > 2**32 - 1 - ): - raise ValueError('data too large for standard TIFF file') - - # if not compressed or multi-tiled, write the first IFD and then - # all data contiguously; else, write all IFDs and data interleaved - for pageindex in range(1 if contiguous else shape[0]): - - ifdpos = fhpos - if ifdpos % 2: - # location of IFD must begin on a word boundary - fh.write(b'\0') - ifdpos += 1 - - # update pointer at ifdoffset - fh.seek(self._ifdoffset) - fh.write(pack(offsetformat, ifdpos)) - fh.seek(ifdpos) - - # create IFD in memory - if pageindex < 2: - ifd = io.BytesIO() - ifd.write(pack(tagnoformat, len(tags))) - tagoffset = ifd.tell() - ifd.write(b''.join(t[1] for t in tags)) - ifdoffset = ifd.tell() - ifd.write(pack(offsetformat, 0)) # offset to next IFD - # write tag values and patch offsets in ifdentries - for tagindex, tag in enumerate(tags): - offset = tagoffset + tagindex * tagsize + offsetsize + 4 - code = tag[0] - value = tag[2] - if value: - pos = ifd.tell() - if pos % 2: - # tag value is expected to begin on word boundary - ifd.write(b'\0') - pos += 1 - ifd.seek(offset) - ifd.write(pack(offsetformat, ifdpos + pos)) - ifd.seek(pos) - ifd.write(value) - if code == tagoffsets: - dataoffsetsoffset = offset, pos - elif code == tagbytecounts: - databytecountsoffset = offset, pos - elif code == 270 and value.endswith(b'\0\0\0\0'): - # image description buffer - self._descriptionoffset = ifdpos + pos - self._descriptionlenoffset = ( - ifdpos + tagoffset + tagindex * tagsize + 4) - elif code == tagoffsets: - dataoffsetsoffset = offset, None - elif code == tagbytecounts: - databytecountsoffset = offset, None - ifdsize = ifd.tell() - if ifdsize % 2: - ifd.write(b'\0') - ifdsize += 1 - - # write IFD later when strip/tile bytecounts and offsets are known - fh.seek(ifdsize, 1) - - # write image data - dataoffset = fh.tell() - skip = align - dataoffset % align - fh.seek(skip, 1) - dataoffset += skip - if contiguous: - if data is None: - fh.write_empty(datasize) - else: - fh.write_array(data) - elif tile: - # TODO: refactor this - # TODO: use multithreading and chunk buffer? - if data is None: - fh.write_empty(numtiles * databytecounts[0]) - elif len(tile) == 3: - stripindex = 0 - for plane in data[pageindex]: - for tz in range(tiles[0]): - for ty in range(tiles[1]): - for tx in range(tiles[2]): - c0 = min(tile[0], shape[2] - tz * tile[0]) - c1 = min(tile[1], shape[3] - ty * tile[1]) - c2 = min(tile[2], shape[4] - tx * tile[2]) - chunk[c0:, c1:, c2:] = 0 - chunk[:c0, :c1, :c2] = plane[ - tz * tile[0]: tz * tile[0] + c0, - ty * tile[1]: ty * tile[1] + c1, - tx * tile[2]: tx * tile[2] + c2, - ] - if compress: - t = compress(chunk) - fh.write(t) - databytecounts[stripindex] = len(t) - stripindex += 1 - else: - fh.write_array(chunk) - # fh.flush() - else: - stripindex = 0 - for plane in data[pageindex]: - for ty in range(tiles[0]): - for tx in range(tiles[1]): - c1 = min(tile[0], shape[3] - ty * tile[0]) - c2 = min(tile[1], shape[4] - tx * tile[1]) - chunk[c1:, c2:] = 0 - chunk[:c1, :c2] = plane[ - 0, - ty * tile[0]: ty * tile[0] + c1, - tx * tile[1]: tx * tile[1] + c2, - ] - if compress: - t = compress(chunk) - fh.write(t) - databytecounts[stripindex] = len(t) - stripindex += 1 - else: - fh.write_array(chunk) - # fh.flush() - elif compress: - # write one strip per rowsperstrip - if data.shape[2] != 1: - # not handling depth - raise RuntimeError('data.shape[2] != 1') - numstrips = (shape[-3] + rowsperstrip - 1) // rowsperstrip - stripindex = 0 - for plane in data[pageindex]: - for i in range(numstrips): - strip = plane[ - 0, - i * rowsperstrip: (i + 1) * rowsperstrip - ] - strip = compress(strip) - fh.write(strip) - databytecounts[stripindex] = len(strip) - stripindex += 1 - else: - fh.write_array(data[pageindex]) - - # update strip/tile offsets - offset, pos = dataoffsetsoffset - ifd.seek(offset) - if pos: - ifd.write(pack(offsetformat, ifdpos + pos)) - ifd.seek(pos) - offset = dataoffset - for size in databytecounts: - ifd.write(pack(offsetformat, offset)) - offset += size - else: - ifd.write(pack(offsetformat, dataoffset)) - - if compress: - # update strip/tile bytecounts - offset, pos = databytecountsoffset - ifd.seek(offset) - if pos: - ifd.write(pack(offsetformat, ifdpos + pos)) - ifd.seek(pos) - ifd.write(pack(bytecountformat, *databytecounts)) - - fhpos = fh.tell() - fh.seek(ifdpos) - fh.write(iogetbuffer(ifd)) - fh.flush() - fh.seek(fhpos) - - self._ifdoffset = ifdpos + ifdoffset - - # remove tags that should be written only once - if pageindex == 0: - tags = [tag for tag in tags if not tag[-1]] - - self._shape = shape - self._datashape = (1,) + input_shape - self._datadtype = datadtype - self._dataoffset = dataoffset - self._databytecounts = databytecounts - - if contiguous: - # write remaining IFDs/tags later - self._tags = tags - # return offset and size of image data - if returnoffset: - return dataoffset, sum(databytecounts) - return None - - def _write_remaining_pages(self): - """Write outstanding IFDs and tags to file.""" - if not self._tags or self._truncate: - return - - pageno = self._shape[0] * self._datashape[0] - 1 - if pageno < 1: - self._tags = None - self._datadtype = None - self._dataoffset = None - self._databytecounts = None - return - - fh = self._fh - fhpos = fh.tell() - if fhpos % 2: - fh.write(b'\0') - fhpos += 1 - - pack = struct.pack - offsetformat = self._byteorder + self._offsetformat - offsetsize = self._offsetsize - tagnoformat = self._byteorder + self._tagnoformat - tagsize = self._tagsize - dataoffset = self._dataoffset - pagedatasize = sum(self._databytecounts) - - # construct template IFD in memory - # need to patch offsets to next IFD and data before writing to file - ifd = io.BytesIO() - ifd.write(pack(tagnoformat, len(self._tags))) - tagoffset = ifd.tell() - ifd.write(b''.join(t[1] for t in self._tags)) - ifdoffset = ifd.tell() - ifd.write(pack(offsetformat, 0)) # offset to next IFD - # tag values - for tagindex, tag in enumerate(self._tags): - offset = tagoffset + tagindex * tagsize + offsetsize + 4 - code = tag[0] - value = tag[2] - if value: - pos = ifd.tell() - if pos % 2: - # tag value is expected to begin on word boundary - ifd.write(b'\0') - pos += 1 - ifd.seek(offset) - try: - ifd.write(pack(offsetformat, fhpos + pos)) - except Exception: # struct.error - if self._imagej: - warnings.warn('truncating ImageJ file') - self._truncate = True - return - raise ValueError('data too large for non-BigTIFF file') - ifd.seek(pos) - ifd.write(value) - if code == self._tagoffsets: - # save strip/tile offsets for later updates - dataoffsetsoffset = offset, pos - elif code == self._tagoffsets: - dataoffsetsoffset = offset, None - - ifdsize = ifd.tell() - if ifdsize % 2: - ifd.write(b'\0') - ifdsize += 1 - - # check if all IFDs fit in file - if not self._bigtiff and fhpos + ifdsize * pageno > 2**32 - 32: - if self._imagej: - warnings.warn('truncating ImageJ file') - self._truncate = True - return - raise ValueError('data too large for non-BigTIFF file') - - # assemble IFD chain in memory from IFD template - ifds = io.BytesIO(bytes(ifdsize * pageno)) - ifdpos = fhpos - for _ in range(pageno): - # update strip/tile offsets in IFD - dataoffset += pagedatasize # offset to image data - offset, pos = dataoffsetsoffset - ifd.seek(offset) - if pos: - ifd.write(pack(offsetformat, ifdpos + pos)) - ifd.seek(pos) - offset = dataoffset - for size in self._databytecounts: - ifd.write(pack(offsetformat, offset)) - offset += size - else: - ifd.write(pack(offsetformat, dataoffset)) - # update pointer at ifdoffset to point to next IFD in file - ifdpos += ifdsize - ifd.seek(ifdoffset) - ifd.write(pack(offsetformat, ifdpos)) - # write IFD entry - ifds.write(iogetbuffer(ifd)) - - # terminate IFD chain - ifdoffset += ifdsize * (pageno - 1) - ifds.seek(ifdoffset) - ifds.write(pack(offsetformat, 0)) - # write IFD chain to file - fh.write(iogetbuffer(ifds)) - # update file to point to new IFD chain - pos = fh.tell() - fh.seek(self._ifdoffset) - fh.write(pack(offsetformat, fhpos)) - fh.flush() - fh.seek(pos) - - self._ifdoffset = fhpos + ifdoffset - self._tags = None - self._datadtype = None - self._dataoffset = None - self._databytecounts = None - # do not reset _shape or _datashape - - def _write_image_description(self): - """Write metadata to ImageDescription tag.""" - if ( - not self._datashape - or self._datashape[0] == 1 - or self._descriptionoffset <= 0 - ): - return - - colormapped = self._colormap is not None - if self._imagej: - isrgb = self._shape[-1] in (3, 4) - description = imagej_description( - self._datashape, isrgb, colormapped, **self._metadata) - else: - description = json_description(self._datashape, **self._metadata) - - # rewrite description and its length to file - description = description.encode('utf-8') - description = description[:self._descriptionlen - 1] - pos = self._fh.tell() - self._fh.seek(self._descriptionoffset) - self._fh.write(description) - self._fh.seek(self._descriptionlenoffset) - self._fh.write(struct.pack(self._byteorder + self._offsetformat, - len(description) + 1)) - self._fh.seek(pos) - - self._descriptionoffset = 0 - self._descriptionlenoffset = 0 - self._descriptionlen = 0 - - def _now(self): - """Return current date and time.""" - return datetime.datetime.now() - - def close(self): - """Write remaining pages and close file handle.""" - if not self._truncate: - self._write_remaining_pages() - self._write_image_description() - self._fh.close() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - - -class TiffFile(object): - """Read image and metadata from TIFF file. - - TiffFile instances must be closed using the 'close' method, which is - automatically called when using the 'with' context manager. - - TiffFile instances are not thread-safe. - - Attributes - ---------- - pages : TiffPages - Sequence of TIFF pages in file. - series : list of TiffPageSeries - Sequences of closely related TIFF pages. These are computed - from OME, LSM, ImageJ, etc. metadata or based on similarity - of page properties such as shape, dtype, and compression. - is_flag : bool - If True, file is of a certain format. - Flags are: bigtiff, uniform, shaped, ome, imagej, stk, lsm, fluoview, - nih, vista, micromanager, metaseries, mdgel, mediacy, tvips, fei, - sem, scn, svs, scanimage, andor, epics, ndpi, pilatus, qpi. - - All attributes are read-only. - - """ - - def __init__(self, arg, name=None, offset=None, size=None, - multifile=True, _useframes=None, **kwargs): - """Initialize instance from file. - - Parameters - ---------- - arg : str or open file - Name of file or open file object. - The file objects are closed in TiffFile.close(). - name : str - Optional name of file in case 'arg' is a file handle. - offset : int - Optional start position of embedded file. By default, this is - the current file position. - size : int - Optional size of embedded file. By default, this is the number - of bytes from the 'offset' to the end of the file. - multifile : bool - If True (default), series may include pages from multiple files. - Currently applies to OME-TIFF only. - kwargs : bool - 'is_ome': If False, disable processing of OME-XML metadata. - - """ - if kwargs: - for key in ('movie', 'fastij', 'multifile_close'): - if key in kwargs: - del kwargs[key] - log_warning("TiffFile: the '%s' argument is ignored", key) - if 'pages' in kwargs: - raise TypeError( - "the TiffFile 'pages' argument is no longer supported.\n\n" - "Use TiffFile.asarray(key=[...]) to read image data " - "from specific pages.\n") - - for key, value in kwargs.items(): - if key[:3] == 'is_' and key[3:] in TIFF.FILE_FLAGS: - if value is not None and not value: - setattr(self, key, bool(value)) - else: - raise TypeError('unexpected keyword argument: %s' % key) - - fh = FileHandle(arg, mode='rb', name=name, offset=offset, size=size) - self._fh = fh - self._multifile = bool(multifile) - self._files = {fh.name: self} # cache of TiffFiles - try: - fh.seek(0) - header = fh.read(4) - try: - byteorder = {b'II': '<', b'MM': '>'}[header[:2]] - except KeyError: - raise TiffFileError('not a TIFF file') - - version = struct.unpack(byteorder + 'H', header[2:4])[0] - if version == 43: - # BigTiff - offsetsize, zero = struct.unpack(byteorder + 'HH', fh.read(4)) - if zero != 0 or offsetsize != 8: - raise TiffFileError('invalid BigTIFF file') - if byteorder == '>': - self.tiff = TIFF.BIG_BE - else: - self.tiff = TIFF.BIG_LE - elif version == 42: - # Classic TIFF - if byteorder == '>': - self.tiff = TIFF.CLASSIC_BE - elif kwargs.get('is_ndpi', False): - # NDPI uses 64 bit IFD offsets - # TODO: fix offsets in NDPI tags if file size > 4 GB - self.tiff = TIFF.NDPI_LE - else: - self.tiff = TIFF.CLASSIC_LE - else: - raise TiffFileError('invalid TIFF file') - - # file handle is at offset to offset to first page - self.pages = TiffPages(self) - - if self.is_lsm and ( - self.filehandle.size >= 2**32 - or self.pages[0].compression != 1 - or self.pages[1].compression != 1 - ): - self._lsm_load_pages() - elif self.is_scanimage and ( - not self.is_bigtiff and self.filehandle.size >= 2**31 - ): - self.pages._load_virtual_frames() - elif _useframes: - self.pages.useframes = True - - except Exception: - fh.close() - raise - - @property - def byteorder(self): - return self.tiff.byteorder - - @property - def is_bigtiff(self): - return self.tiff.version == 43 - - @property - def filehandle(self): - """Return file handle.""" - return self._fh - - @property - def filename(self): - """Return name of file handle.""" - return self._fh.name - - @lazyattr - def fstat(self): - """Return status of file handle as stat_result object.""" - try: - return os.fstat(self._fh.fileno()) - except Exception: # io.UnsupportedOperation - return None - - def close(self): - """Close open file handle(s).""" - for tif in self._files.values(): - tif.filehandle.close() - self._files = {} - - def asarray(self, key=None, series=None, out=None, validate=True, - maxworkers=None): - """Return image data from selected TIFF page(s) as numpy array. - - By default, the data from the first series is returned. - - Parameters - ---------- - key : int, slice, or sequence of indices - Defines which pages to return as array. - If None (default), data from a series (default 0) is returned. - If not None, data from the specified pages in the whole file - (if 'series' is None) or a specified series are returned as a - stacked array. - Requesting an array from multiple pages that are not compatible - wrt. shape, dtype, compression etc is undefined, i.e. may crash - or return incorrect values. - series : int or TiffPageSeries - Defines which series of pages to return as array. - out : numpy.ndarray, str, or file-like object - Buffer where image data will be saved. - If None (default), a new array will be created. - If numpy.ndarray, a writable array of compatible dtype and shape. - If 'memmap', directly memory-map the image data in the TIFF file - if possible; else create a memory-mapped array in a temporary file. - If str or open file, the file name or file object used to - create a memory-map to an array stored in a binary file on disk. - validate : bool - If True (default), validate various tags. - Passed to TiffPage.asarray(). - maxworkers : int or None - Maximum number of threads to concurrently get data from multiple - pages or compressed segments. - If None (default), up to half the CPU cores are used. - If 1, multi-threading is disabled. - Reading data from file is limited to a single thread. - Using multiple threads can significantly speed up this function - if the bottleneck is decoding compressed data, e.g. in case of - large LZW compressed LSM files or JPEG compressed tiled slides. - If the bottleneck is I/O or pure Python code, using multiple - threads might be detrimental. - - Returns - ------- - numpy.ndarray - Image data from the specified pages. - See the TiffPage.asarray function for operations that are - applied (or not) to the raw data stored in the file. - - """ - if not self.pages: - return numpy.array([]) - if key is None and series is None: - series = 0 - if series is None: - pages = self.pages - else: - try: - series = self.series[series] - except (KeyError, TypeError): - pass - pages = series.pages - - if key is None: - pass - elif series is None: - pages = self.pages._getlist(key) - elif isinstance(key, inttypes): - pages = [pages[key]] - elif isinstance(key, slice): - pages = pages[key] - elif isinstance(key, Iterable): - pages = [pages[k] for k in key] - else: - raise TypeError('key must be an int, slice, or sequence') - - if not pages: - raise ValueError('no pages selected') - - if key is None and series and series.offset: - typecode = self.byteorder + series.dtype.char - if ( - pages[0].is_memmappable - and isinstance(out, str) - and out == 'memmap' - ): - # direct mapping - result = self.filehandle.memmap_array( - typecode, series.shape, series.offset) - else: - # read into output - if out is not None: - out = create_output(out, series.shape, series.dtype) - self.filehandle.seek(series.offset) - result = self.filehandle.read_array( - typecode, product(series.shape), out=out) - elif len(pages) == 1: - result = pages[0].asarray(out=out, validate=validate, - maxworkers=maxworkers) - else: - result = stack_pages(pages, out=out, maxworkers=maxworkers) - - if result is None: - return None - - if key is None: - try: - result.shape = series.shape - except ValueError: - try: - log_warning('TiffFile.asarray: failed to reshape %s to %s', - result.shape, series.shape) - # try series of expected shapes - result.shape = (-1,) + series.shape - except ValueError: - # revert to generic shape - result.shape = (-1,) + pages[0].shape - elif len(pages) == 1: - result.shape = pages[0].shape - else: - result.shape = (-1,) + pages[0].shape - return result - - @lazyattr - def series(self): - """Return related pages as TiffPageSeries. - - Side effect: after calling this function, TiffFile.pages might contain - TiffPage and TiffFrame instances. - - """ - if not self.pages: - return [] - - useframes = self.pages.useframes - keyframe = self.pages.keyframe.index - series = [] - for name in ( - 'lsm', - 'ome', - 'imagej', - 'shaped', - 'fluoview', - 'sis', - 'uniform', - 'mdgel', - ): - if getattr(self, 'is_' + name, False): - series = getattr(self, '_series_' + name)() - break - self.pages.useframes = useframes - self.pages.keyframe = keyframe - if not series: - series = self._series_generic() - - # remove empty series, e.g. in MD Gel files - series = [s for s in series if product(s.shape) > 0] - - for i, s in enumerate(series): - s.index = i - return series - - def _series_generic(self): - """Return image series in file. - - A series is a sequence of TiffPages with the same hash. - - """ - pages = self.pages - pages._clear(False) - pages.useframes = False - if pages.cache: - pages._load() - - result = [] - keys = [] - series = {} - for page in pages: - if not page.shape or product(page.shape) == 0: - continue - key = page.hash - if key in series: - series[key].append(page) - else: - keys.append(key) - series[key] = [page] - - for key in keys: - pages = series[key] - page = pages[0] - shape = page.shape - axes = page.axes - if len(pages) > 1: - shape = (len(pages),) + shape - axes = 'I' + axes - result.append( - TiffPageSeries(pages, shape, page.dtype, axes, kind='Generic') - ) - - self.is_uniform = len(result) == 1 - return result - - def _series_uniform(self): - """Return all images in file as single series.""" - page = self.pages[0] - shape = page.shape - axes = page.axes - dtype = page.dtype - validate = not (page.is_scanimage or page.is_nih) - pages = self.pages._getlist(validate=validate) - lenpages = len(pages) - if lenpages > 1: - shape = (lenpages,) + shape - axes = 'I' + axes - if page.is_scanimage: - kind = 'ScanImage' - elif page.is_nih: - kind = 'NIHImage' - else: - kind = 'Uniform' - return [TiffPageSeries(pages, shape, dtype, axes, kind=kind)] - - def _series_shaped(self): - """Return image series in "shaped" file.""" - pages = self.pages - pages.useframes = True - lenpages = len(pages) - - def append_series(series, pages, axes, shape, reshape, name, - truncated): - page = pages[0] - if not axes: - shape = page.shape - axes = page.axes - if len(pages) > 1: - shape = (len(pages),) + shape - axes = 'Q' + axes - size = product(shape) - resize = product(reshape) - if page.is_contiguous and resize > size and resize % size == 0: - if truncated is None: - truncated = True - axes = 'Q' + axes - shape = (resize // size,) + shape - try: - axes = reshape_axes(axes, shape, reshape) - shape = reshape - except ValueError as exc: - log_warning('Shaped series: %s: %s', - exc.__class__.__name__, exc) - series.append( - TiffPageSeries(pages, shape, page.dtype, axes, - name=name, kind='Shaped', truncated=truncated) - ) - - keyframe = axes = shape = reshape = name = None - series = [] - index = 0 - while True: - if index >= lenpages: - break - # new keyframe; start of new series - pages.keyframe = index - keyframe = pages.keyframe - if not keyframe.is_shaped: - log_warning( - 'Shaped series: invalid metadata or corrupted file') - return None - # read metadata - axes = None - shape = None - metadata = json_description_metadata(keyframe.is_shaped) - name = metadata.get('name', '') - reshape = metadata['shape'] - truncated = metadata.get('truncated', None) - if 'axes' in metadata: - axes = metadata['axes'] - if len(axes) == len(reshape): - shape = reshape - else: - axes = '' - log_warning('Shaped series: axes do not match shape') - # skip pages if possible - spages = [keyframe] - size = product(reshape) - npages, mod = divmod(size, product(keyframe.shape)) - if mod: - log_warning( - 'Shaped series: series shape does not match page shape') - return None - if 1 < npages <= lenpages - index: - size *= keyframe._dtype.itemsize - if truncated: - npages = 1 - elif ( - keyframe.is_final - and keyframe.offset + size < pages[index + 1].offset - ): - truncated = False - else: - # need to read all pages for series - truncated = False - for j in range(index + 1, index + npages): - page = pages[j] - page.keyframe = keyframe - spages.append(page) - append_series(series, spages, axes, shape, reshape, name, - truncated) - index += npages - - self.is_uniform = len(series) == 1 - - return series - - def _series_imagej(self): - """Return image series in ImageJ file.""" - # ImageJ's dimension order is always TZCYXS - # TODO: fix loading of color, composite, or palette images - pages = self.pages - pages.useframes = True - pages.keyframe = 0 - page = pages[0] - ij = self.imagej_metadata - - def is_virtual(): - # ImageJ virtual hyperstacks store all image metadata in the first - # page and image data are stored contiguously before the second - # page, if any - if not page.is_final: - return False - images = ij.get('images', 0) - if images <= 1: - return False - offset, count = page.is_contiguous - if ( - count != product(page.shape) * page.bitspersample // 8 - or offset + count * images > self.filehandle.size - ): - raise ValueError() - # check that next page is stored after data - if len(pages) > 1 and offset + count * images > pages[1].offset: - return False - return True - - try: - isvirtual = is_virtual() - except ValueError: - log_warning('ImageJ series: invalid metadata or corrupted file') - return None - if isvirtual: - # no need to read other pages - pages = [page] - else: - pages = pages[:] - - images = ij.get('images', len(pages)) - frames = ij.get('frames', 1) - slices = ij.get('slices', 1) - channels = ij.get('channels', 1) - mode = ij.get('mode', None) - - shape = [] - axes = [] - if frames > 1: - shape.append(frames) - axes.append('T') - if slices > 1: - shape.append(slices) - axes.append('Z') - if channels > 1 and (page.photometric != 2 or mode != 'composite'): - shape.append(channels) - axes.append('C') - - remain = images // (product(shape) if shape else 1) - if remain > 1: - shape.append(remain) - axes.append('I') - - if page.axes[0] == 'S' and 'C' in axes: - # planar storage, S == C, saved by Bio-Formats - shape.extend(page.shape[1:]) - axes.extend(page.axes[1:]) - elif page.axes[0] == 'I': - # contiguous multiple images - shape.extend(page.shape[1:]) - axes.extend(page.axes[1:]) - elif page.axes[:2] == 'SI': - # color-mapped contiguous multiple images - shape = page.shape[0:1] + tuple(shape) + page.shape[2:] - axes = list(page.axes[0]) + axes + list(page.axes[2:]) - else: - shape.extend(page.shape) - axes.extend(page.axes) - - truncated = ( - isvirtual - and len(self.pages) == 1 - and page.is_contiguous[1] != ( - product(shape) * page.bitspersample // 8) - ) - - self.is_uniform = True - - return [ - TiffPageSeries(pages, shape, page.dtype, axes, - kind='ImageJ', truncated=truncated) - ] - - def _series_fluoview(self): - """Return image series in FluoView file.""" - pages = self.pages._getlist(validate=False) - - mm = self.fluoview_metadata - mmhd = list(reversed(mm['Dimensions'])) - axes = ''.join(TIFF.MM_DIMENSIONS.get(i[0].upper(), 'Q') - for i in mmhd if i[1] > 1) - shape = tuple(int(i[1]) for i in mmhd if i[1] > 1) - self.is_uniform = True - return [ - TiffPageSeries(pages, shape, pages[0].dtype, axes, - name=mm['ImageName'], kind='FluoView') - ] - - def _series_mdgel(self): - """Return image series in MD Gel file.""" - # only a single page, scaled according to metadata in second page - self.pages.useframes = False - self.pages.keyframe = 0 - md = self.mdgel_metadata - if md['FileTag'] in (2, 128): - dtype = numpy.dtype('float32') - scale = md['ScalePixel'] - scale = scale[0] / scale[1] # rational - if md['FileTag'] == 2: - # squary root data format - def transform(a): - return a.astype('float32')**2 * scale - else: - def transform(a): - return a.astype('float32') * scale - else: - transform = None - page = self.pages[0] - self.is_uniform = False - return [ - TiffPageSeries([page], page.shape, dtype, page.axes, - transform=transform, kind='MDGel') - ] - - def _series_sis(self): - """Return image series in Olympus SIS file.""" - pages = self.pages._getlist(validate=False) - page = pages[0] - lenpages = len(pages) - md = self.sis_metadata - if 'shape' in md and 'axes' in md: - shape = md['shape'] + page.shape - axes = md['axes'] + page.axes - elif lenpages == 1: - shape = page.shape - axes = page.axes - else: - shape = (lenpages,) + page.shape - axes = 'I' + page.axes - self.is_uniform = True - return [ - TiffPageSeries(pages, shape, page.dtype, axes, kind='SIS') - ] - - def _series_ome(self): - """Return image series in OME-TIFF file(s).""" - from xml.etree import cElementTree as etree # delayed import - omexml = self.pages[0].description - try: - root = etree.fromstring(omexml) - except etree.ParseError as exc: - # TODO: test badly encoded OME-XML - log_warning('OME series: %s: %s', exc.__class__.__name__, exc) - try: - # might work on Python 2 - omexml = omexml.decode('utf-8', 'ignore').encode('utf-8') - root = etree.fromstring(omexml) - except Exception: - return None - - self.pages.cache = True - self.pages.useframes = True - self.pages.keyframe = 0 - self.pages._load(keyframe=None) - - root_uuid = root.attrib.get('UUID', None) - self._files = {root_uuid: self} - dirname = self._fh.dirname - modulo = {} - series = [] - for element in root: - if element.tag.endswith('BinaryOnly'): - # TODO: load OME-XML from master or companion file - log_warning('OME series: not an ome-tiff master file') - break - if element.tag.endswith('StructuredAnnotations'): - for annot in element: - if not annot.attrib.get('Namespace', - '').endswith('modulo'): - continue - for value in annot: - for modul in value: - for along in modul: - if not along.tag[:-1].endswith('Along'): - continue - axis = along.tag[-1] - newaxis = along.attrib.get('Type', 'other') - newaxis = TIFF.AXES_LABELS[newaxis] - if 'Start' in along.attrib: - step = float(along.attrib.get('Step', 1)) - start = float(along.attrib['Start']) - stop = float(along.attrib['End']) + step - labels = numpy.arange(start, stop, step) - else: - labels = [ - label.text - for label in along - if label.tag.endswith('Label') - ] - modulo[axis] = (newaxis, labels) - - if not element.tag.endswith('Image'): - continue - - attr = element.attrib - name = attr.get('Name', None) - - for pixels in element: - if not pixels.tag.endswith('Pixels'): - continue - attr = pixels.attrib - # dtype = attr.get('PixelType', None) - axes = ''.join(reversed(attr['DimensionOrder'])) - shape = idxshape = [int(attr['Size' + ax]) for ax in axes] - size = product(shape[:-2]) - ifds = None - spp = 1 # samples per pixel - for data in pixels: - if data.tag.endswith('Channel'): - attr = data.attrib - if ifds is None: - spp = int(attr.get('SamplesPerPixel', spp)) - ifds = [None] * (size // spp) - if spp > 1: - # correct channel dimension for spp - idxshape = [ - shape[i] // spp if ax == 'C' else shape[i] - for i, ax in enumerate(axes)] - elif int(attr.get('SamplesPerPixel', 1)) != spp: - raise ValueError('OME series: cannot handle ' - 'differing SamplesPerPixel') - continue - if ifds is None: - ifds = [None] * (size // spp) - if not data.tag.endswith('TiffData'): - continue - attr = data.attrib - ifd = int(attr.get('IFD', 0)) - num = int(attr.get('NumPlanes', 1 if 'IFD' in attr else 0)) - num = int(attr.get('PlaneCount', num)) - idx = [int(attr.get('First' + ax, 0)) for ax in axes[:-2]] - try: - idx = numpy.ravel_multi_index(idx, idxshape[:-2]) - except ValueError: - # ImageJ produces invalid ome-xml when cropping - log_warning('OME series: invalid TiffData index') - continue - for uuid in data: - if not uuid.tag.endswith('UUID'): - continue - if root_uuid is None and uuid.text is not None: - # no global UUID, use this file - root_uuid = uuid.text - self._files[root_uuid] = self._files[None] - elif uuid.text not in self._files: - if not self._multifile: - # abort reading multifile OME series - # and fall back to generic series - return [] - fname = uuid.attrib['FileName'] - try: - tif = TiffFile(os.path.join(dirname, fname)) - tif.pages.cache = True - tif.pages.useframes = True - tif.pages.keyframe = 0 - tif.pages._load(keyframe=None) - except (IOError, FileNotFoundError, ValueError): - log_warning( - "OME series: failed to read '%s'", fname) - break - self._files[uuid.text] = tif - tif.close() - pages = self._files[uuid.text].pages - try: - for i in range(num if num else len(pages)): - ifds[idx + i] = pages[ifd + i] - except IndexError: - log_warning('OME series: index out of range') - # only process first UUID - break - else: - pages = self.pages - try: - for i in range(num if num else - min(len(pages), len(ifds))): - ifds[idx + i] = pages[ifd + i] - except IndexError: - log_warning('OME series: index out of range') - - if all(i is None for i in ifds): - # skip images without data - continue - - # find a keyframe - keyframe = None - for i in ifds: - # try find a TiffPage - if i and i == i.keyframe: - keyframe = i - break - if keyframe is None: - # reload a TiffPage from file - for i, keyframe in enumerate(ifds): - if keyframe: - keyframe.parent.pages.keyframe = keyframe.index - keyframe = keyframe.parent.pages[keyframe.index] - ifds[i] = keyframe - break - - # move channel axis to match PlanarConfiguration storage - # TODO: is this a bug or a inconsistency in the OME spec? - if spp > 1: - if keyframe.planarconfig == 1 and axes[-1] != 'C': - i = axes.index('C') - axes = axes[:i] + axes[i + 1:] + axes[i: i + 1] - shape = shape[:i] + shape[i + 1:] + shape[i: i + 1] - - # FIXME: this implementation assumes the last dimensions are - # stored in TIFF pages. Apparently that is not always the case. - # For now, verify that shapes of keyframe and series match - # If not, skip series. - if keyframe.shape != tuple(shape[-len(keyframe.shape):]): - log_warning('OME series: incompatible page shape %s; ' - 'expected %s', keyframe.shape, - tuple(shape[-len(keyframe.shape):])) - del ifds - continue - - # set a keyframe on all IFDs - for i in ifds: - if i is not None: - try: - i.keyframe = keyframe - except RuntimeError as exception: - log_warning('OME series: %s', str(exception)) - - series.append( - TiffPageSeries(ifds, shape, keyframe.dtype, axes, - parent=self, name=name, kind='OME') - ) - del ifds - - for serie in series: - shape = list(serie.shape) - for axis, (newaxis, labels) in modulo.items(): - i = serie.axes.index(axis) - size = len(labels) - if shape[i] == size: - serie.axes = serie.axes.replace(axis, newaxis, 1) - else: - shape[i] //= size - shape.insert(i + 1, size) - serie.axes = serie.axes.replace(axis, axis + newaxis, 1) - serie.shape = tuple(shape) - - # squeeze dimensions - for serie in series: - serie.shape, serie.axes = squeeze_axes(serie.shape, serie.axes) - self.is_uniform = len(series) == 1 - return series - - def _series_lsm(self): - """Return main and thumbnail series in LSM file.""" - lsmi = self.lsm_metadata - axes = TIFF.CZ_LSMINFO_SCANTYPE[lsmi['ScanType']] - if self.pages[0].photometric == 2: # RGB; more than one channel - axes = axes.replace('C', '').replace('XY', 'XYC') - if lsmi.get('DimensionP', 0) > 1: - axes += 'P' - if lsmi.get('DimensionM', 0) > 1: - axes += 'M' - axes = axes[::-1] - shape = tuple(int(lsmi[TIFF.CZ_LSMINFO_DIMENSIONS[i]]) for i in axes) - name = lsmi.get('Name', '') - pages = self.pages._getlist(slice(0, None, 2), validate=False) - dtype = pages[0].dtype - series = [ - TiffPageSeries(pages, shape, dtype, axes, name=name, kind='LSM') - ] - - if self.pages[1].is_reduced: - pages = self.pages._getlist(slice(1, None, 2), validate=False) - dtype = pages[0].dtype - cp = 1 - i = 0 - while cp < len(pages) and i < len(shape) - 2: - cp *= shape[i] - i += 1 - shape = shape[:i] + pages[0].shape - axes = axes[:i] + 'CYX' - series.append( - TiffPageSeries(pages, shape, dtype, axes, name=name, - kind='LSMreduced') - ) - - self.is_uniform = False - return series - - def _lsm_load_pages(self): - """Load and fix all pages from LSM file.""" - # cache all pages to preserve corrected values - pages = self.pages - pages.cache = True - pages.useframes = True - # use first and second page as keyframes - pages.keyframe = 1 - pages.keyframe = 0 - # load remaining pages as frames - pages._load(keyframe=None) - # fix offsets and bytecounts first - # TODO: fix multiple conversions between lists and tuples - self._lsm_fix_strip_offsets() - self._lsm_fix_strip_bytecounts() - # assign keyframes for data and thumbnail series - keyframe = pages[0] - for page in pages[::2]: - page.keyframe = keyframe - keyframe = pages[1] - for page in pages[1::2]: - page.keyframe = keyframe - - def _lsm_fix_strip_offsets(self): - """Unwrap strip offsets for LSM files greater than 4 GB. - - Each series and position require separate unwrapping (undocumented). - - """ - if self.filehandle.size < 2**32: - return - - pages = self.pages - npages = len(pages) - series = self.series[0] - axes = series.axes - - # find positions - positions = 1 - for i in 0, 1: - if series.axes[i] in 'PM': - positions *= series.shape[i] - - # make time axis first - if positions > 1: - ntimes = 0 - for i in 1, 2: - if axes[i] == 'T': - ntimes = series.shape[i] - break - if ntimes: - div, mod = divmod(npages, 2 * positions * ntimes) - if mod != 0: - raise RuntimeError('mod != 0') - shape = (positions, ntimes, div, 2) - indices = numpy.arange(product(shape)).reshape(shape) - indices = numpy.moveaxis(indices, 1, 0) - else: - indices = numpy.arange(npages).reshape(-1, 2) - - # images of reduced page might be stored first - if pages[0]._offsetscounts[0][0] > pages[1]._offsetscounts[0][0]: - indices = indices[..., ::-1] - - # unwrap offsets - wrap = 0 - previousoffset = 0 - for i in indices.flat: - page = pages[int(i)] - dataoffsets = [] - for currentoffset in page._offsetscounts[0]: - if currentoffset < previousoffset: - wrap += 2**32 - dataoffsets.append(currentoffset + wrap) - previousoffset = currentoffset - page._offsetscounts = tuple(dataoffsets), page._offsetscounts[1] - - def _lsm_fix_strip_bytecounts(self): - """Set databytecounts to size of compressed data. - - The StripByteCounts tag in LSM files contains the number of bytes - for the uncompressed data. - - """ - pages = self.pages - if pages[0].compression == 1: - return - # sort pages by first strip offset - pages = sorted(pages, key=lambda p: p._offsetscounts[0][0]) - npages = len(pages) - 1 - for i, page in enumerate(pages): - if page.index % 2: - continue - offsets, bytecounts = page._offsetscounts - if i < npages: - lastoffset = pages[i + 1]._offsetscounts[0][0] - else: - # LZW compressed strips might be longer than uncompressed - lastoffset = min(offsets[-1] + 2 * bytecounts[-1], - self._fh.size) - bytecounts = list(bytecounts) - for j in range(len(bytecounts) - 1): - bytecounts[j] = offsets[j + 1] - offsets[j] - bytecounts[-1] = lastoffset - offsets[-1] - page._offsetscounts = offsets, tuple(bytecounts) - - def __getattr__(self, name): - """Return 'is_flag' attributes from first page.""" - if name[3:] in TIFF.FILE_FLAGS: - if not self.pages: - return False - value = bool(getattr(self.pages[0], name)) - setattr(self, name, value) - return value - raise AttributeError("'%s' object has no attribute '%s'" - % (self.__class__.__name__, name)) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - - def __str__(self, detail=0, width=79): - """Return string containing information about file. - - The detail parameter specifies the level of detail returned: - - 0: file only. - 1: all series, first page of series and its tags. - 2: large tag values and file metadata. - 3: all pages. - - """ - info = [ - "TiffFile '%s'", - format_size(self._fh.size), - '' - if byteorder_isnative(self.tiff.byteorder) - else {'<': 'little-endian', - '>': 'big-endian'}[self.tiff.byteorder] - ] - if self.is_bigtiff: - info.append('BigTiff') - info.append(' '.join(f.lower() for f in self.flags)) - if len(self.pages) > 1: - info.append('%i Pages' % len(self.pages)) - if len(self.series) > 1: - info.append('%i Series' % len(self.series)) - if len(self._files) > 1: - info.append('%i Files' % (len(self._files))) - info = ' '.join(info) - info = info.replace(' ', ' ').replace(' ', ' ') - info = info % snipstr(self._fh.name, max(12, width + 2 - len(info))) - if detail <= 0: - return info - info = [info] - info.append('\n'.join(str(s) for s in self.series)) - if detail >= 3: - info.extend( - ( - TiffPage.__str__(p, detail=detail, width=width) - for p in self.pages - if p is not None - ) - ) - elif self.series: - info.extend( - ( - TiffPage.__str__(s.pages[0], detail=detail, width=width) - for s in self.series - if s.pages[0] is not None - ) - ) - elif self.pages and self.pages[0]: - info.append( - TiffPage.__str__(self.pages[0], detail=detail, width=width) - ) - if detail >= 2: - for name in sorted(self.flags): - if hasattr(self, name + '_metadata'): - m = getattr(self, name + '_metadata') - if m: - info.append( - '%s_METADATA\n%s' - % (name.upper(), - pformat(m, width=width, height=detail * 12)) - ) - return '\n\n'.join(info).replace('\n\n\n', '\n\n') - - @lazyattr - def flags(self): - """Return set of file flags.""" - return set( - name.lower() - for name in sorted(TIFF.FILE_FLAGS) - if getattr(self, 'is_' + name) - ) - - @lazyattr - def is_mdgel(self): - """File has MD Gel format.""" - # TODO: this likely reads the second page from file - try: - ismdgel = self.pages[0].is_mdgel or self.pages[1].is_mdgel - if ismdgel: - self.is_uniform = False - return ismdgel - except IndexError: - return False - - @lazyattr - def is_uniform(self): - """Return if file contains a uniform series of pages.""" - # the hashes of IFDs 0, 7, and -1 are the same - pages = self.pages - page = pages[0] - if page.is_scanimage or page.is_nih: - return True - try: - useframes = pages.useframes - pages.useframes = False - h = page.hash - for i in (1, 7, -1): - if pages[i].aspage().hash != h: - return False - except IndexError: - return False - finally: - pages.useframes = useframes - return True - - @property - def is_appendable(self): - """Return if pages can be appended to file without corrupting.""" - # TODO: check other formats - return not ( - self.is_lsm - or self.is_stk - or self.is_imagej - or self.is_fluoview - or self.is_micromanager - ) - - @lazyattr - def shaped_metadata(self): - """Return tifffile metadata from JSON descriptions as dicts.""" - if not self.is_shaped: - return None - return tuple( - json_description_metadata(s.pages[0].is_shaped) - for s in self.series - if s.kind.lower() == 'shaped' - ) - - @property - def ome_metadata(self): - """Return OME XML.""" - if not self.is_ome: - return None - # return xml2dict(self.pages[0].description)['OME'] - return self.pages[0].description - - @property - def lsm_metadata(self): - """Return LSM metadata from CZ_LSMINFO tag as dict.""" - if not self.is_lsm: - return None - return self.pages[0].tags['CZ_LSMINFO'].value - - @lazyattr - def stk_metadata(self): - """Return STK metadata from UIC tags as dict.""" - if not self.is_stk: - return None - page = self.pages[0] - tags = page.tags - result = {} - result['NumberPlanes'] = tags['UIC2tag'].count - if page.description: - result['PlaneDescriptions'] = page.description.split('\0') - # result['plane_descriptions'] = stk_description_metadata( - # page.image_description) - if 'UIC1tag' in tags: - result.update(tags['UIC1tag'].value) - if 'UIC3tag' in tags: - result.update(tags['UIC3tag'].value) # wavelengths - if 'UIC4tag' in tags: - result.update(tags['UIC4tag'].value) # override uic1 tags - uic2tag = tags['UIC2tag'].value - result['ZDistance'] = uic2tag['ZDistance'] - result['TimeCreated'] = uic2tag['TimeCreated'] - result['TimeModified'] = uic2tag['TimeModified'] - try: - result['DatetimeCreated'] = numpy.array( - [julian_datetime(*dt) for dt in - zip(uic2tag['DateCreated'], uic2tag['TimeCreated'])], - dtype='datetime64[ns]') - result['DatetimeModified'] = numpy.array( - [julian_datetime(*dt) for dt in - zip(uic2tag['DateModified'], uic2tag['TimeModified'])], - dtype='datetime64[ns]') - except ValueError as exc: - log_warning('STK metadata: %s: %s', exc.__class__.__name__, exc) - return result - - @lazyattr - def imagej_metadata(self): - """Return consolidated ImageJ metadata as dict.""" - if not self.is_imagej: - return None - page = self.pages[0] - result = imagej_description_metadata(page.is_imagej) - if 'IJMetadata' in page.tags: - try: - result.update(page.tags['IJMetadata'].value) - except Exception: - pass - return result - - @lazyattr - def fluoview_metadata(self): - """Return consolidated FluoView metadata as dict.""" - if not self.is_fluoview: - return None - result = {} - page = self.pages[0] - result.update(page.tags['MM_Header'].value) - # TODO: read stamps from all pages - result['Stamp'] = page.tags['MM_Stamp'].value - # skip parsing image description; not reliable - # try: - # t = fluoview_description_metadata(page.image_description) - # if t is not None: - # result['ImageDescription'] = t - # except Exception as exc: - # log_warning('FluoView metadata: ' - # 'failed to parse image description (%s)', str(exc)) - return result - - @lazyattr - def nih_metadata(self): - """Return NIH Image metadata from NIHImageHeader tag as dict.""" - if not self.is_nih: - return None - return self.pages[0].tags['NIHImageHeader'].value - - @lazyattr - def fei_metadata(self): - """Return FEI metadata from SFEG or HELIOS tags as dict.""" - if not self.is_fei: - return None - tags = self.pages[0].tags - if 'FEI_SFEG' in tags: - return tags['FEI_SFEG'].value - if 'FEI_HELIOS' in tags: - return tags['FEI_HELIOS'].value - return None - - @property - def sem_metadata(self): - """Return SEM metadata from CZ_SEM tag as dict.""" - if not self.is_sem: - return None - return self.pages[0].tags['CZ_SEM'].value - - @lazyattr - def sis_metadata(self): - """Return Olympus SIS metadata from SIS and INI tags as dict.""" - if not self.is_sis: - return None - tags = self.pages[0].tags - result = {} - try: - result.update(tags['OlympusINI'].value) - except Exception: - pass - try: - result.update(tags['OlympusSIS'].value) - except Exception: - pass - return result - - @lazyattr - def mdgel_metadata(self): - """Return consolidated metadata from MD GEL tags as dict.""" - for page in self.pages[:2]: - if 'MDFileTag' in page.tags: - tags = page.tags - break - else: - return None - result = {} - for code in range(33445, 33453): - name = TIFF.TAGS[code] - if name not in tags: - continue - result[name[2:]] = tags[name].value - return result - - @property - def andor_metadata(self): - """Return Andor tags as dict.""" - return self.pages[0].andor_tags - - @property - def epics_metadata(self): - """Return EPICS areaDetector tags as dict.""" - return self.pages[0].epics_tags - - @property - def tvips_metadata(self): - """Return TVIPS tag as dict.""" - if not self.is_tvips: - return None - return self.pages[0].tags['TVIPS'].value - - @lazyattr - def metaseries_metadata(self): - """Return MetaSeries metadata from image description as dict.""" - if not self.is_metaseries: - return None - return metaseries_description_metadata(self.pages[0].description) - - @lazyattr - def pilatus_metadata(self): - """Return Pilatus metadata from image description as dict.""" - if not self.is_pilatus: - return None - return pilatus_description_metadata(self.pages[0].description) - - @lazyattr - def micromanager_metadata(self): - """Return consolidated MicroManager metadata as dict.""" - if not self.is_micromanager: - return None - # from file header - result = read_micromanager_metadata(self._fh) - # from tag - result.update(self.pages[0].tags['MicroManagerMetadata'].value) - return result - - @lazyattr - def scanimage_metadata(self): - """Return ScanImage non-varying frame and ROI metadata as dict.""" - if not self.is_scanimage: - return None - result = {} - try: - framedata, roidata = read_scanimage_metadata(self._fh) - result['FrameData'] = framedata - result.update(roidata) - except ValueError: - pass - # TODO: scanimage_artist_metadata - try: - result['Description'] = scanimage_description_metadata( - self.pages[0].description) - except Exception as exc: - log_warning('ScanImage metadata: %s: %s', - exc.__class__.__name__, exc) - return result - - @property - def geotiff_metadata(self): - """Return GeoTIFF metadata from first page as dict.""" - if not self.is_geotiff: - return None - return self.pages[0].geotiff_tags - - -class TiffPages(object): - """Sequence of TIFF image file directories (IFD chain). - - Instances of TiffPages have a state (cache, keyframe, etc.) and are not - thread-safe. - - """ - - def __init__(self, parent): - """Initialize instance and read first TiffPage from file. - - If parent is a TiffFile, the file position must be at an offset to an - offset to a TiffPage. If parent is a TiffPage, page offsets are read - from the SubIFDs tag. - - """ - self.parent = None - self.pages = [] # cache of TiffPages, TiffFrames, or their offsets - self._indexed = False # True if offsets to all pages were read - self._cached = False # True if all pages were read into cache - self._tiffpage = TiffPage # class used for reading pages - self._keyframe = None # current page that is used as keyframe - self._cache = False # do not cache frames or pages (if not keyframe) - self._nextpageoffset = None - - if isinstance(parent, TiffFile): - # read offset to first page from current file position - self.parent = parent - fh = parent.filehandle - self._nextpageoffset = fh.tell() - offset = struct.unpack(parent.tiff.ifdoffsetformat, - fh.read(parent.tiff.ifdoffsetsize))[0] - elif 'SubIFDs' not in parent.tags: - self._indexed = True - return - else: - # use offsets from SubIFDs tag - self.parent = parent.parent - fh = self.parent.filehandle - offsets = parent.tags['SubIFDs'].value - offset = offsets[0] - - if offset == 0: - log_warning('TiffPages: file contains no pages') - self._indexed = True - return - if offset >= fh.size: - log_warning('TiffPages: invalid page offset (%i)', offset) - self._indexed = True - return - - # read and cache first page - fh.seek(offset) - page = TiffPage(self.parent, index=0) - self.pages.append(page) - self._keyframe = page - if self._nextpageoffset is None: - # offsets from SubIFDs tag - self.pages.extend(offsets[1:]) - self._indexed = True - self._cached = True - - @property - def cache(self): - """Return if pages/frames are currently being cached.""" - return self._cache - - @cache.setter - def cache(self, value): - """Enable or disable caching of pages/frames. Clear cache if False.""" - value = bool(value) - if self._cache and not value: - self._clear() - self._cache = value - - @property - def useframes(self): - """Return if currently using TiffFrame (True) or TiffPage (False).""" - return self._tiffpage == TiffFrame and TiffFrame is not TiffPage - - @useframes.setter - def useframes(self, value): - """Set to use TiffFrame (True) or TiffPage (False).""" - self._tiffpage = TiffFrame if value else TiffPage - - @property - def keyframe(self): - """Return current keyframe.""" - return self._keyframe - - @keyframe.setter - def keyframe(self, index): - """Set current keyframe. Load TiffPage from file if necessary.""" - index = int(index) - if index < 0: - index %= len(self) - if self._keyframe.index == index: - return - if index == 0: - self._keyframe = self.pages[0] - return - if self._indexed or index < len(self.pages): - page = self.pages[index] - if isinstance(page, TiffPage): - self._keyframe = page - return - if isinstance(page, TiffFrame): - # remove existing TiffFrame - self.pages[index] = page.offset - # load TiffPage from file - tiffpage = self._tiffpage - self._tiffpage = TiffPage - try: - self._keyframe = self._getitem(index) - finally: - self._tiffpage = tiffpage - # always cache keyframes - self.pages[index] = self._keyframe - - @property - def next_page_offset(self): - """Return offset where offset to a new page can be stored.""" - if not self._indexed: - self._seek(-1) - return self._nextpageoffset - - def _load(self, keyframe=True): - """Read all remaining pages from file.""" - if self._cached: - return - pages = self.pages - if not pages: - return - if not self._indexed: - self._seek(-1) - if not self._cache: - return - fh = self.parent.filehandle - if keyframe is not None: - keyframe = self._keyframe - for i, page in enumerate(pages): - if isinstance(page, inttypes): - fh.seek(page) - page = self._tiffpage(self.parent, index=i, keyframe=keyframe) - pages[i] = page - self._cached = True - - def _load_virtual_frames(self): - """Calculate virtual TiffFrames.""" - pages = self.pages - try: - if sys.version_info[0] == 2: - raise ValueError('not supported on Python 2') - if len(pages) > 1: - raise ValueError('pages already loaded') - page = pages[0] - bytecounts = page._offsetscounts[1] - if len(bytecounts) != 1: - raise ValueError('data not contiguous') - self._seek(4) - delta = pages[2] - pages[1] - if pages[3] - pages[2] != delta or pages[4] - pages[3] != delta: - raise ValueError('page offsets not equidistant') - page1 = self._getitem(1, validate=page.hash) - offsetoffset = page1._offsetscounts[0][0] - page1.offset - if offsetoffset < 0 or offsetoffset > delta: - raise ValueError('page offsets not equidistant') - pages = [page, page1] - filesize = self.parent.filehandle.size - delta - for index, offset in enumerate(range(page1.offset + delta, - filesize, delta)): - offsets = [offset + offsetoffset] - offset = offset if offset < 2**31 else None - pages.append( - TiffFrame( - parent=page.parent, - index=index + 2, - offset=None, - offsets=offsets, - bytecounts=bytecounts, - keyframe=page) - ) - self.pages = pages - self._cache = True - self._cached = True - self._indexed = True - except Exception as exc: - log_warning( - 'TiffPages: failed to load virtual frames: %s', str(exc)) - - def _clear(self, fully=True): - """Delete all but first page from cache. Set keyframe to first page.""" - pages = self.pages - if not pages: - return - self._keyframe = pages[0] - if fully: - # delete all but first TiffPage/TiffFrame - for i, page in enumerate(pages[1:]): - if not isinstance(page, inttypes) and page.offset is not None: - pages[i + 1] = page.offset - elif TiffFrame is not TiffPage: - # delete only TiffFrames - for i, page in enumerate(pages): - if isinstance(page, TiffFrame) and page.offset is not None: - pages[i] = page.offset - self._cached = False - - def _seek(self, index, maxpages=None): - """Seek file to offset of page specified by index.""" - pages = self.pages - lenpages = len(pages) - if lenpages == 0: - raise IndexError('index out of range') - - fh = self.parent.filehandle - if fh.closed: - raise ValueError('seek of closed file') - - if self._indexed or 0 <= index < lenpages: - page = pages[index] - offset = page if isinstance(page, inttypes) else page.offset - fh.seek(offset) - return - - tiff = self.parent.tiff - offsetformat = tiff.ifdoffsetformat - offsetsize = tiff.ifdoffsetsize - tagnoformat = tiff.tagnoformat - tagnosize = tiff.tagnosize - tagsize = tiff.tagsize - unpack = struct.unpack - - page = pages[-1] - offset = page if isinstance(page, inttypes) else page.offset - - if maxpages is None: - maxpages = 2**22 - while lenpages < maxpages: - # read offsets to pages from file until index is reached - fh.seek(offset) - # skip tags - try: - tagno = unpack(tagnoformat, fh.read(tagnosize))[0] - if tagno > 4096: - raise TiffFileError( - 'suspicious number of tags: %i' % tagno) - except Exception: - log_warning('TiffPages: corrupted tag list of page %i @ %i', - lenpages, offset) - del pages[-1] - lenpages -= 1 - self._indexed = True - break - self._nextpageoffset = offset + tagnosize + tagno * tagsize - fh.seek(self._nextpageoffset) - - # read offset to next page - offset = unpack(offsetformat, fh.read(offsetsize))[0] - if offset == 0: - self._indexed = True - break - if offset >= fh.size: - log_warning('TiffPages: invalid page offset (%i)', offset) - self._indexed = True - break - - pages.append(offset) - lenpages += 1 - if 0 <= index < lenpages: - break - - # detect some circular references - if lenpages == 100: - for p in pages[:-1]: - if offset == (p if isinstance(p, inttypes) else p.offset): - raise TiffFileError('invalid circular IFD reference') - - if index >= lenpages: - raise IndexError('index out of range') - - page = pages[index] - fh.seek(page if isinstance(page, inttypes) else page.offset) - - def _getlist(self, key=None, useframes=True, validate=True): - """Return specified pages as list of TiffPages or TiffFrames. - - The first item is a TiffPage, and is used as a keyframe for - following TiffFrames. - - """ - getitem = self._getitem - _useframes = self.useframes - - if key is None: - key = iter(range(len(self))) - elif isinstance(key, Iterable): - key = iter(key) - elif isinstance(key, slice): - start, stop, _ = key.indices(2**31 - 1) - if not self._indexed and max(stop, start) > len(self.pages): - self._seek(-1) - key = iter(range(*key.indices(len(self.pages)))) - elif isinstance(key, inttypes): - # return single TiffPage - self.useframes = False - if key == 0: - return [self.pages[key]] - try: - return [getitem(key)] - finally: - self.useframes = _useframes - else: - raise TypeError('key must be an integer, slice, or iterable') - - # use first page as keyframe - keyframe = self._keyframe - self.keyframe = next(key) - if validate: - validate = self._keyframe.hash - if useframes: - self.useframes = True - try: - pages = [getitem(i, validate) for i in key] - pages.insert(0, self._keyframe) - finally: - # restore state - self._keyframe = keyframe - if useframes: - self.useframes = _useframes - - return pages - - def _getitem(self, key, validate=False): - """Return specified page from cache or file.""" - key = int(key) - pages = self.pages - - if key < 0: - key %= len(self) - elif self._indexed and key >= len(pages): - raise IndexError( - 'index %i out of range(%i)' % (key, len(pages))) - - if key < len(pages): - page = pages[key] - if self._cache: - if not isinstance(page, inttypes): - if validate and validate != page.hash: - raise RuntimeError('page hash mismatch') - return page - elif isinstance(page, (TiffPage, self._tiffpage)): - if validate and validate != page.hash: - raise RuntimeError('page hash mismatch') - return page - - self._seek(key) - page = self._tiffpage(self.parent, index=key, keyframe=self._keyframe) - if validate and validate != page.hash: - raise RuntimeError('page hash mismatch') - if self._cache: - pages[key] = page - return page - - def __getitem__(self, key): - """Return specified page(s).""" - pages = self.pages - getitem = self._getitem - - if isinstance(key, inttypes): - if key == 0: - return pages[key] - return getitem(key) - - if isinstance(key, slice): - start, stop, _ = key.indices(2**31 - 1) - if not self._indexed and max(stop, start) > len(pages): - self._seek(-1) - return [getitem(i) for i in range(*key.indices(len(pages)))] - - if isinstance(key, Iterable): - return [getitem(k) for k in key] - - raise TypeError('key must be an integer, slice, or iterable') - - def __iter__(self): - """Return iterator over all pages.""" - i = 0 - while True: - try: - yield self._getitem(i) - i += 1 - except IndexError: - break - if self._cache: - self._cached = True - - def __bool__(self): - """Return True if file contains any pages.""" - return len(self.pages) > 0 - - def __len__(self): - """Return number of pages in file.""" - if not self._indexed: - self._seek(-1) - return len(self.pages) - - -class TiffPage(object): - """TIFF image file directory (IFD). - - Attributes - ---------- - index : int - Index of page in file. - dtype : numpy.dtype or None - Data type (native byte order) of the image in IFD. - shape : tuple - Dimensions of the image in IFD. - axes : str - Axes label codes: - 'X' width, 'Y' height, 'S' sample, 'I' image series|page|plane, - 'Z' depth, 'C' color|em-wavelength|channel, 'E' ex-wavelength|lambda, - 'T' time, 'R' region|tile, 'A' angle, 'P' phase, 'H' lifetime, - 'L' exposure, 'V' event, 'Q' unknown, '_' missing - tags : dict - Dictionary of tags in IFD. {tag.name: TiffTag} - colormap : numpy.ndarray - Color look up table, if exists. - - All attributes are read-only. - - Notes - ----- - The internal, normalized '_shape' attribute is 6 dimensional: - - 0 : number planes/images (stk, ij). - 1 : planar samplesperpixel. - 2 : imagedepth Z (sgi). - 3 : imagelength Y. - 4 : imagewidth X. - 5 : contig samplesperpixel. - - """ - - # default properties; will be updated from tags - subfiletype = 0 - imagewidth = 0 - imagelength = 0 - imagedepth = 1 - tilewidth = 0 - tilelength = 0 - tiledepth = 1 - bitspersample = 1 - samplesperpixel = 1 - sampleformat = 1 - rowsperstrip = 2**32 - 1 - compression = 1 - planarconfig = 1 - fillorder = 1 - photometric = 0 - predictor = 1 - extrasamples = 1 - colormap = None - software = '' - description = '' - description1 = '' - nodata = 0 - - def __init__(self, parent, index, keyframe=None): - """Initialize instance from file. - - The file handle position must be at offset to a valid IFD. - - """ - self.parent = parent - self.index = index - self.shape = () - self._shape = () - self.dtype = None - self._dtype = None - self.axes = '' - self.tags = tags = {} - self.dataoffsets = () - self.databytecounts = () - - tiff = parent.tiff - - # read TIFF IFD structure and its tags from file - fh = parent.filehandle - self.offset = fh.tell() # offset to this IFD - try: - tagno = struct.unpack( - tiff.tagnoformat, fh.read(tiff.tagnosize))[0] - if tagno > 4096: - raise TiffFileError('TiffPage %i: suspicious number of tags' - % self.index) - except Exception: - raise TiffFileError( - 'TiffPage %i: corrupted tag list at offset %i' - % (self.index, self.offset)) - - tagoffset = self.offset + tiff.tagnosize # fh.tell() - tagsize = tiff.tagsize - tagindex = -tagsize - - data = fh.read(tagsize * tagno) - - for _ in range(tagno): - tagindex += tagsize - try: - tag = TiffTag(parent, data[tagindex: tagindex + tagsize], - tagoffset + tagindex) - except TiffFileError as exc: - log_warning('TiffPage %i: %s: %s', self.index, - exc.__class__.__name__, exc) - continue - tagname = tag.name - if tagname not in tags: - name = tagname - tags[name] = tag - else: - # some files contain multiple tags with same code - # e.g. MicroManager files contain two ImageDescription tags - i = 1 - while i < 32: - name = '%s%i' % (tagname, i) - if name not in tags: - tags[name] = tag - break - i += 1 - else: - log_warning("TiffPage %i: suspicious number of '%s' tags", - self.index, tagname) - name = TIFF.TAG_ATTRIBUTES.get(name, '') - if name: - if name[:3] in 'sof des' and not isinstance(tag.value, str): - pass # wrong string type for software, description - else: - setattr(self, name, tag.value) - - if not tags: - return # found in FIBICS - - if 'SubfileType' in tags and self.subfiletype == 0: - sft = tags['SubfileType'].value - if sft == 2: - self.subfiletype = 0b1 # reduced image - elif sft == 3: - self.subfiletype = 0b10 # multi-page - - # consolidate private tags; remove them from self.tags - if self.is_andor: - self.andor_tags - elif self.is_epics: - self.epics_tags - # elif self.is_ndpi: - # self.ndpi_tags - - if self.is_sis and 'GPSTag' in tags: - # TODO: can't change tag.name - tags['OlympusSIS2'] = tags['GPSTag'] - del tags['GPSTag'] - - if self.is_lsm or (self.index and self.parent.is_lsm): - # correct non standard LSM bitspersample tags - tags['BitsPerSample']._fix_lsm_bitspersample(self) - if self.compression == 1 and self.predictor != 1: - # work around bug in LSM510 software - self.predictor = 1 - - if self.is_vista or (self.index and self.parent.is_vista): - # ISS Vista writes wrong ImageDepth tag - self.imagedepth = 1 - - if self.is_stk and 'UIC1tag' in tags and not tags['UIC1tag'].value: - # read UIC1tag now that plane count is known - uic1tag = tags['UIC1tag'] - fh.seek(uic1tag.valueoffset) - tags['UIC1tag'].value = read_uic1tag( - fh, tiff.byteorder, uic1tag.dtype, - uic1tag.count, None, tags['UIC2tag'].count) - - if 'IJMetadata' in tags: - # decode IJMetadata tag - try: - tags['IJMetadata'].value = imagej_metadata( - tags['IJMetadata'].value, - tags['IJMetadataByteCounts'].value, - tiff.byteorder) - except Exception as exc: - log_warning('TiffPage %i: %s: %s', self.index, - exc.__class__.__name__, exc) - - if 'BitsPerSample' in tags: - tag = tags['BitsPerSample'] - if tag.count == 1: - self.bitspersample = tag.value - else: - # LSM might list more items than samplesperpixel - value = tag.value[:self.samplesperpixel] - if any(v - value[0] for v in value): - self.bitspersample = value - else: - self.bitspersample = value[0] - - if 'SampleFormat' in tags: - tag = tags['SampleFormat'] - if tag.count == 1: - self.sampleformat = tag.value - else: - value = tag.value[:self.samplesperpixel] - if any(v - value[0] for v in value): - self.sampleformat = value - else: - self.sampleformat = value[0] - - if 'TileWidth' in tags: - self.rowsperstrip = None - elif 'ImageLength' in tags: - if 'RowsPerStrip' not in tags or tags['RowsPerStrip'].count > 1: - self.rowsperstrip = self.imagelength - self.rowsperstrip = min(self.rowsperstrip, self.imagelength) - # self.stripsperimage = int(math.floor( - # float(self.imagelength + self.rowsperstrip - 1) / - # self.rowsperstrip)) - - # determine dtype - dtype = self.sampleformat, self.bitspersample - dtype = TIFF.SAMPLE_DTYPES.get(dtype, None) - if dtype is not None: - dtype = numpy.dtype(dtype) - self.dtype = self._dtype = dtype - - # determine shape of data - imagelength = self.imagelength - imagewidth = self.imagewidth - imagedepth = self.imagedepth - samplesperpixel = self.samplesperpixel - - if self.is_stk: - if imagedepth != 1: - raise ValueError('STK imagedepth must be 1') - uictag = tags['UIC2tag'].value - planes = tags['UIC2tag'].count - if self.planarconfig == 1: - self._shape = ( - planes, - 1, - 1, - imagelength, - imagewidth, - samplesperpixel, - ) - if samplesperpixel == 1: - self.shape = (planes, imagelength, imagewidth) - self.axes = 'YX' - else: - self.shape = ( - planes, - imagelength, - imagewidth, - samplesperpixel, - ) - self.axes = 'YXS' - else: - self._shape = ( - planes, - samplesperpixel, - 1, - imagelength, - imagewidth, - 1, - ) - if samplesperpixel == 1: - self.shape = (planes, imagelength, imagewidth) - self.axes = 'YX' - else: - self.shape = ( - planes, - samplesperpixel, - imagelength, - imagewidth, - ) - self.axes = 'SYX' - # detect type of series - if planes == 1: - self.shape = self.shape[1:] - elif numpy.all(uictag['ZDistance'] != 0): - self.axes = 'Z' + self.axes - elif numpy.all(numpy.diff(uictag['TimeCreated']) != 0): - self.axes = 'T' + self.axes - else: - self.axes = 'I' + self.axes - elif self.photometric == 2 or samplesperpixel > 1: # PHOTOMETRIC.RGB - if self.planarconfig == 1: - self._shape = ( - 1, - 1, - imagedepth, - imagelength, - imagewidth, - samplesperpixel, - ) - if imagedepth == 1: - self.shape = (imagelength, imagewidth, samplesperpixel) - self.axes = 'YXS' - else: - self.shape = ( - imagedepth, - imagelength, - imagewidth, - samplesperpixel, - ) - self.axes = 'ZYXS' - else: - self._shape = ( - 1, - samplesperpixel, - imagedepth, - imagelength, - imagewidth, - 1, - ) - if imagedepth == 1: - self.shape = (samplesperpixel, imagelength, imagewidth) - self.axes = 'SYX' - else: - self.shape = ( - samplesperpixel, - imagedepth, - imagelength, - imagewidth, - ) - self.axes = 'SZYX' - else: - self._shape = (1, 1, imagedepth, imagelength, imagewidth, 1) - if imagedepth == 1: - self.shape = (imagelength, imagewidth) - self.axes = 'YX' - else: - self.shape = (imagedepth, imagelength, imagewidth) - self.axes = 'ZYX' - - # dataoffsets and databytecounts - if 'TileOffsets' in tags: - self.dataoffsets = tags['TileOffsets'].value - elif 'StripOffsets' in tags: - self.dataoffsets = tags['StripOffsets'].value - if 'TileByteCounts' in tags: - self.databytecounts = tags['TileByteCounts'].value - elif 'StripByteCounts' in tags: - self.databytecounts = tags['StripByteCounts'].value - else: - self.databytecounts = ( - product(self.shape) * (self.bitspersample // 8),) - if self.compression != 1: - log_warning('TiffPage %i: ByteCounts tag is missing', - self.index) - - if 'GDAL_NODATA' in tags: - try: - pytype = type(dtype.type(0).item()) - self.nodata = pytype(tags['GDAL_NODATA'].value) - except Exception: - pass - - @lazyattr - def decode(self): - """Decode single tile or strip.""" - raise NotImplementedError() - # TODO: retun function to decode single strips or tiles - - def asarray(self, out=None, squeeze=True, lock=None, reopen=True, - maxsize=None, maxworkers=None, validate=True): - """Read image data from file and return as numpy array. - - Raise ValueError if format is unsupported. - - Parameters - ---------- - out : numpy.ndarray, str, or file-like object - Buffer where image data will be saved. - If None (default), a new array will be created. - If numpy.ndarray, a writable array of compatible dtype and shape. - If 'memmap', directly memory-map the image data in the TIFF file - if possible; else create a memory-mapped array in a temporary file. - If str or open file, the file name or file object used to - create a memory-map to an array stored in a binary file on disk. - squeeze : bool - If True (default), all length-1 dimensions (except X and Y) are - squeezed out from the array. - If False, the shape of the returned array might be different from - the page.shape. - lock : {RLock, NullContext} - A reentrant lock used to synchronize seeks and reads from file. - If None (default), the lock of the parent's filehandle is used. - reopen : bool - If True (default) and the parent file handle is closed, the file - is temporarily re-opened and closed if no exception occurs. - maxsize: int - Maximum size of data before a ValueError is raised. - Can be used to catch DOS. Default: 16 TB. - maxworkers : int or None - Maximum number of threads to concurrently decode compressed - segments. If None (default), up to half the CPU cores are used. - See remarks in TiffFile.asarray. - validate : bool - If True (default), validate various parameters. - If None, only validate parameters and return None. - - Returns - ------- - numpy.ndarray - Numpy array of decompressed, depredicted, and unpacked image data - read from Strip/Tile Offsets/ByteCounts, formatted according to - shape and dtype metadata found in tags and parameters. - Photometric conversion, pre-multiplied alpha, orientation, and - colorimetry corrections are not applied. Specifically, CMYK images - are not converted to RGB, MinIsWhite images are not inverted, - and color palettes are not applied. An exception are YCbCr JPEG - compressed images, which will be converted to RGB. - - """ - # properties from TiffPage or TiffFrame - fh = self.parent.filehandle - byteorder = self.parent.tiff.byteorder - offsets, bytecounts = self._offsetscounts - self_ = self - self = self.keyframe # self or keyframe - - if not self._shape or product(self._shape) == 0: - return None - - tags = self.tags - - if validate or validate is None: - if maxsize is None: - maxsize = 2**44 - if maxsize and product(self._shape) > maxsize: - raise ValueError('TiffPage %i: data are too large %s' - % (self.index, str(self._shape))) - if self.dtype is None: - raise ValueError( - 'TiffPage %i: data type not supported: %s%i' - % (self.index, self.sampleformat, self.bitspersample)) - if self.compression not in TIFF.DECOMPESSORS: - raise ValueError('TiffPage %i: cannot decompress %s' - % (self.index, self.compression.name)) - if 'SampleFormat' in tags: - tag = tags['SampleFormat'] - if ( - tag.count != 1 - and any(i - tag.value[0] for i in tag.value) - ): - raise ValueError( - 'TiffPage %i: sample formats do not match %s' - % (self.index, tag.value)) - if self.is_subsampled and (self.compression not in (6, 7) - or self.planarconfig == 2): - raise NotImplementedError( - 'TiffPage %i: chroma subsampling not supported' - % self.index) - if validate is None: - return None - - lock = fh.lock if lock is None else lock - with lock: - closed = fh.closed - if closed: - if reopen: - fh.open() - else: - raise IOError('TiffPage %i: file handle is closed' - % self.index) - - dtype = self._dtype - shape = self._shape - imagewidth = self.imagewidth - imagelength = self.imagelength - imagedepth = self.imagedepth - bitspersample = self.bitspersample - typecode = byteorder + dtype.char - lsb2msb = self.fillorder == 2 - istiled = self.is_tiled - - if istiled: - tilewidth = self.tilewidth - tilelength = self.tilelength - tiledepth = self.tiledepth - tw = (imagewidth + tilewidth - 1) // tilewidth - tl = (imagelength + tilelength - 1) // tilelength - td = (imagedepth + tiledepth - 1) // tiledepth - tiledshape = (td, tl, tw) - tileshape = (tiledepth, tilelength, tilewidth, shape[-1]) - runlen = tilewidth - else: - runlen = imagewidth - - if self.planarconfig == 1: - runlen *= self.samplesperpixel - - if isinstance(out, str) and out == 'memmap' and self.is_memmappable: - # direct memory map array in file - with lock: - result = fh.memmap_array(typecode, shape, offset=offsets[0]) - elif self.is_contiguous: - # read contiguous bytes to array - if out is not None: - out = create_output(out, shape, dtype) - with lock: - fh.seek(offsets[0]) - result = fh.read_array(typecode, product(shape), out=out) - if lsb2msb: - bitorder_decode(result, out=result) - else: - # decompress, unpack,... individual strips or tiles - result = create_output(out, shape, dtype) - - decompress = TIFF.DECOMPESSORS[self.compression] - - if self.compression in (6, 7): # COMPRESSION.JPEG - colorspace = None - outcolorspace = None - jpegtables = None - if lsb2msb: - log_warning('TiffPage %i: disabling LSB2MSB for JPEG', - self.index) - lsb2msb = False - if 'JPEGTables' in tags: - # load JPEGTables from TiffFrame - jpegtables = self_._gettags({347}, lock=lock)[0][1].value - # TODO: obtain table from OJPEG tags - # elif ('JPEGInterchangeFormat' in tags and - # 'JPEGInterchangeFormatLength' in tags and - # tags['JPEGInterchangeFormat'].value != offsets[0]): - # fh.seek(tags['JPEGInterchangeFormat'].value) - # fh.read(tags['JPEGInterchangeFormatLength'].value) - if 'ExtraSamples' in tags: - pass - elif self.photometric == 6: - # YCBCR -> RGB - outcolorspace = 'RGB' - elif self.photometric == 2: - if self.planarconfig == 1: - colorspace = outcolorspace = 'RGB' - else: - outcolorspace = TIFF.PHOTOMETRIC(self.photometric).name - if istiled: - heightwidth = tilelength, tilewidth - else: - heightwidth = imagelength, imagewidth - - def decompress(data, bitspersample=bitspersample, - jpegtables=jpegtables, colorspace=colorspace, - outcolorspace=outcolorspace, shape=heightwidth, - out=None, _decompress=decompress): - return _decompress(data, bitspersample, jpegtables, - colorspace, outcolorspace, shape, out) - - def unpack(data): - return data.reshape(-1) - - elif bitspersample in (8, 16, 32, 64, 128): - if (bitspersample * runlen) % 8: - raise ValueError( - 'TiffPage %i: data and sample size mismatch' - % self.index) - if self.predictor == 3: # PREDICTOR.FLOATINGPOINT - # the floating-point horizontal differencing decoder - # needs the raw byte order - typecode = dtype.char - - def unpack(data, typecode=typecode, out=None): - try: - # read only numpy array - return numpy.frombuffer(data, typecode) - except ValueError: - # strips may be missing EOI - # log_warning('TiffPage.asarray: ...') - bps = bitspersample // 8 - xlen = (len(data) // bps) * bps - return numpy.frombuffer(data[:xlen], typecode) - - elif isinstance(bitspersample, tuple): - - def unpack(data, out=None): - return unpack_rgb(data, typecode, bitspersample) - - else: - - def unpack(data, out=None): - return packints_decode(data, typecode, bitspersample, - runlen) - - # TODO: store decode function for future use - # TODO: unify tile and strip decoding - if istiled: - unpredict = TIFF.UNPREDICTORS[self.predictor] - - def decode(tile, tileindex, tileshape=tileshape, - tiledshape=tiledshape, lsb2msb=lsb2msb, - decompress=decompress, unpack=unpack, - unpredict=unpredict, nodata=self.nodata, - out=result[0]): - return tile_decode(tile, tileindex, tileshape, tiledshape, - lsb2msb, decompress, unpack, unpredict, - nodata, out) - - tileiter = fh.read_segments(offsets, bytecounts, lock) - - if self.compression == 1 or len(offsets) < 3: - maxworkers = 1 - elif maxworkers is None or maxworkers < 1: - import multiprocessing - maxworkers = max(multiprocessing.cpu_count() // 2, 1) - - if maxworkers < 2: - for i, tile in enumerate(tileiter): - decode(tile, i) - else: - # decode first tile un-threaded to catch exceptions - decode(next(tileiter), 0) - with ThreadPoolExecutor(maxworkers) as executor: - executor.map(decode, tileiter, range(1, len(offsets))) - - else: - stripsize = self.rowsperstrip * self.imagewidth - if self.planarconfig == 1: - stripsize *= self.samplesperpixel - outsize = stripsize * self.dtype.itemsize - result = result.reshape(-1) - index = 0 - for strip in fh.read_segments(offsets, bytecounts, lock): - if strip is None: - result[index:index + stripsize] = self.nodata - index += stripsize - continue - if lsb2msb: - strip = bitorder_decode(strip, out=strip) - strip = decompress(strip, out=outsize) - strip = unpack(strip) - size = min(result.size, strip.size, stripsize, - result.size - index) - result[index:index + size] = strip[:size] - del strip - index += size - - result.shape = self._shape - - if self.predictor != 1 and not (istiled and not self.is_contiguous): - unpredict = TIFF.UNPREDICTORS[self.predictor] - result = unpredict(result, axis=-2, out=result) - - if squeeze: - try: - result.shape = self.shape - except ValueError: - log_warning('TiffPage %i: failed to reshape %s to %s', - self.index, result.shape, self.shape) - - if closed: - # TODO: file should remain open if an exception occurred above - fh.close() - return result - - def asrgb(self, uint8=False, alpha=None, colormap=None, - dmin=None, dmax=None, **kwargs): - """Return image data as RGB(A). - - Work in progress. - - """ - data = self.asarray(**kwargs) - self = self.keyframe # self or keyframe - photometric = self.photometric - PHOTOMETRIC = TIFF.PHOTOMETRIC - - if photometric == PHOTOMETRIC.PALETTE: - colormap = self.colormap - if ( - colormap.shape[1] < 2**self.bitspersample - or self.dtype.char not in 'BH' - ): - raise ValueError('TiffPage %i: cannot apply colormap' - % self.index) - if uint8: - if colormap.max() > 255: - colormap >>= 8 - colormap = colormap.astype('uint8') - if 'S' in self.axes: - data = data[..., 0] if self.planarconfig == 1 else data[0] - data = apply_colormap(data, colormap) - - elif photometric == PHOTOMETRIC.RGB: - if 'ExtraSamples' in self.tags: - if alpha is None: - alpha = TIFF.EXTRASAMPLE - extrasamples = self.extrasamples - if self.tags['ExtraSamples'].count == 1: - extrasamples = (extrasamples,) - for i, exs in enumerate(extrasamples): - if exs in alpha: - if self.planarconfig == 1: - data = data[..., [0, 1, 2, 3 + i]] - else: - data = data[:, [0, 1, 2, 3 + i]] - break - else: - if self.planarconfig == 1: - data = data[..., :3] - else: - data = data[:, :3] - # TODO: convert to uint8? - - elif photometric == PHOTOMETRIC.MINISBLACK: - raise NotImplementedError() - elif photometric == PHOTOMETRIC.MINISWHITE: - raise NotImplementedError() - elif photometric == PHOTOMETRIC.SEPARATED: - raise NotImplementedError() - else: - raise NotImplementedError() - return data - - def _gettags(self, codes=None, lock=None): - """Return list of (code, TiffTag).""" - tags = [] - for tag in self.tags.values(): - code = tag.code - if not codes or code in codes: - tags.append((code, tag)) - return tags - - def aspage(self): - """Return self.""" - return self - - @property - def keyframe(self): - """Return keyframe, self.""" - return self - - @keyframe.setter - def keyframe(self, index): - """Set keyframe, NOP.""" - return - - @lazyattr - def pages(self): - """Return sequence of sub-pages (SubIFDs).""" - if 'SubIFDs' not in self.tags: - return tuple() - return TiffPages(self) - - @property - def hash(self): - """Return checksum to identify pages in same series.""" - return hash( - self._shape - + ( - self.tilewidth, - self.tilelength, - self.tiledepth, - self.bitspersample, - self.fillorder, - self.predictor, - self.extrasamples, - self.photometric, - self.compression, - self.planarconfig, - ) - ) - - @lazyattr - def _offsetscounts(self): - """Return simplified offsets and bytecounts.""" - if self.is_contiguous: - offset, bytecount = self.is_contiguous - return ((offset,), (bytecount,)) - return self.dataoffsets, self.databytecounts - - @lazyattr - def is_contiguous(self): - """Return offset and size of contiguous data, else None. - - Excludes prediction and fill_order. - - """ - if self.compression != 1 or self.bitspersample not in (8, 16, 32, 64): - return None - if 'TileWidth' in self.tags: - if ( - self.imagewidth != self.tilewidth - or self.imagelength % self.tilelength - or self.tilewidth % 16 - or self.tilelength % 16 - ): - return None - if ( - 'ImageDepth' in self.tags - and 'TileDepth' in self.tags - and (self.imagelength != self.tilelength - or self.imagedepth % self.tiledepth) - ): - return None - offsets = self.dataoffsets - bytecounts = self.databytecounts - if len(offsets) == 1: - return offsets[0], bytecounts[0] - if self.is_stk or self.is_lsm: - return offsets[0], sum(bytecounts) - if all( - bytecounts[i] != 0 and offsets[i] + bytecounts[i] == offsets[i + 1] - for i in range(len(offsets) - 1) - ): - return offsets[0], sum(bytecounts) - return None - - @lazyattr - def is_final(self): - """Return if page's image data are stored in final form. - - Excludes byte-swapping. - - """ - return ( - self.is_contiguous - and self.fillorder == 1 - and self.predictor == 1 - and not self.is_subsampled - ) - - @lazyattr - def is_memmappable(self): - """Return if page's image data in file can be memory-mapped.""" - return ( - self.parent.filehandle.is_file - and self.is_final - # and (self.bitspersample == 8 or self.parent.isnative) - # aligned? - and self.is_contiguous[0] % self.dtype.itemsize == 0 - ) - - def __str__(self, detail=0, width=79): - """Return string containing information about page.""" - if self.keyframe != self: - return TiffFrame.__str__(self, detail, width) - attr = '' - for name in ('memmappable', 'final', 'contiguous'): - attr = getattr(self, 'is_' + name) - if attr: - attr = name.upper() - break - - def tostr(name, skip=1): - obj = getattr(self, name) - try: - value = getattr(obj, 'name') - except AttributeError: - return '' - if obj != skip: - return value - return '' - - info = ' '.join( - s.lower() - for s in ( - 'x'.join(str(i) for i in self.shape), - '%s%s' - % ( - TIFF.SAMPLEFORMAT(self.sampleformat).name, - self.bitspersample, - ), - ' '.join( - i - for i in ( - TIFF.PHOTOMETRIC(self.photometric).name, - 'REDUCED' if self.is_reduced else '', - 'MASK' if self.is_mask else '', - 'TILED' if self.is_tiled else '', - tostr('compression'), - tostr('planarconfig'), - tostr('predictor'), - tostr('fillorder'), - ) - + tuple(f.upper() for f in self.flags) - + (attr,) - if i - ), - ) - if s - ) - info = 'TiffPage %i @%i %s' % (self.index, self.offset, info) - if detail <= 0: - return info - info = [info] - tags = self.tags - tlines = [] - vlines = [] - for tag in sorted(tags.values(), key=lambda x: x.code): - value = tag.__str__(width=width + 1) - tlines.append(value[:width].strip()) - if detail > 1 and len(value) > width: - name = tag.name.upper() - if detail <= 2 and ('COUNTS' in name or 'OFFSETS' in name): - value = pformat(tag.value, width=width, height=detail * 4) - else: - value = pformat(tag.value, width=width, height=detail * 12) - vlines.append('%s\n%s' % (tag.name, value)) - info.append('\n'.join(tlines)) - if detail > 1: - info.append('\n\n'.join(vlines)) - for name in ('ndpi',): - name = name + '_tags' - attr = getattr(self, name, False) - if attr: - info.append('%s\n%s' % (name.upper(), pformat(attr))) - if detail > 3: - try: - info.append( - 'DATA\n%s' - % pformat(self.asarray(), width=width, height=detail * 8) - ) - except Exception: - pass - return '\n\n'.join(info) - - @lazyattr - def flags(self): - """Return set of flags.""" - return set( - name.lower() - for name in sorted(TIFF.FILE_FLAGS) - if getattr(self, 'is_' + name) - ) - - @property - def ndim(self): - """Return number of array dimensions.""" - return len(self.shape) - - @property - def size(self): - """Return number of elements in array.""" - return product(self.shape) - - @lazyattr - def andor_tags(self): - """Return consolidated metadata from Andor tags as dict. - - Remove Andor tags from self.tags. - - """ - if not self.is_andor: - return None - tags = self.tags - result = {'Id': tags['AndorId'].value} - for tag in list(self.tags.values()): - code = tag.code - if not 4864 < code < 5031: - continue - value = tag.value - name = tag.name[5:] if len(tag.name) > 5 else tag.name - result[name] = value - del tags[tag.name] - return result - - @lazyattr - def epics_tags(self): - """Return consolidated metadata from EPICS areaDetector tags as dict. - - Remove areaDetector tags from self.tags. - - """ - if not self.is_epics: - return None - result = {} - tags = self.tags - for tag in list(self.tags.values()): - code = tag.code - if not 65000 <= code < 65500: - continue - value = tag.value - if code == 65000: - result['timeStamp'] = datetime.datetime.fromtimestamp( - float(value)) - elif code == 65001: - result['uniqueID'] = int(value) - elif code == 65002: - result['epicsTSSec'] = int(value) - elif code == 65003: - result['epicsTSNsec'] = int(value) - else: - key, value = value.split(':', 1) - result[key] = astype(value) - del tags[tag.name] - return result - - @lazyattr - def ndpi_tags(self): - """Return consolidated metadata from Hamamatsu NDPI as dict.""" - if not self.is_ndpi: - return None - tags = self.tags - result = {} - for name in ('Make', 'Model', 'Software'): - result[name] = tags[name].value - for code, name in TIFF.NDPI_TAGS.items(): - code = str(code) - if code in tags: - result[name] = tags[code].value - # del tags[code] - return result - - @lazyattr - def geotiff_tags(self): - """Return consolidated metadata from GeoTIFF tags as dict.""" - if not self.is_geotiff: - return None - tags = self.tags - - gkd = tags['GeoKeyDirectoryTag'].value - if gkd[0] != 1: - log_warning('GeoTIFF tags: invalid GeoKeyDirectoryTag') - return {} - - result = { - 'KeyDirectoryVersion': gkd[0], - 'KeyRevision': gkd[1], - 'KeyRevisionMinor': gkd[2], - # 'NumberOfKeys': gkd[3], - } - # deltags = ['GeoKeyDirectoryTag'] - geokeys = TIFF.GEO_KEYS - geocodes = TIFF.GEO_CODES - for index in range(gkd[3]): - try: - keyid, tagid, count, offset = gkd[4 + index * 4: index * 4 + 8] - except Exception as exception: - log_warning('GeoTIFF tags: %s', str(exception)) - continue - keyid = geokeys.get(keyid, keyid) - if tagid == 0: - value = offset - else: - tagname = TIFF.TAGS[tagid] - # deltags.append(tagname) - try: - value = tags[tagname].value[offset: offset + count] - except KeyError: - log_warning('GeoTIFF tags: %s not found', tagname) - continue - if tagid == 34737 and count > 1 and value[-1] == '|': - value = value[:-1] - value = value if count > 1 else value[0] - if keyid in geocodes: - try: - value = geocodes[keyid](value) - except Exception: - pass - result[keyid] = value - - if 'IntergraphMatrixTag' in tags: - value = tags['IntergraphMatrixTag'].value - value = numpy.array(value) - if len(value) == 16: - value = value.reshape((4, 4)).tolist() - result['IntergraphMatrix'] = value - if 'ModelPixelScaleTag' in tags: - value = numpy.array(tags['ModelPixelScaleTag'].value).tolist() - result['ModelPixelScale'] = value - if 'ModelTiepointTag' in tags: - value = tags['ModelTiepointTag'].value - value = numpy.array(value).reshape((-1, 6)).squeeze().tolist() - result['ModelTiepoint'] = value - if 'ModelTransformationTag' in tags: - value = tags['ModelTransformationTag'].value - value = numpy.array(value).reshape((4, 4)).tolist() - result['ModelTransformation'] = value - # if 'ModelPixelScaleTag' in tags and 'ModelTiepointTag' in tags: - # sx, sy, sz = tags['ModelPixelScaleTag'].value - # tiepoints = tags['ModelTiepointTag'].value - # transforms = [] - # for tp in range(0, len(tiepoints), 6): - # i, j, k, x, y, z = tiepoints[tp:tp+6] - # transforms.append([ - # [sx, 0.0, 0.0, x - i * sx], - # [0.0, -sy, 0.0, y + j * sy], - # [0.0, 0.0, sz, z - k * sz], - # [0.0, 0.0, 0.0, 1.0]]) - # if len(tiepoints) == 6: - # transforms = transforms[0] - # result['ModelTransformation'] = transforms - - if 'RPCCoefficientTag' in tags: - rpcc = tags['RPCCoefficientTag'].value - result['RPCCoefficient'] = { - 'ERR_BIAS': rpcc[0], - 'ERR_RAND': rpcc[1], - 'LINE_OFF': rpcc[2], - 'SAMP_OFF': rpcc[3], - 'LAT_OFF': rpcc[4], - 'LONG_OFF': rpcc[5], - 'HEIGHT_OFF': rpcc[6], - 'LINE_SCALE': rpcc[7], - 'SAMP_SCALE': rpcc[8], - 'LAT_SCALE': rpcc[9], - 'LONG_SCALE': rpcc[10], - 'HEIGHT_SCALE': rpcc[11], - 'LINE_NUM_COEFF': rpcc[12:33], - 'LINE_DEN_COEFF ': rpcc[33:53], - 'SAMP_NUM_COEFF': rpcc[53:73], - 'SAMP_DEN_COEFF': rpcc[73:], - } - - return result - - @property - def is_reduced(self): - """Page is reduced image of another image.""" - return self.subfiletype & 0b1 - - @property - def is_multipage(self): - """Page is part of multi-page image.""" - return self.subfiletype & 0b10 - - @property - def is_mask(self): - """Page is transparency mask for another image.""" - return self.subfiletype & 0b100 - - @property - def is_mrc(self): - """Page is part of Mixed Raster Content.""" - return self.subfiletype & 0b1000 - - @property - def is_tiled(self): - """Page contains tiled image.""" - return 'TileWidth' in self.tags - - @property - def is_subsampled(self): - """Page contains chroma subsampled image.""" - if 'YCbCrSubSampling' in self.tags: - return self.tags['YCbCrSubSampling'].value != (1, 1) - return ( - self.compression == 7 - and self.planarconfig == 1 - and self.photometric in (2, 6) - ) - - @lazyattr - def is_imagej(self): - """Return ImageJ description if exists, else None.""" - for description in (self.description, self.description1): - if not description: - return None - if description[:7] == 'ImageJ=': - return description - return None - - @lazyattr - def is_shaped(self): - """Return description containing array shape if exists, else None.""" - for description in (self.description, self.description1): - if not description: - return None - if description[:1] == '{' and '"shape":' in description: - return description - if description[:6] == 'shape=': - return description - return None - - @property - def is_mdgel(self): - """Page contains MDFileTag tag.""" - return 'MDFileTag' in self.tags - - @property - def is_mediacy(self): - """Page contains Media Cybernetics Id tag.""" - return ( - 'MC_Id' in self.tags and self.tags['MC_Id'].value[:7] == b'MC TIFF' - ) - - @property - def is_stk(self): - """Page contains UIC2Tag tag.""" - return 'UIC2tag' in self.tags - - @property - def is_lsm(self): - """Page contains CZ_LSMINFO tag.""" - return 'CZ_LSMINFO' in self.tags - - @property - def is_fluoview(self): - """Page contains FluoView MM_STAMP tag.""" - return 'MM_Stamp' in self.tags - - @property - def is_nih(self): - """Page contains NIH image header.""" - return 'NIHImageHeader' in self.tags - - @property - def is_sgi(self): - """Page contains SGI image and tile depth tags.""" - return 'ImageDepth' in self.tags and 'TileDepth' in self.tags - - @property - def is_vista(self): - """Software tag is 'ISS Vista'.""" - return self.software == 'ISS Vista' - - @property - def is_metaseries(self): - """Page contains MDS MetaSeries metadata in ImageDescription tag.""" - if self.index > 1 or self.software != 'MetaSeries': - return False - d = self.description - return d.startswith('') and d.endswith('') - - @property - def is_ome(self): - """Page contains OME-XML in ImageDescription tag.""" - if self.index > 1 or not self.description: - return False - d = self.description - return ((d[:13] == '') - - @property - def is_scn(self): - """Page contains Leica SCN XML in ImageDescription tag.""" - if self.index > 1 or not self.description: - return False - d = self.description - return d[:14] == '' - - @property - def is_micromanager(self): - """Page contains Micro-Manager metadata.""" - return 'MicroManagerMetadata' in self.tags - - @property - def is_andor(self): - """Page contains Andor Technology tags.""" - return 'AndorId' in self.tags - - @property - def is_pilatus(self): - """Page contains Pilatus tags.""" - return self.software[:8] == 'TVX TIFF' and self.description[:2] == '# ' - - @property - def is_epics(self): - """Page contains EPICS areaDetector tags.""" - return ( - self.description == 'EPICS areaDetector' - or self.software == 'EPICS areaDetector' - ) - - @property - def is_tvips(self): - """Page contains TVIPS metadata.""" - return 'TVIPS' in self.tags - - @property - def is_fei(self): - """Page contains SFEG or HELIOS metadata.""" - return 'FEI_SFEG' in self.tags or 'FEI_HELIOS' in self.tags - - @property - def is_sem(self): - """Page contains Zeiss SEM metadata.""" - return 'CZ_SEM' in self.tags - - @property - def is_svs(self): - """Page contains Aperio metadata.""" - return self.description[:20] == 'Aperio Image Library' - - @property - def is_scanimage(self): - """Page contains ScanImage metadata.""" - return ( - self.description[:12] == 'state.config' - or self.software[:22] == 'SI.LINE_FORMAT_VERSION' - or 'scanimage.SI' in self.description[-256:] - ) - - @property - def is_qpi(self): - """Page contains PerkinElmer tissue images metadata.""" - # The ImageDescription tag contains XML with a top-level - # element - return self.software[:15] == 'PerkinElmer-QPI' - - @property - def is_geotiff(self): - """Page contains GeoTIFF metadata.""" - return 'GeoKeyDirectoryTag' in self.tags - - @property - def is_sis(self): - """Page contains Olympus SIS metadata.""" - return 'OlympusSIS' in self.tags or 'OlympusINI' in self.tags - - @lazyattr # must not be property; tag 65420 is later removed - def is_ndpi(self): - """Page contains NDPI metadata.""" - return '65420' in self.tags and 'Make' in self.tags - - -class TiffFrame(object): - """Lightweight TIFF image file directory (IFD). - - Only a limited number of tag values are read from file, e.g. StripOffsets, - and StripByteCounts. Other tag values are assumed to be identical with a - specified TiffPage instance, the keyframe. - - TiffFrame is intended to reduce resource usage and speed up reading image - data from file, not for introspection of metadata. - - Not compatible with Python 2. - - """ - - __slots__ = 'index', 'parent', 'offset', '_offsetscounts', '_keyframe' - - is_mdgel = False - pages = None - tags = {} - - def __init__(self, parent, index, offset=None, keyframe=None, - offsets=None, bytecounts=None): - """Initialize TiffFrame from file or values. - - The file handle position must be at the offset to a valid IFD. - - """ - self._keyframe = None - self.parent = parent - self.index = index - self.offset = offset - - if offsets is not None: - # initialize "virtual frame" from offsets and bytecounts - self._offsetscounts = offsets, bytecounts - self._keyframe = keyframe - return - - if offset is None: - self.offset = parent.filehandle.tell() - else: - parent.filehandle.seek(offset) - - if keyframe is None: - tags = {273, 279, 324, 325} - elif keyframe.is_contiguous: - tags = {256, 273, 324} - else: - tags = {256, 273, 279, 324, 325} - - dataoffsets = databytecounts = [] - - for code, tag in self._gettags(tags): - if code == 273 or code == 324: - dataoffsets = tag.value - elif code == 279 or code == 325: - databytecounts = tag.value - elif code == 256 and keyframe.imagewidth != tag.value: - raise RuntimeError( - 'TiffFrame %i: incompatible keyframe' % index) - # elif code == 270: - # tagname = tag.name - # if tagname not in tags: - # tags[tagname] = bytes2str(tag.value) - # elif 'ImageDescription1' not in tags: - # tags['ImageDescription1'] = bytes2str(tag.value) - # else: - # tags[tag.name] = tag.value - - if not dataoffsets: - log_warning('TiffFrame %i: missing required tags', index) - - self._offsetscounts = dataoffsets, databytecounts - - if keyframe is not None: - self.keyframe = keyframe - - def _gettags(self, codes=None, lock=None): - """Return list of (code, TiffTag) from file.""" - fh = self.parent.filehandle - tiff = self.parent.tiff - unpack = struct.unpack - lock = NullContext() if lock is None else lock - tags = [] - - with lock: - fh.seek(self.offset) - try: - tagno = unpack(tiff.tagnoformat, fh.read(tiff.tagnosize))[0] - if tagno > 4096: - raise TiffFileError( - 'TiffFrame %i: suspicious number of tags' % self.index) - except Exception: - raise TiffFileError( - 'TiffFrame %i: corrupted page list at offset %i' - % (self.index, self.offset)) - - tagoffset = self.offset + tiff.tagnosize # fh.tell() - tagsize = tiff.tagsize - tagindex = -tagsize - codeformat = tiff.tagformat1[:2] - tagbytes = fh.read(tagsize * tagno) - - for _ in range(tagno): - tagindex += tagsize - code = unpack(codeformat, tagbytes[tagindex: tagindex + 2])[0] - if codes and code not in codes: - continue - try: - tag = TiffTag(self.parent, - tagbytes[tagindex: tagindex + tagsize], - tagoffset + tagindex) - except TiffFileError as exc: - log_warning('TiffFrame %i: %s: %s', - self.index, exc.__class__.__name__, exc) - continue - tags.append((code, tag)) - - return tags - - def aspage(self): - """Return TiffPage from file.""" - if self.offset is None: - raise ValueError( - 'TiffFrame %i: cannot return virtual frame as page' - % self.index) - self.parent.filehandle.seek(self.offset) - return TiffPage(self.parent, index=self.index) - - def asarray(self, *args, **kwargs): - """Read image data from file and return as numpy array.""" - # TODO: fix TypeError on Python 2 - # "TypeError: unbound method asarray() must be called with TiffPage - # instance as first argument (got TiffFrame instance instead)" - if self._keyframe is None: - raise RuntimeError('TiffFrame %i: keyframe not set' % self.index) - kwargs['validate'] = False - return TiffPage.asarray(self, *args, **kwargs) - - def asrgb(self, *args, **kwargs): - """Read image data from file and return RGB image as numpy array.""" - if self._keyframe is None: - raise RuntimeError('TiffFrame %i: keyframe not set' % self.index) - kwargs['validate'] = False - return TiffPage.asrgb(self, *args, **kwargs) - - @property - def keyframe(self): - """Return keyframe.""" - return self._keyframe - - @keyframe.setter - def keyframe(self, keyframe): - """Set keyframe.""" - if self._keyframe == keyframe: - return - if self._keyframe is not None: - raise RuntimeError( - 'TiffFrame %i: cannot reset keyframe' % self.index) - if len(self._offsetscounts[0]) != len(keyframe.dataoffsets): - raise RuntimeError( - 'TiffFrame %i: incompatible keyframe' % self.index) - if keyframe.is_tiled: - pass - if keyframe.is_contiguous: - self._offsetscounts = ( - (self._offsetscounts[0][0], ), - (keyframe.is_contiguous[1], ), - ) - self._keyframe = keyframe - - @property - def is_contiguous(self): - """Return offset and size of contiguous data, else None.""" - if self._keyframe is None: - raise RuntimeError('TiffFrame %i: keyframe not set' % self.index) - if self._keyframe.is_contiguous: - return self._offsetscounts[0][0], self._keyframe.is_contiguous[1] - return None - - @property - def is_memmappable(self): - """Return if page's image data in file can be memory-mapped.""" - if self._keyframe is None: - raise RuntimeError('TiffFrame %i: keyframe not set' % self.index) - return self._keyframe.is_memmappable - - @property - def hash(self): - """Return checksum to identify pages in same series.""" - if self._keyframe is None: - raise RuntimeError('TiffFrame %i: keyframe not set' % self.index) - return self._keyframe.hash - - def __getattr__(self, name): - """Return attribute from keyframe.""" - if name in TIFF.FRAME_ATTRS: - return getattr(self._keyframe, name) - # this error could be raised because an AttributeError was - # raised inside a @property function - raise AttributeError("'%s' object has no attribute '%s'" - % (self.__class__.__name__, name)) - - def __str__(self, detail=0, width=79): - """Return string containing information about frame.""" - if self._keyframe is None: - info = '' - kf = None - else: - info = ' '.join(s for s in ('x'.join(str(i) for i in self.shape), - str(self.dtype))) - kf = TiffPage.__str__(self._keyframe, width=width - 11) - if detail > 3: - of, bc = self._offsetscounts - of = pformat(of, width=width - 9, height=detail - 3) - bc = pformat(bc, width=width - 13, height=detail - 3) - info = '\n Keyframe %s\n Offsets %s\n Bytecounts %s' % (kf, of, bc) - return 'TiffFrame %i @%s %s' % (self.index, self.offset, info) - - -class TiffTag(object): - """TIFF tag structure. - - Attributes - ---------- - name : string - Name of tag. - code : int - Decimal code of tag. - dtype : str - Datatype of tag data. One of TIFF DATA_FORMATS. - count : int - Number of values. - value : various types - Tag data as Python object. - ImageSourceData : int - Location of value in file. - - All attributes are read-only. - - """ - - __slots__ = ('code', 'count', 'dtype', 'value', 'valueoffset') - - def __init__(self, parent, tagheader, tagoffset): - """Initialize instance from tag header.""" - fh = parent.filehandle - tiff = parent.tiff - byteorder = tiff.byteorder - offsetsize = tiff.offsetsize - unpack = struct.unpack - - self.valueoffset = tagoffset + offsetsize + 4 - code, type_ = unpack(tiff.tagformat1, tagheader[:4]) - count, value = unpack(tiff.tagformat2, tagheader[4:]) - - try: - dtype = TIFF.DATA_FORMATS[type_] - except KeyError: - raise TiffFileError('unknown tag data type %i' % type_) - - fmt = '%s%i%s' % (byteorder, count * int(dtype[0]), dtype[1]) - size = struct.calcsize(fmt) - if size > offsetsize or code in TIFF.TAG_READERS: - self.valueoffset = offset = unpack(tiff.offsetformat, value)[0] - if offset < 8 or offset > fh.size - size: - raise TiffFileError('invalid tag value offset') - # if offset % 2: - # log_warning('TiffTag: value does not begin on word boundary') - fh.seek(offset) - if code in TIFF.TAG_READERS: - readfunc = TIFF.TAG_READERS[code] - value = readfunc(fh, byteorder, dtype, count, offsetsize) - elif type_ == 7 or (count > 1 and dtype[-1] == 'B'): - value = read_bytes(fh, byteorder, dtype, count, offsetsize) - elif code in TIFF.TAGS or dtype[-1] == 's': - value = unpack(fmt, fh.read(size)) - else: - value = read_numpy(fh, byteorder, dtype, count, offsetsize) - elif dtype[-1] == 'B' or type_ == 7: - value = value[:size] - else: - value = unpack(fmt, value[:size]) - - process = ( - code not in TIFF.TAG_READERS - and code not in TIFF.TAG_TUPLE - and type_ != 7 - ) - if process and dtype[-1] == 's' and isinstance(value[0], bytes): - # TIFF ASCII fields can contain multiple strings, - # each terminated with a NUL - value = value[0] - try: - value = bytes2str(stripascii(value).strip()) - except UnicodeDecodeError: - # TODO: this doesn't work on Python 2 - log_warning( - 'TiffTag %i: coercing invalid ASCII to bytes', code) - dtype = '1B' - else: - if code in TIFF.TAG_ENUM: - t = TIFF.TAG_ENUM[code] - try: - value = tuple(t(v) for v in value) - except ValueError as exc: - log_warning('TiffTag %i: %s', code, str(exc)) - if process: - if len(value) == 1: - value = value[0] - - self.code = code - self.dtype = dtype - self.count = count - self.value = value - - @property - def name(self): - """Return name of tag from TIFF.TAGS registry.""" - try: - return TIFF.TAGS[self.code] - except KeyError: - return str(self.code) - - def _fix_lsm_bitspersample(self, parent): - """Correct LSM bitspersample tag. - - Old LSM writers may use a separate region for two 16-bit values, - although they fit into the tag value element of the tag. - - """ - if self.code != 258 or self.count != 2: - return - # TODO: test this case; need example file - log_warning('TiffTag %i: correcting LSM bitspersample tag', self.code) - value = struct.pack(' 0 and offsets[0] > 0: - if lock is None: - lock = self._lock - with lock: - self.seek(offsets[0]) - yield self._fh.read(bytecounts[0]) - else: - yield None - return - - if lock is None: - lock = self._lock - if buffersize is None: - buffersize = 2**26 # 64 MB - - seek = self.seek - read = self._fh.read - index = 0 - while index < length: - segments = [] - with lock: - size = 0 - while size < buffersize and index < length: - offset = offsets[index] - bytecount = bytecounts[index] - if offset > 0 and bytecount > 0: - seek(offset) - segments.append(read(bytecount)) - # buffer = bytearray(bytecount) - # n = fh.readinto(buffer) - # data.append(buffer[:n]) - size += bytecount - else: - segments.append(None) - index += 1 - for segment in segments: - yield segment - - def read_record(self, dtype, shape=1, byteorder=None): - """Return numpy record from file.""" - rec = numpy.rec - try: - record = rec.fromfile(self._fh, dtype, shape, byteorder=byteorder) - except Exception: - dtype = numpy.dtype(dtype) - if shape is None: - shape = self._size // dtype.itemsize - size = product(sequence(shape)) * dtype.itemsize - data = self._fh.read(size) - record = rec.fromstring(data, dtype, shape, byteorder=byteorder) - return record[0] if shape == 1 else record - - def write_empty(self, size): - """Append size bytes to file. Position must be at end of file.""" - if size < 1: - return - self._fh.seek(size - 1, 1) - self._fh.write(b'\x00') - - def write_array(self, data): - """Write numpy array to binary file.""" - try: - data.tofile(self._fh) - except Exception: - # BytesIO - self._fh.write(data.tostring()) - - def tell(self): - """Return file's current position.""" - return self._fh.tell() - self._offset - - def seek(self, offset, whence=0): - """Set file's current position.""" - if self._offset: - if whence == 0: - self._fh.seek(self._offset + offset, whence) - return - if whence == 2 and self._size > 0: - self._fh.seek(self._offset + self._size + offset, 0) - return - self._fh.seek(offset, whence) - - def close(self): - """Close file.""" - if self._close and self._fh: - self._fh.close() - self._fh = None - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - - def __getattr__(self, name): - """Return attribute from underlying file object.""" - if self._offset: - warnings.warn( - "FileHandle: '%s' not implemented for embedded files" % name) - return getattr(self._fh, name) - - @property - def name(self): - return self._name - - @property - def dirname(self): - return self._dir - - @property - def path(self): - return os.path.join(self._dir, self._name) - - @property - def size(self): - return self._size - - @property - def closed(self): - return self._fh is None - - @property - def lock(self): - return self._lock - - @lock.setter - def lock(self, value): - self._lock = threading.RLock() if value else NullContext() - - -class NullContext(object): - """Null context manager. - - >>> with NullContext(): - ... pass - - """ - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - pass - - -class OpenFileCache(object): - """Keep files open.""" - - __slots__ = ('files', 'past', 'lock', 'size') - - def __init__(self, size, lock=None): - """Initialize open file cache.""" - self.past = [] # FIFO of opened files - self.files = {} # refcounts of opened files - self.lock = NullContext() if lock is None else lock - self.size = int(size) - - def open(self, filehandle): - """Re-open file if necessary.""" - with self.lock: - if filehandle in self.files: - self.files[filehandle] += 1 - elif filehandle.closed: - filehandle.open() - self.files[filehandle] = 1 - self.past.append(filehandle) - - def close(self, filehandle): - """Close openend file if no longer used.""" - with self.lock: - if filehandle in self.files: - self.files[filehandle] -= 1 - # trim the file cache - index = 0 - size = len(self.past) - while size > self.size and index < size: - filehandle = self.past[index] - if self.files[filehandle] == 0: - filehandle.close() - del self.files[filehandle] - del self.past[index] - size -= 1 - else: - index += 1 - - def clear(self): - """Close all opened files if not in use.""" - with self.lock: - for filehandle, refcount in list(self.files.items()): - if refcount == 0: - filehandle.close() - del self.files[filehandle] - del self.past[self.past.index(filehandle)] - - -class Timer(object): - """Stopwatch for timing execution speed.""" - - __slots__ = ('started', 'stopped', 'duration') - - try: - clock = time.perf_counter - except AttributeError: - clock = time.clock - - def __init__(self, message='', end=' '): - """Initialize timer and print message.""" - if message: - print_(message, end=end, flush=True) - self.duration = 0 - self.started = self.stopped = Timer.clock() - - def start(self, message='', end=' '): - """Start timer and return current time.""" - if message: - print_(message, end=end, flush=True) - self.duration = 0 - self.started = self.stopped = Timer.clock() - return self.started - - def stop(self, message='', end=' '): - """Return duration of timer till start.""" - self.stopped = Timer.clock() - if message: - print_(message, end=end, flush=True) - self.duration = self.stopped - self.started - return self.duration - - def print(self, message='', end=None): - """Print duration from timer start till last stop or now.""" - msg = str(self) - if message: - print_(message, end=' ') - print_(msg, end=end, flush=True) - - def __str__(self): - """Return duration from timer start till last stop or now as string.""" - if self.duration <= 0: - # not stopped - duration = Timer.clock() - self.started - else: - duration = self.duration - s = str(datetime.timedelta(seconds=duration)) - i = 0 - while i < len(s) and s[i:i + 2] in '0:0010203040506070809': - i += 1 - return '%s s' % s[i:] - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.print() - - -class LazyConst(object): - """Class whose attributes are computed on first access from its methods.""" - - def __init__(self, cls): - self._cls = cls - self.__doc__ = getattr(cls, '__doc__') - - def __getattr__(self, name): - func = getattr(self._cls, name) - if not callable(func): - return func - try: - value = func() - except TypeError: - # Python 2 unbound method - value = func.__func__() - setattr(self, name, value) - return value - - -@LazyConst -class TIFF(object): - """Namespace for module constants.""" - - def CLASSIC_LE(): - class ClassicTiffLe(object): - __slots__ = [] - version = 42 - byteorder = '<' - offsetsize = 4 - offsetformat = '= 32768 - 32781: 'ImageID', - 32931: 'WangTag1', - 32932: 'WangAnnotation', - 32933: 'WangTag3', - 32934: 'WangTag4', - 32953: 'ImageReferencePoints', - 32954: 'RegionXformTackPoint', - 32955: 'WarpQuadrilateral', - 32956: 'AffineTransformMat', - 32995: 'Matteing', - 32996: 'DataType', # use SampleFormat - 32997: 'ImageDepth', - 32998: 'TileDepth', - 33300: 'ImageFullWidth', - 33301: 'ImageFullLength', - 33302: 'TextureFormat', - 33303: 'TextureWrapModes', - 33304: 'FieldOfViewCotangent', - 33305: 'MatrixWorldToScreen', - 33306: 'MatrixWorldToCamera', - 33405: 'Model2', - 33421: 'CFARepeatPatternDim', - 33422: 'CFAPattern', - 33423: 'BatteryLevel', - 33424: 'KodakIFD', - 33434: 'ExposureTime', - 33437: 'FNumber', - 33432: 'Copyright', - 33445: 'MDFileTag', - 33446: 'MDScalePixel', - 33447: 'MDColorTable', - 33448: 'MDLabName', - 33449: 'MDSampleInfo', - 33450: 'MDPrepDate', - 33451: 'MDPrepTime', - 33452: 'MDFileUnits', - 33471: 'OlympusINI', - 33550: 'ModelPixelScaleTag', - 33560: 'OlympusSIS', # see also 33471 and 34853 - 33589: 'AdventScale', - 33590: 'AdventRevision', - 33628: 'UIC1tag', # Metamorph Universal Imaging Corp STK - 33629: 'UIC2tag', - 33630: 'UIC3tag', - 33631: 'UIC4tag', - 33723: 'IPTCNAA', - 33858: 'ExtendedTagsOffset', # DEFF points IFD with private tags - 33918: 'IntergraphPacketData', # INGRPacketDataTag - 33919: 'IntergraphFlagRegisters', # INGRFlagRegisters - 33920: 'IntergraphMatrixTag', # IrasBTransformationMatrix - 33921: 'INGRReserved', - 33922: 'ModelTiepointTag', - 33923: 'LeicaMagic', - 34016: 'Site', # 34016..34032 ANSI IT8 TIFF/IT - 34017: 'ColorSequence', - 34018: 'IT8Header', - 34019: 'RasterPadding', - 34020: 'BitsPerRunLength', - 34021: 'BitsPerExtendedRunLength', - 34022: 'ColorTable', - 34023: 'ImageColorIndicator', - 34024: 'BackgroundColorIndicator', - 34025: 'ImageColorValue', - 34026: 'BackgroundColorValue', - 34027: 'PixelIntensityRange', - 34028: 'TransparencyIndicator', - 34029: 'ColorCharacterization', - 34030: 'HCUsage', - 34031: 'TrapIndicator', - 34032: 'CMYKEquivalent', - 34118: 'CZ_SEM', # Zeiss SEM - 34152: 'AFCP_IPTC', - 34232: 'PixelMagicJBIGOptions', # EXIF, also TI FrameCount - 34263: 'JPLCartoIFD', - 34122: 'IPLAB', # number of images - 34264: 'ModelTransformationTag', - 34306: 'WB_GRGBLevels', # Leaf MOS - 34310: 'LeafData', - 34361: 'MM_Header', - 34362: 'MM_Stamp', - 34363: 'MM_Unknown', - 34377: 'ImageResources', # Photoshop - 34386: 'MM_UserBlock', - 34412: 'CZ_LSMINFO', - 34665: 'ExifTag', - 34675: 'InterColorProfile', # ICCProfile - 34680: 'FEI_SFEG', # - 34682: 'FEI_HELIOS', # - 34683: 'FEI_TITAN', # - 34687: 'FXExtensions', - 34688: 'MultiProfiles', - 34689: 'SharedData', - 34690: 'T88Options', - 34710: 'MarCCD', # offset to MarCCD header - 34732: 'ImageLayer', - 34735: 'GeoKeyDirectoryTag', - 34736: 'GeoDoubleParamsTag', - 34737: 'GeoAsciiParamsTag', - 34750: 'JBIGOptions', - 34821: 'PIXTIFF', # ? Pixel Translations Inc - 34850: 'ExposureProgram', - 34852: 'SpectralSensitivity', - 34853: 'GPSTag', # GPSIFD also OlympusSIS2 - 34855: 'ISOSpeedRatings', - 34856: 'OECF', - 34857: 'Interlace', - 34858: 'TimeZoneOffset', - 34859: 'SelfTimerMode', - 34864: 'SensitivityType', - 34865: 'StandardOutputSensitivity', - 34866: 'RecommendedExposureIndex', - 34867: 'ISOSpeed', - 34868: 'ISOSpeedLatitudeyyy', - 34869: 'ISOSpeedLatitudezzz', - 34908: 'HylaFAXFaxRecvParams', - 34909: 'HylaFAXFaxSubAddress', - 34910: 'HylaFAXFaxRecvTime', - 34911: 'FaxDcs', - 34929: 'FedexEDR', - 34954: 'LeafSubIFD', - 34959: 'Aphelion1', - 34960: 'Aphelion2', - 34961: 'AphelionInternal', # ADCIS - 36864: 'ExifVersion', - 36867: 'DateTimeOriginal', - 36868: 'DateTimeDigitized', - 36873: 'GooglePlusUploadCode', - 36880: 'OffsetTime', - 36881: 'OffsetTimeOriginal', - 36882: 'OffsetTimeDigitized', - # TODO: Pilatus/CHESS/TV6 36864..37120 conflicting with Exif tags - # 36864: 'TVX ?', - # 36865: 'TVX_NumExposure', - # 36866: 'TVX_NumBackground', - # 36867: 'TVX_ExposureTime', - # 36868: 'TVX_BackgroundTime', - # 36870: 'TVX ?', - # 36873: 'TVX_SubBpp', - # 36874: 'TVX_SubWide', - # 36875: 'TVX_SubHigh', - # 36876: 'TVX_BlackLevel', - # 36877: 'TVX_DarkCurrent', - # 36878: 'TVX_ReadNoise', - # 36879: 'TVX_DarkCurrentNoise', - # 36880: 'TVX_BeamMonitor', - # 37120: 'TVX_UserVariables', # A/D values - 37121: 'ComponentsConfiguration', - 37122: 'CompressedBitsPerPixel', - 37377: 'ShutterSpeedValue', - 37378: 'ApertureValue', - 37379: 'BrightnessValue', - 37380: 'ExposureBiasValue', - 37381: 'MaxApertureValue', - 37382: 'SubjectDistance', - 37383: 'MeteringMode', - 37384: 'LightSource', - 37385: 'Flash', - 37386: 'FocalLength', - 37387: 'FlashEnergy_', # 37387 - 37388: 'SpatialFrequencyResponse_', # 37388 - 37389: 'Noise', - 37390: 'FocalPlaneXResolution', - 37391: 'FocalPlaneYResolution', - 37392: 'FocalPlaneResolutionUnit', - 37393: 'ImageNumber', - 37394: 'SecurityClassification', - 37395: 'ImageHistory', - 37396: 'SubjectLocation', - 37397: 'ExposureIndex', - 37398: 'TIFFEPStandardID', - 37399: 'SensingMethod', - 37434: 'CIP3DataFile', - 37435: 'CIP3Sheet', - 37436: 'CIP3Side', - 37439: 'StoNits', - 37500: 'MakerNote', - 37510: 'UserComment', - 37520: 'SubsecTime', - 37521: 'SubsecTimeOriginal', - 37522: 'SubsecTimeDigitized', - 37679: 'MODIText', # Microsoft Office Document Imaging - 37680: 'MODIOLEPropertySetStorage', - 37681: 'MODIPositioning', - 37706: 'TVIPS', # offset to TemData structure - 37707: 'TVIPS1', - 37708: 'TVIPS2', # same TemData structure as undefined - 37724: 'ImageSourceData', # Photoshop - 37888: 'Temperature', - 37889: 'Humidity', - 37890: 'Pressure', - 37891: 'WaterDepth', - 37892: 'Acceleration', - 37893: 'CameraElevationAngle', - 40001: 'MC_IpWinScal', # Media Cybernetics - # 40001: 'RecipName', # MS FAX - 40002: 'RecipNumber', - 40003: 'SenderName', - 40004: 'Routing', - 40005: 'CallerId', - 40006: 'TSID', - 40007: 'CSID', - 40008: 'FaxTime', - 40100: 'MC_IdOld', - 40106: 'MC_Unknown', - 40965: 'InteroperabilityTag', # InteropOffset - 40091: 'XPTitle', - 40092: 'XPComment', - 40093: 'XPAuthor', - 40094: 'XPKeywords', - 40095: 'XPSubject', - 40960: 'FlashpixVersion', - 40961: 'ColorSpace', - 40962: 'PixelXDimension', - 40963: 'PixelYDimension', - 40964: 'RelatedSoundFile', - 40976: 'SamsungRawPointersOffset', - 40977: 'SamsungRawPointersLength', - 41217: 'SamsungRawByteOrder', - 41218: 'SamsungRawUnknown', - 41483: 'FlashEnergy', - 41484: 'SpatialFrequencyResponse', - 41485: 'Noise_', # 37389 - 41486: 'FocalPlaneXResolution_', # 37390 - 41487: 'FocalPlaneYResolution_', # 37391 - 41488: 'FocalPlaneResolutionUnit_', # 37392 - 41489: 'ImageNumber_', # 37393 - 41490: 'SecurityClassification_', # 37394 - 41491: 'ImageHistory_', # 37395 - 41492: 'SubjectLocation_', # 37395 - 41493: 'ExposureIndex_ ', # 37397 - 41494: 'TIFF-EPStandardID', - 41495: 'SensingMethod_', # 37399 - 41728: 'FileSource', - 41729: 'SceneType', - 41730: 'CFAPattern_', # 33422 - 41985: 'CustomRendered', - 41986: 'ExposureMode', - 41987: 'WhiteBalance', - 41988: 'DigitalZoomRatio', - 41989: 'FocalLengthIn35mmFilm', - 41990: 'SceneCaptureType', - 41991: 'GainControl', - 41992: 'Contrast', - 41993: 'Saturation', - 41994: 'Sharpness', - 41995: 'DeviceSettingDescription', - 41996: 'SubjectDistanceRange', - 42016: 'ImageUniqueID', - 42032: 'CameraOwnerName', - 42033: 'BodySerialNumber', - 42034: 'LensSpecification', - 42035: 'LensMake', - 42036: 'LensModel', - 42037: 'LensSerialNumber', - 42112: 'GDAL_METADATA', - 42113: 'GDAL_NODATA', - 42240: 'Gamma', - 43314: 'NIHImageHeader', - 44992: 'ExpandSoftware', - 44993: 'ExpandLens', - 44994: 'ExpandFilm', - 44995: 'ExpandFilterLens', - 44996: 'ExpandScanner', - 44997: 'ExpandFlashLamp', - 48129: 'PixelFormat', # HDP and WDP - 48130: 'Transformation', - 48131: 'Uncompressed', - 48132: 'ImageType', - 48256: 'ImageWidth_', # 256 - 48257: 'ImageHeight_', - 48258: 'WidthResolution', - 48259: 'HeightResolution', - 48320: 'ImageOffset', - 48321: 'ImageByteCount', - 48322: 'AlphaOffset', - 48323: 'AlphaByteCount', - 48324: 'ImageDataDiscard', - 48325: 'AlphaDataDiscard', - 50003: 'KodakAPP3', - 50215: 'OceScanjobDescription', - 50216: 'OceApplicationSelector', - 50217: 'OceIdentificationNumber', - 50218: 'OceImageLogicCharacteristics', - 50255: 'Annotations', - 50288: 'MC_Id', # Media Cybernetics - 50289: 'MC_XYPosition', - 50290: 'MC_ZPosition', - 50291: 'MC_XYCalibration', - 50292: 'MC_LensCharacteristics', - 50293: 'MC_ChannelName', - 50294: 'MC_ExcitationWavelength', - 50295: 'MC_TimeStamp', - 50296: 'MC_FrameProperties', - 50341: 'PrintImageMatching', - 50495: 'PCO_RAW', # TODO: PCO CamWare - 50547: 'OriginalFileName', - 50560: 'USPTO_OriginalContentType', # US Patent Office - 50561: 'USPTO_RotationCode', - 50648: 'CR2Unknown1', - 50649: 'CR2Unknown2', - 50656: 'CR2CFAPattern', - 50674: 'LercParameters', # ESGI 50674 .. 50677 - 50706: 'DNGVersion', # DNG 50706 .. 51112 - 50707: 'DNGBackwardVersion', - 50708: 'UniqueCameraModel', - 50709: 'LocalizedCameraModel', - 50710: 'CFAPlaneColor', - 50711: 'CFALayout', - 50712: 'LinearizationTable', - 50713: 'BlackLevelRepeatDim', - 50714: 'BlackLevel', - 50715: 'BlackLevelDeltaH', - 50716: 'BlackLevelDeltaV', - 50717: 'WhiteLevel', - 50718: 'DefaultScale', - 50719: 'DefaultCropOrigin', - 50720: 'DefaultCropSize', - 50721: 'ColorMatrix1', - 50722: 'ColorMatrix2', - 50723: 'CameraCalibration1', - 50724: 'CameraCalibration2', - 50725: 'ReductionMatrix1', - 50726: 'ReductionMatrix2', - 50727: 'AnalogBalance', - 50728: 'AsShotNeutral', - 50729: 'AsShotWhiteXY', - 50730: 'BaselineExposure', - 50731: 'BaselineNoise', - 50732: 'BaselineSharpness', - 50733: 'BayerGreenSplit', - 50734: 'LinearResponseLimit', - 50735: 'CameraSerialNumber', - 50736: 'LensInfo', - 50737: 'ChromaBlurRadius', - 50738: 'AntiAliasStrength', - 50739: 'ShadowScale', - 50740: 'DNGPrivateData', - 50741: 'MakerNoteSafety', - 50752: 'RawImageSegmentation', - 50778: 'CalibrationIlluminant1', - 50779: 'CalibrationIlluminant2', - 50780: 'BestQualityScale', - 50781: 'RawDataUniqueID', - 50784: 'AliasLayerMetadata', - 50827: 'OriginalRawFileName', - 50828: 'OriginalRawFileData', - 50829: 'ActiveArea', - 50830: 'MaskedAreas', - 50831: 'AsShotICCProfile', - 50832: 'AsShotPreProfileMatrix', - 50833: 'CurrentICCProfile', - 50834: 'CurrentPreProfileMatrix', - 50838: 'IJMetadataByteCounts', - 50839: 'IJMetadata', - 50844: 'RPCCoefficientTag', - 50879: 'ColorimetricReference', - 50885: 'SRawType', - 50898: 'PanasonicTitle', - 50899: 'PanasonicTitle2', - 50908: 'RSID', # DGIWG - 50909: 'GEO_METADATA', # DGIWG XML - 50931: 'CameraCalibrationSignature', - 50932: 'ProfileCalibrationSignature', - 50933: 'ProfileIFD', - 50934: 'AsShotProfileName', - 50935: 'NoiseReductionApplied', - 50936: 'ProfileName', - 50937: 'ProfileHueSatMapDims', - 50938: 'ProfileHueSatMapData1', - 50939: 'ProfileHueSatMapData2', - 50940: 'ProfileToneCurve', - 50941: 'ProfileEmbedPolicy', - 50942: 'ProfileCopyright', - 50964: 'ForwardMatrix1', - 50965: 'ForwardMatrix2', - 50966: 'PreviewApplicationName', - 50967: 'PreviewApplicationVersion', - 50968: 'PreviewSettingsName', - 50969: 'PreviewSettingsDigest', - 50970: 'PreviewColorSpace', - 50971: 'PreviewDateTime', - 50972: 'RawImageDigest', - 50973: 'OriginalRawFileDigest', - 50974: 'SubTileBlockSize', - 50975: 'RowInterleaveFactor', - 50981: 'ProfileLookTableDims', - 50982: 'ProfileLookTableData', - 51008: 'OpcodeList1', - 51009: 'OpcodeList2', - 51022: 'OpcodeList3', - 51023: 'FibicsXML', # - 51041: 'NoiseProfile', - 51043: 'TimeCodes', - 51044: 'FrameRate', - 51058: 'TStop', - 51081: 'ReelName', - 51089: 'OriginalDefaultFinalSize', - 51090: 'OriginalBestQualitySize', - 51091: 'OriginalDefaultCropSize', - 51105: 'CameraLabel', - 51107: 'ProfileHueSatMapEncoding', - 51108: 'ProfileLookTableEncoding', - 51109: 'BaselineExposureOffset', - 51110: 'DefaultBlackRender', - 51111: 'NewRawImageDigest', - 51112: 'RawToPreviewGain', - 51125: 'DefaultUserCrop', - 51123: 'MicroManagerMetadata', - 51159: 'ZIFmetadata', # Objective Pathology Services - 51160: 'ZIFannotations', # Objective Pathology Services - 59932: 'Padding', - 59933: 'OffsetSchema', - # Reusable Tags 65000-65535 - # 65000: Dimap_Document XML - # 65000-65112: Photoshop Camera RAW EXIF tags - # 65000: 'OwnerName', - # 65001: 'SerialNumber', - # 65002: 'Lens', - # 65024: 'KDC_IFD', - # 65100: 'RawFile', - # 65101: 'Converter', - # 65102: 'WhiteBalance', - # 65105: 'Exposure', - # 65106: 'Shadows', - # 65107: 'Brightness', - # 65108: 'Contrast', - # 65109: 'Saturation', - # 65110: 'Sharpness', - # 65111: 'Smoothness', - # 65112: 'MoireFilter', - 65200: 'FlexXML', - } - - def TAG_NAMES(): - return {v: c for c, v in TIFF.TAGS.items()} - - def TAG_READERS(): - # Map TIFF tag codes to import functions - return { - 320: read_colormap, - # 700: read_bytes, # read_utf8, - # 34377: read_bytes, - 33723: read_bytes, - # 34675: read_bytes, - 33628: read_uic1tag, # Universal Imaging Corp STK - 33629: read_uic2tag, - 33630: read_uic3tag, - 33631: read_uic4tag, - 34118: read_cz_sem, # Carl Zeiss SEM - 34361: read_mm_header, # Olympus FluoView - 34362: read_mm_stamp, - 34363: read_numpy, # MM_Unknown - 34386: read_numpy, # MM_UserBlock - 34412: read_cz_lsminfo, # Carl Zeiss LSM - 34680: read_fei_metadata, # S-FEG - 34682: read_fei_metadata, # Helios NanoLab - 37706: read_tvips_header, # TVIPS EMMENU - 37724: read_bytes, # ImageSourceData - 33923: read_bytes, # read_leica_magic - 43314: read_nih_image_header, - # 40001: read_bytes, - 40100: read_bytes, - 50288: read_bytes, - 50296: read_bytes, - 50839: read_bytes, - 51123: read_json, - 33471: read_sis_ini, - 33560: read_sis, - 34665: read_exif_ifd, - 34853: read_gps_ifd, # conflicts with OlympusSIS - 40965: read_interoperability_ifd, - } - - def TAG_TUPLE(): - # Tags whose values must be stored as tuples - return frozenset((273, 279, 324, 325, 330, 530, 531, 34736)) - - def TAG_ATTRIBUTES(): - # Map tag codes to TiffPage attribute names - return { - 'ImageWidth': 'imagewidth', - 'ImageLength': 'imagelength', - 'BitsPerSample': 'bitspersample', - 'Compression': 'compression', - 'PlanarConfiguration': 'planarconfig', - 'FillOrder': 'fillorder', - 'PhotometricInterpretation': 'photometric', - 'ColorMap': 'colormap', - 'ImageDescription': 'description', - 'ImageDescription1': 'description1', - 'SamplesPerPixel': 'samplesperpixel', - 'RowsPerStrip': 'rowsperstrip', - 'Software': 'software', - 'Predictor': 'predictor', - 'TileWidth': 'tilewidth', - 'TileLength': 'tilelength', - 'ExtraSamples': 'extrasamples', - 'SampleFormat': 'sampleformat', - 'ImageDepth': 'imagedepth', - 'TileDepth': 'tiledepth', - 'NewSubfileType': 'subfiletype', - } - - def TAG_ENUM(): - return { - # 254: TIFF.FILETYPE, - 255: TIFF.OFILETYPE, - 259: TIFF.COMPRESSION, - 262: TIFF.PHOTOMETRIC, - 263: TIFF.THRESHHOLD, - 266: TIFF.FILLORDER, - 274: TIFF.ORIENTATION, - 284: TIFF.PLANARCONFIG, - 290: TIFF.GRAYRESPONSEUNIT, - # 292: TIFF.GROUP3OPT, - # 293: TIFF.GROUP4OPT, - 296: TIFF.RESUNIT, - 300: TIFF.COLORRESPONSEUNIT, - 317: TIFF.PREDICTOR, - 338: TIFF.EXTRASAMPLE, - 339: TIFF.SAMPLEFORMAT, - # 512: TIFF.JPEGPROC, - # 531: TIFF.YCBCRPOSITION, - } - - def FILETYPE(): - class FILETYPE(enum.IntFlag): - # Python 3.6 only - UNDEFINED = 0 - REDUCEDIMAGE = 1 - PAGE = 2 - MASK = 4 - - return FILETYPE - - def OFILETYPE(): - class OFILETYPE(enum.IntEnum): - UNDEFINED = 0 - IMAGE = 1 - REDUCEDIMAGE = 2 - PAGE = 3 - - return OFILETYPE - - def COMPRESSION(): - class COMPRESSION(enum.IntEnum): - NONE = 1 # Uncompressed - CCITTRLE = 2 # CCITT 1D - CCITT_T4 = 3 # 'T4/Group 3 Fax', - CCITT_T6 = 4 # 'T6/Group 4 Fax', - LZW = 5 - OJPEG = 6 # old-style JPEG - JPEG = 7 - ADOBE_DEFLATE = 8 - JBIG_BW = 9 - JBIG_COLOR = 10 - JPEG_99 = 99 - KODAK_262 = 262 - NEXT = 32766 - SONY_ARW = 32767 - PACKED_RAW = 32769 - SAMSUNG_SRW = 32770 - CCIRLEW = 32771 - SAMSUNG_SRW2 = 32772 - PACKBITS = 32773 - THUNDERSCAN = 32809 - IT8CTPAD = 32895 - IT8LW = 32896 - IT8MP = 32897 - IT8BL = 32898 - PIXARFILM = 32908 - PIXARLOG = 32909 - DEFLATE = 32946 - DCS = 32947 - APERIO_JP2000_YCBC = 33003 # Leica Aperio - APERIO_JP2000_RGB = 33005 # Leica Aperio - JBIG = 34661 - SGILOG = 34676 - SGILOG24 = 34677 - JPEG2000 = 34712 - NIKON_NEF = 34713 - JBIG2 = 34715 - MDI_BINARY = 34718 # Microsoft Document Imaging - MDI_PROGRESSIVE = 34719 # Microsoft Document Imaging - MDI_VECTOR = 34720 # Microsoft Document Imaging - LERC = 34887 # ESRI Lerc - JPEG_LOSSY = 34892 - LZMA = 34925 - ZSTD_DEPRECATED = 34926 - WEBP_DEPRECATED = 34927 - PNG = 34933 # Objective Pathology Services - JPEGXR = 34934 # Objective Pathology Services - ZSTD = 50000 - WEBP = 50001 - PIXTIFF = 50013 - KODAK_DCR = 65000 - PENTAX_PEF = 65535 - # def __bool__(self): return self != 1 # Python 3.6+ only - - return COMPRESSION - - def PHOTOMETRIC(): - class PHOTOMETRIC(enum.IntEnum): - MINISWHITE = 0 - MINISBLACK = 1 - RGB = 2 - PALETTE = 3 - MASK = 4 - SEPARATED = 5 # CMYK - YCBCR = 6 - CIELAB = 8 - ICCLAB = 9 - ITULAB = 10 - CFA = 32803 # Color Filter Array - LOGL = 32844 - LOGLUV = 32845 - LINEAR_RAW = 34892 - - return PHOTOMETRIC - - def THRESHHOLD(): - class THRESHHOLD(enum.IntEnum): - BILEVEL = 1 - HALFTONE = 2 - ERRORDIFFUSE = 3 - - return THRESHHOLD - - def FILLORDER(): - class FILLORDER(enum.IntEnum): - MSB2LSB = 1 - LSB2MSB = 2 - - return FILLORDER - - def ORIENTATION(): - class ORIENTATION(enum.IntEnum): - TOPLEFT = 1 - TOPRIGHT = 2 - BOTRIGHT = 3 - BOTLEFT = 4 - LEFTTOP = 5 - RIGHTTOP = 6 - RIGHTBOT = 7 - LEFTBOT = 8 - - return ORIENTATION - - def PLANARCONFIG(): - class PLANARCONFIG(enum.IntEnum): - CONTIG = 1 - SEPARATE = 2 - - return PLANARCONFIG - - def GRAYRESPONSEUNIT(): - class GRAYRESPONSEUNIT(enum.IntEnum): - _10S = 1 - _100S = 2 - _1000S = 3 - _10000S = 4 - _100000S = 5 - - return GRAYRESPONSEUNIT - - def GROUP4OPT(): - class GROUP4OPT(enum.IntEnum): - UNCOMPRESSED = 2 - - return GROUP4OPT - - def RESUNIT(): - class RESUNIT(enum.IntEnum): - NONE = 1 - INCH = 2 - CENTIMETER = 3 - # def __bool__(self): return self != 1 # Python 3.6 only - - return RESUNIT - - def COLORRESPONSEUNIT(): - class COLORRESPONSEUNIT(enum.IntEnum): - _10S = 1 - _100S = 2 - _1000S = 3 - _10000S = 4 - _100000S = 5 - - return COLORRESPONSEUNIT - - def PREDICTOR(): - class PREDICTOR(enum.IntEnum): - NONE = 1 - HORIZONTAL = 2 - FLOATINGPOINT = 3 - # def __bool__(self): return self != 1 # Python 3.6 only - - return PREDICTOR - - def EXTRASAMPLE(): - class EXTRASAMPLE(enum.IntEnum): - UNSPECIFIED = 0 - ASSOCALPHA = 1 - UNASSALPHA = 2 - - return EXTRASAMPLE - - def SAMPLEFORMAT(): - class SAMPLEFORMAT(enum.IntEnum): - UINT = 1 - INT = 2 - IEEEFP = 3 - VOID = 4 - COMPLEXINT = 5 - COMPLEXIEEEFP = 6 - - return SAMPLEFORMAT - - def DATATYPES(): - class DATATYPES(enum.IntEnum): - NOTYPE = 0 - BYTE = 1 - ASCII = 2 - SHORT = 3 - LONG = 4 - RATIONAL = 5 - SBYTE = 6 - UNDEFINED = 7 - SSHORT = 8 - SLONG = 9 - SRATIONAL = 10 - FLOAT = 11 - DOUBLE = 12 - IFD = 13 - UNICODE = 14 - COMPLEX = 15 - LONG8 = 16 - SLONG8 = 17 - IFD8 = 18 - - return DATATYPES - - def DATA_FORMATS(): - # Map TIFF DATATYPES to Python struct formats - return { - 1: '1B', # BYTE 8-bit unsigned integer. - 2: '1s', # ASCII 8-bit byte that contains a 7-bit ASCII code; - # the last byte must be NULL (binary zero). - 3: '1H', # SHORT 16-bit (2-byte) unsigned integer - 4: '1I', # LONG 32-bit (4-byte) unsigned integer. - 5: '2I', # RATIONAL Two LONGs: the first represents the numerator - # of a fraction; the second, the denominator. - 6: '1b', # SBYTE An 8-bit signed (twos-complement) integer. - 7: '1B', # UNDEFINED An 8-bit byte that may contain anything, - # depending on the definition of the field. - 8: '1h', # SSHORT A 16-bit (2-byte) signed (twos-complement) - # integer. - 9: '1i', # SLONG A 32-bit (4-byte) signed (twos-complement) - # integer. - 10: '2i', # SRATIONAL Two SLONGs: the first represents the - # numerator of a fraction, the second the denominator. - 11: '1f', # FLOAT Single precision (4-byte) IEEE format. - 12: '1d', # DOUBLE Double precision (8-byte) IEEE format. - 13: '1I', # IFD unsigned 4 byte IFD offset. - # 14: '', # UNICODE - # 15: '', # COMPLEX - 16: '1Q', # LONG8 unsigned 8 byte integer (BigTiff) - 17: '1q', # SLONG8 signed 8 byte integer (BigTiff) - 18: '1Q', # IFD8 unsigned 8 byte IFD offset (BigTiff) - } - - def DATA_DTYPES(): - # Map numpy dtypes to TIFF DATATYPES - return { - 'B': 1, - 's': 2, - 'H': 3, - 'I': 4, - '2I': 5, - 'b': 6, - 'h': 8, - 'i': 9, - '2i': 10, - 'f': 11, - 'd': 12, - 'Q': 16, - 'q': 17, - } - - def SAMPLE_DTYPES(): - # Map TIFF SampleFormats and BitsPerSample to numpy dtype - return { - # UINT - (1, 1): '?', # bitmap - (1, 2): 'B', - (1, 3): 'B', - (1, 4): 'B', - (1, 5): 'B', - (1, 6): 'B', - (1, 7): 'B', - (1, 8): 'B', - (1, 9): 'H', - (1, 10): 'H', - (1, 11): 'H', - (1, 12): 'H', - (1, 13): 'H', - (1, 14): 'H', - (1, 15): 'H', - (1, 16): 'H', - (1, 17): 'I', - (1, 18): 'I', - (1, 19): 'I', - (1, 20): 'I', - (1, 21): 'I', - (1, 22): 'I', - (1, 23): 'I', - (1, 24): 'I', - (1, 25): 'I', - (1, 26): 'I', - (1, 27): 'I', - (1, 28): 'I', - (1, 29): 'I', - (1, 30): 'I', - (1, 31): 'I', - (1, 32): 'I', - (1, 64): 'Q', - # VOID : treat as UINT - (4, 1): '?', # bitmap - (4, 2): 'B', - (4, 3): 'B', - (4, 4): 'B', - (4, 5): 'B', - (4, 6): 'B', - (4, 7): 'B', - (4, 8): 'B', - (4, 9): 'H', - (4, 10): 'H', - (4, 11): 'H', - (4, 12): 'H', - (4, 13): 'H', - (4, 14): 'H', - (4, 15): 'H', - (4, 16): 'H', - (4, 17): 'I', - (4, 18): 'I', - (4, 19): 'I', - (4, 20): 'I', - (4, 21): 'I', - (4, 22): 'I', - (4, 23): 'I', - (4, 24): 'I', - (4, 25): 'I', - (4, 26): 'I', - (4, 27): 'I', - (4, 28): 'I', - (4, 29): 'I', - (4, 30): 'I', - (4, 31): 'I', - (4, 32): 'I', - (4, 64): 'Q', - # INT - (2, 8): 'b', - (2, 16): 'h', - (2, 32): 'i', - (2, 64): 'q', - # IEEEFP : 24 bit not supported by numpy - (3, 16): 'e', - # (3, 24): '', # - (3, 32): 'f', - (3, 64): 'd', - # COMPLEXIEEEFP - (6, 64): 'F', - (6, 128): 'D', - # RGB565 - (1, (5, 6, 5)): 'B', - # COMPLEXINT : not supported by numpy - } - - def PREDICTORS(): - # Map PREDICTOR to predictor encode functions - if imagecodecs is None: - return { - None: identityfunc, - 1: identityfunc, - 2: delta_encode, - } - return { - None: imagecodecs.none_encode, - 1: imagecodecs.none_encode, - 2: imagecodecs.delta_encode, - 3: imagecodecs.floatpred_encode, - } - - def UNPREDICTORS(): - # Map PREDICTOR to predictor decode functions - if imagecodecs is None: - return { - None: identityfunc, - 1: identityfunc, - 2: delta_decode, - } - return { - None: imagecodecs.none_decode, - 1: imagecodecs.none_decode, - 2: imagecodecs.delta_decode, - 3: imagecodecs.floatpred_decode, - } - - def COMPESSORS(): - # Map COMPRESSION to compress functions - if hasattr(imagecodecs, 'zlib_encode'): - return { - None: imagecodecs.none_encode, - 1: imagecodecs.none_encode, - 7: imagecodecs.jpeg_encode, - 8: imagecodecs.zlib_encode, - 32946: imagecodecs.zlib_encode, - 32773: imagecodecs.packbits_encode, - 34712: imagecodecs.j2k_encode, - 34925: imagecodecs.lzma_encode, - 34933: imagecodecs.png_encode, - 34934: imagecodecs.jxr_encode, - 50000: imagecodecs.zstd_encode, - 50001: imagecodecs.webp_encode, - } - - def zlib_encode(data, level=6, out=None): - """Compress Zlib DEFLATE.""" - return zlib.compress(data, level) - - if imagecodecs is None: - return { - None: identityfunc, - 1: identityfunc, - 8: zlib_encode, - 32946: zlib_encode, - # 34925: lzma.compress - } - - return { - None: imagecodecs.none_encode, - 1: imagecodecs.none_encode, - 8: zlib_encode, - 32946: zlib_encode, - 32773: imagecodecs.packbits_encode, - } - - def DECOMPESSORS(): - # Map COMPRESSION to decompress functions - if hasattr(imagecodecs, 'zlib_decode'): - return { - None: imagecodecs.none_decode, - 1: imagecodecs.none_decode, - 5: imagecodecs.lzw_decode, - 6: imagecodecs.jpeg_decode, - 7: imagecodecs.jpeg_decode, - 8: imagecodecs.zlib_decode, - 32946: imagecodecs.zlib_decode, - 32773: imagecodecs.packbits_decode, - # 34892: imagecodecs.jpeg_decode, # DNG lossy - 34925: imagecodecs.lzma_decode, - 34926: imagecodecs.zstd_decode, # deprecated - 34927: imagecodecs.webp_decode, # deprecated - 33003: imagecodecs.j2k_decode, - 33005: imagecodecs.j2k_decode, - 34712: imagecodecs.j2k_decode, - 34933: imagecodecs.png_decode, - 34934: imagecodecs.jxr_decode, - 50000: imagecodecs.zstd_decode, - 50001: imagecodecs.webp_decode, - } - - def zlib_decode(data, out=None): - """Decompress Zlib DEFLATE.""" - return zlib.decompress(data) - - if imagecodecs is None: - return { - None: identityfunc, - 1: identityfunc, - 8: zlib_decode, - 32946: zlib_decode, - # 34925: lzma.decompress - } - - return { - None: imagecodecs.none_decode, - 1: imagecodecs.none_decode, - 5: imagecodecs.lzw_decode, - 8: zlib_decode, - 32946: zlib_decode, - 32773: imagecodecs.packbits_decode, - } - - def FRAME_ATTRS(): - # Attributes that a TiffFrame shares with its keyframe - return { - 'shape', - 'ndim', - 'size', - 'dtype', - 'axes', - 'is_final', - } - - def FILE_FLAGS(): - # TiffFile and TiffPage 'is_\*' attributes - exclude = { - 'reduced', - 'mask', - 'final', - 'memmappable', - 'contiguous', - 'tiled', - 'subsampled', - } - return set( - a[3:] - for a in dir(TiffPage) - if a[:3] == 'is_' and a[3:] not in exclude - ) - - def FILE_EXTENSIONS(): - # TIFF file extensions - return ( - 'tif', 'tiff', 'ome.tif', 'lsm', 'stk', 'qpi', 'pcoraw', - 'gel', 'seq', 'svs', 'zif', 'ndpi', 'bif', 'tf8', 'tf2', 'btf', - ) - - def FILEOPEN_FILTER(): - # String for use in Windows File Open box - return [ - ('%s files' % ext.upper(), '*.%s' % ext) - for ext in TIFF.FILE_EXTENSIONS - ] + [('allfiles', '*')] - - def AXES_LABELS(): - # TODO: is there a standard for character axes labels? - axes = { - 'X': 'width', - 'Y': 'height', - 'Z': 'depth', - 'S': 'sample', # rgb(a) - 'I': 'series', # general sequence, plane, page, IFD - 'T': 'time', - 'C': 'channel', # color, emission wavelength - 'A': 'angle', - 'P': 'phase', # formerly F # P is Position in LSM! - 'R': 'tile', # region, point, mosaic - 'H': 'lifetime', # histogram - 'E': 'lambda', # excitation wavelength - 'L': 'exposure', # lux - 'V': 'event', - 'Q': 'other', - 'M': 'mosaic', # LSM 6 - } - axes.update(dict((v, k) for k, v in axes.items())) - return axes - - def NDPI_TAGS(): - # 65420 - 65458 Private Hamamatsu NDPI tags - tags = dict((code, str(code)) for code in range(65420, 65459)) - tags.update({ - 65420: 'FileFormat', - 65421: 'Magnification', # SourceLens - 65422: 'XOffsetFromSlideCentre', - 65423: 'YOffsetFromSlideCentre', - 65424: 'ZOffsetFromSlideCentre', - 65427: 'UserLabel', - 65428: 'AuthCode', # ? - 65442: 'ScannerSerialNumber', - 65449: 'Comments', - 65447: 'BlankLanes', - 65434: 'Fluorescence', - }) - return tags - - def EXIF_TAGS(): - tags = { - # 65000 - 65112 Photoshop Camera RAW EXIF tags - 65000: 'OwnerName', - 65001: 'SerialNumber', - 65002: 'Lens', - 65100: 'RawFile', - 65101: 'Converter', - 65102: 'WhiteBalance', - 65105: 'Exposure', - 65106: 'Shadows', - 65107: 'Brightness', - 65108: 'Contrast', - 65109: 'Saturation', - 65110: 'Sharpness', - 65111: 'Smoothness', - 65112: 'MoireFilter', - } - tags.update(TIFF.TAGS) - return tags - - def GPS_TAGS(): - return { - 0: 'GPSVersionID', - 1: 'GPSLatitudeRef', - 2: 'GPSLatitude', - 3: 'GPSLongitudeRef', - 4: 'GPSLongitude', - 5: 'GPSAltitudeRef', - 6: 'GPSAltitude', - 7: 'GPSTimeStamp', - 8: 'GPSSatellites', - 9: 'GPSStatus', - 10: 'GPSMeasureMode', - 11: 'GPSDOP', - 12: 'GPSSpeedRef', - 13: 'GPSSpeed', - 14: 'GPSTrackRef', - 15: 'GPSTrack', - 16: 'GPSImgDirectionRef', - 17: 'GPSImgDirection', - 18: 'GPSMapDatum', - 19: 'GPSDestLatitudeRef', - 20: 'GPSDestLatitude', - 21: 'GPSDestLongitudeRef', - 22: 'GPSDestLongitude', - 23: 'GPSDestBearingRef', - 24: 'GPSDestBearing', - 25: 'GPSDestDistanceRef', - 26: 'GPSDestDistance', - 27: 'GPSProcessingMethod', - 28: 'GPSAreaInformation', - 29: 'GPSDateStamp', - 30: 'GPSDifferential', - 31: 'GPSHPositioningError', - } - - def IOP_TAGS(): - return { - 1: 'InteroperabilityIndex', - 2: 'InteroperabilityVersion', - 4096: 'RelatedImageFileFormat', - 4097: 'RelatedImageWidth', - 4098: 'RelatedImageLength', - } - - def GEO_KEYS(): - return { - 1024: 'GTModelTypeGeoKey', - 1025: 'GTRasterTypeGeoKey', - 1026: 'GTCitationGeoKey', - 2048: 'GeographicTypeGeoKey', - 2049: 'GeogCitationGeoKey', - 2050: 'GeogGeodeticDatumGeoKey', - 2051: 'GeogPrimeMeridianGeoKey', - 2052: 'GeogLinearUnitsGeoKey', - 2053: 'GeogLinearUnitSizeGeoKey', - 2054: 'GeogAngularUnitsGeoKey', - 2055: 'GeogAngularUnitsSizeGeoKey', - 2056: 'GeogEllipsoidGeoKey', - 2057: 'GeogSemiMajorAxisGeoKey', - 2058: 'GeogSemiMinorAxisGeoKey', - 2059: 'GeogInvFlatteningGeoKey', - 2060: 'GeogAzimuthUnitsGeoKey', - 2061: 'GeogPrimeMeridianLongGeoKey', - 2062: 'GeogTOWGS84GeoKey', - 3059: 'ProjLinearUnitsInterpCorrectGeoKey', # GDAL - 3072: 'ProjectedCSTypeGeoKey', - 3073: 'PCSCitationGeoKey', - 3074: 'ProjectionGeoKey', - 3075: 'ProjCoordTransGeoKey', - 3076: 'ProjLinearUnitsGeoKey', - 3077: 'ProjLinearUnitSizeGeoKey', - 3078: 'ProjStdParallel1GeoKey', - 3079: 'ProjStdParallel2GeoKey', - 3080: 'ProjNatOriginLongGeoKey', - 3081: 'ProjNatOriginLatGeoKey', - 3082: 'ProjFalseEastingGeoKey', - 3083: 'ProjFalseNorthingGeoKey', - 3084: 'ProjFalseOriginLongGeoKey', - 3085: 'ProjFalseOriginLatGeoKey', - 3086: 'ProjFalseOriginEastingGeoKey', - 3087: 'ProjFalseOriginNorthingGeoKey', - 3088: 'ProjCenterLongGeoKey', - 3089: 'ProjCenterLatGeoKey', - 3090: 'ProjCenterEastingGeoKey', - 3091: 'ProjFalseOriginNorthingGeoKey', - 3092: 'ProjScaleAtNatOriginGeoKey', - 3093: 'ProjScaleAtCenterGeoKey', - 3094: 'ProjAzimuthAngleGeoKey', - 3095: 'ProjStraightVertPoleLongGeoKey', - 3096: 'ProjRectifiedGridAngleGeoKey', - 4096: 'VerticalCSTypeGeoKey', - 4097: 'VerticalCitationGeoKey', - 4098: 'VerticalDatumGeoKey', - 4099: 'VerticalUnitsGeoKey', - } - - def GEO_CODES(): - try: - from .tifffile_geodb import GEO_CODES # delayed import - except (ImportError, ValueError): - try: - from tifffile_geodb import GEO_CODES # delayed import - except (ImportError, ValueError): - GEO_CODES = {} - return GEO_CODES - - def CZ_LSMINFO(): - return [ - ('MagicNumber', 'u4'), - ('StructureSize', 'i4'), - ('DimensionX', 'i4'), - ('DimensionY', 'i4'), - ('DimensionZ', 'i4'), - ('DimensionChannels', 'i4'), - ('DimensionTime', 'i4'), - ('DataType', 'i4'), # DATATYPES - ('ThumbnailX', 'i4'), - ('ThumbnailY', 'i4'), - ('VoxelSizeX', 'f8'), - ('VoxelSizeY', 'f8'), - ('VoxelSizeZ', 'f8'), - ('OriginX', 'f8'), - ('OriginY', 'f8'), - ('OriginZ', 'f8'), - ('ScanType', 'u2'), - ('SpectralScan', 'u2'), - ('TypeOfData', 'u4'), # TYPEOFDATA - ('OffsetVectorOverlay', 'u4'), - ('OffsetInputLut', 'u4'), - ('OffsetOutputLut', 'u4'), - ('OffsetChannelColors', 'u4'), - ('TimeIntervall', 'f8'), - ('OffsetChannelDataTypes', 'u4'), - ('OffsetScanInformation', 'u4'), # SCANINFO - ('OffsetKsData', 'u4'), - ('OffsetTimeStamps', 'u4'), - ('OffsetEventList', 'u4'), - ('OffsetRoi', 'u4'), - ('OffsetBleachRoi', 'u4'), - ('OffsetNextRecording', 'u4'), - # LSM 2.0 ends here - ('DisplayAspectX', 'f8'), - ('DisplayAspectY', 'f8'), - ('DisplayAspectZ', 'f8'), - ('DisplayAspectTime', 'f8'), - ('OffsetMeanOfRoisOverlay', 'u4'), - ('OffsetTopoIsolineOverlay', 'u4'), - ('OffsetTopoProfileOverlay', 'u4'), - ('OffsetLinescanOverlay', 'u4'), - ('ToolbarFlags', 'u4'), - ('OffsetChannelWavelength', 'u4'), - ('OffsetChannelFactors', 'u4'), - ('ObjectiveSphereCorrection', 'f8'), - ('OffsetUnmixParameters', 'u4'), - # LSM 3.2, 4.0 end here - ('OffsetAcquisitionParameters', 'u4'), - ('OffsetCharacteristics', 'u4'), - ('OffsetPalette', 'u4'), - ('TimeDifferenceX', 'f8'), - ('TimeDifferenceY', 'f8'), - ('TimeDifferenceZ', 'f8'), - ('InternalUse1', 'u4'), - ('DimensionP', 'i4'), - ('DimensionM', 'i4'), - ('DimensionsReserved', '16i4'), - ('OffsetTilePositions', 'u4'), - ('', '9u4'), # Reserved - ('OffsetPositions', 'u4'), - # ('', '21u4'), # must be 0 - ] - - def CZ_LSMINFO_READERS(): - # Import functions for CZ_LSMINFO sub-records - # TODO: read more CZ_LSMINFO sub-records - return { - 'ScanInformation': read_lsm_scaninfo, - 'TimeStamps': read_lsm_timestamps, - 'EventList': read_lsm_eventlist, - 'ChannelColors': read_lsm_channelcolors, - 'Positions': read_lsm_floatpairs, - 'TilePositions': read_lsm_floatpairs, - 'VectorOverlay': None, - 'InputLut': None, - 'OutputLut': None, - 'TimeIntervall': None, - 'ChannelDataTypes': None, - 'KsData': None, - 'Roi': None, - 'BleachRoi': None, - 'NextRecording': None, - 'MeanOfRoisOverlay': None, - 'TopoIsolineOverlay': None, - 'TopoProfileOverlay': None, - 'ChannelWavelength': None, - 'SphereCorrection': None, - 'ChannelFactors': None, - 'UnmixParameters': None, - 'AcquisitionParameters': None, - 'Characteristics': None, - } - - def CZ_LSMINFO_SCANTYPE(): - # Map CZ_LSMINFO.ScanType to dimension order - return { - 0: 'XYZCT', # 'Stack' normal x-y-z-scan - 1: 'XYZCT', # 'Z-Scan' x-z-plane Y=1 - 2: 'XYZCT', # 'Line' - 3: 'XYTCZ', # 'Time Series Plane' time series x-y XYCTZ ? Z=1 - 4: 'XYZTC', # 'Time Series z-Scan' time series x-z - 5: 'XYTCZ', # 'Time Series Mean-of-ROIs' - 6: 'XYZTC', # 'Time Series Stack' time series x-y-z - 7: 'XYCTZ', # Spline Scan - 8: 'XYCZT', # Spline Plane x-z - 9: 'XYTCZ', # Time Series Spline Plane x-z - 10: 'XYZCT', # 'Time Series Point' point mode - } - - def CZ_LSMINFO_DIMENSIONS(): - # Map dimension codes to CZ_LSMINFO attribute - return { - 'X': 'DimensionX', - 'Y': 'DimensionY', - 'Z': 'DimensionZ', - 'C': 'DimensionChannels', - 'T': 'DimensionTime', - 'P': 'DimensionP', - 'M': 'DimensionM', - } - - def CZ_LSMINFO_DATATYPES(): - # Description of CZ_LSMINFO.DataType - return { - 0: 'varying data types', - 1: '8 bit unsigned integer', - 2: '12 bit unsigned integer', - 5: '32 bit float', - } - - def CZ_LSMINFO_TYPEOFDATA(): - # Description of CZ_LSMINFO.TypeOfData - return { - 0: 'Original scan data', - 1: 'Calculated data', - 2: '3D reconstruction', - 3: 'Topography height map', - } - - def CZ_LSMINFO_SCANINFO_ARRAYS(): - return { - 0x20000000: 'Tracks', - 0x30000000: 'Lasers', - 0x60000000: 'DetectionChannels', - 0x80000000: 'IlluminationChannels', - 0xA0000000: 'BeamSplitters', - 0xC0000000: 'DataChannels', - 0x11000000: 'Timers', - 0x13000000: 'Markers', - } - - def CZ_LSMINFO_SCANINFO_STRUCTS(): - return { - # 0x10000000: 'Recording', - 0x40000000: 'Track', - 0x50000000: 'Laser', - 0x70000000: 'DetectionChannel', - 0x90000000: 'IlluminationChannel', - 0xB0000000: 'BeamSplitter', - 0xD0000000: 'DataChannel', - 0x12000000: 'Timer', - 0x14000000: 'Marker', - } - - def CZ_LSMINFO_SCANINFO_ATTRIBUTES(): - return { - # Recording - 0x10000001: 'Name', - 0x10000002: 'Description', - 0x10000003: 'Notes', - 0x10000004: 'Objective', - 0x10000005: 'ProcessingSummary', - 0x10000006: 'SpecialScanMode', - 0x10000007: 'ScanType', - 0x10000008: 'ScanMode', - 0x10000009: 'NumberOfStacks', - 0x1000000A: 'LinesPerPlane', - 0x1000000B: 'SamplesPerLine', - 0x1000000C: 'PlanesPerVolume', - 0x1000000D: 'ImagesWidth', - 0x1000000E: 'ImagesHeight', - 0x1000000F: 'ImagesNumberPlanes', - 0x10000010: 'ImagesNumberStacks', - 0x10000011: 'ImagesNumberChannels', - 0x10000012: 'LinscanXySize', - 0x10000013: 'ScanDirection', - 0x10000014: 'TimeSeries', - 0x10000015: 'OriginalScanData', - 0x10000016: 'ZoomX', - 0x10000017: 'ZoomY', - 0x10000018: 'ZoomZ', - 0x10000019: 'Sample0X', - 0x1000001A: 'Sample0Y', - 0x1000001B: 'Sample0Z', - 0x1000001C: 'SampleSpacing', - 0x1000001D: 'LineSpacing', - 0x1000001E: 'PlaneSpacing', - 0x1000001F: 'PlaneWidth', - 0x10000020: 'PlaneHeight', - 0x10000021: 'VolumeDepth', - 0x10000023: 'Nutation', - 0x10000034: 'Rotation', - 0x10000035: 'Precession', - 0x10000036: 'Sample0time', - 0x10000037: 'StartScanTriggerIn', - 0x10000038: 'StartScanTriggerOut', - 0x10000039: 'StartScanEvent', - 0x10000040: 'StartScanTime', - 0x10000041: 'StopScanTriggerIn', - 0x10000042: 'StopScanTriggerOut', - 0x10000043: 'StopScanEvent', - 0x10000044: 'StopScanTime', - 0x10000045: 'UseRois', - 0x10000046: 'UseReducedMemoryRois', - 0x10000047: 'User', - 0x10000048: 'UseBcCorrection', - 0x10000049: 'PositionBcCorrection1', - 0x10000050: 'PositionBcCorrection2', - 0x10000051: 'InterpolationY', - 0x10000052: 'CameraBinning', - 0x10000053: 'CameraSupersampling', - 0x10000054: 'CameraFrameWidth', - 0x10000055: 'CameraFrameHeight', - 0x10000056: 'CameraOffsetX', - 0x10000057: 'CameraOffsetY', - 0x10000059: 'RtBinning', - 0x1000005A: 'RtFrameWidth', - 0x1000005B: 'RtFrameHeight', - 0x1000005C: 'RtRegionWidth', - 0x1000005D: 'RtRegionHeight', - 0x1000005E: 'RtOffsetX', - 0x1000005F: 'RtOffsetY', - 0x10000060: 'RtZoom', - 0x10000061: 'RtLinePeriod', - 0x10000062: 'Prescan', - 0x10000063: 'ScanDirectionZ', - # Track - 0x40000001: 'MultiplexType', # 0 After Line; 1 After Frame - 0x40000002: 'MultiplexOrder', - 0x40000003: 'SamplingMode', # 0 Sample; 1 Line Avg; 2 Frame Avg - 0x40000004: 'SamplingMethod', # 1 Mean; 2 Sum - 0x40000005: 'SamplingNumber', - 0x40000006: 'Acquire', - 0x40000007: 'SampleObservationTime', - 0x4000000B: 'TimeBetweenStacks', - 0x4000000C: 'Name', - 0x4000000D: 'Collimator1Name', - 0x4000000E: 'Collimator1Position', - 0x4000000F: 'Collimator2Name', - 0x40000010: 'Collimator2Position', - 0x40000011: 'IsBleachTrack', - 0x40000012: 'IsBleachAfterScanNumber', - 0x40000013: 'BleachScanNumber', - 0x40000014: 'TriggerIn', - 0x40000015: 'TriggerOut', - 0x40000016: 'IsRatioTrack', - 0x40000017: 'BleachCount', - 0x40000018: 'SpiCenterWavelength', - 0x40000019: 'PixelTime', - 0x40000021: 'CondensorFrontlens', - 0x40000023: 'FieldStopValue', - 0x40000024: 'IdCondensorAperture', - 0x40000025: 'CondensorAperture', - 0x40000026: 'IdCondensorRevolver', - 0x40000027: 'CondensorFilter', - 0x40000028: 'IdTransmissionFilter1', - 0x40000029: 'IdTransmission1', - 0x40000030: 'IdTransmissionFilter2', - 0x40000031: 'IdTransmission2', - 0x40000032: 'RepeatBleach', - 0x40000033: 'EnableSpotBleachPos', - 0x40000034: 'SpotBleachPosx', - 0x40000035: 'SpotBleachPosy', - 0x40000036: 'SpotBleachPosz', - 0x40000037: 'IdTubelens', - 0x40000038: 'IdTubelensPosition', - 0x40000039: 'TransmittedLight', - 0x4000003A: 'ReflectedLight', - 0x4000003B: 'SimultanGrabAndBleach', - 0x4000003C: 'BleachPixelTime', - # Laser - 0x50000001: 'Name', - 0x50000002: 'Acquire', - 0x50000003: 'Power', - # DetectionChannel - 0x70000001: 'IntegrationMode', - 0x70000002: 'SpecialMode', - 0x70000003: 'DetectorGainFirst', - 0x70000004: 'DetectorGainLast', - 0x70000005: 'AmplifierGainFirst', - 0x70000006: 'AmplifierGainLast', - 0x70000007: 'AmplifierOffsFirst', - 0x70000008: 'AmplifierOffsLast', - 0x70000009: 'PinholeDiameter', - 0x7000000A: 'CountingTrigger', - 0x7000000B: 'Acquire', - 0x7000000C: 'PointDetectorName', - 0x7000000D: 'AmplifierName', - 0x7000000E: 'PinholeName', - 0x7000000F: 'FilterSetName', - 0x70000010: 'FilterName', - 0x70000013: 'IntegratorName', - 0x70000014: 'ChannelName', - 0x70000015: 'DetectorGainBc1', - 0x70000016: 'DetectorGainBc2', - 0x70000017: 'AmplifierGainBc1', - 0x70000018: 'AmplifierGainBc2', - 0x70000019: 'AmplifierOffsetBc1', - 0x70000020: 'AmplifierOffsetBc2', - 0x70000021: 'SpectralScanChannels', - 0x70000022: 'SpiWavelengthStart', - 0x70000023: 'SpiWavelengthStop', - 0x70000026: 'DyeName', - 0x70000027: 'DyeFolder', - # IlluminationChannel - 0x90000001: 'Name', - 0x90000002: 'Power', - 0x90000003: 'Wavelength', - 0x90000004: 'Aquire', - 0x90000005: 'DetchannelName', - 0x90000006: 'PowerBc1', - 0x90000007: 'PowerBc2', - # BeamSplitter - 0xB0000001: 'FilterSet', - 0xB0000002: 'Filter', - 0xB0000003: 'Name', - # DataChannel - 0xD0000001: 'Name', - 0xD0000003: 'Acquire', - 0xD0000004: 'Color', - 0xD0000005: 'SampleType', - 0xD0000006: 'BitsPerSample', - 0xD0000007: 'RatioType', - 0xD0000008: 'RatioTrack1', - 0xD0000009: 'RatioTrack2', - 0xD000000A: 'RatioChannel1', - 0xD000000B: 'RatioChannel2', - 0xD000000C: 'RatioConst1', - 0xD000000D: 'RatioConst2', - 0xD000000E: 'RatioConst3', - 0xD000000F: 'RatioConst4', - 0xD0000010: 'RatioConst5', - 0xD0000011: 'RatioConst6', - 0xD0000012: 'RatioFirstImages1', - 0xD0000013: 'RatioFirstImages2', - 0xD0000014: 'DyeName', - 0xD0000015: 'DyeFolder', - 0xD0000016: 'Spectrum', - 0xD0000017: 'Acquire', - # Timer - 0x12000001: 'Name', - 0x12000002: 'Description', - 0x12000003: 'Interval', - 0x12000004: 'TriggerIn', - 0x12000005: 'TriggerOut', - 0x12000006: 'ActivationTime', - 0x12000007: 'ActivationNumber', - # Marker - 0x14000001: 'Name', - 0x14000002: 'Description', - 0x14000003: 'TriggerIn', - 0x14000004: 'TriggerOut', - } - - def NIH_IMAGE_HEADER(): - return [ - ('FileID', 'a8'), - ('nLines', 'i2'), - ('PixelsPerLine', 'i2'), - ('Version', 'i2'), - ('OldLutMode', 'i2'), - ('OldnColors', 'i2'), - ('Colors', 'u1', (3, 32)), - ('OldColorStart', 'i2'), - ('ColorWidth', 'i2'), - ('ExtraColors', 'u2', (6, 3)), - ('nExtraColors', 'i2'), - ('ForegroundIndex', 'i2'), - ('BackgroundIndex', 'i2'), - ('XScale', 'f8'), - ('Unused2', 'i2'), - ('Unused3', 'i2'), - ('UnitsID', 'i2'), # NIH_UNITS_TYPE - ('p1', [('x', 'i2'), ('y', 'i2')]), - ('p2', [('x', 'i2'), ('y', 'i2')]), - ('CurveFitType', 'i2'), # NIH_CURVEFIT_TYPE - ('nCoefficients', 'i2'), - ('Coeff', 'f8', 6), - ('UMsize', 'u1'), - ('UM', 'a15'), - ('UnusedBoolean', 'u1'), - ('BinaryPic', 'b1'), - ('SliceStart', 'i2'), - ('SliceEnd', 'i2'), - ('ScaleMagnification', 'f4'), - ('nSlices', 'i2'), - ('SliceSpacing', 'f4'), - ('CurrentSlice', 'i2'), - ('FrameInterval', 'f4'), - ('PixelAspectRatio', 'f4'), - ('ColorStart', 'i2'), - ('ColorEnd', 'i2'), - ('nColors', 'i2'), - ('Fill1', '3u2'), - ('Fill2', '3u2'), - ('Table', 'u1'), # NIH_COLORTABLE_TYPE - ('LutMode', 'u1'), # NIH_LUTMODE_TYPE - ('InvertedTable', 'b1'), - ('ZeroClip', 'b1'), - ('XUnitSize', 'u1'), - ('XUnit', 'a11'), - ('StackType', 'i2'), # NIH_STACKTYPE_TYPE - # ('UnusedBytes', 'u1', 200) - ] - - def NIH_COLORTABLE_TYPE(): - return ( - 'CustomTable', - 'AppleDefault', - 'Pseudo20', - 'Pseudo32', - 'Rainbow', - 'Fire1', - 'Fire2', - 'Ice', - 'Grays', - 'Spectrum', - ) - - def NIH_LUTMODE_TYPE(): - return ( - 'PseudoColor', - 'OldAppleDefault', - 'OldSpectrum', - 'GrayScale', - 'ColorLut', - 'CustomGrayscale', - ) - - def NIH_CURVEFIT_TYPE(): - return ( - 'StraightLine', - 'Poly2', - 'Poly3', - 'Poly4', - 'Poly5', - 'ExpoFit', - 'PowerFit', - 'LogFit', - 'RodbardFit', - 'SpareFit1', - 'Uncalibrated', - 'UncalibratedOD', - ) - - def NIH_UNITS_TYPE(): - return ( - 'Nanometers', - 'Micrometers', - 'Millimeters', - 'Centimeters', - 'Meters', - 'Kilometers', - 'Inches', - 'Feet', - 'Miles', - 'Pixels', - 'OtherUnits', - ) - - def TVIPS_HEADER_V1(): - # TVIPS TemData structure from EMMENU Help file - return [ - ('Version', 'i4'), - ('CommentV1', 'a80'), - ('HighTension', 'i4'), - ('SphericalAberration', 'i4'), - ('IlluminationAperture', 'i4'), - ('Magnification', 'i4'), - ('PostMagnification', 'i4'), - ('FocalLength', 'i4'), - ('Defocus', 'i4'), - ('Astigmatism', 'i4'), - ('AstigmatismDirection', 'i4'), - ('BiprismVoltage', 'i4'), - ('SpecimenTiltAngle', 'i4'), - ('SpecimenTiltDirection', 'i4'), - ('IlluminationTiltDirection', 'i4'), - ('IlluminationTiltAngle', 'i4'), - ('ImageMode', 'i4'), - ('EnergySpread', 'i4'), - ('ChromaticAberration', 'i4'), - ('ShutterType', 'i4'), - ('DefocusSpread', 'i4'), - ('CcdNumber', 'i4'), - ('CcdSize', 'i4'), - ('OffsetXV1', 'i4'), - ('OffsetYV1', 'i4'), - ('PhysicalPixelSize', 'i4'), - ('Binning', 'i4'), - ('ReadoutSpeed', 'i4'), - ('GainV1', 'i4'), - ('SensitivityV1', 'i4'), - ('ExposureTimeV1', 'i4'), - ('FlatCorrected', 'i4'), - ('DeadPxCorrected', 'i4'), - ('ImageMean', 'i4'), - ('ImageStd', 'i4'), - ('DisplacementX', 'i4'), - ('DisplacementY', 'i4'), - ('DateV1', 'i4'), - ('TimeV1', 'i4'), - ('ImageMin', 'i4'), - ('ImageMax', 'i4'), - ('ImageStatisticsQuality', 'i4'), - ] - - def TVIPS_HEADER_V2(): - return [ - ('ImageName', 'V160'), # utf16 - ('ImageFolder', 'V160'), - ('ImageSizeX', 'i4'), - ('ImageSizeY', 'i4'), - ('ImageSizeZ', 'i4'), - ('ImageSizeE', 'i4'), - ('ImageDataType', 'i4'), - ('Date', 'i4'), - ('Time', 'i4'), - ('Comment', 'V1024'), - ('ImageHistory', 'V1024'), - ('Scaling', '16f4'), - ('ImageStatistics', '16c16'), - ('ImageType', 'i4'), - ('ImageDisplaType', 'i4'), - ('PixelSizeX', 'f4'), # distance between two px in x, [nm] - ('PixelSizeY', 'f4'), # distance between two px in y, [nm] - ('ImageDistanceZ', 'f4'), - ('ImageDistanceE', 'f4'), - ('ImageMisc', '32f4'), - ('TemType', 'V160'), - ('TemHighTension', 'f4'), - ('TemAberrations', '32f4'), - ('TemEnergy', '32f4'), - ('TemMode', 'i4'), - ('TemMagnification', 'f4'), - ('TemMagnificationCorrection', 'f4'), - ('PostMagnification', 'f4'), - ('TemStageType', 'i4'), - ('TemStagePosition', '5f4'), # x, y, z, a, b - ('TemImageShift', '2f4'), - ('TemBeamShift', '2f4'), - ('TemBeamTilt', '2f4'), - ('TilingParameters', '7f4'), # 0: tiling? 1:x 2:y 3: max x - # 4: max y 5: overlap x 6: overlap y - ('TemIllumination', '3f4'), # 0: spotsize 1: intensity - ('TemShutter', 'i4'), - ('TemMisc', '32f4'), - ('CameraType', 'V160'), - ('PhysicalPixelSizeX', 'f4'), - ('PhysicalPixelSizeY', 'f4'), - ('OffsetX', 'i4'), - ('OffsetY', 'i4'), - ('BinningX', 'i4'), - ('BinningY', 'i4'), - ('ExposureTime', 'f4'), - ('Gain', 'f4'), - ('ReadoutRate', 'f4'), - ('FlatfieldDescription', 'V160'), - ('Sensitivity', 'f4'), - ('Dose', 'f4'), - ('CamMisc', '32f4'), - ('FeiMicroscopeInformation', 'V1024'), - ('FeiSpecimenInformation', 'V1024'), - ('Magic', 'u4'), - ] - - def MM_HEADER(): - # Olympus FluoView MM_Header - MM_DIMENSION = [ - ('Name', 'a16'), - ('Size', 'i4'), - ('Origin', 'f8'), - ('Resolution', 'f8'), - ('Unit', 'a64'), - ] - return [ - ('HeaderFlag', 'i2'), - ('ImageType', 'u1'), - ('ImageName', 'a257'), - ('OffsetData', 'u4'), - ('PaletteSize', 'i4'), - ('OffsetPalette0', 'u4'), - ('OffsetPalette1', 'u4'), - ('CommentSize', 'i4'), - ('OffsetComment', 'u4'), - ('Dimensions', MM_DIMENSION, 10), - ('OffsetPosition', 'u4'), - ('MapType', 'i2'), - ('MapMin', 'f8'), - ('MapMax', 'f8'), - ('MinValue', 'f8'), - ('MaxValue', 'f8'), - ('OffsetMap', 'u4'), - ('Gamma', 'f8'), - ('Offset', 'f8'), - ('GrayChannel', MM_DIMENSION), - ('OffsetThumbnail', 'u4'), - ('VoiceField', 'i4'), - ('OffsetVoiceField', 'u4'), - ] - - def MM_DIMENSIONS(): - # Map FluoView MM_Header.Dimensions to axes characters - return { - 'X': 'X', - 'Y': 'Y', - 'Z': 'Z', - 'T': 'T', - 'CH': 'C', - 'WAVELENGTH': 'C', - 'TIME': 'T', - 'XY': 'R', - 'EVENT': 'V', - 'EXPOSURE': 'L', - } - - def UIC_TAGS(): - # Map Universal Imaging Corporation MetaMorph internal tag ids to - # name and type - from fractions import Fraction # delayed import - return [ - ('AutoScale', int), - ('MinScale', int), - ('MaxScale', int), - ('SpatialCalibration', int), - ('XCalibration', Fraction), - ('YCalibration', Fraction), - ('CalibrationUnits', str), - ('Name', str), - ('ThreshState', int), - ('ThreshStateRed', int), - ('tagid_10', None), # undefined - ('ThreshStateGreen', int), - ('ThreshStateBlue', int), - ('ThreshStateLo', int), - ('ThreshStateHi', int), - ('Zoom', int), - ('CreateTime', julian_datetime), - ('LastSavedTime', julian_datetime), - ('currentBuffer', int), - ('grayFit', None), - ('grayPointCount', None), - ('grayX', Fraction), - ('grayY', Fraction), - ('grayMin', Fraction), - ('grayMax', Fraction), - ('grayUnitName', str), - ('StandardLUT', int), - ('wavelength', int), - ('StagePosition', '(%i,2,2)u4'), # N xy positions as fract - ('CameraChipOffset', '(%i,2,2)u4'), # N xy offsets as fract - ('OverlayMask', None), - ('OverlayCompress', None), - ('Overlay', None), - ('SpecialOverlayMask', None), - ('SpecialOverlayCompress', None), - ('SpecialOverlay', None), - ('ImageProperty', read_uic_image_property), - ('StageLabel', '%ip'), # N str - ('AutoScaleLoInfo', Fraction), - ('AutoScaleHiInfo', Fraction), - ('AbsoluteZ', '(%i,2)u4'), # N fractions - ('AbsoluteZValid', '(%i,)u4'), # N long - ('Gamma', 'I'), # 'I' uses offset - ('GammaRed', 'I'), - ('GammaGreen', 'I'), - ('GammaBlue', 'I'), - ('CameraBin', '2I'), - ('NewLUT', int), - ('ImagePropertyEx', None), - ('PlaneProperty', int), - ('UserLutTable', '(256,3)u1'), - ('RedAutoScaleInfo', int), - ('RedAutoScaleLoInfo', Fraction), - ('RedAutoScaleHiInfo', Fraction), - ('RedMinScaleInfo', int), - ('RedMaxScaleInfo', int), - ('GreenAutoScaleInfo', int), - ('GreenAutoScaleLoInfo', Fraction), - ('GreenAutoScaleHiInfo', Fraction), - ('GreenMinScaleInfo', int), - ('GreenMaxScaleInfo', int), - ('BlueAutoScaleInfo', int), - ('BlueAutoScaleLoInfo', Fraction), - ('BlueAutoScaleHiInfo', Fraction), - ('BlueMinScaleInfo', int), - ('BlueMaxScaleInfo', int), - # ('OverlayPlaneColor', read_uic_overlay_plane_color), - ] - - def PILATUS_HEADER(): - # PILATUS CBF Header Specification, Version 1.4 - # Map key to [value_indices], type - return { - 'Detector': ([slice(1, None)], str), - 'Pixel_size': ([1, 4], float), - 'Silicon': ([3], float), - 'Exposure_time': ([1], float), - 'Exposure_period': ([1], float), - 'Tau': ([1], float), - 'Count_cutoff': ([1], int), - 'Threshold_setting': ([1], float), - 'Gain_setting': ([1, 2], str), - 'N_excluded_pixels': ([1], int), - 'Excluded_pixels': ([1], str), - 'Flat_field': ([1], str), - 'Trim_file': ([1], str), - 'Image_path': ([1], str), - # optional - 'Wavelength': ([1], float), - 'Energy_range': ([1, 2], float), - 'Detector_distance': ([1], float), - 'Detector_Voffset': ([1], float), - 'Beam_xy': ([1, 2], float), - 'Flux': ([1], str), - 'Filter_transmission': ([1], float), - 'Start_angle': ([1], float), - 'Angle_increment': ([1], float), - 'Detector_2theta': ([1], float), - 'Polarization': ([1], float), - 'Alpha': ([1], float), - 'Kappa': ([1], float), - 'Phi': ([1], float), - 'Phi_increment': ([1], float), - 'Chi': ([1], float), - 'Chi_increment': ([1], float), - 'Oscillation_axis': ([slice(1, None)], str), - 'N_oscillations': ([1], int), - 'Start_position': ([1], float), - 'Position_increment': ([1], float), - 'Shutter_time': ([1], float), - 'Omega': ([1], float), - 'Omega_increment': ([1], float), - } - - def ALLOCATIONGRANULARITY(): - # alignment for writing contiguous data to TIFF - import mmap # delayed import - return mmap.ALLOCATIONGRANULARITY - - -def read_tags(fh, byteorder, offsetsize, tagnames, customtags=None, - maxifds=None): - """Read tags from chain of IFDs and return as list of dicts. - - The file handle position must be at a valid IFD header. - - """ - if offsetsize == 4: - offsetformat = byteorder + 'I' - tagnosize = 2 - tagnoformat = byteorder + 'H' - tagsize = 12 - tagformat1 = byteorder + 'HH' - tagformat2 = byteorder + 'I4s' - elif offsetsize == 8: - offsetformat = byteorder + 'Q' - tagnosize = 8 - tagnoformat = byteorder + 'Q' - tagsize = 20 - tagformat1 = byteorder + 'HH' - tagformat2 = byteorder + 'Q8s' - else: - raise ValueError('invalid offset size') - - if customtags is None: - customtags = {} - if maxifds is None: - maxifds = 2**32 - - result = [] - unpack = struct.unpack - offset = fh.tell() - while len(result) < maxifds: - # loop over IFDs - try: - tagno = unpack(tagnoformat, fh.read(tagnosize))[0] - if tagno > 4096: - raise TiffFileError('suspicious number of tags') - except Exception: - log_warning('read_tags: corrupted tag list at offset %i', offset) - break - - tags = {} - data = fh.read(tagsize * tagno) - pos = fh.tell() - index = 0 - for _ in range(tagno): - code, type_ = unpack(tagformat1, data[index:index + 4]) - count, value = unpack(tagformat2, data[index + 4: index + tagsize]) - index += tagsize - name = tagnames.get(code, str(code)) - try: - dtype = TIFF.DATA_FORMATS[type_] - except KeyError: - raise TiffFileError('unknown tag data type %i' % type_) - - fmt = '%s%i%s' % (byteorder, count * int(dtype[0]), dtype[1]) - size = struct.calcsize(fmt) - if size > offsetsize or code in customtags: - offset = unpack(offsetformat, value)[0] - if offset < 8 or offset > fh.size - size: - raise TiffFileError('invalid tag value offset %i' % offset) - fh.seek(offset) - if code in customtags: - readfunc = customtags[code][1] - value = readfunc(fh, byteorder, dtype, count, offsetsize) - elif type_ == 7 or (count > 1 and dtype[-1] == 'B'): - value = read_bytes(fh, byteorder, dtype, count, offsetsize) - elif code in tagnames or dtype[-1] == 's': - value = unpack(fmt, fh.read(size)) - else: - value = read_numpy(fh, byteorder, dtype, count, offsetsize) - elif dtype[-1] == 'B' or type_ == 7: - value = value[:size] - else: - value = unpack(fmt, value[:size]) - - if code not in customtags and code not in TIFF.TAG_TUPLE: - if len(value) == 1: - value = value[0] - if type_ != 7 and dtype[-1] == 's' and isinstance(value, bytes): - # TIFF ASCII fields can contain multiple strings, - # each terminated with a NUL - try: - value = bytes2str(stripascii(value).strip()) - except UnicodeDecodeError: - log_warning( - 'read_tags: coercing invalid ASCII to bytes (tag %i)', - code) - - tags[name] = value - - result.append(tags) - # read offset to next page - fh.seek(pos) - offset = unpack(offsetformat, fh.read(offsetsize))[0] - if offset == 0: - break - if offset >= fh.size: - log_warning('read_tags: invalid page offset (%i)', offset) - break - fh.seek(offset) - - if result and maxifds == 1: - result = result[0] - return result - - -def read_exif_ifd(fh, byteorder, dtype, count, offsetsize): - """Read EXIF tags from file and return as dict.""" - exif = read_tags(fh, byteorder, offsetsize, TIFF.EXIF_TAGS, maxifds=1) - for name in ('ExifVersion', 'FlashpixVersion'): - try: - exif[name] = bytes2str(exif[name]) - except Exception: - pass - if 'UserComment' in exif: - idcode = exif['UserComment'][:8] - try: - if idcode == b'ASCII\x00\x00\x00': - exif['UserComment'] = bytes2str(exif['UserComment'][8:]) - elif idcode == b'UNICODE\x00': - exif['UserComment'] = exif['UserComment'][8:].decode('utf-16') - except Exception: - pass - return exif - - -def read_gps_ifd(fh, byteorder, dtype, count, offsetsize): - """Read GPS tags from file and return as dict.""" - return read_tags(fh, byteorder, offsetsize, TIFF.GPS_TAGS, maxifds=1) - - -def read_interoperability_ifd(fh, byteorder, dtype, count, offsetsize): - """Read Interoperability tags from file and return as dict.""" - tag_names = {1: 'InteroperabilityIndex'} - return read_tags(fh, byteorder, offsetsize, tag_names, maxifds=1) - - -def read_bytes(fh, byteorder, dtype, count, offsetsize): - """Read tag data from file and return as byte string.""" - dtype = 'B' if dtype[-1] == 's' else byteorder + dtype[-1] - count *= numpy.dtype(dtype).itemsize - data = fh.read(count) - if len(data) != count: - log_warning('read_bytes: failed to read all bytes (%i < %i)', - len(data), count) - return data - - -def read_utf8(fh, byteorder, dtype, count, offsetsize): - """Read tag data from file and return as unicode string.""" - return fh.read(count).decode('utf-8') - - -def read_numpy(fh, byteorder, dtype, count, offsetsize): - """Read tag data from file and return as numpy array.""" - dtype = 'b' if dtype[-1] == 's' else byteorder + dtype[-1] - return fh.read_array(dtype, count) - - -def read_colormap(fh, byteorder, dtype, count, offsetsize): - """Read ColorMap data from file and return as numpy array.""" - cmap = fh.read_array(byteorder + dtype[-1], count) - cmap.shape = (3, -1) - return cmap - - -def read_json(fh, byteorder, dtype, count, offsetsize): - """Read JSON tag data from file and return as object.""" - data = fh.read(count) - try: - return json.loads(unicode(stripnull(data), 'utf-8')) - except ValueError: - log_warning('read_json: invalid JSON') - - -def read_mm_header(fh, byteorder, dtype, count, offsetsize): - """Read FluoView mm_header tag from file and return as dict.""" - mmh = fh.read_record(TIFF.MM_HEADER, byteorder=byteorder) - mmh = recarray2dict(mmh) - mmh['Dimensions'] = [ - (bytes2str(d[0]).strip(), d[1], d[2], d[3], bytes2str(d[4]).strip()) - for d in mmh['Dimensions']] - d = mmh['GrayChannel'] - mmh['GrayChannel'] = ( - bytes2str(d[0]).strip(), d[1], d[2], d[3], bytes2str(d[4]).strip()) - return mmh - - -def read_mm_stamp(fh, byteorder, dtype, count, offsetsize): - """Read FluoView mm_stamp tag from file and return as numpy.ndarray.""" - return fh.read_array(byteorder + 'f8', 8) - - -def read_uic1tag(fh, byteorder, dtype, count, offsetsize, planecount=None): - """Read MetaMorph STK UIC1Tag from file and return as dict. - - Return empty dictionary if planecount is unknown. - - """ - if dtype not in ('2I', '1I') or byteorder != '<': - raise ValueError('invalid UIC1Tag') - result = {} - if dtype == '2I': - # pre MetaMorph 2.5 (not tested) - values = fh.read_array(' structure_size: - break - lsminfo.append((name, dtype)) - else: - lsminfo = TIFF.CZ_LSMINFO - - lsminfo = fh.read_record(lsminfo, byteorder=byteorder) - lsminfo = recarray2dict(lsminfo) - - # read LSM info subrecords at offsets - for name, reader in TIFF.CZ_LSMINFO_READERS.items(): - if reader is None: - continue - offset = lsminfo.get('Offset' + name, 0) - if offset < 8: - continue - fh.seek(offset) - try: - lsminfo[name] = reader(fh) - except ValueError: - pass - return lsminfo - - -def read_lsm_floatpairs(fh): - """Read LSM sequence of float pairs from file and return as list.""" - size = struct.unpack(' 0: - esize, etime, etype = struct.unpack(' 4: - size = struct.unpack(' 1 else {} - return frame_data, roi_data - - -def read_micromanager_metadata(fh): - """Read MicroManager non-TIFF settings from open file and return as dict. - - The settings can be used to read image data without parsing the TIFF file. - - Raise ValueError if the file does not contain valid MicroManager metadata. - - """ - fh.seek(0) - try: - byteorder = {b'II': '<', b'MM': '>'}[fh.read(2)] - except IndexError: - raise ValueError('not a MicroManager TIFF file') - - result = {} - fh.seek(8) - ( - index_header, - index_offset, - display_header, - display_offset, - comments_header, - comments_offset, - summary_header, - summary_length - ) = struct.unpack(byteorder + 'IIIIIIII', fh.read(32)) - - if summary_header != 2355492: - raise ValueError('invalid MicroManager summary header') - result['Summary'] = read_json(fh, byteorder, None, summary_length, None) - - if index_header != 54773648: - raise ValueError('invalid MicroManager index header') - fh.seek(index_offset) - header, count = struct.unpack(byteorder + 'II', fh.read(8)) - if header != 3453623: - raise ValueError('invalid MicroManager index header') - data = struct.unpack(byteorder + 'IIIII' * count, fh.read(20 * count)) - result['IndexMap'] = { - 'Channel': data[::5], - 'Slice': data[1::5], - 'Frame': data[2::5], - 'Position': data[3::5], - 'Offset': data[4::5], - } - - if display_header != 483765892: - raise ValueError('invalid MicroManager display header') - fh.seek(display_offset) - header, count = struct.unpack(byteorder + 'II', fh.read(8)) - if header != 347834724: - raise ValueError('invalid MicroManager display header') - result['DisplaySettings'] = read_json(fh, byteorder, None, count, None) - - if comments_header != 99384722: - raise ValueError('invalid MicroManager comments header') - fh.seek(comments_offset) - header, count = struct.unpack(byteorder + 'II', fh.read(8)) - if header != 84720485: - raise ValueError('invalid MicroManager comments header') - result['Comments'] = read_json(fh, byteorder, None, count, None) - - return result - - -def read_metaseries_catalog(fh): - """Read MetaSeries non-TIFF hint catalog from file. - - Raise ValueError if the file does not contain a valid hint catalog. - - """ - # TODO: implement read_metaseries_catalog - raise NotImplementedError() - - -def imagej_metadata_tag(metadata, byteorder): - """Return IJMetadata and IJMetadataByteCounts tags from metadata dict. - - The tags can be passed to the TiffWriter.save function as extratags. - - The metadata dict may contain the following keys and values: - - Info : str - Human-readable information as string. - Labels : sequence of str - Human-readable labels for each channel. - Ranges : sequence of doubles - Lower and upper values for each channel. - LUTs : sequence of (3, 256) uint8 ndarrays - Color palettes for each channel. - Plot : bytes - Undocumented ImageJ internal format. - ROI: bytes - Undocumented ImageJ internal region of interest format. - Overlays : bytes - Undocumented ImageJ internal format. - - """ - header = [{'>': b'IJIJ', '<': b'JIJI'}[byteorder]] - bytecounts = [0] - body = [] - - def _string(data, byteorder): - return data.encode('utf-16' + {'>': 'be', '<': 'le'}[byteorder]) - - def _doubles(data, byteorder): - return struct.pack(byteorder + ('d' * len(data)), *data) - - def _ndarray(data, byteorder): - return data.tobytes() - - def _bytes(data, byteorder): - return data - - metadata_types = ( - ('Info', b'info', 1, _string), - ('Labels', b'labl', None, _string), - ('Ranges', b'rang', 1, _doubles), - ('LUTs', b'luts', None, _ndarray), - ('Plot', b'plot', 1, _bytes), - ('ROI', b'roi ', 1, _bytes), - ('Overlays', b'over', None, _bytes), - ) - - for key, mtype, count, func in metadata_types: - if key.lower() in metadata: - key = key.lower() - elif key not in metadata: - continue - if byteorder == '<': - mtype = mtype[::-1] - values = metadata[key] - if count is None: - count = len(values) - else: - values = [values] - header.append(mtype + struct.pack(byteorder + 'I', count)) - for value in values: - data = func(value, byteorder) - body.append(data) - bytecounts.append(len(data)) - - if not body: - return () - body = b''.join(body) - header = b''.join(header) - data = header + body - bytecounts[0] = len(header) - bytecounts = struct.pack(byteorder + ('I' * len(bytecounts)), *bytecounts) - return ( - (50839, 'B', len(data), data, True), - (50838, 'I', len(bytecounts) // 4, bytecounts, True) - ) - - -def imagej_metadata(data, bytecounts, byteorder): - """Return IJMetadata tag value as dict. - - The 'Info' string can have multiple formats, e.g. OIF or ScanImage, - that might be parsed into dicts using the matlabstr2py or - oiffile.SettingsFile functions. - - """ - - def _string(data, byteorder): - return data.decode('utf-16' + {'>': 'be', '<': 'le'}[byteorder]) - - def _doubles(data, byteorder): - return struct.unpack(byteorder + ('d' * (len(data) // 8)), data) - - def _lut(data, byteorder): - return numpy.frombuffer(data, 'uint8').reshape(-1, 256) - - def _bytes(data, byteorder): - return data - - # big-endian - metadata_types = { - b'info': ('Info', _string), - b'labl': ('Labels', _string), - b'rang': ('Ranges', _doubles), - b'luts': ('LUTs', _lut), - b'plot': ('Plots', _bytes), - b'roi ': ('ROI', _bytes), - b'over': ('Overlays', _bytes), - } - # little-endian - metadata_types.update({k[::-1]: v for k, v in metadata_types.items()}) - - if not bytecounts: - raise ValueError('no ImageJ metadata') - - if not data[:4] in (b'IJIJ', b'JIJI'): - raise ValueError('invalid ImageJ metadata') - - header_size = bytecounts[0] - if header_size < 12 or header_size > 804: - raise ValueError('invalid ImageJ metadata header size') - - ntypes = (header_size - 4) // 8 - header = struct.unpack(byteorder + '4sI' * ntypes, data[4: 4 + ntypes * 8]) - pos = 4 + ntypes * 8 - counter = 0 - result = {} - for mtype, count in zip(header[::2], header[1::2]): - values = [] - name, func = metadata_types.get(mtype, (bytes2str(mtype), read_bytes)) - for _ in range(count): - counter += 1 - pos1 = pos + bytecounts[counter] - values.append(func(data[pos:pos1], byteorder)) - pos = pos1 - result[name.strip()] = values[0] if count == 1 else values - return result - - -def imagej_description_metadata(description): - """Return metatata from ImageJ image description as dict. - - Raise ValueError if not a valid ImageJ description. - - >>> description = 'ImageJ=1.11a\\nimages=510\\nhyperstack=true\\n' - >>> imagej_description_metadata(description) # doctest: +SKIP - {'ImageJ': '1.11a', 'images': 510, 'hyperstack': True} - - """ - - def _bool(val): - return {'true': True, 'false': False}[val.lower()] - - result = {} - for line in description.splitlines(): - try: - key, val = line.split('=') - except Exception: - continue - key = key.strip() - val = val.strip() - for dtype in (int, float, _bool): - try: - val = dtype(val) - break - except Exception: - pass - result[key] = val - - if 'ImageJ' not in result: - raise ValueError('not a ImageJ image description') - return result - - -def imagej_description(shape, rgb=None, colormaped=False, version=None, - hyperstack=None, mode=None, loop=None, **kwargs): - """Return ImageJ image description from data shape. - - ImageJ can handle up to 6 dimensions in order TZCYXS. - - >>> imagej_description((51, 5, 2, 196, 171)) # doctest: +SKIP - ImageJ=1.11a - images=510 - channels=2 - slices=5 - frames=51 - hyperstack=true - mode=grayscale - loop=false - - """ - if colormaped: - raise NotImplementedError('ImageJ colormapping not supported') - if version is None: - version = '1.11a' - shape = imagej_shape(shape, rgb=rgb) - rgb = shape[-1] in (3, 4) - - result = ['ImageJ=%s' % version] - append = [] - result.append('images=%i' % product(shape[:-3])) - if hyperstack is None: - hyperstack = True - append.append('hyperstack=true') - else: - append.append('hyperstack=%s' % bool(hyperstack)) - if shape[2] > 1: - result.append('channels=%i' % shape[2]) - if mode is None and not rgb: - mode = 'grayscale' - if hyperstack and mode: - append.append('mode=%s' % mode) - if shape[1] > 1: - result.append('slices=%i' % shape[1]) - if shape[0] > 1: - result.append('frames=%i' % shape[0]) - if loop is None: - append.append('loop=false') - if loop is not None: - append.append('loop=%s' % bool(loop)) - for key, value in kwargs.items(): - append.append('%s=%s' % (key.lower(), value)) - - return '\n'.join(result + append + ['']) - - -def imagej_shape(shape, rgb=None): - """Return shape normalized to 6D ImageJ hyperstack TZCYXS. - - Raise ValueError if not a valid ImageJ hyperstack shape. - - >>> imagej_shape((2, 3, 4, 5, 3), False) - (2, 3, 4, 5, 3, 1) - - """ - shape = tuple(int(i) for i in shape) - ndim = len(shape) - if 1 > ndim > 6: - raise ValueError('invalid ImageJ hyperstack: not 2 to 6 dimensional') - if rgb is None: - rgb = shape[-1] in (3, 4) and ndim > 2 - if rgb and shape[-1] not in (3, 4): - raise ValueError('invalid ImageJ hyperstack: not a RGB image') - if not rgb and ndim == 6 and shape[-1] != 1: - raise ValueError('invalid ImageJ hyperstack: not a non-RGB image') - if rgb or shape[-1] == 1: - return (1, ) * (6 - ndim) + shape - return (1, ) * (5 - ndim) + shape + (1,) - - -def json_description(shape, **metadata): - """Return JSON image description from data shape and other metadata. - - Return UTF-8 encoded JSON. - - >>> json_description((256, 256, 3), axes='YXS') # doctest: +SKIP - b'{"shape": [256, 256, 3], "axes": "YXS"}' - - """ - metadata.update(shape=shape) - return json.dumps(metadata) # .encode('utf-8') - - -def json_description_metadata(description): - """Return metatata from JSON formated image description as dict. - - Raise ValuError if description is of unknown format. - - >>> description = '{"shape": [256, 256, 3], "axes": "YXS"}' - >>> json_description_metadata(description) # doctest: +SKIP - {'shape': [256, 256, 3], 'axes': 'YXS'} - >>> json_description_metadata('shape=(256, 256, 3)') - {'shape': (256, 256, 3)} - - """ - if description[:6] == 'shape=': - # old-style 'shaped' description; not JSON - shape = tuple(int(i) for i in description[7:-1].split(',')) - return dict(shape=shape) - if description[:1] == '{' and description[-1:] == '}': - # JSON description - return json.loads(description) - raise ValueError('invalid JSON image description', description) - - -def fluoview_description_metadata(description, ignoresections=None): - """Return metatata from FluoView image description as dict. - - The FluoView image description format is unspecified. Expect failures. - - >>> descr = ('[Intensity Mapping]\\nMap Ch0: Range=00000 to 02047\\n' - ... '[Intensity Mapping End]') - >>> fluoview_description_metadata(descr) - {'Intensity Mapping': {'Map Ch0: Range': '00000 to 02047'}} - - """ - if not description.startswith('['): - raise ValueError('invalid FluoView image description') - if ignoresections is None: - ignoresections = {'Region Info (Fields)', 'Protocol Description'} - - result = {} - sections = [result] - comment = False - for line in description.splitlines(): - if not comment: - line = line.strip() - if not line: - continue - if line[0] == '[': - if line[-5:] == ' End]': - # close section - del sections[-1] - section = sections[-1] - name = line[1:-5] - if comment: - section[name] = '\n'.join(section[name]) - if name[:4] == 'LUT ': - a = numpy.array(section[name], dtype='uint8') - a.shape = -1, 3 - section[name] = a - continue - # new section - comment = False - name = line[1:-1] - if name[:4] == 'LUT ': - section = [] - elif name in ignoresections: - section = [] - comment = True - else: - section = {} - sections.append(section) - result[name] = section - continue - # add entry - if comment: - section.append(line) - continue - line = line.split('=', 1) - if len(line) == 1: - section[line[0].strip()] = None - continue - key, value = line - if key[:4] == 'RGB ': - section.extend(int(rgb) for rgb in value.split()) - else: - section[key.strip()] = astype(value.strip()) - return result - - -def pilatus_description_metadata(description): - """Return metatata from Pilatus image description as dict. - - Return metadata from Pilatus pixel array detectors by Dectris, created - by camserver or TVX software. - - >>> pilatus_description_metadata('# Pixel_size 172e-6 m x 172e-6 m') - {'Pixel_size': (0.000172, 0.000172)} - - """ - result = {} - if not description.startswith('# '): - return result - for c in '#:=,()': - description = description.replace(c, ' ') - for line in description.split('\n'): - if line[:2] != ' ': - continue - line = line.split() - name = line[0] - if line[0] not in TIFF.PILATUS_HEADER: - try: - result['DateTime'] = datetime.datetime.strptime( - ' '.join(line), '%Y-%m-%dT%H %M %S.%f') - except Exception: - result[name] = ' '.join(line[1:]) - continue - indices, dtype = TIFF.PILATUS_HEADER[line[0]] - if isinstance(indices[0], slice): - # assumes one slice - values = line[indices[0]] - else: - values = [line[i] for i in indices] - if dtype is float and values[0] == 'not': - values = ['NaN'] - values = tuple(dtype(v) for v in values) - if dtype == str: - values = ' '.join(values) - elif len(values) == 1: - values = values[0] - result[name] = values - return result - - -def svs_description_metadata(description): - """Return metatata from Aperio image description as dict. - - The Aperio image description format is unspecified. Expect failures. - - >>> svs_description_metadata('Aperio Image Library v1.0') - {'Aperio Image Library': 'v1.0'} - - """ - if not description.startswith('Aperio Image Library '): - raise ValueError('invalid Aperio image description') - result = {} - lines = description.split('\n') - key, value = lines[0].strip().rsplit(None, 1) # 'Aperio Image Library' - result[key.strip()] = value.strip() - if len(lines) == 1: - return result - items = lines[1].split('|') - result[''] = items[0].strip() # TODO: parse this? - for item in items[1:]: - key, value = item.split(' = ') - result[key.strip()] = astype(value.strip()) - return result - - -def stk_description_metadata(description): - """Return metadata from MetaMorph image description as list of dict. - - The MetaMorph image description format is unspecified. Expect failures. - - """ - description = description.strip() - if not description: - return [] - try: - description = bytes2str(description) - except UnicodeDecodeError as exc: - log_warning('stk_description_metadata: %s: %s', - exc.__class__.__name__, exc) - return [] - result = [] - for plane in description.split('\x00'): - d = {} - for line in plane.split('\r\n'): - line = line.split(':', 1) - if len(line) > 1: - name, value = line - d[name.strip()] = astype(value.strip()) - else: - value = line[0].strip() - if value: - if '' in d: - d[''].append(value) - else: - d[''] = [value] - result.append(d) - return result - - -def metaseries_description_metadata(description): - """Return metatata from MetaSeries image description as dict.""" - if not description.startswith(''): - raise ValueError('invalid MetaSeries image description') - - from xml.etree import cElementTree as etree # delayed import - - root = etree.fromstring(description) - types = { - 'float': float, - 'int': int, - 'bool': lambda x: asbool(x, 'on', 'off'), - } - - def parse(root, result): - # recursive - for child in root: - attrib = child.attrib - if not attrib: - result[child.tag] = parse(child, {}) - continue - if 'id' in attrib: - i = attrib['id'] - t = attrib['type'] - v = attrib['value'] - if t in types: - result[i] = types[t](v) - else: - result[i] = v - return result - - adict = parse(root, {}) - if 'Description' in adict: - adict['Description'] = adict['Description'].replace(' ', '\n') - return adict - - -def scanimage_description_metadata(description): - """Return metatata from ScanImage image description as dict.""" - return matlabstr2py(description) - - -def scanimage_artist_metadata(artist): - """Return metatata from ScanImage artist tag as dict.""" - try: - return json.loads(artist) - except ValueError as exc: - log_warning('scanimage_artist_metadata: %s: %s', - exc.__class__.__name__, exc) - - -def olympusini_metadata(inistr): - """Return OlympusSIS metadata from INI string. - - No documentation is available. - - """ - - def keyindex(key): - # split key into name and index - index = 0 - i = len(key.rstrip('0123456789')) - if i < len(key): - index = int(key[i:]) - 1 - key = key[:i] - return key, index - - result = {} - bands = [] - zpos = None - tpos = None - for line in inistr.splitlines(): - line = line.strip() - if line == '' or line[0] == ';': - continue - if line[0] == '[' and line[-1] == ']': - section_name = line[1:-1] - result[section_name] = section = {} - if section_name == 'Dimension': - result['axes'] = axes = [] - result['shape'] = shape = [] - elif section_name == 'ASD': - result[section_name] = [] - elif section_name == 'Z': - if 'Dimension' in result: - result[section_name]['ZPos'] = zpos = [] - elif section_name == 'Time': - if 'Dimension' in result: - result[section_name]['TimePos'] = tpos = [] - elif section_name == 'Band': - nbands = result['Dimension']['Band'] - bands = [{'LUT': []} for i in range(nbands)] - result[section_name] = bands - iband = 0 - else: - key, value = line.split('=') - if value.strip() == '': - value = None - elif ',' in value: - value = tuple(astype(v) for v in value.split(',')) - else: - value = astype(value) - - if section_name == 'Dimension': - section[key] = value - axes.append(key) - shape.append(value) - elif section_name == 'ASD': - if key == 'Count': - result['ASD'] = [{}] * value - else: - key, index = keyindex(key) - result['ASD'][index][key] = value - elif section_name == 'Band': - if key[:3] == 'LUT': - lut = bands[iband]['LUT'] - value = struct.pack(' 1: - axes.append(sisaxes.get(x, x[0].upper())) - shape.append(i) - result['axes'] = ''.join(axes) - result['shape'] = tuple(shape) - try: - result['Z']['ZPos'] = numpy.array( - result['Z']['ZPos'][:result['Dimension']['Z']], 'float64') - except Exception: - pass - try: - result['Time']['TimePos'] = numpy.array( - result['Time']['TimePos'][:result['Dimension']['Time']], 'int32') - except Exception: - pass - for band in bands: - band['LUT'] = numpy.array(band['LUT'], 'uint8') - return result - - -def tile_decode(tile, tileindex, tileshape, tiledshape, - lsb2msb, decompress, unpack, unpredict, nodata, out): - """Decode tile segment bytes into 5D output array.""" - _, imagedepth, imagelength, imagewidth, _ = out.shape - tileddepth, tiledlength, tiledwidth = tiledshape - tiledepth, tilelength, tilewidth, samples = tileshape - tilesize = tiledepth * tilelength * tilewidth * samples - pl = tileindex // (tiledwidth * tiledlength * tileddepth) - td = (tileindex // (tiledwidth * tiledlength)) % tileddepth * tiledepth - tl = (tileindex // tiledwidth) % tiledlength * tilelength - tw = tileindex % tiledwidth * tilewidth - - if tile is None: - out[pl, - td: td + tiledepth, - tl: tl + tilelength, - tw: tw + tilewidth] = nodata - return - - if lsb2msb: - tile = bitorder_decode(tile, out=tile) - tile = decompress(tile) - tile = unpack(tile) - # decompression / unpacking might return too many bytes - tile = tile[:tilesize] - try: - # complete tile according to TIFF specification - tile.shape = tileshape - except ValueError: - # tile fills remaining space; found in some JPEG compressed slides - s = ( - min(imagedepth - td, tiledepth), - min(imagelength - tl, tilelength), - min(imagewidth - tw, tilewidth), - samples, - ) - try: - tile.shape = s - except ValueError: - # incomplete tile; see gdal issue #1179 - log_warning('tile_decode: incomplete tile %s %s', - tile.shape, tileshape) - t = numpy.zeros(tilesize, tile.dtype) - s = min(tile.size, tilesize) - t[:s] = tile[:s] - tile = t.reshape(tileshape) - tile = unpredict(tile, axis=-2, out=tile) - out[pl, - td: td + tiledepth, - tl: tl + tilelength, - tw: tw + tilewidth] = tile[:imagedepth - td, - :imagelength - tl, - :imagewidth - tw] - - -def unpack_rgb(data, dtype=None, bitspersample=None, rescale=True): - """Return array from byte string containing packed samples. - - Use to unpack RGB565 or RGB555 to RGB888 format. - - Parameters - ---------- - data : byte str - The data to be decoded. Samples in each pixel are stored consecutively. - Pixels are aligned to 8, 16, or 32 bit boundaries. - dtype : numpy.dtype - The sample data type. The byteorder applies also to the data stream. - bitspersample : tuple - Number of bits for each sample in a pixel. - rescale : bool - Upscale samples to the number of bits in dtype. - - Returns - ------- - numpy.ndarray - Flattened array of unpacked samples of native dtype. - - Examples - -------- - >>> data = struct.pack('BBBB', 0x21, 0x08, 0xff, 0xff) - >>> print(unpack_rgb(data, '>> print(unpack_rgb(data, '>> print(unpack_rgb(data, '= bits) - data = numpy.frombuffer(data, dtype.byteorder + dt) - result = numpy.empty((data.size, len(bitspersample)), dtype.char) - for i, bps in enumerate(bitspersample): - t = data >> int(numpy.sum(bitspersample[i + 1:])) - t &= int('0b' + '1' * bps, 2) - if rescale: - o = ((dtype.itemsize * 8) // bps + 1) * bps - if o > data.dtype.itemsize * 8: - t = t.astype('I') - t *= (2**o - 1) // (2**bps - 1) - t //= 2**(o - (dtype.itemsize * 8)) - result[:, i] = t - return result.reshape(-1) - - -def delta_encode(data, axis=-1, out=None): - """Encode Delta.""" - if isinstance(data, (bytes, bytearray)): - data = numpy.frombuffer(data, dtype='u1') - diff = numpy.diff(data, axis=0) - return numpy.insert(diff, 0, data[0]).tobytes() - - dtype = data.dtype - if dtype.kind == 'f': - data = data.view('u%i' % dtype.itemsize) - - diff = numpy.diff(data, axis=axis) - key = [slice(None)] * data.ndim - key[axis] = 0 - diff = numpy.insert(diff, 0, data[tuple(key)], axis=axis) - - if dtype.kind == 'f': - return diff.view(dtype) - return diff - - -def delta_decode(data, axis=-1, out=None): - """Decode Delta.""" - if out is not None and not out.flags.writeable: - out = None - if isinstance(data, (bytes, bytearray)): - data = numpy.frombuffer(data, dtype='u1') - return numpy.cumsum(data, axis=0, dtype='u1', out=out).tobytes() - if data.dtype.kind == 'f': - view = data.view('u%i' % data.dtype.itemsize) - view = numpy.cumsum(view, axis=axis, dtype=view.dtype) - return view.view(data.dtype) - return numpy.cumsum(data, axis=axis, dtype=data.dtype, out=out) - - -def bitorder_decode(data, out=None, _bitorder=[]): - """Reverse bits in each byte of byte string or numpy array. - - Decode data where pixels with lower column values are stored in the - lower-order bits of the bytes (TIFF FillOrder is LSB2MSB). - - Parameters - ---------- - data : byte string or ndarray - The data to be bit reversed. If byte string, a new bit-reversed byte - string is returned. Numpy arrays are bit-reversed in-place. - - Examples - -------- - >>> bitorder_decode(b'\\x01\\x64') - b'\\x80&' - >>> data = numpy.array([1, 666], dtype='uint16') - >>> bitorder_decode(data) - >>> data - array([ 128, 16473], dtype=uint16) - - """ - if not _bitorder: - _bitorder.append( - b'\x00\x80@\xc0 \xa0`\xe0\x10\x90P\xd00\xb0p\xf0\x08\x88H\xc8(' - b'\xa8h\xe8\x18\x98X\xd88\xb8x\xf8\x04\x84D\xc4$\xa4d\xe4\x14' - b'\x94T\xd44\xb4t\xf4\x0c\x8cL\xcc,\xacl\xec\x1c\x9c\\\xdc<\xbc|' - b'\xfc\x02\x82B\xc2"\xa2b\xe2\x12\x92R\xd22\xb2r\xf2\n\x8aJ\xca*' - b'\xaaj\xea\x1a\x9aZ\xda:\xbaz\xfa\x06\x86F\xc6&\xa6f\xe6\x16' - b'\x96V\xd66\xb6v\xf6\x0e\x8eN\xce.\xaen\xee\x1e\x9e^\xde>\xbe~' - b'\xfe\x01\x81A\xc1!\xa1a\xe1\x11\x91Q\xd11\xb1q\xf1\t\x89I\xc9)' - b'\xa9i\xe9\x19\x99Y\xd99\xb9y\xf9\x05\x85E\xc5%\xa5e\xe5\x15' - b'\x95U\xd55\xb5u\xf5\r\x8dM\xcd-\xadm\xed\x1d\x9d]\xdd=\xbd}' - b'\xfd\x03\x83C\xc3#\xa3c\xe3\x13\x93S\xd33\xb3s\xf3\x0b\x8bK' - b'\xcb+\xabk\xeb\x1b\x9b[\xdb;\xbb{\xfb\x07\x87G\xc7\'\xa7g\xe7' - b'\x17\x97W\xd77\xb7w\xf7\x0f\x8fO\xcf/\xafo\xef\x1f\x9f_' - b'\xdf?\xbf\x7f\xff' - ) - _bitorder.append(numpy.frombuffer(_bitorder[0], dtype='uint8')) - try: - view = data.view('uint8') - numpy.take(_bitorder[1], view, out=view) - return data - except AttributeError: - return data.translate(_bitorder[0]) - except ValueError: - raise NotImplementedError('slices of arrays not supported') - return None - - -def packints_decode(data, dtype, numbits, runlen=0, out=None): - """Decompress byte string to array of integers. - - This implementation only handles itemsizes 1, 8, 16, 32, and 64 bits. - Install the imagecodecs package for decoding other integer sizes. - - Parameters - ---------- - data : byte str - Data to decompress. - dtype : numpy.dtype or str - A numpy boolean or integer type. - numbits : int - Number of bits per integer. - runlen : int - Number of consecutive integers, after which to start at next byte. - - Examples - -------- - >>> packints_decode(b'a', 'B', 1) - array([0, 1, 1, 0, 0, 0, 0, 1], dtype=uint8) - - """ - if numbits == 1: # bitarray - data = numpy.frombuffer(data, '|B') - data = numpy.unpackbits(data) - if runlen % 8: - data = data.reshape(-1, runlen + (8 - runlen % 8)) - data = data[:, :runlen].reshape(-1) - return data.astype(dtype) - if numbits in (8, 16, 32, 64): - return numpy.frombuffer(data, dtype) - raise NotImplementedError( - 'unpacking %s-bit integers to %s not supported' - % (numbits, numpy.dtype(dtype))) - - -if imagecodecs is not None: - bitorder_decode = imagecodecs.bitorder_decode # noqa - packints_decode = imagecodecs.packints_decode # noqa - - -def apply_colormap(image, colormap, contig=True): - """Return palette-colored image. - - The image values are used to index the colormap on axis 1. The returned - image is of shape image.shape+colormap.shape[0] and dtype colormap.dtype. - - Parameters - ---------- - image : numpy.ndarray - Indexes into the colormap. - colormap : numpy.ndarray - RGB lookup table aka palette of shape (3, 2**bits_per_sample). - contig : bool - If True, return a contiguous array. - - Examples - -------- - >>> image = numpy.arange(256, dtype='uint8') - >>> colormap = numpy.vstack([image, image, image]).astype('uint16') * 256 - >>> apply_colormap(image, colormap)[-1] - array([65280, 65280, 65280], dtype=uint16) - - """ - image = numpy.take(colormap, image, axis=1) - image = numpy.rollaxis(image, 0, image.ndim) - if contig: - image = numpy.ascontiguousarray(image) - return image - - -def parse_filenames(files, pattern): - """Return shape and axes from sequence of file names matching pattern. - - >>> parse_filenames(['c1001.ext', 'c2002.ext'], - ... r'([^\\d])(\\d)(?P\\d+)\\.ext') - ('ct', (2, 2), [(1, 1), (2, 2)], (1, 1)) - - """ - if not pattern: - raise ValueError('invalid pattern') - pattern = re.compile(pattern, re.IGNORECASE | re.VERBOSE) - - def parse(fname, pattern=pattern): - """Return axes and indices from file name.""" - fname = os.path.split(fname)[-1] - axes = [] - indices = [] - groupindex = {v: k for k, v in pattern.groupindex.items()} - match = pattern.search(fname) - if not match: - raise ValueError('pattern does not match file name') - ax = None - for i, m in enumerate(match.groups()): - if m is None: - continue - if m[0].isalpha(): - if ax is not None: - raise ValueError('invalid pattern') - ax = m - elif m[0].isdigit(): - if i + 1 in groupindex: - ax = groupindex[i + 1] - else: - ax = 'Q' if ax is None else ax - axes.append(ax[0]) - indices.append(int(m)) - ax = None - return ''.join(axes), tuple(indices) - - axes = None - indices = [] - for fname in files: - ax, idx = parse(fname) - if axes is None: - axes = ax - elif axes != ax: - raise ValueError('axes do not match within image sequence') - indices.append(idx) - shape = tuple(numpy.max(indices, axis=0)) - startindex = tuple(numpy.min(indices, axis=0)) - shape = tuple(i - j + 1 for i, j in zip(shape, startindex)) - # if product(shape) != len(files): - # raise VaueError('files are missing') - return axes, shape, indices, startindex - - -def reorient(image, orientation): - """Return reoriented view of image array. - - Parameters - ---------- - image : numpy.ndarray - Non-squeezed output of asarray() functions. - Axes -3 and -2 must be image length and width respectively. - orientation : int or str - One of TIFF.ORIENTATION names or values. - - """ - orient = TIFF.ORIENTATION - orientation = enumarg(orient, orientation) - - if orientation == orient.TOPLEFT: - return image - if orientation == orient.TOPRIGHT: - return image[..., ::-1, :] - if orientation == orient.BOTLEFT: - return image[..., ::-1, :, :] - if orientation == orient.BOTRIGHT: - return image[..., ::-1, ::-1, :] - if orientation == orient.LEFTTOP: - return numpy.swapaxes(image, -3, -2) - if orientation == orient.RIGHTTOP: - return numpy.swapaxes(image, -3, -2)[..., ::-1, :] - if orientation == orient.RIGHTBOT: - return numpy.swapaxes(image, -3, -2)[..., ::-1, :, :] - if orientation == orient.LEFTBOT: - return numpy.swapaxes(image, -3, -2)[..., ::-1, ::-1, :] - return image - - -def repeat_nd(a, repeats): - """Return read-only view into input array with elements repeated. - - Zoom nD image by integer factors using nearest neighbor interpolation - (box filter). - - Parameters - ---------- - a : array_like - Input array. - repeats : sequence of int - The number of repetitions to apply along each dimension of input array. - - Examples - -------- - >>> repeat_nd([[1, 2], [3, 4]], (2, 2)) - array([[1, 1, 2, 2], - [1, 1, 2, 2], - [3, 3, 4, 4], - [3, 3, 4, 4]]) - - """ - a = numpy.asarray(a) - reshape = [] - shape = [] - strides = [] - for i, j, k in zip(a.strides, a.shape, repeats): - shape.extend((j, k)) - strides.extend((i, 0)) - reshape.append(j * k) - return numpy.lib.stride_tricks.as_strided( - a, shape, strides, writeable=False).reshape(reshape) - - -def reshape_nd(data_or_shape, ndim): - """Return image array or shape with at least ndim dimensions. - - Prepend 1s to image shape as necessary. - - >>> reshape_nd(numpy.empty(0), 1).shape - (0,) - >>> reshape_nd(numpy.empty(1), 2).shape - (1, 1) - >>> reshape_nd(numpy.empty((2, 3)), 3).shape - (1, 2, 3) - >>> reshape_nd(numpy.empty((3, 4, 5)), 3).shape - (3, 4, 5) - >>> reshape_nd((2, 3), 3) - (1, 2, 3) - - """ - is_shape = isinstance(data_or_shape, tuple) - shape = data_or_shape if is_shape else data_or_shape.shape - if len(shape) >= ndim: - return data_or_shape - shape = (1,) * (ndim - len(shape)) + shape - return shape if is_shape else data_or_shape.reshape(shape) - - -def squeeze_axes(shape, axes, skip=None): - """Return shape and axes with single-dimensional entries removed. - - Remove unused dimensions unless their axes are listed in 'skip'. - - >>> squeeze_axes((5, 1, 2, 1, 1), 'TZYXC') - ((5, 2, 1), 'TYX') - - """ - if len(shape) != len(axes): - raise ValueError('dimensions of axes and shape do not match') - if skip is None: - skip = 'XY' - shape, axes = zip(*(i for i in zip(shape, axes) - if i[0] > 1 or i[1] in skip)) - return tuple(shape), ''.join(axes) - - -def transpose_axes(image, axes, asaxes=None): - """Return image with its axes permuted to match specified axes. - - A view is returned if possible. - - >>> transpose_axes(numpy.zeros((2, 3, 4, 5)), 'TYXC', asaxes='CTZYX').shape - (5, 2, 1, 3, 4) - - """ - for ax in axes: - if ax not in asaxes: - raise ValueError('unknown axis %s' % ax) - # add missing axes to image - if asaxes is None: - asaxes = 'CTZYX' - shape = image.shape - for ax in reversed(asaxes): - if ax not in axes: - axes = ax + axes - shape = (1,) + shape - image = image.reshape(shape) - # transpose axes - image = image.transpose([axes.index(ax) for ax in asaxes]) - return image - - -def reshape_axes(axes, shape, newshape, unknown=None): - """Return axes matching new shape. - - By default, unknown dimensions are labelled 'Q'. - - >>> reshape_axes('YXS', (219, 301, 1), (219, 301)) - 'YX' - >>> reshape_axes('IYX', (12, 219, 301), (3, 4, 219, 1, 301, 1)) - 'QQYQXQ' - - """ - shape = tuple(shape) - newshape = tuple(newshape) - if len(axes) != len(shape): - raise ValueError('axes do not match shape') - - size = product(shape) - newsize = product(newshape) - if size != newsize: - raise ValueError('cannot reshape %s to %s' % (shape, newshape)) - if not axes or not newshape: - return '' - - lendiff = max(0, len(shape) - len(newshape)) - if lendiff: - newshape = newshape + (1,) * lendiff - - i = len(shape) - 1 - prodns = 1 - prods = 1 - result = [] - for ns in newshape[:: -1]: - prodns *= ns - while i > 0 and shape[i] == 1 and ns != 1: - i -= 1 - if ns == shape[i] and prodns == prods * shape[i]: - prods *= shape[i] - result.append(axes[i]) - i -= 1 - elif unknown: - result.append(unknown) - else: - unknown = 'Q' - result.append(unknown) - - return ''.join(reversed(result[lendiff:])) - - -def stack_pages(pages, out=None, maxworkers=None, **kwargs): - """Read data from sequence of TiffPage and stack them vertically. - - Additional parameters are passsed to the TiffPage.asarray function. - - """ - npages = len(pages) - if npages == 0: - raise ValueError('no pages') - - if npages == 1: - kwargs['maxworkers'] = maxworkers - return pages[0].asarray(out=out, **kwargs) - - page0 = next(p for p in pages if p is not None).keyframe - shape = (npages,) + page0.shape - dtype = page0.dtype - out = create_output(out, shape, dtype) - - if maxworkers is None or maxworkers < 1: - import multiprocessing - maxworkers = max(multiprocessing.cpu_count() // 2, 1) - - if maxworkers == 1: - kwargs['maxworkers'] = 1 - elif npages < 3: - kwargs['maxworkers'] = maxworkers - maxworkers = 1 - elif page0.compression > 1 and len(page0.dataoffsets) > 2: - kwargs['maxworkers'] = min(maxworkers, len(page0.dataoffsets)) - maxworkers = max(maxworkers - kwargs['maxworkers'], 1) - else: - kwargs['maxworkers'] = 1 - - page0.parent.filehandle.lock = maxworkers > 1 - - filecache = OpenFileCache(size=max(4, maxworkers), - lock=page0.parent.filehandle.lock) - - def func(page, index, out=out, filecache=filecache, validate=0, - kwargs=kwargs): - """Read, decode, and copy page data.""" - if page is not None: - filecache.open(page.parent.filehandle) - out[index] = page.asarray(lock=filecache.lock, reopen=False, - validate=False, **kwargs) - filecache.close(page.parent.filehandle) - - if maxworkers < 2: - for i, page in enumerate(pages): - func(page, i) - else: - # TODO: add exception handling - # read first page un-threaded to catch exceptions - func(page0, 0, validate=True) - with ThreadPoolExecutor(maxworkers) as executor: - executor.map(func, pages[1:], range(1, npages)) - - filecache.clear() - page0.parent.filehandle.lock = None - return out - - -def create_output(out, shape, dtype, mode='w+', suffix=None): - """Return numpy array where image data of shape and dtype can be copied. - - The 'out' parameter may have the following values or types: - - None - An empty array of shape and dtype is created and returned. - numpy.ndarray - An existing writable array of compatible dtype and shape. A view of - the same array is returned after verification. - 'memmap' or 'memmap:tempdir' - A memory-map to an array stored in a temporary binary file on disk - is created and returned. - str or open file - The file name or file object used to create a memory-map to an array - stored in a binary file on disk. The created memory-mapped array is - returned. - - """ - if out is None: - return numpy.zeros(shape, dtype) - if isinstance(out, str) and out[:6] == 'memmap': - import tempfile - tempdir = out[7:] if len(out) > 7 else None - if suffix is None: - suffix = '.memmap' - with tempfile.NamedTemporaryFile(dir=tempdir, suffix=suffix) as fh: - return numpy.memmap(fh, shape=shape, dtype=dtype, mode=mode) - if isinstance(out, numpy.ndarray): - if product(shape) != product(out.shape): - raise ValueError('incompatible output shape') - if not numpy.can_cast(dtype, out.dtype): - raise ValueError('incompatible output dtype') - return out.reshape(shape) - if isinstance(out, pathlib.Path): - out = str(out) - return numpy.memmap(out, shape=shape, dtype=dtype, mode=mode) - - -def matlabstr2py(string): - """Return Python object from Matlab string representation. - - Return str, bool, int, float, list (Matlab arrays or cells), or - dict (Matlab structures) types. - - Use to access ScanImage metadata. - - >>> matlabstr2py('1') - 1 - >>> matlabstr2py("['x y z' true false; 1 2.0 -3e4; NaN Inf @class]") - [['x y z', True, False], [1, 2.0, -30000.0], [nan, inf, '@class']] - >>> d = matlabstr2py("SI.hChannels.channelType = {'stripe' 'stripe'}\\n" - ... "SI.hChannels.channelsActive = 2") - >>> d['SI.hChannels.channelType'] - ['stripe', 'stripe'] - - """ - # TODO: handle invalid input - # TODO: review unboxing of multidimensional arrays - - def lex(s): - # return sequence of tokens from matlab string representation - tokens = ['['] - while True: - t, i = next_token(s) - if t is None: - break - if t == ';': - tokens.extend((']', '[')) - elif t == '[': - tokens.extend(('[', '[')) - elif t == ']': - tokens.extend((']', ']')) - else: - tokens.append(t) - s = s[i:] - tokens.append(']') - return tokens - - def next_token(s): - # return next token in matlab string - length = len(s) - if length == 0: - return None, 0 - i = 0 - while i < length and s[i] == ' ': - i += 1 - if i == length: - return None, i - if s[i] in '{[;]}': - return s[i], i + 1 - if s[i] == "'": - j = i + 1 - while j < length and s[j] != "'": - j += 1 - return s[i: j + 1], j + 1 - if s[i] == '<': - j = i + 1 - while j < length and s[j] != '>': - j += 1 - return s[i: j + 1], j + 1 - j = i - while j < length and not s[j] in ' {[;]}': - j += 1 - return s[i:j], j - - def value(s, fail=False): - # return Python value of token - s = s.strip() - if not s: - return s - if len(s) == 1: - try: - return int(s) - except Exception: - if fail: - raise ValueError() - return s - if s[0] == "'": - if fail and s[-1] != "'" or "'" in s[1:-1]: - raise ValueError() - return s[1:-1] - if s[0] == '<': - if fail and s[-1] != '>' or '<' in s[1:-1]: - raise ValueError() - return s - if fail and any(i in s for i in " ';[]{}"): - raise ValueError() - if s[0] == '@': - return s - if s in ('true', 'True'): - return True - if s in ('false', 'False'): - return False - if s[:6] == 'zeros(': - return numpy.zeros([int(i) for i in s[6:-1].split(',')]).tolist() - if s[:5] == 'ones(': - return numpy.ones([int(i) for i in s[5:-1].split(',')]).tolist() - if '.' in s or 'e' in s: - try: - return float(s) - except Exception: - pass - try: - return int(s) - except Exception: - pass - try: - return float(s) # nan, inf - except Exception: - if fail: - raise ValueError() - return s - - def parse(s): - # return Python value from string representation of Matlab value - s = s.strip() - try: - return value(s, fail=True) - except ValueError: - pass - result = add2 = [] - levels = [add2] - for t in lex(s): - if t in '[{': - add2 = [] - levels.append(add2) - elif t in ']}': - x = levels.pop() - if len(x) == 1 and isinstance(x[0], (list, str)): - x = x[0] - add2 = levels[-1] - add2.append(x) - else: - add2.append(value(t)) - if len(result) == 1 and isinstance(result[0], (list, str)): - result = result[0] - return result - - if '\r' in string or '\n' in string: - # structure - d = {} - for line in string.splitlines(): - line = line.strip() - if not line or line[0] == '%': - continue - k, v = line.split('=', 1) - k = k.strip() - if any(c in k for c in " ';[]{}<>"): - continue - d[k] = parse(v) - return d - return parse(string) - - -def stripnull(string, null=b'\x00'): - """Return string truncated at first null character. - - Clean NULL terminated C strings. For unicode strings use null='\\0'. - - >>> stripnull(b'string\\x00') - b'string' - >>> stripnull('string\\x00', null='\\0') - 'string' - - """ - i = string.find(null) - return string if (i < 0) else string[:i] - - -def stripascii(string): - """Return string truncated at last byte that is 7-bit ASCII. - - Clean NULL separated and terminated TIFF strings. - - >>> stripascii(b'string\\x00string\\n\\x01\\x00') - b'string\\x00string\\n' - >>> stripascii(b'\\x00') - b'' - - """ - # TODO: pythonize this - i = len(string) - while i: - i -= 1 - if 8 < byte2int(string[i]) < 127: - break - else: - i = -1 - return string[: i + 1] - - -def asbool(value, true=(b'true', u'true'), false=(b'false', u'false')): - """Return string as bool if possible, else raise TypeError. - - >>> asbool(b' False ') - False - - """ - value = value.strip().lower() - if value in true: # might raise UnicodeWarning/BytesWarning - return True - if value in false: - return False - raise TypeError() - - -def astype(value, types=None): - """Return argument as one of types if possible. - - >>> astype('42') - 42 - >>> astype('3.14') - 3.14 - >>> astype('True') - True - >>> astype(b'Neee-Wom') - 'Neee-Wom' - - """ - if types is None: - types = int, float, asbool, bytes2str - for typ in types: - try: - return typ(value) - except (ValueError, AttributeError, TypeError, UnicodeEncodeError): - pass - return value - - -def format_size(size, threshold=1536): - """Return file size as string from byte size. - - >>> format_size(1234) - '1234 B' - >>> format_size(12345678901) - '11.50 GiB' - - """ - if size < threshold: - return "%i B" % size - for unit in ('KiB', 'MiB', 'GiB', 'TiB', 'PiB'): - size /= 1024.0 - if size < threshold: - return "%.2f %s" % (size, unit) - return 'ginormous' - - -def identityfunc(arg, *args, **kwargs): - """Single argument identity function. - - >>> identityfunc('arg') - 'arg' - - """ - return arg - - -def nullfunc(*args, **kwargs): - """Null function. - - >>> nullfunc('arg', kwarg='kwarg') - - """ - return - - -def sequence(value): - """Return tuple containing value if value is not a tuple or list. - - >>> sequence(1) - (1,) - >>> sequence([1]) - [1] - >>> sequence('ab') - ('ab',) - - """ - return value if isinstance(value, (tuple, list)) else (value,) - - -def product(iterable): - """Return product of sequence of numbers. - - Equivalent of functools.reduce(operator.mul, iterable, 1). - Multiplying numpy integers might overflow. - - >>> product([2**8, 2**30]) - 274877906944 - >>> product([]) - 1 - - """ - prod = 1 - for i in iterable: - prod *= i - return prod - - -def natural_sorted(iterable): - """Return human sorted list of strings. - - E.g. for sorting file names. - - >>> natural_sorted(['f1', 'f2', 'f10']) - ['f1', 'f2', 'f10'] - - """ - - def sortkey(x): - return [(int(c) if c.isdigit() else c) for c in re.split(numbers, x)] - - numbers = re.compile(r'(\d+)') - return sorted(iterable, key=sortkey) - - -def excel_datetime(timestamp, epoch=None): - """Return datetime object from timestamp in Excel serial format. - - Convert LSM time stamps. - - >>> excel_datetime(40237.029999999795) - datetime.datetime(2010, 2, 28, 0, 43, 11, 999982) - - """ - if epoch is None: - epoch = datetime.datetime.fromordinal(693594) - return epoch + datetime.timedelta(timestamp) - - -def julian_datetime(julianday, milisecond=0): - """Return datetime from days since 1/1/4713 BC and ms since midnight. - - Convert Julian dates according to MetaMorph. - - >>> julian_datetime(2451576, 54362783) - datetime.datetime(2000, 2, 2, 15, 6, 2, 783) - - """ - if julianday <= 1721423: - # no datetime before year 1 - return None - - a = julianday + 1 - if a > 2299160: - alpha = math.trunc((a - 1867216.25) / 36524.25) - a += 1 + alpha - alpha // 4 - b = a + (1524 if a > 1721423 else 1158) - c = math.trunc((b - 122.1) / 365.25) - d = math.trunc(365.25 * c) - e = math.trunc((b - d) / 30.6001) - - day = b - d - math.trunc(30.6001 * e) - month = e - (1 if e < 13.5 else 13) - year = c - (4716 if month > 2.5 else 4715) - - hour, milisecond = divmod(milisecond, 1000 * 60 * 60) - minute, milisecond = divmod(milisecond, 1000 * 60) - second, milisecond = divmod(milisecond, 1000) - - return datetime.datetime(year, month, day, - hour, minute, second, milisecond) - - -def byteorder_isnative(byteorder): - """Return if byteorder matches the system's byteorder. - - >>> byteorder_isnative('=') - True - - """ - if byteorder in ('=', sys.byteorder): - return True - keys = {'big': '>', 'little': '<'} - return keys.get(byteorder, byteorder) == keys[sys.byteorder] - - -def recarray2dict(recarray): - """Return numpy.recarray as dict.""" - # TODO: subarrays - result = {} - for descr, value in zip(recarray.dtype.descr, recarray): - name, dtype = descr[:2] - if dtype[1] == 'S': - value = bytes2str(stripnull(value)) - elif value.ndim < 2: - value = value.tolist() - result[name] = value - return result - - -def xml2dict(xml, sanitize=True, prefix=None): - """Return XML as dict. - - >>> xml2dict('1') - {'root': {'key': 1, 'attr': 'name'}} - - """ - from xml.etree import cElementTree as etree # delayed import - - at = tx = '' - if prefix: - at, tx = prefix - - def astype(value): - # return value as int, float, bool, or str - for t in (int, float, asbool): - try: - return t(value) - except Exception: - pass - return value - - def etree2dict(t): - # adapted from https://stackoverflow.com/a/10077069/453463 - key = t.tag - if sanitize: - key = key.rsplit('}', 1)[-1] - d = {key: {} if t.attrib else None} - children = list(t) - if children: - dd = collections.defaultdict(list) - for dc in map(etree2dict, children): - for k, v in dc.items(): - dd[k].append(astype(v)) - d = {key: {k: astype(v[0]) if len(v) == 1 else astype(v) - for k, v in dd.items()}} - if t.attrib: - d[key].update((at + k, astype(v)) for k, v in t.attrib.items()) - if t.text: - text = t.text.strip() - if children or t.attrib: - if text: - d[key][tx + 'value'] = astype(text) - else: - d[key] = astype(text) - return d - - return etree2dict(etree.fromstring(xml)) - - -def hexdump(bytestr, width=75, height=24, snipat=-2, modulo=2, ellipsis=None): - """Return hexdump representation of byte string. - - >>> hexdump(binascii.unhexlify('49492a00080000000e00fe0004000100')) - '49 49 2a 00 08 00 00 00 0e 00 fe 00 04 00 01 00 II*.............' - - """ - size = len(bytestr) - if size < 1 or width < 2 or height < 1: - return '' - if height == 1: - addr = b'' - bytesperline = min(modulo * (((width - len(addr)) // 4) // modulo), - size) - if bytesperline < 1: - return '' - nlines = 1 - else: - addr = b'%%0%ix: ' % len(b'%x' % size) - bytesperline = min(modulo * (((width - len(addr % 1)) // 4) // modulo), - size) - if bytesperline < 1: - return '' - width = 3 * bytesperline + len(addr % 1) - nlines = (size - 1) // bytesperline + 1 - - if snipat is None or snipat == 1: - snipat = height - elif 0 < abs(snipat) < 1: - snipat = int(math.floor(height * snipat)) - if snipat < 0: - snipat += height - - if height == 1 or nlines == 1: - blocks = [(0, bytestr[:bytesperline])] - addr = b'' - height = 1 - width = 3 * bytesperline - elif height is None or nlines <= height: - blocks = [(0, bytestr)] - elif snipat <= 0: - start = bytesperline * (nlines - height) - blocks = [(start, bytestr[start:])] # (start, None) - elif snipat >= height or height < 3: - end = bytesperline * height - blocks = [(0, bytestr[:end])] # (end, None) - else: - end1 = bytesperline * snipat - end2 = bytesperline * (height - snipat - 1) - blocks = [ - (0, bytestr[:end1]), - (size - end1 - end2, None), - (size - end2, bytestr[size - end2:]), - ] - - ellipsis = b'...' if ellipsis is None else str2bytes(ellipsis) - result = [] - for start, bytestr in blocks: - if bytestr is None: - result.append(ellipsis) # 'skip %i bytes' % start) - continue - hexstr = binascii.hexlify(bytestr) - strstr = re.sub(br'[^\x20-\x7f]', b'.', bytestr) - for i in range(0, len(bytestr), bytesperline): - h = hexstr[2 * i: 2 * i + bytesperline * 2] - r = (addr % (i + start)) if height > 1 else addr - r += b' '.join(h[i: i + 2] for i in range(0, 2 * bytesperline, 2)) - r += b' ' * (width - len(r)) - r += strstr[i: i + bytesperline] - result.append(r) - result = b'\n'.join(result) - if sys.version_info[0] > 2: - result = result.decode('ascii') - return result - - -def isprintable(string): - """Return if all characters in string are printable. - - >>> isprintable('abc') - True - >>> isprintable(b'\01') - False - - """ - string = string.strip() - if not string: - return True - if sys.version_info[0] > 2: - try: - return string.isprintable() - except Exception: - pass - try: - return string.decode('utf-8').isprintable() - except Exception: - pass - else: - if string.isalnum(): - return True - printable = ('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST' - 'UVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c') - return all(c in printable for c in string) - - -def clean_whitespace(string, compact=False): - """Return string with compressed whitespace.""" - for a, b in ( - ('\r\n', '\n'), - ('\r', '\n'), - ('\n\n', '\n'), - ('\t', ' '), - (' ', ' ') - ): - string = string.replace(a, b) - if compact: - for a, b in ( - ('\n', ' '), - ('[ ', '['), - (' ', ' '), - (' ', ' '), - (' ', ' ') - ): - string = string.replace(a, b) - return string.strip() - - -def pformat_xml(xml): - """Return pretty formatted XML.""" - try: - from lxml import etree # delayed import - - if not isinstance(xml, bytes): - xml = xml.encode('utf-8') - xml = etree.parse(io.BytesIO(xml)) - xml = etree.tostring(xml, pretty_print=True, xml_declaration=True, - encoding=xml.docinfo.encoding) - xml = bytes2str(xml) - except Exception: - if isinstance(xml, bytes): - xml = bytes2str(xml) - xml = xml.replace('><', '>\n<') - return xml.replace(' ', ' ').replace('\t', ' ') - - -def pformat(arg, width=79, height=24, compact=True): - """Return pretty formatted representation of object as string. - - Whitespace might be altered. - - """ - if height is None or height < 1: - height = 1024 - if width is None or width < 1: - width = 256 - - npopt = numpy.get_printoptions() - numpy.set_printoptions(threshold=100, linewidth=width) - - if isinstance(arg, basestring): - if arg[:5].lower() in (' height: - arg = '\n'.join(argl[:height // 2] + ['...'] + argl[-height // 2:]) - return arg - - -def snipstr(string, width=79, snipat=None, ellipsis='...'): - """Return string cut to specified length. - - >>> snipstr('abcdefghijklmnop', 8) - 'abc...op' - - """ - if snipat is None: - snipat = 0.5 - if ellipsis is None: - if isinstance(string, bytes): - ellipsis = b'...' - else: - ellipsis = u'\u2026' # does not print on win-py3.5 - esize = len(ellipsis) - - splitlines = string.splitlines() - # TODO: finish and test multiline snip - - result = [] - for line in splitlines: - if line is None: - result.append(ellipsis) - continue - linelen = len(line) - if linelen <= width: - result.append(string) - continue - - split = snipat - if split is None or split == 1: - split = linelen - elif 0 < abs(split) < 1: - split = int(math.floor(linelen * split)) - if split < 0: - split += linelen - if split < 0: - split = 0 - - if esize == 0 or width < esize + 1: - if split <= 0: - result.append(string[-width:]) - else: - result.append(string[:width]) - elif split <= 0: - result.append(ellipsis + string[esize - width:]) - elif split >= linelen or width < esize + 4: - result.append(string[:width - esize] + ellipsis) - else: - splitlen = linelen - width + esize - end1 = split - splitlen // 2 - end2 = end1 + splitlen - result.append(string[:end1] + ellipsis + string[end2:]) - - if isinstance(string, bytes): - return b'\n'.join(result) - return '\n'.join(result) - - -def enumarg(enum, arg): - """Return enum member from its name or value. - - >>> enumarg(TIFF.PHOTOMETRIC, 2) - - >>> enumarg(TIFF.PHOTOMETRIC, 'RGB') - - - """ - try: - return enum(arg) - except Exception: - try: - return enum[arg.upper()] - except Exception: - raise ValueError('invalid argument %s' % arg) - - -def parse_kwargs(kwargs, *keys, **keyvalues): - """Return dict with keys from keys|keyvals and values from kwargs|keyvals. - - Existing keys are deleted from kwargs. - - >>> kwargs = {'one': 1, 'two': 2, 'four': 4} - >>> kwargs2 = parse_kwargs(kwargs, 'two', 'three', four=None, five=5) - >>> kwargs == {'one': 1} - True - >>> kwargs2 == {'two': 2, 'four': 4, 'five': 5} - True - - """ - result = {} - for key in keys: - if key in kwargs: - result[key] = kwargs[key] - del kwargs[key] - for key, value in keyvalues.items(): - if key in kwargs: - result[key] = kwargs[key] - del kwargs[key] - else: - result[key] = value - return result - - -def update_kwargs(kwargs, **keyvalues): - """Update dict with keys and values if keys do not already exist. - - >>> kwargs = {'one': 1, } - >>> update_kwargs(kwargs, one=None, two=2) - >>> kwargs == {'one': 1, 'two': 2} - True - - """ - for key, value in keyvalues.items(): - if key not in kwargs: - kwargs[key] = value - - -def log_warning(msg, *args, **kwargs): - """Log message with level WARNING.""" - import logging - logging.getLogger(__name__).warning(msg, *args, **kwargs) - - -def validate_jhove(filename, jhove=None, ignore=None): - """Validate TIFF file using jhove -m TIFF-hul. - - Raise ValueError if jhove outputs an error message unless the message - contains one of the strings in 'ignore'. - - JHOVE does not support bigtiff or more than 50 IFDs. - - See `JHOVE TIFF-hul Module `_ - - """ - import subprocess - if ignore is None: - ignore = ['More than 50 IFDs'] - if jhove is None: - jhove = 'jhove' - out = subprocess.check_output([jhove, filename, '-m', 'TIFF-hul']) - if b'ErrorMessage: ' in out: - for line in out.splitlines(): - line = line.strip() - if line.startswith(b'ErrorMessage: '): - error = line[14:].decode('utf-8') - for i in ignore: - if i in error: - break - else: - raise ValueError(error) - break - - -def lsm2bin(lsmfile, binfile=None, tile=None, verbose=True): - """Convert [MP]TZCYX LSM file to series of BIN files. - - One BIN file containing 'ZCYX' data are created for each position, time, - and tile. The position, time, and tile indices are encoded at the end - of the filenames. - - """ - verbose = print_ if verbose else nullfunc - - if tile is None: - tile = (256, 256) - - if binfile is None: - binfile = lsmfile - elif binfile.lower() == 'none': - binfile = None - if binfile: - binfile += '_(z%ic%iy%ix%i)_m%%ip%%it%%03iy%%ix%%i.bin' - - verbose('\nOpening LSM file... ', end='', flush=True) - timer = Timer() - - with TiffFile(lsmfile) as lsm: - if not lsm.is_lsm: - verbose('\n', lsm, flush=True) - raise ValueError('not a LSM file') - series = lsm.series[0] # first series contains the image data - shape = series.shape - axes = series.axes - dtype = series.dtype - size = product(shape) * dtype.itemsize - - verbose(timer) - # verbose(lsm, flush=True) - verbose('Image\n axes: %s\n shape: %s\n dtype: %s\n size: %s' - % (axes, shape, dtype, format_size(size)), flush=True) - if not series.axes.endswith('TZCYX'): - raise ValueError('not a *TZCYX LSM file') - - verbose('Copying image from LSM to BIN files', end='', flush=True) - timer.start() - tiles = shape[-2] // tile[-2], shape[-1] // tile[-1] - if binfile: - binfile = binfile % (shape[-4], shape[-3], tile[0], tile[1]) - shape = (1,) * (7 - len(shape)) + shape - # cache for ZCYX stacks and output files - data = numpy.empty(shape[3:], dtype=dtype) - out = numpy.empty((shape[-4], shape[-3], tile[0], tile[1]), - dtype=dtype) - # iterate over Tiff pages containing data - pages = iter(series.pages) - for m in range(shape[0]): # mosaic axis - for p in range(shape[1]): # position axis - for t in range(shape[2]): # time axis - for z in range(shape[3]): # z slices - data[z] = next(pages).asarray() - for y in range(tiles[0]): # tile y - for x in range(tiles[1]): # tile x - out[:] = data[ - ..., - y * tile[0]: (y + 1) * tile[0], - x * tile[1]: (x + 1) * tile[1] - ] - if binfile: - out.tofile(binfile % (m, p, t, y, x)) - verbose('.', end='', flush=True) - verbose(timer, flush=True) - - -def imshow(data, photometric=None, planarconfig=None, bitspersample=None, - interpolation=None, cmap=None, vmin=None, vmax=None, - figure=None, title=None, dpi=96, subplot=None, maxdim=None, - **kwargs): - """Plot n-dimensional images using matplotlib.pyplot. - - Return figure, subplot and plot axis. - Requires pyplot already imported C{from matplotlib import pyplot}. - - Parameters - ---------- - data : nd array - The image data. - photometric : {'MINISWHITE', 'MINISBLACK', 'RGB', or 'PALETTE'} - The color space of the image data. - planarconfig : {'CONTIG' or 'SEPARATE'} - Defines how components of each pixel are stored. - bitspersample : int - Number of bits per channel in integer RGB images. - interpolation : str - The image interpolation method used in matplotlib.imshow. By default, - 'nearest' will be used for image dimensions <= 512, else 'bilinear'. - cmap : str or matplotlib.colors.Colormap - The colormap maps non-RGBA scalar data to colors. - vmin, vmax : scalar - Data range covered by the colormap. By default, the complete - range of the data is covered. - figure : matplotlib.figure.Figure - Matplotlib figure to use for plotting. - title : str - Window and subplot title. - subplot : int - A matplotlib.pyplot.subplot axis. - maxdim : int - Maximum image width and length. - kwargs : dict - Additional arguments for matplotlib.pyplot.imshow. - - """ - # TODO: rewrite detection of isrgb, iscontig - # TODO: use planarconfig - if photometric is None: - photometric = 'RGB' - if maxdim is None: - maxdim = 2**16 - isrgb = photometric in ('RGB', 'YCBCR') # 'PALETTE', 'YCBCR' - - if data.dtype == 'float16': - data = data.astype('float32') - - if data.dtype.kind == 'b': - isrgb = False - - if isrgb and not ( - data.shape[-1] in (3, 4) - or (data.ndim > 2 and data.shape[-3] in (3, 4)) - ): - isrgb = False - photometric = 'MINISBLACK' - - data = data.squeeze() - if photometric in ('MINISWHITE', 'MINISBLACK', None): - data = reshape_nd(data, 2) - else: - data = reshape_nd(data, 3) - - dims = data.ndim - if dims < 2: - raise ValueError('not an image') - if dims == 2: - dims = 0 - isrgb = False - else: - if isrgb and data.shape[-3] in (3, 4): - data = numpy.swapaxes(data, -3, -2) - data = numpy.swapaxes(data, -2, -1) - elif not isrgb and ( - data.shape[-1] < data.shape[-2] // 8 - and data.shape[-1] < data.shape[-3] // 8 - ): - data = numpy.swapaxes(data, -3, -1) - data = numpy.swapaxes(data, -2, -1) - isrgb = isrgb and data.shape[-1] in (3, 4) - dims -= 3 if isrgb else 2 - - if interpolation is None: - threshold = 512 - elif isinstance(interpolation, int): - threshold = interpolation - else: - threshold = 0 - - if isrgb: - data = data[..., :maxdim, :maxdim, :maxdim] - if threshold: - if data.shape[-2] > threshold or data.shape[-3] > threshold: - interpolation = 'bilinear' - else: - interpolation = 'nearest' - else: - data = data[..., :maxdim, :maxdim] - if threshold: - if data.shape[-1] > threshold or data.shape[-2] > threshold: - interpolation = 'bilinear' - else: - interpolation = 'nearest' - - if photometric == 'PALETTE' and isrgb: - datamax = data.max() - if datamax > 255: - data = data >> 8 # possible precision loss - data = data.astype('B') - elif data.dtype.kind in 'ui': - if not (isrgb and data.dtype.itemsize <= 1) or bitspersample is None: - try: - bitspersample = int(math.ceil(math.log(data.max(), 2))) - except Exception: - bitspersample = data.dtype.itemsize * 8 - elif not isinstance(bitspersample, inttypes): - # bitspersample can be tuple, e.g. (5, 6, 5) - bitspersample = data.dtype.itemsize * 8 - datamax = 2**bitspersample - if isrgb: - if bitspersample < 8: - data = data << (8 - bitspersample) - elif bitspersample > 8: - data = data >> (bitspersample - 8) # precision loss - data = data.astype('B') - elif data.dtype.kind == 'f': - datamax = data.max() - if isrgb and datamax > 1.0: - if data.dtype.char == 'd': - data = data.astype('f') - data /= datamax - else: - data = data / datamax - elif data.dtype.kind == 'b': - datamax = 1 - elif data.dtype.kind == 'c': - data = numpy.absolute(data) - datamax = data.max() - - if isrgb: - vmin = 0 - else: - if vmax is None: - vmax = datamax - if vmin is None: - if data.dtype.kind == 'i': - dtmin = numpy.iinfo(data.dtype).min - vmin = numpy.min(data) - if vmin == dtmin: - vmin = numpy.min(data[data > dtmin]) - elif data.dtype.kind == 'f': - dtmin = numpy.finfo(data.dtype).min - vmin = numpy.min(data) - if vmin == dtmin: - vmin = numpy.min(data[data > dtmin]) - else: - vmin = 0 - - pyplot = sys.modules['matplotlib.pyplot'] - - if figure is None: - pyplot.rc('font', family='sans-serif', weight='normal', size=8) - figure = pyplot.figure(dpi=dpi, figsize=(10.3, 6.3), frameon=True, - facecolor='1.0', edgecolor='w') - try: - figure.canvas.manager.window.title(title) - except Exception: - pass - size = len(title.splitlines()) if title else 1 - pyplot.subplots_adjust( - bottom=0.03 * (dims + 2), - top=0.98 - size * 0.03, - left=0.1, - right=0.95, - hspace=0.05, - wspace=0.0) - if subplot is None: - subplot = 111 - subplot = pyplot.subplot(subplot) - subplot.set_facecolor((0, 0, 0)) - - if title: - try: - title = unicode(title, 'Windows-1252') - except TypeError: - pass - pyplot.title(title, size=11) - - if cmap is None: - if data.dtype.char == '?': - cmap = 'gray' - elif data.dtype.kind in 'buf' or vmin == 0: - cmap = 'viridis' - else: - cmap = 'coolwarm' - if photometric == 'MINISWHITE': - cmap += '_r' - - image = pyplot.imshow(numpy.atleast_2d(data[(0,) * dims].squeeze()), - vmin=vmin, vmax=vmax, cmap=cmap, - interpolation=interpolation, **kwargs) - - if not isrgb: - pyplot.colorbar() # panchor=(0.55, 0.5), fraction=0.05 - - def format_coord(x, y): - # callback function to format coordinate display in toolbar - x = int(x + 0.5) - y = int(y + 0.5) - try: - if dims: - return '%s @ %s [%4i, %4i]' % ( - curaxdat[1][y, x], current, y, x) - return '%s @ [%4i, %4i]' % (data[y, x], y, x) - except IndexError: - return '' - - def none(event): - return '' - - subplot.format_coord = format_coord - image.get_cursor_data = none - image.format_cursor_data = none - - if dims: - current = list((0,) * dims) - curaxdat = [0, data[tuple(current)].squeeze()] - sliders = [pyplot.Slider( - pyplot.axes([0.125, 0.03 * (axis + 1), 0.725, 0.025]), - 'Dimension %i' % axis, 0, data.shape[axis] - 1, 0, facecolor='0.5', - valfmt='%%.0f [%i]' % data.shape[axis]) for axis in range(dims)] - for slider in sliders: - slider.drawon = False - - def set_image(current, sliders=sliders, data=data): - # change image and redraw canvas - curaxdat[1] = data[tuple(current)].squeeze() - image.set_data(curaxdat[1]) - for ctrl, index in zip(sliders, current): - ctrl.eventson = False - ctrl.set_val(index) - ctrl.eventson = True - figure.canvas.draw() - - def on_changed(index, axis, data=data, current=current): - # callback function for slider change event - index = int(round(index)) - curaxdat[0] = axis - if index == current[axis]: - return - if index >= data.shape[axis]: - index = 0 - elif index < 0: - index = data.shape[axis] - 1 - current[axis] = index - set_image(current) - - def on_keypressed(event, data=data, current=current): - # callback function for key press event - key = event.key - axis = curaxdat[0] - if str(key) in '0123456789': - on_changed(key, axis) - elif key == 'right': - on_changed(current[axis] + 1, axis) - elif key == 'left': - on_changed(current[axis] - 1, axis) - elif key == 'up': - curaxdat[0] = 0 if axis == len(data.shape) - 1 else axis + 1 - elif key == 'down': - curaxdat[0] = len(data.shape) - 1 if axis == 0 else axis - 1 - elif key == 'end': - on_changed(data.shape[axis] - 1, axis) - elif key == 'home': - on_changed(0, axis) - - figure.canvas.mpl_connect('key_press_event', on_keypressed) - for axis, ctrl in enumerate(sliders): - ctrl.on_changed(lambda k, a=axis: on_changed(k, a)) - - return figure, subplot, image - - -def _app_show(): - """Block the GUI. For use as skimage plugin.""" - pyplot = sys.modules['matplotlib.pyplot'] - pyplot.show() - - -def askopenfilename(**kwargs): - """Return file name(s) from Tkinter's file open dialog.""" - try: - from Tkinter import Tk - import tkFileDialog as filedialog - except ImportError: - from tkinter import Tk, filedialog - root = Tk() - root.withdraw() - root.update() - filenames = filedialog.askopenfilename(**kwargs) - root.destroy() - return filenames - - -def main(): - """Tifffile command line usage main function.""" - import optparse # TODO: use argparse - import logging - - logging.getLogger(__name__).setLevel(logging.INFO) - - parser = optparse.OptionParser( - usage='usage: %prog [options] path', - description='Display image data in TIFF files.', - version='%%prog %s' % __version__, prog='tifffile') - opt = parser.add_option - opt('-p', '--page', dest='page', type='int', default=-1, - help='display single page') - opt('-s', '--series', dest='series', type='int', default=-1, - help='display series of pages of same shape') - opt('--nomultifile', dest='nomultifile', action='store_true', - default=False, help='do not read OME series from multiple files') - opt('--noplots', dest='noplots', type='int', default=10, - help='maximum number of plots') - opt('--interpol', dest='interpol', metavar='INTERPOL', default=None, - help='image interpolation method') - opt('--dpi', dest='dpi', type='int', default=96, - help='plot resolution') - opt('--vmin', dest='vmin', type='int', default=None, - help='minimum value for colormapping') - opt('--vmax', dest='vmax', type='int', default=None, - help='maximum value for colormapping') - opt('--debug', dest='debug', action='store_true', default=False, - help='raise exception on failures') - opt('--doctest', dest='doctest', action='store_true', default=False, - help='runs the docstring examples') - opt('-v', '--detail', dest='detail', type='int', default=2) - opt('-q', '--quiet', dest='quiet', action='store_true') - - settings, path = parser.parse_args() - path = ' '.join(path) - - if settings.doctest: - import doctest - if sys.version_info < (3, 6): - print('Doctests work with Python >=3.6 only') - return 0 - doctest.testmod(optionflags=doctest.ELLIPSIS) - return 0 - if not path: - path = askopenfilename(title='Select a TIFF file', - filetypes=TIFF.FILEOPEN_FILTER) - if not path: - parser.error('No file specified') - - if any(i in path for i in '?*'): - path = glob.glob(path) - if not path: - print('No files match the pattern') - return 0 - # TODO: handle image sequences - path = path[0] - - if not settings.quiet: - print_('\nReading TIFF header:', end=' ', flush=True) - timer = Timer() - try: - tif = TiffFile(path, multifile=not settings.nomultifile) - except Exception as exc: - if settings.debug: - raise - print('\n\n%s: %s' % (exc.__class__.__name__, exc)) - sys.exit(0) - - if not settings.quiet: - print(timer) - - if tif.is_ome: - settings.norgb = True - - images = [] - if settings.noplots > 0: - if not settings.quiet: - print_('Reading image data: ', end=' ', flush=True) - - def notnone(x): - return next(i for i in x if i is not None) - - timer.start() - try: - if settings.page >= 0: - images = [(tif.asarray(key=settings.page), - tif[settings.page], None)] - elif settings.series >= 0: - images = [(tif.asarray(series=settings.series), - notnone(tif.series[settings.series]._pages), - tif.series[settings.series])] - else: - for i, s in enumerate(tif.series[:settings.noplots]): - try: - images.append((tif.asarray(series=i), - notnone(s._pages), - tif.series[i])) - except Exception as exc: - images.append((None, notnone(s.pages), None)) - if settings.debug: - raise - print('\nSeries %i failed with %s: %s... ' - % (i, exc.__class__.__name__, exc), end='') - except Exception as exc: - if settings.debug: - raise - print('%s: %s' % (exc.__class__.__name__, exc)) - - if not settings.quiet: - print(timer) - - if not settings.quiet: - print_('Generating report:', end=' ', flush=True) - timer.start() - info = TiffFile.__str__(tif, detail=int(settings.detail)) - print(timer) - print() - print(info) - print() - tif.close() - - if images and settings.noplots > 0: - try: - import matplotlib - matplotlib.use('TkAgg') - from matplotlib import pyplot - - except ImportError as exc: - log_warning('tifffile.main: %s: %s', exc.__class__.__name__, exc) - else: - for img, page, series in images: - if img is None: - continue - vmin, vmax = settings.vmin, settings.vmax - if page.keyframe.nodata: - try: - vmin = numpy.min(img[img > page.keyframe.nodata]) - except ValueError: - pass - if tif.is_stk: - try: - vmin = tif.stk_metadata['MinScale'] - vmax = tif.stk_metadata['MaxScale'] - except KeyError: - pass - else: - if vmax <= vmin: - vmin, vmax = settings.vmin, settings.vmax - if series: - title = '%s\n%s\n%s' % (str(tif), str(page), str(series)) - else: - title = '%s\n %s' % (str(tif), str(page)) - photometric = 'MINISBLACK' - if page.photometric not in (3,): - photometric = TIFF.PHOTOMETRIC(page.photometric).name - imshow(img, title=title, vmin=vmin, vmax=vmax, - bitspersample=page.bitspersample, - photometric=photometric, - interpolation=settings.interpol, - dpi=settings.dpi) - pyplot.show() - return 0 - - -if sys.version_info[0] == 2: - inttypes = int, long, numpy.integer # noqa - - def print_(*args, **kwargs): - """Print function with flush support.""" - flush = kwargs.pop('flush', False) - print(*args, **kwargs) - if flush: - sys.stdout.flush() - - def bytes2str(b, encoding=None, errors=None): - """Return string from bytes.""" - return b - - def str2bytes(s, encoding=None): - """Return bytes from string.""" - return s - - def bytestr(s, encoding='cp1252'): - """Return byte string from unicode string, else pass through.""" - return s.encode(encoding) if isinstance(s, unicode) else s - - def byte2int(b): - """Return value of byte as int.""" - return ord(b) - - def iogetbuffer(bio): - """Return contents of BytesIO buffer.""" - return bio.getvalue() - - class FileNotFoundError(IOError): - """FileNotFoundError exception for Python 2.""" - - TiffFrame = TiffPage # noqa -else: - inttypes = int, numpy.integer - basestring = str, bytes - unicode = str - print_ = print - - def bytes2str(b, encoding=None, errors='strict'): - """Return unicode string from encoded bytes.""" - if encoding is not None: - return b.decode(encoding, errors) - try: - return b.decode('utf-8', errors) - except UnicodeDecodeError: - return b.decode('cp1252', errors) - - def str2bytes(s, encoding='cp1252'): - """Return bytes from unicode string.""" - return s.encode(encoding) - - def bytestr(s, encoding='cp1252'): - """Return byte string from unicode string, else pass through.""" - return s.encode(encoding) if isinstance(s, str) else s - - def byte2int(b): - """Return value of byte as int.""" - return b - - def iogetbuffer(bio): - """Return view over BytesIO buffer.""" - return bio.getbuffer() - - -# deprecated - -def decodelzw(encoded): - """Decompress LZW encoded byte string.""" - warnings.warn( - 'The decodelzw function was removed from the tifffile package.\n' - 'Use the lzw_decode function from the imagecodecs package instead.') - return imagecodecs.lzw_decode(encoded) - - -decode_lzw = decodelzw -imsave = imwrite - -if __name__ == '__main__': - sys.exit(main()) From 7cb403a286a58c59ff3d4c0cc1abc43c89978fc4 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Mon, 26 Oct 2020 20:41:55 -0400 Subject: [PATCH 06/63] Replace thirdparty version of tifffile with requirements.txt --- .../md/data_file_types/ome_tiff_metadata_file.py | 2 +- .../md/data_file_types/scn_tiff_metadata_file.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ingest-pipeline/md/data_file_types/ome_tiff_metadata_file.py b/src/ingest-pipeline/md/data_file_types/ome_tiff_metadata_file.py index a6641a2b..71a463cc 100755 --- a/src/ingest-pipeline/md/data_file_types/ome_tiff_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/ome_tiff_metadata_file.py @@ -2,7 +2,7 @@ import xmltodict from metadata_file import MetadataFile -from thirdparty import tifffile +import tifffile class OMETiffMetadataFile(MetadataFile): """A metadata file type for OME-Tiff files""" diff --git a/src/ingest-pipeline/md/data_file_types/scn_tiff_metadata_file.py b/src/ingest-pipeline/md/data_file_types/scn_tiff_metadata_file.py index 7b275d73..65ef995e 100755 --- a/src/ingest-pipeline/md/data_file_types/scn_tiff_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/scn_tiff_metadata_file.py @@ -2,7 +2,7 @@ import xmltodict from metadata_file import MetadataFile -from thirdparty import tifffile +import tifffile class ScnTiffMetadataFile(MetadataFile): """A metadata file type for Scn-Tiff files""" From 5a5a14d4553115fa38f14b239ca765fcd224beac Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Mon, 26 Oct 2020 20:52:36 -0400 Subject: [PATCH 07/63] This reactivates the call to ingest-validation-tools in metadatatsv_metadata_file --- .gitmodules | 3 +++ .../metadatatsv_metadata_file.py | 19 +++++++++---------- src/ingest-pipeline/requirements.txt | 14 +++++++++++--- .../submodules/ingest-validation-tools | 2 +- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/.gitmodules b/.gitmodules index a2198239..534aa6fa 100644 --- a/.gitmodules +++ b/.gitmodules @@ -28,3 +28,6 @@ [submodule "src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims"] path = src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims url = https://github.com/hubmapconsortium/ome-tiff-pyramid +[submodule "src/ingest-pipeline/submodules/ingest-validation-tests"] + path = src/ingest-pipeline/submodules/ingest-validation-tests + url = git@github.com:hubmapconsortium/ingest-validation-tests.git diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index 705c8961..a92b0c98 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -14,16 +14,15 @@ class MetadataTSVMetadataFile(MetadataFile): category_name = 'METADATATSV'; def collect_metadata(self): -# print('validating {} as metadata.tsv'.format(self.path)) -# dirpath = Path(os.path.dirname(self.path)) -# submission = ingest_validation_tools_submission.Submission(directory_path=dirpath, -# ignore_files=os.path.basename(self.path)) -# report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) -# if report.errors: -# # Scan reports an error result -# with open('ingest_validation_tools_report.txt', 'w') as f: -# f.write(report.as_text()) -# raise MetadataError('{} failed ingest validation test'.format(self.path)) + print('validating {} as metadata.tsv'.format(self.path)) + dirpath = Path(os.path.dirname(self.path)) + submission = ingest_validation_tools_submission.Submission(directory_path=dirpath) + report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) + if report.errors: + # Scan reports an error result + with open('ingest_validation_tools_report.txt', 'w') as f: + f.write(report.as_text()) + raise MetadataError('{} failed ingest validation test'.format(self.path)) print('parsing metadatatsv from {}'.format(self.path)) md = [] with open(self.path, 'rU', newline='') as f: diff --git a/src/ingest-pipeline/requirements.txt b/src/ingest-pipeline/requirements.txt index 24560b33..dc2312d1 100644 --- a/src/ingest-pipeline/requirements.txt +++ b/src/ingest-pipeline/requirements.txt @@ -1,16 +1,24 @@ git+git://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons prov==1.5.1 pylibczi>=1.1.1 +tifffile>=2020.10.1 xmltodict>=0.12.0 -pyimzml>=1.3.0 +#pyimzml>=1.3.0 +pyimzml>=1.2.6 airflow-multi-dagrun>=1.2 jsonschema==3.2.0 fastjsonschema==2.14.2 requests>=1.2 jsonref-ap==0.3.dev0 -tableschema>=1.15.0 -goodtables>=2.4.9 PyYAML>=5.3.1 rdflib==4.2.2 rdflib-jsonld==0.4.0 git+git://github.com/hubmapconsortium/cwltool.git@docker-gpu#egg=cwltool +# We need the dependencies of ingest-validation tools, but relative paths don't work +# -r ${CWD}/submodules/ingest-validation-tools/requirements.txt +#jsonschema==3.2.0 +#pyyaml>=5.3.1 +tableschema==1.15.0 +goodtables==2.4.9 +globus-cli==1.12.0 +yattag==1.14.0 diff --git a/src/ingest-pipeline/submodules/ingest-validation-tools b/src/ingest-pipeline/submodules/ingest-validation-tools index 4f951e1a..7302226c 160000 --- a/src/ingest-pipeline/submodules/ingest-validation-tools +++ b/src/ingest-pipeline/submodules/ingest-validation-tools @@ -1 +1 @@ -Subproject commit 4f951e1a25e0e8d579721112ed6cd965f0eed551 +Subproject commit 7302226c72e3637d7f62e3f67f81ef247d2f926b From 9eecc2cf7054bd643a500548e967a24f9bd9670f Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 9 Dec 2020 12:11:01 -0500 Subject: [PATCH 08/63] Update codex-pipeline submodule to v1.6.8 --- src/ingest-pipeline/airflow/dags/cwl/codex-pipeline | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/codex-pipeline b/src/ingest-pipeline/airflow/dags/cwl/codex-pipeline index 221f7c42..b007e431 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/codex-pipeline +++ b/src/ingest-pipeline/airflow/dags/cwl/codex-pipeline @@ -1 +1 @@ -Subproject commit 221f7c4215e926f3e816bac60812007601d45e5d +Subproject commit b007e431acbb76cea3833f8596a2e27e21660c08 From 86ce9764e45fba6e44cbf1fa49dc37590a5d22e1 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Thu, 10 Dec 2020 13:57:01 -0500 Subject: [PATCH 09/63] Bump ingest-validation-tools to head of master --- src/ingest-pipeline/submodules/ingest-validation-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/submodules/ingest-validation-tools b/src/ingest-pipeline/submodules/ingest-validation-tools index 7302226c..61c7c1d9 160000 --- a/src/ingest-pipeline/submodules/ingest-validation-tools +++ b/src/ingest-pipeline/submodules/ingest-validation-tools @@ -1 +1 @@ -Subproject commit 7302226c72e3637d7f62e3f67f81ef247d2f926b +Subproject commit 61c7c1d99925d7041e6169e4aca51c4f3908a5d6 From f34ef5520097c83245efad26443fe6a7e8e0c02e Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Tue, 15 Dec 2020 15:24:07 -0500 Subject: [PATCH 10/63] devtest_data_collection learns to accept optional ome.tiff files --- .../md/data_collection_types/devtest_data_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/md/data_collection_types/devtest_data_collection.py b/src/ingest-pipeline/md/data_collection_types/devtest_data_collection.py index 4e405f12..957f0d18 100755 --- a/src/ingest-pipeline/md/data_collection_types/devtest_data_collection.py +++ b/src/ingest-pipeline/md/data_collection_types/devtest_data_collection.py @@ -19,7 +19,7 @@ class DEVTESTDataCollection(DataCollection): expected_files = [('test.yml', "YAML"), ] - optional_files = [] + optional_files = [('*.ome.tiff', 'OME_TIFF')] @classmethod def test_match(cls, path): From cf744726836b66a7efc4b86783a87c47e03bdd4b Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Tue, 15 Dec 2020 15:27:57 -0500 Subject: [PATCH 11/63] Update branch of commit ingest-validation-tools --- src/ingest-pipeline/submodules/ingest-validation-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/submodules/ingest-validation-tools b/src/ingest-pipeline/submodules/ingest-validation-tools index 7302226c..f3d718e6 160000 --- a/src/ingest-pipeline/submodules/ingest-validation-tools +++ b/src/ingest-pipeline/submodules/ingest-validation-tools @@ -1 +1 @@ -Subproject commit 7302226c72e3637d7f62e3f67f81ef247d2f926b +Subproject commit f3d718e6e107d767875c39855ccd26819a92524e From 2ebddacdc139ad069395556523e5bbaa1bed39b1 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Wed, 16 Dec 2020 13:43:47 -0500 Subject: [PATCH 12/63] Temporarily un-set tifffile version. WIP. --- src/ingest-pipeline/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ingest-pipeline/requirements.txt b/src/ingest-pipeline/requirements.txt index dc2312d1..2562f0f0 100644 --- a/src/ingest-pipeline/requirements.txt +++ b/src/ingest-pipeline/requirements.txt @@ -1,7 +1,8 @@ git+git://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons prov==1.5.1 pylibczi>=1.1.1 -tifffile>=2020.10.1 +#tifffile==2020.12.8 +tifffile xmltodict>=0.12.0 #pyimzml>=1.3.0 pyimzml>=1.2.6 From 6311365443e4d936a328cb3c56f3617f1c9fa735 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Wed, 16 Dec 2020 14:31:27 -0500 Subject: [PATCH 13/63] Bump ingest-validation-tools to master HEAD --- src/ingest-pipeline/submodules/ingest-validation-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/submodules/ingest-validation-tools b/src/ingest-pipeline/submodules/ingest-validation-tools index 61c7c1d9..86a8ecc5 160000 --- a/src/ingest-pipeline/submodules/ingest-validation-tools +++ b/src/ingest-pipeline/submodules/ingest-validation-tools @@ -1 +1 @@ -Subproject commit 61c7c1d99925d7041e6169e4aca51c4f3908a5d6 +Subproject commit 86a8ecc514f9bd32b792400543dca0a28f06ef1e From a43c2b715e032f2fc76aaaa76bbe8bf91641c6b8 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Wed, 16 Dec 2020 14:33:35 -0500 Subject: [PATCH 14/63] Bump portal-containers to master HEAD --- src/ingest-pipeline/airflow/dags/cwl/portal-containers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/portal-containers b/src/ingest-pipeline/airflow/dags/cwl/portal-containers index fb191032..6a89ea66 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/portal-containers +++ b/src/ingest-pipeline/airflow/dags/cwl/portal-containers @@ -1 +1 @@ -Subproject commit fb1910324fc796ff4b7d4e643de27ff2861e7d8c +Subproject commit 6a89ea661169f8210ea785ed9edefac0857e0475 From a88ae113abbb02f7d82d2a33d63e863a41cc4d86 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Mon, 21 Dec 2020 16:12:08 -0500 Subject: [PATCH 15/63] Plugins: Add initial auth plugin for Globus. --- .../airflow/plugins/globus_auth/__init__.py | 0 .../plugins/globus_auth/globus_auth.py | 166 ++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 src/ingest-pipeline/airflow/plugins/globus_auth/__init__.py create mode 100644 src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py diff --git a/src/ingest-pipeline/airflow/plugins/globus_auth/__init__.py b/src/ingest-pipeline/airflow/plugins/globus_auth/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py new file mode 100644 index 00000000..c74b76fa --- /dev/null +++ b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py @@ -0,0 +1,166 @@ +import flask_login + +import globus_sdk + +# Need to expose these downstream +# flake8: noqa: F401 +from flask_login import current_user, logout_user, login_required, login_user + +from flask import url_for, redirect, request + +from flask_oauthlib.client import OAuth + +from airflow import models, configuration +from airflow.utils.db import provide_session +from airflow.utils.log.logging_mixin import LoggingMixin + +log = LoggingMixin().log + + +def get_config_param(param): + return str(configuration.conf.get('globus', param)) + + +class GlobusUser(models.User): + + def __init__(self, user): + self.user = user + + @property + def is_active(self): + """Required by flask_login""" + return True + + @property + def is_authenticated(self): + """Required by flask_login""" + return True + + @property + def is_anonymous(self): + """Required by flask_login""" + return False + + def get_id(self): + """Returns the current user id as required by flask_login""" + return self.user.get_id() + + def data_profiling(self): + """Provides access to data profiling tools""" + return True + + def is_superuser(self): + """Access all the things""" + return True + + +class AuthenticationError(Exception): + pass + + +class GlobusAuthBackend(object): + + def __init__(self): + # self.globus_host = get_config_param('host') + self.login_manager = flask_login.LoginManager() + self.login_manager.login_view = 'airflow.login' + self.flask_app = None + self.globus_oauth = None + self.api_rev = None + + def init_app(self, flask_app): + self.flask_app = flask_app + + self.login_manager.init_app(self.flask_app) + + self.globus_oauth = globus_sdk.ConfidentialAppAuthClient( + get_config_param('APP_CLIENT_ID'), get_config_param('APP_CLIENT_SECRET')) + + self.login_manager.user_loader(self.load_user) + + self.flask_app.add_url_rule(get_config_param('oauth_callback_route'), + 'globus_oauth_callback', + self.oauth_callback) + + def login(self, request): + log.debug('Redirecting user to Globus login') + return self.globus_oauth.oauth2_start_flow(url_for( + 'globus_oauth_callback', + _external=True)) + + def get_globus_user_profile_info(self, globus_token): + resp = self.globus_oauth.oauth2_userinfo() + + if not resp or resp.status != 200: + raise AuthenticationError( + 'Failed to fetch user profile, status ({0})'.format( + resp.status if resp else 'None')) + + return resp['name'], resp['email'] + + @provide_session + def load_user(self, userid, session=None): + if not userid or userid == 'None': + return None + + user = session.query(models.User).filter( + models.User.id == int(userid)).first() + return GlobusUser(user) + + @provide_session + def oauth_callback(self, session=None): + log.debug('Globus OAuth callback called') + + next_url = request.args.get('state') or url_for('admin.index') + + resp = self.globus_oauth.authorized_response() + + try: + # If there's no "code" query string parameter, we're in this route + # starting a Globus Auth login flow. + # Redirect out to Globus Auth + if 'code' not in request.args: + auth_uri = self.globus_oauth.oauth2_get_authorize_url(additional_params={ + "scope": "openid profile email urn:globus:auth:scope:transfer.api.globus.org:all urn:globus:auth:scope:auth.globus.org:view_identities urn:globus:auth:scope:nexus.api.globus.org:groups"}) + return redirect(auth_uri) + # If we do have a "code" param, we're coming back from Globus Auth + # and can start the process of exchanging an auth code for a token. + else: + code = request.args.get('code') + tokens = self.globus_oauth.oauth2_exchange_code_for_tokens(code) + + globus_token = tokens.by_resource_server['auth.globus.org']['access_token'] + + username, email = self.get_globus_user_profile_info(globus_token) + + # store the resulting tokens in the session + session.update( + tokens=tokens.by_resource_server, + is_authenticated=True + ) + user = session.query(models.User).filter( + models.User.username == username).first() + + if not user: + user = models.User( + username=username, + email=email, + is_superuser=False) + + session.merge(user) + session.commit() + login_user(GlobusUser(user)) + session.commit() + + return redirect(next_url) + except AuthenticationError: + return redirect(url_for('airflow.noaccess')) + + + + +login_manager = GlobusAuthBackend() + + +def login(self, request): + return login_manager.login(request) \ No newline at end of file From ac338b5a23f640c92a6f87dba2b94e36b2d4a4a1 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Tue, 5 Jan 2021 10:24:35 -0500 Subject: [PATCH 16/63] entrypoint.sh update was cached but not committed --- docker/ingest-pipeline/entrypoint.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/ingest-pipeline/entrypoint.sh b/docker/ingest-pipeline/entrypoint.sh index 172b5171..d7533d78 100644 --- a/docker/ingest-pipeline/entrypoint.sh +++ b/docker/ingest-pipeline/entrypoint.sh @@ -9,13 +9,15 @@ export PATH=/home/airflow/.local/bin:$PATH # Install custom python package if requirements.txt is present pip install --upgrade pip -pip install --user 'apache-airflow[celery,crypto,postgres,redis,ssh]' +pip install --user flask-admin +pip install --user 'apache-airflow[celery,crypto,postgres,redis,ssh]<2.0.0' if [[ -e "/requirements.txt" ]]; then $(command -v pip) install --user -r /requirements.txt fi # Global defaults and back-compat export AIRFLOW__CORE__FERNET_KEY=`python -c 'from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)'` +export AIRFLOW__WEBSERVER__SECRET_KEY=`openssl rand -hex 30` # Load DAGs examples (default: Yes) if [[ -z "$AIRFLOW__CORE__LOAD_EXAMPLES" && "${LOAD_EX:=n}" == n ]]; then From 0f24f494434e06fca581ecceb8066703f2312ff4 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 5 Jan 2021 14:02:31 -0500 Subject: [PATCH 17/63] General: Add flask oauth --- src/ingest-pipeline/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ingest-pipeline/requirements.txt b/src/ingest-pipeline/requirements.txt index 24560b33..b5b2ed66 100644 --- a/src/ingest-pipeline/requirements.txt +++ b/src/ingest-pipeline/requirements.txt @@ -13,4 +13,5 @@ goodtables>=2.4.9 PyYAML>=5.3.1 rdflib==4.2.2 rdflib-jsonld==0.4.0 +Flask-OAuth==0.12 git+git://github.com/hubmapconsortium/cwltool.git@docker-gpu#egg=cwltool From 7db0456c4c773c9b04e116aa1fced1263ec11965 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 5 Jan 2021 14:23:46 -0500 Subject: [PATCH 18/63] General: Flask-OAuthlib != Flask-OAuth. Also add globus_sdk --- src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims | 2 +- src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline | 2 +- src/ingest-pipeline/requirements.txt | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims b/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims index 5224e3ab..bed64eb3 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims +++ b/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims @@ -1 +1 @@ -Subproject commit 5224e3ab90a5de2666764e1061acb2df2c590ec5 +Subproject commit bed64eb34f697c04cddc8ee085871d79523932ff diff --git a/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline b/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline index 8471d6e6..53415520 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline +++ b/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline @@ -1 +1 @@ -Subproject commit 8471d6e68c990d84106527d36f74ead3ab9c09b3 +Subproject commit 53415520f94ac0d9a1fae89af0f6e8250240723a diff --git a/src/ingest-pipeline/requirements.txt b/src/ingest-pipeline/requirements.txt index b5b2ed66..70cd46b2 100644 --- a/src/ingest-pipeline/requirements.txt +++ b/src/ingest-pipeline/requirements.txt @@ -13,5 +13,6 @@ goodtables>=2.4.9 PyYAML>=5.3.1 rdflib==4.2.2 rdflib-jsonld==0.4.0 -Flask-OAuth==0.12 +Flask-OAuthlib==0.9.6 +globus-sdk==1.10.0 git+git://github.com/hubmapconsortium/cwltool.git@docker-gpu#egg=cwltool From c7f63a24bb3ccdca653acc3a1b5918b53f32b33c Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 5 Jan 2021 16:38:23 -0500 Subject: [PATCH 19/63] General: Set authenticate to True --- docker/ingest-pipeline/config/airflow.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/ingest-pipeline/config/airflow.cfg b/docker/ingest-pipeline/config/airflow.cfg index 6af0c6ee..4959645e 100644 --- a/docker/ingest-pipeline/config/airflow.cfg +++ b/docker/ingest-pipeline/config/airflow.cfg @@ -352,7 +352,7 @@ expose_stacktrace = True # Set to true to turn on authentication: # https://airflow.apache.org/security.html#web-authentication -authenticate = False +authenticate = True # Filter the list of dags by owner name (requires authentication to be enabled) filter_by_owner = False From 258380e45a7b5b572f32aa8fd6a20569e5155af5 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Wed, 6 Jan 2021 13:26:34 -0500 Subject: [PATCH 20/63] General: Add globus authentication to the config for webserver auth. --- docker/ingest-pipeline/config/airflow.cfg | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker/ingest-pipeline/config/airflow.cfg b/docker/ingest-pipeline/config/airflow.cfg index 4959645e..13fd694e 100644 --- a/docker/ingest-pipeline/config/airflow.cfg +++ b/docker/ingest-pipeline/config/airflow.cfg @@ -354,6 +354,8 @@ expose_stacktrace = True # https://airflow.apache.org/security.html#web-authentication authenticate = True +auth_backend = globus_auth.globus_auth + # Filter the list of dags by owner name (requires authentication to be enabled) filter_by_owner = False @@ -1053,3 +1055,8 @@ fs_group = # The worker pods will be given these static labels, as well as some additional dynamic labels # to identify the task. # Should be supplied in the format: ``key = value`` + +[globus] +app_client_id = +app_client_secret = +oauth_callback_route = /oauth2callback \ No newline at end of file From a89823cf5ee619ff9e7ebe51fb30892fb23a9733 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Mon, 11 Jan 2021 16:25:25 -0500 Subject: [PATCH 21/63] General: Attempt to fix this --- .../plugins/globus_auth/globus_auth.py | 62 ++++++++----------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py index c74b76fa..87c89e70 100644 --- a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py +++ b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py @@ -78,53 +78,23 @@ def init_app(self, flask_app): self.login_manager.user_loader(self.load_user) - self.flask_app.add_url_rule(get_config_param('oauth_callback_route'), - 'globus_oauth_callback', - self.oauth_callback) - - def login(self, request): - log.debug('Redirecting user to Globus login') - return self.globus_oauth.oauth2_start_flow(url_for( - 'globus_oauth_callback', - _external=True)) - - def get_globus_user_profile_info(self, globus_token): - resp = self.globus_oauth.oauth2_userinfo() - - if not resp or resp.status != 200: - raise AuthenticationError( - 'Failed to fetch user profile, status ({0})'.format( - resp.status if resp else 'None')) - - return resp['name'], resp['email'] + self.flask_app.add_url_rule('/login', + 'login', + self.login) @provide_session - def load_user(self, userid, session=None): - if not userid or userid == 'None': - return None - - user = session.query(models.User).filter( - models.User.id == int(userid)).first() - return GlobusUser(user) - - @provide_session - def oauth_callback(self, session=None): - log.debug('Globus OAuth callback called') + def login(self, request=None, session=None): + log.debug('Redirecting user to Globus login') - next_url = request.args.get('state') or url_for('admin.index') + redirect_url = url_for('login', _external=True) - resp = self.globus_oauth.authorized_response() + self.globus_oauth.oauth2_start_flow(redirect_url) try: - # If there's no "code" query string parameter, we're in this route - # starting a Globus Auth login flow. - # Redirect out to Globus Auth if 'code' not in request.args: auth_uri = self.globus_oauth.oauth2_get_authorize_url(additional_params={ "scope": "openid profile email urn:globus:auth:scope:transfer.api.globus.org:all urn:globus:auth:scope:auth.globus.org:view_identities urn:globus:auth:scope:nexus.api.globus.org:groups"}) return redirect(auth_uri) - # If we do have a "code" param, we're coming back from Globus Auth - # and can start the process of exchanging an auth code for a token. else: code = request.args.get('code') tokens = self.globus_oauth.oauth2_exchange_code_for_tokens(code) @@ -152,12 +122,30 @@ def oauth_callback(self, session=None): login_user(GlobusUser(user)) session.commit() + next_url = request.args.get('state') or url_for('admin.index') return redirect(next_url) except AuthenticationError: return redirect(url_for('airflow.noaccess')) + def get_globus_user_profile_info(self, globus_token): + resp = self.globus_oauth.oauth2_userinfo() + + if not resp or resp.status != 200: + raise AuthenticationError( + 'Failed to fetch user profile, status ({0})'.format( + resp.status if resp else 'None')) + return resp['name'], resp['email'] + + @provide_session + def load_user(self, userid, session=None): + if not userid or userid == 'None': + return None + + user = session.query(models.User).filter( + models.User.id == int(userid)).first() + return GlobusUser(user) login_manager = GlobusAuthBackend() From af5fa52257968aac7764bf965eb6dde125322b6c Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Mon, 11 Jan 2021 18:04:11 -0500 Subject: [PATCH 22/63] Updates on the path to handling contributor and antibody .tsv files. WIP. --- .../generic_metadatatsv_data_collection.py | 23 +++++++++++++++---- .../metadatatsv_data_collection.py | 2 +- src/ingest-pipeline/md/metadata_extract.py | 2 +- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py b/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py index 891835c3..1274c1c5 100755 --- a/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py +++ b/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py @@ -16,7 +16,7 @@ class GenericMetadataTSVDataCollection(DataCollection): category_name = 'GENERICMETADATATSV'; - match_priority = 0.1 # >= 0.0; higher is better + match_priority = 2.0 # >= 0.0; higher is better top_target = None dir_regex = None @@ -80,11 +80,26 @@ def collect_metadata(self): rslt[os.path.relpath(fpath, self.topdir)] = this_md fname = os.path.basename(fpath) if 'metadata' in fname and fname.endswith('.tsv'): - assert isinstance(this_md, list), 'metadata...tsv did not produce a list' - cl.extend(this_md) + assert isinstance(this_md, list), 'metadata.tsv did not produce a list' + rec_list = this_md + for rec in rec_list: + for key in ['assay_type', 'data_path', 'contributors_path']: + assert key in rec, ('metadata.tsv does not have a' + '"{}" column'.format(key)) + this_dict = {'metadata': rec} + for sub_key, dict_key in [('contributors_path', 'contributors'), + ('antibodies_path', 'antibodies')]: + if sub_key in rec: + assert rec[sub_key].endswith('.tsv') + sub_path = os.path.join(os.path.dirname(fpath), + rec[sub_key]) + sub_parser = md_type_tbl['METADATATSV'](sub_path) + sub_md = sub_parser.collect_metadata() + this_dict[dict_key] = sub_md + cl.append(this_dict) rslt['components'] = cl - rslt['collectiontype'] = 'single_metadatatsv' + rslt['collectiontype'] = 'generic_metadatatsv' return rslt def basic_filter_metadata(self, raw_metadata): diff --git a/src/ingest-pipeline/md/data_collection_types/metadatatsv_data_collection.py b/src/ingest-pipeline/md/data_collection_types/metadatatsv_data_collection.py index a5332a9d..24f1397c 100755 --- a/src/ingest-pipeline/md/data_collection_types/metadatatsv_data_collection.py +++ b/src/ingest-pipeline/md/data_collection_types/metadatatsv_data_collection.py @@ -72,7 +72,7 @@ def collect_metadata(self): rslt[os.path.relpath(fpath, self.topdir)] = this_md fname = os.path.basename(fpath) if 'metadata' in fname and fname.endswith('.tsv'): - assert isinstance(this_md, list), 'metadata...tsv did not produce a list' + assert isinstance(this_md, list), 'metadata.tsv did not produce a list' cl.extend(this_md) rslt['components'] = cl diff --git a/src/ingest-pipeline/md/metadata_extract.py b/src/ingest-pipeline/md/metadata_extract.py index 7db77cbe..93c8958f 100755 --- a/src/ingest-pipeline/md/metadata_extract.py +++ b/src/ingest-pipeline/md/metadata_extract.py @@ -35,7 +35,7 @@ def scan(target_dir, out_fname, schema_fname, yaml_flag=False): for collection_type in _KNOWN_DATA_COLLECTION_TYPES: if collection_type.test_match(target_dir): - #print('collector match: ', collection_type.category_name) + print('collector match: ', collection_type.category_name) collector = collection_type(target_dir) metadata = collector.filter_metadata(collector.collect_metadata()) #print('collector: ', repr(collector)) From d445b85332eb856cf9447256384fb829d46879a7 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 12 Jan 2021 13:34:11 -0500 Subject: [PATCH 23/63] General: Have to integrate with AuthHelper to use some of their special functions. --- .../plugins/globus_auth/globus_auth.py | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py index 87c89e70..a7363ec4 100644 --- a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py +++ b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py @@ -14,6 +14,8 @@ from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin +from hubmap_commons.hm_auth import AuthHelper + log = LoggingMixin().log @@ -67,8 +69,12 @@ def __init__(self): self.flask_app = None self.globus_oauth = None self.api_rev = None + self.authHelper = None def init_app(self, flask_app): + client_id = get_config_param('APP_CLIENT_ID') + client_secret = get_config_param('APP_CLIENT_SECRET') + self.flask_app = flask_app self.login_manager.init_app(self.flask_app) @@ -82,8 +88,13 @@ def init_app(self, flask_app): 'login', self.login) + if not AuthHelper.isInitialized(): + self.authHelper = AuthHelper.create(clientId=client_id, clientSecret=client_secret) + else: + self.authHelper = AuthHelper.instance() + @provide_session - def login(self, request=None, session=None): + def login(self, session=None): log.debug('Redirecting user to Globus login') redirect_url = url_for('login', _external=True) @@ -99,15 +110,8 @@ def login(self, request=None, session=None): code = request.args.get('code') tokens = self.globus_oauth.oauth2_exchange_code_for_tokens(code) - globus_token = tokens.by_resource_server['auth.globus.org']['access_token'] + username, email = self.get_globus_user_profile_info(tokens.by_resource_server['auth.globus.org']['access_token']) - username, email = self.get_globus_user_profile_info(globus_token) - - # store the resulting tokens in the session - session.update( - tokens=tokens.by_resource_server, - is_authenticated=True - ) user = session.query(models.User).filter( models.User.username == username).first() @@ -124,19 +128,14 @@ def login(self, request=None, session=None): next_url = request.args.get('state') or url_for('admin.index') return redirect(next_url) - except AuthenticationError: + except Exception as e: + log.error(e) return redirect(url_for('airflow.noaccess')) - def get_globus_user_profile_info(self, globus_token): - resp = self.globus_oauth.oauth2_userinfo() - - if not resp or resp.status != 200: - raise AuthenticationError( - 'Failed to fetch user profile, status ({0})'.format( - resp.status if resp else 'None')) - - return resp['name'], resp['email'] + def get_globus_user_profile_info(self, token): + userInfo = self.authHelper.getUserInfo(token) + return userInfo['name'], userInfo['email'] @provide_session def load_user(self, userid, session=None): From dfb5a83706e91bef0a92f4e7ba94e8deed553142 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Wed, 13 Jan 2021 17:18:57 -0500 Subject: [PATCH 24/63] Added a specific TSV metadata type; modified generic_metadatatsv_data_collection to be more of a catch-all. --- .../airflow/dags/workflow_map.yml | 3 +++ .../md/data_collection_types/__init__.py | 14 ++++++++----- .../generic_metadatatsv_data_collection.py | 2 +- .../md/data_file_types/__init__.py | 3 ++- .../metadatatsv_metadata_file.py | 5 ++++- .../md/data_file_types/tsv_metadata_file.py | 20 +++++++++++++++++++ src/ingest-pipeline/md/metadata_extract.py | 2 +- 7 files changed, 40 insertions(+), 9 deletions(-) create mode 100755 src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py diff --git a/src/ingest-pipeline/airflow/dags/workflow_map.yml b/src/ingest-pipeline/airflow/dags/workflow_map.yml index eaff858a..3cbe5cc6 100644 --- a/src/ingest-pipeline/airflow/dags/workflow_map.yml +++ b/src/ingest-pipeline/airflow/dags/workflow_map.yml @@ -8,6 +8,9 @@ workflow_map: - 'collection_type': 'codex' 'assay_type': 'CODEX' 'workflow': 'codex_cytokit' + - 'collection_type': '.*' + 'assay_type': 'CODEX' + 'workflow': 'codex_cytokit' - 'collection_type': '.*' 'assay_type': 'MxIF' 'workflow': 'ometiff_pyramid' diff --git a/src/ingest-pipeline/md/data_collection_types/__init__.py b/src/ingest-pipeline/md/data_collection_types/__init__.py index 6ba001d9..72bbe002 100644 --- a/src/ingest-pipeline/md/data_collection_types/__init__.py +++ b/src/ingest-pipeline/md/data_collection_types/__init__.py @@ -6,8 +6,12 @@ from .metadatatsv_data_collection import MetadataTSVDataCollection from .generic_metadatatsv_data_collection import GenericMetadataTSVDataCollection -__all__ = [MetadataTSVDataCollection, - IMSDataCollection, RNASEQ10XDataCollection, - StanfordCODEXDataCollection, - AkoyaCODEXDataCollection, DEVTESTDataCollection, - GenericMetadataTSVDataCollection] +__all__ = [ + MetadataTSVDataCollection, + IMSDataCollection, + RNASEQ10XDataCollection, + StanfordCODEXDataCollection, + AkoyaCODEXDataCollection, + DEVTESTDataCollection, + GenericMetadataTSVDataCollection +] diff --git a/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py b/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py index 1274c1c5..37355b1b 100755 --- a/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py +++ b/src/ingest-pipeline/md/data_collection_types/generic_metadatatsv_data_collection.py @@ -93,7 +93,7 @@ def collect_metadata(self): assert rec[sub_key].endswith('.tsv') sub_path = os.path.join(os.path.dirname(fpath), rec[sub_key]) - sub_parser = md_type_tbl['METADATATSV'](sub_path) + sub_parser = md_type_tbl['TSV'](sub_path) sub_md = sub_parser.collect_metadata() this_dict[dict_key] = sub_md cl.append(this_dict) diff --git a/src/ingest-pipeline/md/data_file_types/__init__.py b/src/ingest-pipeline/md/data_file_types/__init__.py index e28cb59a..93146c1a 100644 --- a/src/ingest-pipeline/md/data_file_types/__init__.py +++ b/src/ingest-pipeline/md/data_file_types/__init__.py @@ -11,9 +11,10 @@ from .imzml_metadata_file import ImzMLMetadataFile from .fastq_metadata_file import FASTQMetadataFile from .csv_metadata_file import CSVMetadataFile +from .tsv_metadata_file import TSVMetadataFile from .metadatatsv_metadata_file import MetadataTSVMetadataFile __all__ = ["IgnoreMetadataFile", "YamlMetadataFile", "JSONMetadataFile", "TxtTformMetadataFile", "MtxTformMetadataFile", "CZIMetadataFile", "OMETiffMetadataFile", "ScnTiffMetadataFile", "ImzMLMetadataFile", "FASTQMetadataFile", "FalseJSONMetadataFile", "TxtWordListMetadataFile", - "CSVMetadataFile", "MetadataTSVMetadataFile"] + "CSVMetadataFile", "TSVMetadataFile", "MetadataTSVMetadataFile"] diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index a92b0c98..81a27611 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -16,7 +16,10 @@ class MetadataTSVMetadataFile(MetadataFile): def collect_metadata(self): print('validating {} as metadata.tsv'.format(self.path)) dirpath = Path(os.path.dirname(self.path)) - submission = ingest_validation_tools_submission.Submission(directory_path=dirpath) + ignore_globs = [os.path.basename(self.path), 'extras'] + submission = ingest_validation_tools_submission.Submission(directory_path=dirpath, + dataset_ignore_globs=ignore_globs, + submission_ignore_globs='*') report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) if report.errors: # Scan reports an error result diff --git a/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py new file mode 100755 index 00000000..82d93135 --- /dev/null +++ b/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py @@ -0,0 +1,20 @@ +#! /usr/bin/env python + +from metadata_file import MetadataFile +import csv +from pprint import pprint + +class TSVMetadataFile(MetadataFile): + """ + A metadata file type specialized for tsv files, since the csv sniffer often fails + """ + category_name = 'TSV'; + + def collect_metadata(self): + print('parsing csv from %s' % self.path) + md = [] + with open(self.path, 'rU', newline='') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + md.append({k : v for k, v in row.items()}) + return md diff --git a/src/ingest-pipeline/md/metadata_extract.py b/src/ingest-pipeline/md/metadata_extract.py index 93c8958f..b3265045 100755 --- a/src/ingest-pipeline/md/metadata_extract.py +++ b/src/ingest-pipeline/md/metadata_extract.py @@ -83,4 +83,4 @@ def main(myargv=None): sys.exit(f'{type(e).__name__}: {e}') if __name__ == '__main__': - main() \ No newline at end of file + main() From 9fed60464df13ca95bc11797ca30e296d72dbccf Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Wed, 13 Jan 2021 19:16:04 -0500 Subject: [PATCH 25/63] require 'ascii' codex when reading .tsv files --- .../md/data_file_types/tsv_metadata_file.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py index 82d93135..e3906d03 100755 --- a/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py @@ -1,6 +1,7 @@ #! /usr/bin/env python from metadata_file import MetadataFile +from type_base import MetadataError import csv from pprint import pprint @@ -13,8 +14,11 @@ class TSVMetadataFile(MetadataFile): def collect_metadata(self): print('parsing csv from %s' % self.path) md = [] - with open(self.path, 'rU', newline='') as f: - reader = csv.DictReader(f, delimiter='\t') - for row in reader: - md.append({k : v for k, v in row.items()}) + try: + with open(self.path, 'rU', newline='', encoding='ascii') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + md.append({k : v for k, v in row.items()}) + except UnicodeDecodeError as e: + raise MetadataError(str(e) + f'in {self.path}') return md From 561dfe430aa6ef2511d75c4e970c592909429c94 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Thu, 14 Jan 2021 15:49:32 -0500 Subject: [PATCH 26/63] refactor tsv parsing, fix assay_type from scan_and_begin_processing --- .../airflow/dags/scan_and_begin_processing.py | 11 ++++++---- .../metadatatsv_metadata_file.py | 20 +++---------------- .../md/data_file_types/tsv_metadata_file.py | 9 ++++++++- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/scan_and_begin_processing.py b/src/ingest-pipeline/airflow/dags/scan_and_begin_processing.py index e7018988..d35c94dc 100644 --- a/src/ingest-pipeline/airflow/dags/scan_and_begin_processing.py +++ b/src/ingest-pipeline/airflow/dags/scan_and_begin_processing.py @@ -99,10 +99,13 @@ def send_status_msg(**kwargs): value=(scanned_md['collectiontype'] if 'collectiontype' in scanned_md else None)) - kwargs['ti'].xcom_push(key='assay_type', - value=(scanned_md['assay_type'] - if 'assay_type' in scanned_md - else None)) + if 'assay_type' in scanned_md: + assay_type = scanned_md['assay_type'] + elif 'metadata' in scanned_md and 'assay_type' in scanned_md['metadata']: + assay_type = scanned_md['metadata']['assay_type'] + else: + assay_type = None + kwargs['ti'].xcom_push(key='assay_type', value=assay_type) else: for op in retcode_ops: if retcode_dct[op]: diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index 81a27611..e89bf6d2 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -3,11 +3,11 @@ import csv import os from pathlib import Path -from metadata_file import MetadataFile +from .tsv_metadata_file import TSVMetadataFile from type_base import MetadataError from submodules import ingest_validation_tools_submission, ingest_validation_tools_error_report -class MetadataTSVMetadataFile(MetadataFile): +class MetadataTSVMetadataFile(TSVMetadataFile): """ A metadata file type for the specialized metadata.tsv files used to store submission info """ @@ -27,20 +27,6 @@ def collect_metadata(self): f.write(report.as_text()) raise MetadataError('{} failed ingest validation test'.format(self.path)) print('parsing metadatatsv from {}'.format(self.path)) - md = [] - with open(self.path, 'rU', newline='') as f: - dialect = csv.Sniffer().sniff(f.read(256)) - f.seek(0) - reader = csv.DictReader(f, dialect=dialect) - for row in reader: - dct = {k : v for k, v in row.items()} - dct['_from_metadatatsv'] = True - md.append(dct) + md = super(self, MetadataTSVMetadataFile).collect_metadata() - # Scan for the common error of bad keys/values due to missing delimiters - for row in md: - if any(k in [None, ''] for k in row) or any(v is None for v in row.values()): - raise MetadataError('{} has empty keys or values. Delimiter error?' - .format(self.path)) - return md diff --git a/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py index e3906d03..54c18f93 100755 --- a/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/tsv_metadata_file.py @@ -12,7 +12,7 @@ class TSVMetadataFile(MetadataFile): category_name = 'TSV'; def collect_metadata(self): - print('parsing csv from %s' % self.path) + print('parsing tsv from %s' % self.path) md = [] try: with open(self.path, 'rU', newline='', encoding='ascii') as f: @@ -21,4 +21,11 @@ def collect_metadata(self): md.append({k : v for k, v in row.items()}) except UnicodeDecodeError as e: raise MetadataError(str(e) + f'in {self.path}') + + # Scan for the common error of bad keys/values due to missing delimiters + for row in md: + if any(k in [None, ''] for k in row) or any(v is None for v in row.values()): + raise MetadataError('{} has empty keys or values. Delimiter error?' + .format(self.path)) + return md From bf6584fa8fdfd21577692e3160e82f182baa6a30 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Thu, 14 Jan 2021 18:56:28 -0500 Subject: [PATCH 27/63] syntax debugging. WIP. --- .../md/data_file_types/metadatatsv_metadata_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index e89bf6d2..97d8724b 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -27,6 +27,6 @@ def collect_metadata(self): f.write(report.as_text()) raise MetadataError('{} failed ingest validation test'.format(self.path)) print('parsing metadatatsv from {}'.format(self.path)) - md = super(self, MetadataTSVMetadataFile).collect_metadata() + md = super().collect_metadata() return md From 137230a608824b7a1a22b4e1a35ab6d57dc960b4 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Fri, 15 Jan 2021 18:19:42 -0500 Subject: [PATCH 28/63] Added commented-out offline=True to ingest-validation-tools call --- .../md/data_file_types/metadatatsv_metadata_file.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index 97d8724b..b1aefec0 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -17,9 +17,14 @@ def collect_metadata(self): print('validating {} as metadata.tsv'.format(self.path)) dirpath = Path(os.path.dirname(self.path)) ignore_globs = [os.path.basename(self.path), 'extras'] + # + # Uncomment offline=True below to avoid validating orcid_id URLs &etc + # submission = ingest_validation_tools_submission.Submission(directory_path=dirpath, dataset_ignore_globs=ignore_globs, - submission_ignore_globs='*') + submission_ignore_globs='*', + #offline=True + ) report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) if report.errors: # Scan reports an error result From 360bc1e07f6ea82f8137c17c5f4c2d496b24a24e Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Sat, 16 Jan 2021 00:58:58 -0500 Subject: [PATCH 29/63] offline=True, add_notes=False sent to ingest-validation-tools --- .../md/data_file_types/metadatatsv_metadata_file.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index b1aefec0..477c99f5 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -23,11 +23,12 @@ def collect_metadata(self): submission = ingest_validation_tools_submission.Submission(directory_path=dirpath, dataset_ignore_globs=ignore_globs, submission_ignore_globs='*', - #offline=True + #offline=True, + add_notes=False ) - report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) - if report.errors: + if submission.get_errors(): # Scan reports an error result + report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) with open('ingest_validation_tools_report.txt', 'w') as f: f.write(report.as_text()) raise MetadataError('{} failed ingest validation test'.format(self.path)) From c94d5d8f324f27584df9e96035e90e2b57350b66 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Wed, 20 Jan 2021 16:19:36 -0500 Subject: [PATCH 30/63] Clean up requirements.txt a bit. --- src/ingest-pipeline/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ingest-pipeline/requirements.txt b/src/ingest-pipeline/requirements.txt index 2562f0f0..89120e88 100644 --- a/src/ingest-pipeline/requirements.txt +++ b/src/ingest-pipeline/requirements.txt @@ -4,7 +4,6 @@ pylibczi>=1.1.1 #tifffile==2020.12.8 tifffile xmltodict>=0.12.0 -#pyimzml>=1.3.0 pyimzml>=1.2.6 airflow-multi-dagrun>=1.2 jsonschema==3.2.0 From de4f1bd06d02ab2f8f1235d0903642cdf3ddc70a Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Wed, 20 Jan 2021 16:22:23 -0500 Subject: [PATCH 31/63] set ingest-validation-tools to master HEAD, to separate those updates from this. --- src/ingest-pipeline/submodules/ingest-validation-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/submodules/ingest-validation-tools b/src/ingest-pipeline/submodules/ingest-validation-tools index 86a8ecc5..1f5f9e16 160000 --- a/src/ingest-pipeline/submodules/ingest-validation-tools +++ b/src/ingest-pipeline/submodules/ingest-validation-tools @@ -1 +1 @@ -Subproject commit 86a8ecc514f9bd32b792400543dca0a28f06ef1e +Subproject commit 1f5f9e16b40a7138894bec42c4022ff9fe572b6b From 0b8bca5ff1cb9655214aae8f1c6615d1eb58f641 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Sat, 23 Jan 2021 00:59:59 -0500 Subject: [PATCH 32/63] Add path to plugin tests to the call to ingest-validaiton-tools. WIP. --- .../md/data_file_types/metadatatsv_metadata_file.py | 6 +++++- src/ingest-pipeline/submodules/__init__.py | 9 +++++++-- src/ingest-pipeline/submodules/ingest-validation-tests | 1 + 3 files changed, 13 insertions(+), 3 deletions(-) create mode 160000 src/ingest-pipeline/submodules/ingest-validation-tests diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index 477c99f5..21aeee9f 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -5,7 +5,9 @@ from pathlib import Path from .tsv_metadata_file import TSVMetadataFile from type_base import MetadataError -from submodules import ingest_validation_tools_submission, ingest_validation_tools_error_report +from submodules import (ingest_validation_tools_submission, + ingest_validation_tools_error_report, + ingest_validation_tests) class MetadataTSVMetadataFile(TSVMetadataFile): """ @@ -17,12 +19,14 @@ def collect_metadata(self): print('validating {} as metadata.tsv'.format(self.path)) dirpath = Path(os.path.dirname(self.path)) ignore_globs = [os.path.basename(self.path), 'extras'] + plugin_path = [path for path in ingest_validation_tests.__path__][0] # # Uncomment offline=True below to avoid validating orcid_id URLs &etc # submission = ingest_validation_tools_submission.Submission(directory_path=dirpath, dataset_ignore_globs=ignore_globs, submission_ignore_globs='*', + plugin_path=plugin_path, #offline=True, add_notes=False ) diff --git a/src/ingest-pipeline/submodules/__init__.py b/src/ingest-pipeline/submodules/__init__.py index 18f3d683..53d546c3 100644 --- a/src/ingest-pipeline/submodules/__init__.py +++ b/src/ingest-pipeline/submodules/__init__.py @@ -4,11 +4,16 @@ sys.path.append(os.path.join(os.path.dirname(__file__), 'ingest-validation-tools', 'src')) +sys.path.append(os.path.join(os.path.dirname(__file__), + 'ingest-validation-tests', 'src')) ingest_validation_tools_submission = import_module('ingest_validation_tools.submission') ingest_validation_tools_error_report = import_module('ingest_validation_tools.error_report') ingest_validation_tools_validation_utils = import_module('ingest_validation_tools.validation_utils') -__all__ = ["ingest_validation_tools_validation_utils", "ingest_validation_tools_submission", - "ingest_validation_tools_error_report"] +ingest_validation_tests = import_module('ingest_validation_tests') +__all__ = ["ingest_validation_tools_validation_utils", + "ingest_validation_tools_submission", + "ingest_validation_tools_error_report", + "ingest_validation_tests"] diff --git a/src/ingest-pipeline/submodules/ingest-validation-tests b/src/ingest-pipeline/submodules/ingest-validation-tests new file mode 160000 index 00000000..f8222a57 --- /dev/null +++ b/src/ingest-pipeline/submodules/ingest-validation-tests @@ -0,0 +1 @@ +Subproject commit f8222a57313f5aff9a731694de3d0d1b3f3440f8 From 07b7cdbd20ccbd3684cb70f7069ada5870d7aaf8 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Tue, 26 Jan 2021 16:53:06 -0500 Subject: [PATCH 33/63] Fix named argument typo in metadatatsv_metadata_file --- .../md/data_file_types/metadatatsv_metadata_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py index 21aeee9f..1a46d10d 100755 --- a/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py +++ b/src/ingest-pipeline/md/data_file_types/metadatatsv_metadata_file.py @@ -26,7 +26,7 @@ def collect_metadata(self): submission = ingest_validation_tools_submission.Submission(directory_path=dirpath, dataset_ignore_globs=ignore_globs, submission_ignore_globs='*', - plugin_path=plugin_path, + plugin_directory=plugin_path, #offline=True, add_notes=False ) From dd4f8aebb1ceb008cfa4fc51992c8c17893928f6 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Wed, 27 Jan 2021 17:48:42 -0500 Subject: [PATCH 34/63] Added a schema for validation test input JSON --- .../schemata/validation_test_schema.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/ingest-pipeline/schemata/validation_test_schema.yml diff --git a/src/ingest-pipeline/schemata/validation_test_schema.yml b/src/ingest-pipeline/schemata/validation_test_schema.yml new file mode 100644 index 00000000..9bd25e06 --- /dev/null +++ b/src/ingest-pipeline/schemata/validation_test_schema.yml @@ -0,0 +1,17 @@ +'$schema': 'http://json-schema.org/schema#' +'$id': 'http://schemata.hubmapconsortium.org/validation_test_schema.yml' +'title': 'validation_test metadata schema' +'description': 'validation_test metadata schema' + +'allOf': [{'$ref': '#/definitions/validation_test_metadata'}] + +'definitions': + + 'validation_test_metadata': + 'type': 'object' + 'properties': + 'uuid': + 'type': 'string' + 'description': 'a dataset uuid or DOI' + 'required': ['uuid'] + From 22417c00dfa78d19fd3bddf2a6045bd0e3ea46f5 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Wed, 27 Jan 2021 17:51:47 -0500 Subject: [PATCH 35/63] Bump ingest-validation-tools submodule to current master HEAD, 9ceef0e --- src/ingest-pipeline/submodules/ingest-validation-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/submodules/ingest-validation-tools b/src/ingest-pipeline/submodules/ingest-validation-tools index 1f5f9e16..9ceef0e4 160000 --- a/src/ingest-pipeline/submodules/ingest-validation-tools +++ b/src/ingest-pipeline/submodules/ingest-validation-tools @@ -1 +1 @@ -Subproject commit 1f5f9e16b40a7138894bec42c4022ff9fe572b6b +Subproject commit 9ceef0e4f914806bdd3e163d278151746bbd8d93 From 83e9a7edca9c8660d9389e0bdda9d2ee4780fa4a Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Thu, 28 Jan 2021 00:42:24 -0500 Subject: [PATCH 36/63] First pass validation test dag. WIP. --- .../airflow/dags/validation_test.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 src/ingest-pipeline/airflow/dags/validation_test.py diff --git a/src/ingest-pipeline/airflow/dags/validation_test.py b/src/ingest-pipeline/airflow/dags/validation_test.py new file mode 100644 index 00000000..b5912220 --- /dev/null +++ b/src/ingest-pipeline/airflow/dags/validation_test.py @@ -0,0 +1,135 @@ +import sys +import os +import yaml +import json +import ast +from pathlib import Path +from pprint import pprint +from datetime import datetime, timedelta + +from airflow import DAG +from airflow.configuration import conf as airflow_conf +from airflow.operators.bash_operator import BashOperator +from airflow.operators.python_operator import PythonOperator +from airflow.operators.python_operator import BranchPythonOperator +from airflow.operators.dummy_operator import DummyOperator +from airflow.operators.dagrun_operator import TriggerDagRunOperator, DagRunOrder +from airflow.operators.multi_dagrun import TriggerMultiDagRunOperator +from airflow.hooks.http_hook import HttpHook + +from hubmap_operators.flex_multi_dag_run import FlexMultiDagRunOperator +import utils + +from utils import localized_assert_json_matches_schema as assert_json_matches_schema + + +def get_src_path(**kwargs): + rslt = airflow_conf.as_dict()['connections']['SRC_PATH'] + return rslt.strip("'").strip('"') + + +# Following are defaults which can be overridden later on +default_args = { + 'owner': 'hubmap', + 'depends_on_past': False, + 'start_date': datetime(2019, 1, 1), + 'email': ['joel.welling@gmail.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=1), + 'xcom_push': True, + 'queue': utils.map_queue_name('general') +} + + +with DAG('validation_test', + schedule_interval=None, + is_paused_upon_creation=False, + default_args=default_args, + user_defined_macros={'get_src_path' : get_src_path} + ) as dag: + + def find_uuid(**kwargs): + try: + assert_json_matches_schema(kwargs['dag_run'].conf, + 'validation_test_schema.yml') + except AssertionError as e: + print('invalid metadata follows:') + pprint(kwargs['dag_run'].conf) + raise + + uuid = kwargs['dag_run'].conf['uuid'] + my_callable = lambda **kwargs: uuid + rslt=utils.pythonop_get_dataset_state(dataset_uuid_callable=my_callable, + http_conn_id='ingest_api_connection', + **kwargs) + if not rslt: + raise AirflowException(f'Invalid uuid/doi for group: {uuid}') + print('rslt:') + pprint(rslt) + assert 'dataset' in rslt, f"Status for {uuid} has no dataset entry" + ds_rslt = rslt['dataset'] + + for key in ['status', 'uuid', 'data_types', 'local_directory_full_path']: + assert key in ds_rslt, f"Dataset status for {uuid} has no {key}" + + if not ds_rslt['status'] in ['New', 'Invalid']: + raise AirflowException(f'Dataset {uuid} is not New or Invalid') + + dt = ds_rslt['data_types'] + if isinstance(dt, str) and dt.startswith('[') and dt.endswith(']'): + dt = ast.literal_eval(dt) + print(f'parsed dt: {dt}') + if isinstance(dt, list): + if dt: + if len(dt) == 1: + filtered_data_types = [dt[0]] + else: + filtered_data_types = [tuple(dt)] + else: + raise AirflowException(f'Dataset data_types for {uuid} is empty') + else: + filtered_data_types = [dt] + + lz_path = ds_rslt['local_directory_full_path'] + uuid = ds_rslt['uuid'] # in case the original 'uuid' was actually a DOI + print(f'Finished uuid {uuid}') + print(f'filtered data types: {filtered_data_types}') + print(f'lz path: {lz_path}') + kwargs['ti'].xcom_push(key='assay_type', value=filtered_data_types) + kwargs['ti'].xcom_push(key='lz_path', value=lz_path) + kwargs['ti'].xcom_push(key='uuid', value=uuid) + + t_find_uuid = PythonOperator( + task_id='find_uuid', + python_callable=find_uuid, + provide_context=True, + op_kwargs={ + 'crypt_auth_tok' : utils.encrypt_tok(airflow_conf.as_dict() + ['connections']['APP_CLIENT_SECRET']).decode(), + } + ) + + + t_run_md_extract = BashOperator( + task_id='run_md_extract', + bash_command=""" \ + lz_dir="{{ti.xcom_pull(task_ids='find_uuid', key='lz_path')}}" \ + src_dir="{{get_src_path()}}/md" ; \ + top_dir="{{get_src_path()}}" ; \ + cd "$lz_dir" ; \ + env PYTHONPATH=${PYTHONPATH}:$top_dir \ + python $src_dir/metadata_extract.py --out /dev/null "$lz_dir" \ + > session.log 2> error.log ; \ + echo $? ; \ + if [ -s error.log ] ; \ + then echo 'ERROR!' `cat error.log` >> session.log ; \ + else rm error.log ; \ + fi + """ + ) + + + (dag >> t_find_uuid >> t_run_md_extract) + From 121674efe84da6606acc5c0cf7c3b6f345ad90c1 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Thu, 28 Jan 2021 16:14:49 -0500 Subject: [PATCH 37/63] General: Update app.cfg.example with the needed vars --- src/ingest-pipeline/instance/app.cfg.example | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ingest-pipeline/instance/app.cfg.example b/src/ingest-pipeline/instance/app.cfg.example index bb8bde89..6927bca3 100644 --- a/src/ingest-pipeline/instance/app.cfg.example +++ b/src/ingest-pipeline/instance/app.cfg.example @@ -34,3 +34,8 @@ OUTPUT_GROUP_NAME = 'dataaccessgroup' # Optional template for use in customizing queue names, for better Celery sharing #QUEUE_NAME_TEMPLATE = '{}-test' +[globus] +app_client_id = +app_client_secret = +oauth_callback_route = /oauth2callback + From e89c55e1e59d485fc114423e4a8a76bd5cdfc3f8 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Thu, 28 Jan 2021 19:27:53 -0500 Subject: [PATCH 38/63] Airflow was being included by regenerate_venv; moved to requirements.txt and added version constraint --- src/ingest-pipeline/misc/tools/regenerate_venv.sh | 1 - src/ingest-pipeline/requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/misc/tools/regenerate_venv.sh b/src/ingest-pipeline/misc/tools/regenerate_venv.sh index ff11326d..b2e3bca5 100644 --- a/src/ingest-pipeline/misc/tools/regenerate_venv.sh +++ b/src/ingest-pipeline/misc/tools/regenerate_venv.sh @@ -38,7 +38,6 @@ scl enable rh-python36 bash <=1.1.1 tifffile xmltodict>=0.12.0 pyimzml>=1.2.6 +apache-airflow[celery,crypto,postgres,redis,ssh]<2.0.0 airflow-multi-dagrun>=1.2 jsonschema==3.2.0 fastjsonschema==2.14.2 From d6acb09cf1d54faf7ce76fb45489d7555d9cbac9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 29 Jan 2021 13:37:43 -0500 Subject: [PATCH 39/63] Update portal-containers --- src/ingest-pipeline/airflow/dags/cwl/portal-containers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/portal-containers b/src/ingest-pipeline/airflow/dags/cwl/portal-containers index 6a89ea66..f6190efc 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/portal-containers +++ b/src/ingest-pipeline/airflow/dags/cwl/portal-containers @@ -1 +1 @@ -Subproject commit 6a89ea661169f8210ea785ed9edefac0857e0475 +Subproject commit f6190efc24b11543da13e4a6f20db43bb16d69be From baa9d165b847e5002cdca0f849135999877c3624 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 29 Jan 2021 13:38:56 -0500 Subject: [PATCH 40/63] Swap out sprm-to-json for anndata. --- src/ingest-pipeline/airflow/dags/codex_cytokit.py | 2 +- src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq | 1 + src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) create mode 160000 src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq create mode 160000 src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq diff --git a/src/ingest-pipeline/airflow/dags/codex_cytokit.py b/src/ingest-pipeline/airflow/dags/codex_cytokit.py index 49b30e47..acf86eb5 100644 --- a/src/ingest-pipeline/airflow/dags/codex_cytokit.py +++ b/src/ingest-pipeline/airflow/dags/codex_cytokit.py @@ -54,7 +54,7 @@ cwl_workflows = get_absolute_workflows( Path(pipeline_name, 'pipeline.cwl'), Path('portal-containers', 'ome-tiff-offsets.cwl'), - Path('portal-containers', 'sprm-to-json.cwl'), + Path('portal-containers', 'sprm-to-anndata.cwl'), ) def build_dataset_name(**kwargs): diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq new file mode 160000 index 00000000..cbfa5a74 --- /dev/null +++ b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq @@ -0,0 +1 @@ +Subproject commit cbfa5a745d542a78246a492de037a1a13df4e3a6 diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq new file mode 160000 index 00000000..536d6ed4 --- /dev/null +++ b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq @@ -0,0 +1 @@ +Subproject commit 536d6ed4bc639b93a8d8bc1c1a694b848d4d6f32 From 80d9b9c9271013a9702c171966436a73a20dc1aa Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Mon, 1 Feb 2021 11:20:25 -0500 Subject: [PATCH 41/63] General: Set the appropriate callback route. --- src/ingest-pipeline/instance/app.cfg.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/instance/app.cfg.example b/src/ingest-pipeline/instance/app.cfg.example index 6927bca3..d29e8c55 100644 --- a/src/ingest-pipeline/instance/app.cfg.example +++ b/src/ingest-pipeline/instance/app.cfg.example @@ -37,5 +37,5 @@ OUTPUT_GROUP_NAME = 'dataaccessgroup' [globus] app_client_id = app_client_secret = -oauth_callback_route = /oauth2callback +oauth_callback_route = /login From bbc335b438e8393df90e6f3c9ca6650758150722 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Feb 2021 15:56:40 -0500 Subject: [PATCH 42/63] Add json back. --- src/ingest-pipeline/airflow/dags/codex_cytokit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ingest-pipeline/airflow/dags/codex_cytokit.py b/src/ingest-pipeline/airflow/dags/codex_cytokit.py index acf86eb5..93c639af 100644 --- a/src/ingest-pipeline/airflow/dags/codex_cytokit.py +++ b/src/ingest-pipeline/airflow/dags/codex_cytokit.py @@ -55,6 +55,7 @@ Path(pipeline_name, 'pipeline.cwl'), Path('portal-containers', 'ome-tiff-offsets.cwl'), Path('portal-containers', 'sprm-to-anndata.cwl'), + Path('portal-containers', 'sprm-to-json.cwl'), ) def build_dataset_name(**kwargs): From 9772a85ca1c483bab973d8ad403034cbdf3b2718 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Mon, 1 Feb 2021 16:32:03 -0500 Subject: [PATCH 43/63] General: Add scheme variable so as to not break localhost deployments --- .../airflow/plugins/globus_auth/globus_auth.py | 4 ++-- src/ingest-pipeline/instance/app.cfg.example | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py index a7363ec4..3fbead02 100644 --- a/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py +++ b/src/ingest-pipeline/airflow/plugins/globus_auth/globus_auth.py @@ -97,7 +97,7 @@ def init_app(self, flask_app): def login(self, session=None): log.debug('Redirecting user to Globus login') - redirect_url = url_for('login', _external=True) + redirect_url = url_for('login', _external=True, _scheme=get_config_param('scheme')) self.globus_oauth.oauth2_start_flow(redirect_url) @@ -126,7 +126,7 @@ def login(self, session=None): login_user(GlobusUser(user)) session.commit() - next_url = request.args.get('state') or url_for('admin.index') + next_url = url_for('admin.index') return redirect(next_url) except Exception as e: log.error(e) diff --git a/src/ingest-pipeline/instance/app.cfg.example b/src/ingest-pipeline/instance/app.cfg.example index d29e8c55..2a2168b5 100644 --- a/src/ingest-pipeline/instance/app.cfg.example +++ b/src/ingest-pipeline/instance/app.cfg.example @@ -38,4 +38,6 @@ OUTPUT_GROUP_NAME = 'dataaccessgroup' app_client_id = app_client_secret = oauth_callback_route = /login +# If you are running localhost, set this to http. Otherwise, set this to https. +scheme = https From 80ec4dc25996a6c0af06af5ccc94d140deb7278e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Feb 2021 20:00:18 -0500 Subject: [PATCH 44/63] Add steps. --- .../airflow/dags/codex_cytokit.py | 57 ++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/codex_cytokit.py b/src/ingest-pipeline/airflow/dags/codex_cytokit.py index 93c639af..8e1867e5 100644 --- a/src/ingest-pipeline/airflow/dags/codex_cytokit.py +++ b/src/ingest-pipeline/airflow/dags/codex_cytokit.py @@ -54,8 +54,8 @@ cwl_workflows = get_absolute_workflows( Path(pipeline_name, 'pipeline.cwl'), Path('portal-containers', 'ome-tiff-offsets.cwl'), - Path('portal-containers', 'sprm-to-anndata.cwl'), Path('portal-containers', 'sprm-to-json.cwl'), + Path('portal-containers', 'sprm-to-anndata.cwl'), ) def build_dataset_name(**kwargs): @@ -215,10 +215,60 @@ def build_cwltool_cmd3(**kwargs): task_id='maybe_keep_cwl3', python_callable=utils.pythonop_maybe_keep, provide_context=True, - op_kwargs = {'next_op' : 'move_data', + op_kwargs = {'next_op' : 'prepare_cwl4', 'bail_op' : 'set_dataset_error', 'test_op' : 'pipeline_exec_cwl3'} ) + + prepare_cwl4 = DummyOperator( + task_id='prepare_cwl4' + ) + + def build_cwltool_cmd4(**kwargs): + ctx = kwargs['dag_run'].conf + run_id = kwargs['run_id'] + tmpdir = utils.get_tmp_dir_path(run_id) + print('tmpdir: ', tmpdir) + parent_data_dir = ctx['parent_lz_path'] + print('parent_data_dir: ', parent_data_dir) + data_dir = tmpdir / 'cwl_out' # This stage reads input from stage 1 + print('data_dir: ', data_dir) + + command = [ + *get_cwltool_base_cmd(tmpdir), + cwl_workflows[3], + '--input_dir', + data_dir / 'sprm_outputs', + ] + + return join_quote_command_str(command) + + + t_build_cmd4 = PythonOperator( + task_id='build_cmd4', + python_callable=build_cwltool_cmd3, + provide_context=True, + ) + + + t_pipeline_exec_cwl4 = BashOperator( + task_id='pipeline_exec_cwl4', + bash_command=""" \ + tmp_dir={{tmp_dir_path(run_id)}} ; \ + cd ${tmp_dir}/cwl_out ; \ + {{ti.xcom_pull(task_ids='build_cmd4')}} >> ${tmp_dir}/session.log 2>&1 ; \ + echo $? + """ + ) + + t_maybe_keep_cwl4 = BranchPythonOperator( + task_id='maybe_keep_cwl4', + python_callable=utils.pythonop_maybe_keep, + provide_context=True, + op_kwargs = {'next_op' : 'move_data', + 'bail_op' : 'set_dataset_error', + 'test_op' : 'pipeline_exec_cwl4'} + ) t_send_create_dataset = PythonOperator( @@ -266,6 +316,7 @@ def build_cwltool_cmd3(**kwargs): 'pipeline_exec_cwl1', 'pipeline_exec_cwl2', 'pipeline_exec_cwl3', + 'pipeline_exec_cwl4', 'move_data', ], cwl_workflows=cwl_workflows, @@ -288,10 +339,12 @@ def build_cwltool_cmd3(**kwargs): >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec_cwl1 >> t_maybe_keep_cwl1 >> prepare_cwl2 >> t_build_cmd2 >> t_pipeline_exec_cwl2 >> t_maybe_keep_cwl2 >> prepare_cwl3 >> t_build_cmd3 >> t_pipeline_exec_cwl3 >> t_maybe_keep_cwl3 + >> prepare_cwl4 >> t_build_cmd4 >> t_pipeline_exec_cwl4 >> t_maybe_keep_cwl4 >> t_move_data >> t_expand_symlinks >> t_send_status >> t_join) t_maybe_keep_cwl1 >> t_set_dataset_error t_maybe_keep_cwl2 >> t_set_dataset_error t_maybe_keep_cwl3 >> t_set_dataset_error + t_maybe_keep_cwl4 >> t_set_dataset_error t_set_dataset_error >> t_join t_join >> t_cleanup_tmpdir From 81f3e676dfa28d5cbbf4f023237744fba827c7f2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Feb 2021 20:02:12 -0500 Subject: [PATCH 45/63] Small fix --- src/ingest-pipeline/airflow/dags/codex_cytokit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/codex_cytokit.py b/src/ingest-pipeline/airflow/dags/codex_cytokit.py index 8e1867e5..b289709a 100644 --- a/src/ingest-pipeline/airflow/dags/codex_cytokit.py +++ b/src/ingest-pipeline/airflow/dags/codex_cytokit.py @@ -246,7 +246,7 @@ def build_cwltool_cmd4(**kwargs): t_build_cmd4 = PythonOperator( task_id='build_cmd4', - python_callable=build_cwltool_cmd3, + python_callable=build_cwltool_cmd4, provide_context=True, ) From 0e6a44123aa524e99f259455e811cce15f5800d2 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 2 Feb 2021 09:20:22 -0500 Subject: [PATCH 46/63] General: Add the webserver configuration options. --- src/ingest-pipeline/instance/app.cfg.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ingest-pipeline/instance/app.cfg.example b/src/ingest-pipeline/instance/app.cfg.example index 2a2168b5..0b5ff258 100644 --- a/src/ingest-pipeline/instance/app.cfg.example +++ b/src/ingest-pipeline/instance/app.cfg.example @@ -41,3 +41,6 @@ oauth_callback_route = /login # If you are running localhost, set this to http. Otherwise, set this to https. scheme = https +[webserver] +authenticate = True +auth_backend = globus_auth.globus_auth From e127e8cd2c35cb338f68875d7d991d73e9796485 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 2 Feb 2021 14:00:31 -0500 Subject: [PATCH 47/63] Config: Remove the authenticate var from app.cfg and remove other configs from airflow.cfg --- docker/ingest-pipeline/config/airflow.cfg | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/docker/ingest-pipeline/config/airflow.cfg b/docker/ingest-pipeline/config/airflow.cfg index 13fd694e..78dc8507 100644 --- a/docker/ingest-pipeline/config/airflow.cfg +++ b/docker/ingest-pipeline/config/airflow.cfg @@ -354,7 +354,7 @@ expose_stacktrace = True # https://airflow.apache.org/security.html#web-authentication authenticate = True -auth_backend = globus_auth.globus_auth +#auth_backend = # Filter the list of dags by owner name (requires authentication to be enabled) filter_by_owner = False @@ -1054,9 +1054,4 @@ fs_group = # The Key-value pairs to be given to worker pods. # The worker pods will be given these static labels, as well as some additional dynamic labels # to identify the task. -# Should be supplied in the format: ``key = value`` - -[globus] -app_client_id = -app_client_secret = -oauth_callback_route = /oauth2callback \ No newline at end of file +# Should be supplied in the format: ``key = value`` \ No newline at end of file From 55332ff190b90604b9ce70404706b2efbc589c7c Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 2 Feb 2021 14:00:37 -0500 Subject: [PATCH 48/63] Config: Remove the authenticate var from app.cfg and remove other configs from airflow.cfg --- src/ingest-pipeline/instance/app.cfg.example | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ingest-pipeline/instance/app.cfg.example b/src/ingest-pipeline/instance/app.cfg.example index 0b5ff258..a790e061 100644 --- a/src/ingest-pipeline/instance/app.cfg.example +++ b/src/ingest-pipeline/instance/app.cfg.example @@ -42,5 +42,4 @@ oauth_callback_route = /login scheme = https [webserver] -authenticate = True auth_backend = globus_auth.globus_auth From 8beb5fb521ef102ead6d6b12327c1c443ac9a9c0 Mon Sep 17 00:00:00 2001 From: Juan Puerto Date: Tue, 2 Feb 2021 14:27:36 -0500 Subject: [PATCH 49/63] Fix submodule --- src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims | 2 +- src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims b/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims index bed64eb3..5224e3ab 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims +++ b/src/ingest-pipeline/airflow/dags/cwl/ome-tiff-pyramid-ims @@ -1 +1 @@ -Subproject commit bed64eb34f697c04cddc8ee085871d79523932ff +Subproject commit 5224e3ab90a5de2666764e1061acb2df2c590ec5 diff --git a/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline b/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline index 53415520..8471d6e6 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline +++ b/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline @@ -1 +1 @@ -Subproject commit 53415520f94ac0d9a1fae89af0f6e8250240723a +Subproject commit 8471d6e68c990d84106527d36f74ead3ab9c09b3 From 45ad5c04ad9b50f6ba66f2562afa426e3b920644 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Tue, 2 Feb 2021 18:33:15 -0500 Subject: [PATCH 50/63] Cleaning up submodule problems arising from earlier merge, step 1 --- .gitmodules | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitmodules b/.gitmodules index 534aa6fa..dbd276ca 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,6 +10,14 @@ [submodule "src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq"] path = src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq url = git@github.com:hubmapconsortium/salmon-rnaseq.git + +[submodule "src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq"] + path = src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq + url = git@github.com:hubmapconsortium/salmon-rnaseq.git + +[submodule "src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq"] + path = src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq + url = git@github.com:hubmapconsortium/salmon-rnaseq.git [submodule "src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline"] path = src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline url = git@github.com:hubmapconsortium/sc-atac-seq-pipeline.git From 42d3b60b7e00181e942e8fac3e57f0ff3101c818 Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Tue, 2 Feb 2021 18:45:19 -0500 Subject: [PATCH 51/63] Cleaning up submodule problems arising from earlier merge, step 2 --- .gitmodules | 8 -------- src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq | 1 - .../airflow/dags/cwl/salmon-rnaseq-snareseq | 1 - 3 files changed, 10 deletions(-) delete mode 160000 src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq delete mode 160000 src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq diff --git a/.gitmodules b/.gitmodules index dbd276ca..534aa6fa 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,14 +10,6 @@ [submodule "src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq"] path = src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq url = git@github.com:hubmapconsortium/salmon-rnaseq.git - -[submodule "src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq"] - path = src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq - url = git@github.com:hubmapconsortium/salmon-rnaseq.git - -[submodule "src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq"] - path = src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq - url = git@github.com:hubmapconsortium/salmon-rnaseq.git [submodule "src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline"] path = src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline url = git@github.com:hubmapconsortium/sc-atac-seq-pipeline.git diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq deleted file mode 160000 index cbfa5a74..00000000 --- a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-sciseq +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cbfa5a745d542a78246a492de037a1a13df4e3a6 diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq deleted file mode 160000 index 536d6ed4..00000000 --- a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq-snareseq +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 536d6ed4bc639b93a8d8bc1c1a694b848d4d6f32 From 06c9b1ff3fe7c9b22e1fdc44036cbe13951fedfc Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 26 Jan 2021 14:53:43 -0500 Subject: [PATCH 52/63] Add consolidated salmon_rnaseq.py, remove (non-bulk) others Bulk processing is different enough to leave as its own DAG. --- .../airflow/dags/salmon_rnaseq.py | 294 ++++++++++++++++++ .../airflow/dags/salmon_rnaseq_10x.py | 242 -------------- .../airflow/dags/salmon_rnaseq_sciseq.py | 243 --------------- .../airflow/dags/salmon_rnaseq_snareseq.py | 235 -------------- 4 files changed, 294 insertions(+), 720 deletions(-) create mode 100644 src/ingest-pipeline/airflow/dags/salmon_rnaseq.py delete mode 100644 src/ingest-pipeline/airflow/dags/salmon_rnaseq_10x.py delete mode 100644 src/ingest-pipeline/airflow/dags/salmon_rnaseq_sciseq.py delete mode 100644 src/ingest-pipeline/airflow/dags/salmon_rnaseq_snareseq.py diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py new file mode 100644 index 00000000..7e1ca6ba --- /dev/null +++ b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py @@ -0,0 +1,294 @@ +from pathlib import Path +from datetime import datetime, timedelta +from typing import List, Tuple + +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.operators.python_operator import PythonOperator +from airflow.operators.python_operator import BranchPythonOperator +from airflow.operators.dummy_operator import DummyOperator +from hubmap_operators.common_operators import ( + LogInfoOperator, + JoinOperator, + CreateTmpDirOperator, + CleanupTmpDirOperator, + SetDatasetProcessingOperator, + MoveDataOperator, +) + +import utils +from utils import ( + get_absolute_workflows, + get_cwltool_base_cmd, + get_dataset_uuid, + get_parent_dataset_uuid, + get_uuid_for_error, + join_quote_command_str, + make_send_status_msg_function, +) + +# to be used by the CWL worker +THREADS = 6 + + +def generate_salmon_rnaseq_dag( + dag_id: str, + pipeline_name: str, + assay: str, + dataset_type: str, +) -> DAG: + default_args = { + "owner": "hubmap", + "depends_on_past": False, + "start_date": datetime(2019, 1, 1), + "email": ["joel.welling@gmail.com"], + "email_on_failure": False, + "email_on_retry": False, + "retries": 1, + "retry_delay": timedelta(minutes=1), + "xcom_push": True, + "queue": utils.map_queue_name("general"), + "on_failure_callback": utils.create_dataset_state_error_callback(get_uuid_for_error), + } + + with DAG( + dag_id, + schedule_interval=None, + is_paused_upon_creation=False, + default_args=default_args, + max_active_runs=4, + user_defined_macros={"tmp_dir_path": utils.get_tmp_dir_path}, + ) as dag: + + cwl_workflows = get_absolute_workflows( + Path("salmon-rnaseq", "pipeline.cwl"), + Path("portal-containers", "h5ad-to-arrow.cwl"), + ) + + def build_dataset_name(**kwargs): + id_l = kwargs["dag_run"].conf["parent_submission_id"] + inner_str = id_l if isinstance(id_l, str) else "_".join(id_l) + return f"{dag.dag_id}__{inner_str}__{pipeline_name}" + + prepare_cwl1 = DummyOperator(task_id="prepare_cwl1") + + prepare_cwl2 = DummyOperator(task_id="prepare_cwl2") + + def build_cwltool_cmd1(**kwargs): + ctx = kwargs["dag_run"].conf + run_id = kwargs["run_id"] + tmpdir = utils.get_tmp_dir_path(run_id) + print("tmpdir: ", tmpdir) + + data_dirs = ctx["parent_lz_path"] + data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs + print("data_dirs: ", data_dirs) + + command = [ + *get_cwltool_base_cmd(tmpdir), + "--relax-path-checks", + "--debug", + "--outdir", + tmpdir / "cwl_out", + "--parallel", + cwl_workflows[0], + "--assay", + assay, + "--threads", + THREADS, + ] + for data_dir in data_dirs: + command.append("--fastq_dir") + command.append(data_dir) + + return join_quote_command_str(command) + + def build_cwltool_cmd2(**kwargs): + ctx = kwargs["dag_run"].conf + run_id = kwargs["run_id"] + tmpdir = utils.get_tmp_dir_path(run_id) + print("tmpdir: ", tmpdir) + data_dir = ctx["parent_lz_path"] + print("data_dir: ", data_dir) + + command = [ + *get_cwltool_base_cmd(tmpdir), + cwl_workflows[1], + "--input_dir", + ".", + ] + + return join_quote_command_str(command) + + t_build_cmd1 = PythonOperator( + task_id="build_cmd1", + python_callable=build_cwltool_cmd1, + provide_context=True, + ) + + t_build_cmd2 = PythonOperator( + task_id="build_cmd2", + python_callable=build_cwltool_cmd2, + provide_context=True, + ) + + t_pipeline_exec = BashOperator( + task_id="pipeline_exec", + bash_command=""" \ + tmp_dir={{tmp_dir_path(run_id)}} ; \ + {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ + echo $? + """, + ) + + t_make_arrow1 = BashOperator( + task_id="make_arrow1", + bash_command=""" \ + tmp_dir={{tmp_dir_path(run_id)}} ; \ + ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ + cd "$tmp_dir"/cwl_out/cluster-marker-genes ; \ + {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ + echo $? + """, + ) + + t_maybe_keep_cwl1 = BranchPythonOperator( + task_id="maybe_keep_cwl1", + python_callable=utils.pythonop_maybe_keep, + provide_context=True, + op_kwargs={ + "next_op": "move_files", + "bail_op": "set_dataset_error", + "test_op": "pipeline_exec", + }, + ) + + t_maybe_keep_cwl2 = BranchPythonOperator( + task_id="maybe_keep_cwl2", + python_callable=utils.pythonop_maybe_keep, + provide_context=True, + op_kwargs={ + "next_op": "move_data", + "bail_op": "set_dataset_error", + "test_op": "make_arrow1", + }, + ) + + t_send_create_dataset = PythonOperator( + task_id="send_create_dataset", + python_callable=utils.pythonop_send_create_dataset, + provide_context=True, + op_kwargs={ + "parent_dataset_uuid_callable": get_parent_dataset_uuid, + "http_conn_id": "ingest_api_connection", + "endpoint": "/datasets/derived", + "dataset_name_callable": build_dataset_name, + "dataset_types": [dataset_type], + }, + ) + + t_set_dataset_error = PythonOperator( + task_id="set_dataset_error", + python_callable=utils.pythonop_set_dataset_state, + provide_context=True, + trigger_rule="all_done", + op_kwargs={ + "dataset_uuid_callable": get_dataset_uuid, + "http_conn_id": "ingest_api_connection", + "endpoint": "/datasets/status", + "ds_state": "Error", + "message": "An error occurred in {}".format(pipeline_name), + }, + ) + + t_move_files = BashOperator( + task_id="move_files", + bash_command=""" + tmp_dir={{tmp_dir_path(run_id)}} ; \ + cd "$tmp_dir"/cwl_out ; \ + mkdir cluster-marker-genes ; \ + mv cluster_marker_genes.h5ad cluster-marker-genes ; \ + echo $? + """, + ) + + send_status_msg = make_send_status_msg_function( + dag_file=__file__, + retcode_ops=["pipeline_exec", "move_data", "make_arrow1"], + cwl_workflows=cwl_workflows, + ) + t_send_status = PythonOperator( + task_id="send_status_msg", + python_callable=send_status_msg, + provide_context=True, + ) + + t_log_info = LogInfoOperator(task_id="log_info") + t_join = JoinOperator(task_id="join") + t_create_tmpdir = CreateTmpDirOperator(task_id="create_tmpdir") + t_cleanup_tmpdir = CleanupTmpDirOperator(task_id="cleanup_tmpdir") + t_set_dataset_processing = SetDatasetProcessingOperator(task_id="set_dataset_processing") + t_move_data = MoveDataOperator(task_id="move_data") + + ( + dag + >> t_log_info + >> t_create_tmpdir + >> t_send_create_dataset + >> t_set_dataset_processing + >> prepare_cwl1 + >> t_build_cmd1 + >> t_pipeline_exec + >> t_maybe_keep_cwl1 + >> t_move_files + >> prepare_cwl2 + >> t_build_cmd2 + >> t_make_arrow1 + >> t_maybe_keep_cwl2 + >> t_move_data + >> t_send_status + >> t_join + ) + t_maybe_keep_cwl1 >> t_set_dataset_error + t_maybe_keep_cwl2 >> t_set_dataset_error + t_set_dataset_error >> t_join + t_join >> t_cleanup_tmpdir + + return dag + + +# dag_id, pipeline name, assay given to pipeline via --assay, dataset type +salmon_dag_data: List[Tuple[str, str, str, str]] = [ + ( + "salmon_rnaseq_10x", + "salmon-rnaseq", + "10x", + "salmon_rnaseq_10x", + ), + ( + "salmon_rnaseq_sciseq", + "salmon-rnaseq-sciseq", + "sciseq", + "salmon_rnaseq_sciseq", + ), + ( + "salmon_rnaseq_slideseq", + "salmon-rnaseq-slideseq", + "slideseq", + "salmon_rnaseq_slideseq", + ), + ( + "salmon_rnaseq_snareseq", + "salmon-rnaseq-snareseq", + "snareseq", + "salmon_rnaseq_snareseq", + ), +] + +for dag_id, pipeline_name, assay, dataset_type in salmon_dag_data: + globals()[dag_id] = generate_salmon_rnaseq_dag( + dag_id=dag_id, + pipeline_name=pipeline_name, + assay=assay, + dataset_type=dataset_type, + ) diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq_10x.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq_10x.py deleted file mode 100644 index 7fbac44a..00000000 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq_10x.py +++ /dev/null @@ -1,242 +0,0 @@ -from pathlib import Path -from datetime import datetime, timedelta - -from airflow import DAG -from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, - CleanupTmpDirOperator, - SetDatasetProcessingOperator, - MoveDataOperator -) - -import utils -from utils import ( - get_absolute_workflows, - get_cwltool_base_cmd, - get_dataset_uuid, - get_parent_dataset_uuid, - get_uuid_for_error, - join_quote_command_str, - make_send_status_msg_function, -) - -THREADS = 6 # to be used by the CWL worker - - -default_args = { - 'owner': 'hubmap', - 'depends_on_past': False, - 'start_date': datetime(2019, 1, 1), - 'email': ['joel.welling@gmail.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=1), - 'xcom_push': True, - 'queue': utils.map_queue_name('general'), - 'on_failure_callback': utils.create_dataset_state_error_callback(get_uuid_for_error) -} - - -with DAG('salmon_rnaseq_10x', - schedule_interval=None, - is_paused_upon_creation=False, - default_args=default_args, - max_active_runs=4, - user_defined_macros={'tmp_dir_path' : utils.get_tmp_dir_path} - ) as dag: - - pipeline_name = 'salmon-rnaseq' - cwl_workflows = get_absolute_workflows( - Path(pipeline_name, 'pipeline.cwl'), - Path('portal-containers', 'h5ad-to-arrow.cwl'), - ) - - def build_dataset_name(**kwargs): - return '{}__{}__{}'.format(dag.dag_id, - kwargs['dag_run'].conf['parent_submission_id'], - pipeline_name) - -# prepare_cwl1 = PythonOperator( -# python_callable=utils.clone_or_update_pipeline, -# task_id='clone_or_update_cwl1', -# op_kwargs={'pipeline_name': cwl_workflow1} -# ) - -# prepare_cwl2 = PythonOperator( -# python_callable=utils.clone_or_update_pipeline, -# task_id='clone_or_update_cwl2', -# op_kwargs={'pipeline_name': cwl_workflow2} -# ) - - prepare_cwl1 = DummyOperator( - task_id='prepare_cwl1' - ) - - prepare_cwl2 = DummyOperator( - task_id='prepare_cwl2' - ) - - def build_cwltool_cmd1(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - '--debug', - '--outdir', - tmpdir / 'cwl_out', - '--parallel', - cwl_workflows[0], - '--assay', - '10x', - '--fastq_dir', - data_dir, - '--threads', - THREADS, - ] - - return join_quote_command_str(command) - - def build_cwltool_cmd2(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - cwl_workflows[1], - '--input_dir', - '.', - ] - - return join_quote_command_str(command) - - t_build_cmd1 = PythonOperator( - task_id='build_cmd1', - python_callable=build_cwltool_cmd1, - provide_context=True, - ) - - t_build_cmd2 = PythonOperator( - task_id='build_cmd2', - python_callable=build_cwltool_cmd2, - provide_context=True, - ) - - t_pipeline_exec = BashOperator( - task_id='pipeline_exec', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_make_arrow1 = BashOperator( - task_id='make_arrow1', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out/cluster-marker-genes ; \ - {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_maybe_keep_cwl1 = BranchPythonOperator( - task_id='maybe_keep_cwl1', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs = {'next_op' : 'move_files', - 'bail_op' : 'set_dataset_error', - 'test_op' : 'pipeline_exec'} - ) - - t_maybe_keep_cwl2 = BranchPythonOperator( - task_id='maybe_keep_cwl2', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs = {'next_op' : 'move_data', - 'bail_op' : 'set_dataset_error', - 'test_op' : 'make_arrow1'} - ) - - t_send_create_dataset = PythonOperator( - task_id='send_create_dataset', - python_callable=utils.pythonop_send_create_dataset, - provide_context=True, - op_kwargs = {'parent_dataset_uuid_callable' : get_parent_dataset_uuid, - 'http_conn_id' : 'ingest_api_connection', - 'endpoint' : '/datasets/derived', - 'dataset_name_callable' : build_dataset_name, - "dataset_types":["salmon_rnaseq_10x"] - } - ) - - t_set_dataset_error = PythonOperator( - task_id='set_dataset_error', - python_callable=utils.pythonop_set_dataset_state, - provide_context=True, - trigger_rule='all_done', - op_kwargs = {'dataset_uuid_callable' : get_dataset_uuid, - 'http_conn_id' : 'ingest_api_connection', - 'endpoint' : '/datasets/status', - 'ds_state' : 'Error', - 'message' : 'An error occurred in {}'.format(pipeline_name) - } - ) - - t_move_files = BashOperator( - task_id='move_files', - bash_command=""" - tmp_dir={{tmp_dir_path(run_id)}} ; \ - cd "$tmp_dir"/cwl_out ; \ - mkdir cluster-marker-genes ; \ - mv cluster_marker_genes.h5ad cluster-marker-genes ; \ - echo $? - """ - ) - - send_status_msg = make_send_status_msg_function( - dag_file=__file__, - retcode_ops=['pipeline_exec', 'move_data', 'make_arrow1'], - cwl_workflows=cwl_workflows, - ) - t_send_status = PythonOperator( - task_id='send_status_msg', - python_callable=send_status_msg, - provide_context=True - ) - - - t_log_info = LogInfoOperator(task_id='log_info') - t_join = JoinOperator(task_id='join') - t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') - t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') - t_set_dataset_processing = SetDatasetProcessingOperator(task_id='set_dataset_processing') - t_move_data = MoveDataOperator(task_id='move_data') - - (dag >> t_log_info >> t_create_tmpdir - >> t_send_create_dataset >> t_set_dataset_processing - >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> t_move_files - >> prepare_cwl2 >> t_build_cmd2 >> t_make_arrow1 >> t_maybe_keep_cwl2 - >> t_move_data >> t_send_status >> t_join) - t_maybe_keep_cwl1 >> t_set_dataset_error - t_maybe_keep_cwl2 >> t_set_dataset_error - t_set_dataset_error >> t_join - t_join >> t_cleanup_tmpdir diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq_sciseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq_sciseq.py deleted file mode 100644 index 7c639652..00000000 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq_sciseq.py +++ /dev/null @@ -1,243 +0,0 @@ -from pathlib import Path -from datetime import datetime, timedelta - -from airflow import DAG -from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, - CleanupTmpDirOperator, - SetDatasetProcessingOperator, - MoveDataOperator -) - -import utils -from utils import ( - get_absolute_workflows, - get_cwltool_base_cmd, - get_dataset_uuid, - get_parent_dataset_uuid, - get_uuid_for_error, - join_quote_command_str, - make_send_status_msg_function, -) - -THREADS = 6 # to be used by the CWL worker - - -default_args = { - 'owner': 'hubmap', - 'depends_on_past': False, - 'start_date': datetime(2019, 1, 1), - 'email': ['joel.welling@gmail.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=1), - 'xcom_push': True, - 'queue': utils.map_queue_name('general'), - 'on_failure_callback': utils.create_dataset_state_error_callback(get_uuid_for_error) -} - - -with DAG('salmon_rnaseq_sciseq', - schedule_interval=None, - is_paused_upon_creation=False, - default_args=default_args, - max_active_runs=4, - user_defined_macros={'tmp_dir_path' : utils.get_tmp_dir_path} - ) as dag: - - pipeline_name = 'salmon-rnaseq-sciseq' - cwl_workflows = get_absolute_workflows( - Path('salmon-rnaseq', 'pipeline.cwl'), - Path('portal-containers', 'h5ad-to-arrow.cwl'), - ) - - def build_dataset_name(**kwargs): - id_l = kwargs['dag_run'].conf['parent_submission_id'] - inner_str = id_l if isinstance(id_l, str) else '_'.join(id_l) - return f'{dag.dag_id}__{inner_str}__{pipeline_name}' - -# prepare_cwl1 = PythonOperator( -# python_callable=utils.clone_or_update_pipeline, -# task_id='clone_or_update_cwl1', -# op_kwargs={'pipeline_name': cwl_workflow1} -# ) - -# prepare_cwl2 = PythonOperator( -# python_callable=utils.clone_or_update_pipeline, -# task_id='clone_or_update_cwl2', -# op_kwargs={'pipeline_name': cwl_workflow2} -# ) - - prepare_cwl1 = DummyOperator( - task_id='prepare_cwl1' - ) - - prepare_cwl2 = DummyOperator( - task_id='prepare_cwl2' - ) - - def build_cwltool_cmd1(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - '--relax-path-checks', - '--debug', - '--outdir', - tmpdir / 'cwl_out', - '--parallel', - cwl_workflows[0], - '--assay', - 'sciseq', - '--fastq_dir', - data_dir, - '--threads', - THREADS, - ] - - return join_quote_command_str(command) - - def build_cwltool_cmd2(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - cwl_workflows[1], - '--input_dir', - '.', - ] - - return join_quote_command_str(command) - - t_build_cmd1 = PythonOperator( - task_id='build_cmd1', - python_callable=build_cwltool_cmd1, - provide_context=True, - ) - - t_build_cmd2 = PythonOperator( - task_id='build_cmd2', - python_callable=build_cwltool_cmd2, - provide_context=True, - ) - - t_pipeline_exec = BashOperator( - task_id='pipeline_exec', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_make_arrow1 = BashOperator( - task_id='make_arrow1', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out/cluster-marker-genes ; \ - {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_maybe_keep_cwl1 = BranchPythonOperator( - task_id='maybe_keep_cwl1', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs = {'next_op' : 'move_files', - 'bail_op' : 'set_dataset_error', - 'test_op' : 'pipeline_exec'} - ) - - t_maybe_keep_cwl2 = BranchPythonOperator( - task_id='maybe_keep_cwl2', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs = {'next_op' : 'move_data', - 'bail_op' : 'set_dataset_error', - 'test_op' : 'make_arrow1'} - ) - - t_send_create_dataset = PythonOperator( - task_id='send_create_dataset', - python_callable=utils.pythonop_send_create_dataset, - provide_context=True, - op_kwargs = {'parent_dataset_uuid_callable' : get_parent_dataset_uuid, - 'http_conn_id' : 'ingest_api_connection', - 'endpoint' : '/datasets/derived', - 'dataset_name_callable' : build_dataset_name, - "dataset_types":["salmon_rnaseq_sciseq"] - } - ) - - t_set_dataset_error = PythonOperator( - task_id='set_dataset_error', - python_callable=utils.pythonop_set_dataset_state, - provide_context=True, - trigger_rule='all_done', - op_kwargs = {'dataset_uuid_callable' : get_dataset_uuid, - 'http_conn_id' : 'ingest_api_connection', - 'endpoint' : '/datasets/status', - 'ds_state' : 'Error', - 'message' : 'An error occurred in {}'.format(pipeline_name) - } - ) - - t_move_files = BashOperator( - task_id='move_files', - bash_command=""" - tmp_dir={{tmp_dir_path(run_id)}} ; \ - cd "$tmp_dir"/cwl_out ; \ - mkdir cluster-marker-genes ; \ - mv cluster_marker_genes.h5ad cluster-marker-genes ; \ - echo $? - """ - ) - - send_status_msg = make_send_status_msg_function( - dag_file=__file__, - retcode_ops=['pipeline_exec', 'move_data', 'make_arrow1'], - cwl_workflows=cwl_workflows, - ) - t_send_status = PythonOperator( - task_id='send_status_msg', - python_callable=send_status_msg, - provide_context=True - ) - - - t_log_info = LogInfoOperator(task_id='log_info') - t_join = JoinOperator(task_id='join') - t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') - t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') - t_set_dataset_processing = SetDatasetProcessingOperator(task_id='set_dataset_processing') - t_move_data = MoveDataOperator(task_id='move_data') - - (dag >> t_log_info >> t_create_tmpdir - >> t_send_create_dataset >> t_set_dataset_processing - >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> t_move_files - >> prepare_cwl2 >> t_build_cmd2 >> t_make_arrow1 >> t_maybe_keep_cwl2 - >> t_move_data >> t_send_status >> t_join) - t_maybe_keep_cwl1 >> t_set_dataset_error - t_maybe_keep_cwl2 >> t_set_dataset_error - t_set_dataset_error >> t_join - t_join >> t_cleanup_tmpdir diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq_snareseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq_snareseq.py deleted file mode 100644 index 0491d1c4..00000000 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq_snareseq.py +++ /dev/null @@ -1,235 +0,0 @@ -from pathlib import Path -from datetime import datetime, timedelta - -from airflow import DAG -from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, - CleanupTmpDirOperator, - SetDatasetProcessingOperator, - MoveDataOperator -) - -import utils -from utils import ( - get_absolute_workflows, - get_cwltool_base_cmd, - get_dataset_uuid, - get_parent_dataset_uuid, - get_uuid_for_error, - join_quote_command_str, - make_send_status_msg_function, -) - -THREADS = 6 # to be used by the CWL worker - - -default_args = { - 'owner': 'hubmap', - 'depends_on_past': False, - 'start_date': datetime(2019, 1, 1), - 'email': ['joel.welling@gmail.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=1), - 'xcom_push': True, - 'queue': utils.map_queue_name('general'), - 'on_failure_callback': utils.create_dataset_state_error_callback(get_uuid_for_error) -} - - -with DAG('salmon_rnaseq_snareseq', - schedule_interval=None, - is_paused_upon_creation=False, - default_args=default_args, - max_active_runs=4, - user_defined_macros={'tmp_dir_path' : utils.get_tmp_dir_path} - ) as dag: - - pipeline_name = 'salmon-rnaseq-snareseq' - cwl_workflows = get_absolute_workflows( - Path('salmon-rnaseq', 'pipeline.cwl'), - Path('portal-containers', 'h5ad-to-arrow.cwl'), - ) - - - def build_dataset_name(**kwargs): - id_l = kwargs['dag_run'].conf['parent_submission_id'] - inner_str = id_l if isinstance(id_l, str) else '_'.join(id_l) - return f'{dag.dag_id}__{inner_str}__{pipeline_name}' - - prepare_cwl1 = DummyOperator( - task_id='prepare_cwl1' - ) - - prepare_cwl2 = DummyOperator( - task_id='prepare_cwl2' - ) - - def build_cwltool_cmd1(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - - data_dirs = ctx['parent_lz_path'] - data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs - print('data_dirs: ', data_dirs) - - command = [ - *get_cwltool_base_cmd(tmpdir), - '--debug', - '--outdir', - tmpdir / 'cwl_out', - '--parallel', - cwl_workflows[0], - '--assay', - 'snareseq', - '--threads', - THREADS, - ] - for data_dir in data_dirs: - command.append('--fastq_dir') - command.append(data_dir) - - return join_quote_command_str(command) - - - def build_cwltool_cmd2(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - cwl_workflows[1], - '--input_dir', - '.', - ] - - return join_quote_command_str(command) - - t_build_cmd1 = PythonOperator( - task_id='build_cmd1', - python_callable=build_cwltool_cmd1, - provide_context=True, - ) - - t_build_cmd2 = PythonOperator( - task_id='build_cmd2', - python_callable=build_cwltool_cmd2, - provide_context=True, - ) - - t_pipeline_exec = BashOperator( - task_id='pipeline_exec', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_make_arrow1 = BashOperator( - task_id='make_arrow1', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out/cluster-marker-genes ; \ - {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_maybe_keep_cwl1 = BranchPythonOperator( - task_id='maybe_keep_cwl1', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs = {'next_op' : 'move_files', - 'bail_op' : 'set_dataset_error', - 'test_op' : 'pipeline_exec'} - ) - - t_maybe_keep_cwl2 = BranchPythonOperator( - task_id='maybe_keep_cwl2', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs = {'next_op' : 'move_data', - 'bail_op' : 'set_dataset_error', - 'test_op' : 'make_arrow1'} - ) - - t_send_create_dataset = PythonOperator( - task_id='send_create_dataset', - python_callable=utils.pythonop_send_create_dataset, - provide_context=True, - op_kwargs = {'parent_dataset_uuid_callable' : get_parent_dataset_uuid, - 'http_conn_id' : 'ingest_api_connection', - 'endpoint' : '/datasets/derived', - 'dataset_name_callable' : build_dataset_name, - "dataset_types":["salmon_rnaseq_snareseq"] - } - ) - - t_set_dataset_error = PythonOperator( - task_id='set_dataset_error', - python_callable=utils.pythonop_set_dataset_state, - provide_context=True, - trigger_rule='all_done', - op_kwargs = {'dataset_uuid_callable' : get_dataset_uuid, - 'http_conn_id' : 'ingest_api_connection', - 'endpoint' : '/datasets/status', - 'ds_state' : 'Error', - 'message' : 'An error occurred in {}'.format(pipeline_name) - } - ) - - t_move_files = BashOperator( - task_id='move_files', - bash_command=""" - tmp_dir={{tmp_dir_path(run_id)}} ; \ - cd "$tmp_dir"/cwl_out ; \ - mkdir cluster-marker-genes ; \ - mv cluster_marker_genes.h5ad cluster-marker-genes ; \ - echo $? - """ - ) - - send_status_msg = make_send_status_msg_function( - dag_file=__file__, - retcode_ops=['pipeline_exec', 'move_data', 'make_arrow1'], - cwl_workflows=cwl_workflows, - ) - t_send_status = PythonOperator( - task_id='send_status_msg', - python_callable=send_status_msg, - provide_context=True - ) - - - t_log_info = LogInfoOperator(task_id='log_info') - t_join = JoinOperator(task_id='join') - t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') - t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') - t_set_dataset_processing = SetDatasetProcessingOperator(task_id='set_dataset_processing') - t_move_data = MoveDataOperator(task_id='move_data') - - (dag >> t_log_info >> t_create_tmpdir - >> t_send_create_dataset >> t_set_dataset_processing - >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> t_move_files - >> prepare_cwl2 >> t_build_cmd2 >> t_make_arrow1 >> t_maybe_keep_cwl2 - >> t_move_data >> t_send_status >> t_join) - t_maybe_keep_cwl1 >> t_set_dataset_error - t_maybe_keep_cwl2 >> t_set_dataset_error - t_set_dataset_error >> t_join - t_join >> t_cleanup_tmpdir From 447428007f199d9991145d5317b0ccbb1404b33e Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 26 Jan 2021 14:54:30 -0500 Subject: [PATCH 53/63] Bump salmon-rnaseq submodule to v2.0.3 --- src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq index bd013b61..ae382730 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq +++ b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq @@ -1 +1 @@ -Subproject commit bd013b61984c2dff304249205f69bdbca3055100 +Subproject commit ae382730ac636276f02012ce56b4ef235ff08530 From f8efc5d99e56c9b003ce3178774d524608d3920a Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 26 Jan 2021 14:59:58 -0500 Subject: [PATCH 54/63] salmon_rnaseq DAG: generalize post-pipeline UI processing * Don't move any pipeline output files * Make a separate 'hubmap_ui' directory for the output of anything Ilan needs to run after the main pipeline completes * Update the UI data conversion pipeline call for new paths --- .../airflow/dags/salmon_rnaseq.py | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py index 7e1ca6ba..73eee50d 100644 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py +++ b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py @@ -115,7 +115,9 @@ def build_cwltool_cmd2(**kwargs): *get_cwltool_base_cmd(tmpdir), cwl_workflows[1], "--input_dir", - ".", + # This pipeline invocation runs in a 'hubmap_ui' subdirectory, + # so use the parent directory as input + "..", ] return join_quote_command_str(command) @@ -141,12 +143,14 @@ def build_cwltool_cmd2(**kwargs): """, ) - t_make_arrow1 = BashOperator( - task_id="make_arrow1", + t_convert_for_ui = BashOperator( + task_id="convert_for_ui", bash_command=""" \ tmp_dir={{tmp_dir_path(run_id)}} ; \ ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out/cluster-marker-genes ; \ + cd "$tmp_dir"/cwl_out ; \ + mkdir hubmap_ui ; \ + cd hubmap_ui ; \ {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ echo $? """, @@ -157,7 +161,7 @@ def build_cwltool_cmd2(**kwargs): python_callable=utils.pythonop_maybe_keep, provide_context=True, op_kwargs={ - "next_op": "move_files", + "next_op": "prepare_cwl2", "bail_op": "set_dataset_error", "test_op": "pipeline_exec", }, @@ -170,7 +174,7 @@ def build_cwltool_cmd2(**kwargs): op_kwargs={ "next_op": "move_data", "bail_op": "set_dataset_error", - "test_op": "make_arrow1", + "test_op": "t_convert_for_ui", }, ) @@ -201,17 +205,6 @@ def build_cwltool_cmd2(**kwargs): }, ) - t_move_files = BashOperator( - task_id="move_files", - bash_command=""" - tmp_dir={{tmp_dir_path(run_id)}} ; \ - cd "$tmp_dir"/cwl_out ; \ - mkdir cluster-marker-genes ; \ - mv cluster_marker_genes.h5ad cluster-marker-genes ; \ - echo $? - """, - ) - send_status_msg = make_send_status_msg_function( dag_file=__file__, retcode_ops=["pipeline_exec", "move_data", "make_arrow1"], @@ -240,10 +233,9 @@ def build_cwltool_cmd2(**kwargs): >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> t_move_files >> prepare_cwl2 >> t_build_cmd2 - >> t_make_arrow1 + >> t_convert_for_ui >> t_maybe_keep_cwl2 >> t_move_data >> t_send_status From 0c2429d6f78d8e796aed9e9957c2004f4d12b3be Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Tue, 26 Jan 2021 15:41:51 -0500 Subject: [PATCH 55/63] Remove some boilerplate with new get_salmon_dag_params function --- .../airflow/dags/salmon_rnaseq.py | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py index 73eee50d..34f12047 100644 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py +++ b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py @@ -1,19 +1,18 @@ -from pathlib import Path from datetime import datetime, timedelta +from pathlib import Path from typing import List, Tuple from airflow import DAG from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator from airflow.operators.dummy_operator import DummyOperator +from airflow.operators.python_operator import BranchPythonOperator, PythonOperator from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, CleanupTmpDirOperator, - SetDatasetProcessingOperator, + CreateTmpDirOperator, + JoinOperator, + LogInfoOperator, MoveDataOperator, + SetDatasetProcessingOperator, ) import utils @@ -249,32 +248,29 @@ def build_cwltool_cmd2(**kwargs): return dag +def get_salmon_dag_params(assay: str) -> Tuple[str, str, str, str]: + # TODO: restructure assay names, pipeline names, etc.; this repetition + # is for backward compatibility + return ( + f"salmon_rnaseq_{assay}", + f"salmon-rnaseq-{assay}", + assay, + f"salmon_rnaseq_{assay}", + ) + + # dag_id, pipeline name, assay given to pipeline via --assay, dataset type salmon_dag_data: List[Tuple[str, str, str, str]] = [ + # 10X is special because it was first; no "10x" label in the pipeline name ( "salmon_rnaseq_10x", "salmon-rnaseq", "10x", "salmon_rnaseq_10x", ), - ( - "salmon_rnaseq_sciseq", - "salmon-rnaseq-sciseq", - "sciseq", - "salmon_rnaseq_sciseq", - ), - ( - "salmon_rnaseq_slideseq", - "salmon-rnaseq-slideseq", - "slideseq", - "salmon_rnaseq_slideseq", - ), - ( - "salmon_rnaseq_snareseq", - "salmon-rnaseq-snareseq", - "snareseq", - "salmon_rnaseq_snareseq", - ), + get_salmon_dag_params("sciseq"), + get_salmon_dag_params("slideseq"), + get_salmon_dag_params("snareseq"), ] for dag_id, pipeline_name, assay, dataset_type in salmon_dag_data: From 23d2b79d9633dfc8e05cefc9ccc597849d15398d Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 3 Feb 2021 10:29:21 -0500 Subject: [PATCH 56/63] utils.py: add SequencingDagParameters namedtuple Makes things a little safer in terms of parameter ordering in sequencing DAG generation; it's nicer to refer to fields by name everywhere. --- src/ingest-pipeline/airflow/dags/utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/ingest-pipeline/airflow/dags/utils.py b/src/ingest-pipeline/airflow/dags/utils.py index b56f6f12..1a9bde5c 100644 --- a/src/ingest-pipeline/airflow/dags/utils.py +++ b/src/ingest-pipeline/airflow/dags/utils.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections import namedtuple from functools import lru_cache import json from os import environ, fspath, walk @@ -99,6 +100,17 @@ WORKFLOW_MAP_SCHEMA = 'workflow_map_schema.yml' COMPILED_WORKFLOW_MAP: Optional[List[Tuple[Pattern, Pattern, str]]] = None +# Parameters used to generate scRNA and scATAC analysis DAGs; these +# are the only fields which differ between assays and DAGs +SequencingDagParameters = namedtuple( + 'SequencingDagParameters', + [ + 'dag_id', + 'pipeline_name', + 'assay', + 'dataset_type', + ], +) ManifestMatch = Tuple[bool, Optional[str], Optional[str], Optional[bool]] From 7c513190865922ef929e9b54d406a017c580a777 Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 3 Feb 2021 10:30:57 -0500 Subject: [PATCH 57/63] salmon_rnaseq.py: use SequencingDagParameters instead of plain tuples --- .../airflow/dags/salmon_rnaseq.py | 52 ++++++++----------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py index 34f12047..99bcb181 100644 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py +++ b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py @@ -17,6 +17,7 @@ import utils from utils import ( + SequencingDagParameters, get_absolute_workflows, get_cwltool_base_cmd, get_dataset_uuid, @@ -30,12 +31,7 @@ THREADS = 6 -def generate_salmon_rnaseq_dag( - dag_id: str, - pipeline_name: str, - assay: str, - dataset_type: str, -) -> DAG: +def generate_salmon_rnaseq_dag(params: SequencingDagParameters) -> DAG: default_args = { "owner": "hubmap", "depends_on_past": False, @@ -51,7 +47,7 @@ def generate_salmon_rnaseq_dag( } with DAG( - dag_id, + params.dag_id, schedule_interval=None, is_paused_upon_creation=False, default_args=default_args, @@ -67,7 +63,7 @@ def generate_salmon_rnaseq_dag( def build_dataset_name(**kwargs): id_l = kwargs["dag_run"].conf["parent_submission_id"] inner_str = id_l if isinstance(id_l, str) else "_".join(id_l) - return f"{dag.dag_id}__{inner_str}__{pipeline_name}" + return f"{dag.dag_id}__{inner_str}__{params.pipeline_name}" prepare_cwl1 = DummyOperator(task_id="prepare_cwl1") @@ -92,7 +88,7 @@ def build_cwltool_cmd1(**kwargs): "--parallel", cwl_workflows[0], "--assay", - assay, + params.assay, "--threads", THREADS, ] @@ -186,7 +182,7 @@ def build_cwltool_cmd2(**kwargs): "http_conn_id": "ingest_api_connection", "endpoint": "/datasets/derived", "dataset_name_callable": build_dataset_name, - "dataset_types": [dataset_type], + "dataset_types": [params.dataset_type], }, ) @@ -200,7 +196,7 @@ def build_cwltool_cmd2(**kwargs): "http_conn_id": "ingest_api_connection", "endpoint": "/datasets/status", "ds_state": "Error", - "message": "An error occurred in {}".format(pipeline_name), + "message": f"An error occurred in {params.pipeline_name}", }, ) @@ -248,35 +244,29 @@ def build_cwltool_cmd2(**kwargs): return dag -def get_salmon_dag_params(assay: str) -> Tuple[str, str, str, str]: +def get_salmon_dag_params(assay: str) -> SequencingDagParameters: # TODO: restructure assay names, pipeline names, etc.; this repetition # is for backward compatibility - return ( - f"salmon_rnaseq_{assay}", - f"salmon-rnaseq-{assay}", - assay, - f"salmon_rnaseq_{assay}", + return SequencingDagParameters( + dag_id=f"salmon_rnaseq_{assay}", + pipeline_name=f"salmon-rnaseq-{assay}", + assay=assay, + dataset_type=f"salmon_rnaseq_{assay}", ) -# dag_id, pipeline name, assay given to pipeline via --assay, dataset type -salmon_dag_data: List[Tuple[str, str, str, str]] = [ +salmon_dag_params: List[SequencingDagParameters] = [ # 10X is special because it was first; no "10x" label in the pipeline name - ( - "salmon_rnaseq_10x", - "salmon-rnaseq", - "10x", - "salmon_rnaseq_10x", + SequencingDagParameters( + dag_id="salmon_rnaseq_10x", + pipeline_name="salmon-rnaseq", + assay="10x", + dataset_type="salmon_rnaseq_10x", ), get_salmon_dag_params("sciseq"), get_salmon_dag_params("slideseq"), get_salmon_dag_params("snareseq"), ] -for dag_id, pipeline_name, assay, dataset_type in salmon_dag_data: - globals()[dag_id] = generate_salmon_rnaseq_dag( - dag_id=dag_id, - pipeline_name=pipeline_name, - assay=assay, - dataset_type=dataset_type, - ) +for params in salmon_dag_params: + globals()[params.dag_id] = generate_salmon_rnaseq_dag(params) From e89cbebbf2821246a9cfb0feb3f1855b3f469eaa Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 3 Feb 2021 10:33:30 -0500 Subject: [PATCH 58/63] Add sc_atac_seq.py DAG generator, remove assay-specific DAGs --- .../airflow/dags/sc_atac_seq.py | 261 ++++++++++++++++++ .../airflow/dags/sc_atac_seq_sci.py | 219 --------------- .../airflow/dags/sc_atac_seq_sn.py | 223 --------------- .../airflow/dags/sc_atac_seq_snare.py | 222 --------------- 4 files changed, 261 insertions(+), 664 deletions(-) create mode 100644 src/ingest-pipeline/airflow/dags/sc_atac_seq.py delete mode 100644 src/ingest-pipeline/airflow/dags/sc_atac_seq_sci.py delete mode 100644 src/ingest-pipeline/airflow/dags/sc_atac_seq_sn.py delete mode 100644 src/ingest-pipeline/airflow/dags/sc_atac_seq_snare.py diff --git a/src/ingest-pipeline/airflow/dags/sc_atac_seq.py b/src/ingest-pipeline/airflow/dags/sc_atac_seq.py new file mode 100644 index 00000000..bb8dfae8 --- /dev/null +++ b/src/ingest-pipeline/airflow/dags/sc_atac_seq.py @@ -0,0 +1,261 @@ +from datetime import datetime, timedelta +from pathlib import Path +from typing import List, Tuple + +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.operators.dummy_operator import DummyOperator +from airflow.operators.python_operator import BranchPythonOperator, PythonOperator +from hubmap_operators.common_operators import ( + CleanupTmpDirOperator, + CreateTmpDirOperator, + JoinOperator, + LogInfoOperator, + MoveDataOperator, + SetDatasetProcessingOperator, +) + +import utils +from utils import ( + SequencingDagParameters, + get_absolute_workflows, + get_cwltool_base_cmd, + get_dataset_uuid, + get_parent_dataset_uuid, + get_uuid_for_error, + join_quote_command_str, + make_send_status_msg_function, +) + +# to be used by the CWL worker +THREADS = 6 + + +def generate_atac_seq_dag(params: SequencingDagParameters) -> DAG: + default_args = { + "owner": "hubmap", + "depends_on_past": False, + "start_date": datetime(2019, 1, 1), + "email": ["joel.welling@gmail.com"], + "email_on_failure": False, + "email_on_retry": False, + "retries": 1, + "retry_delay": timedelta(minutes=1), + "xcom_push": True, + "queue": utils.map_queue_name("general"), + "on_failure_callback": utils.create_dataset_state_error_callback(get_uuid_for_error), + } + + with DAG( + params.dag_id, + schedule_interval=None, + is_paused_upon_creation=False, + default_args=default_args, + max_active_runs=4, + user_defined_macros={"tmp_dir_path": utils.get_tmp_dir_path}, + ) as dag: + cwl_workflows = get_absolute_workflows( + Path("sc-atac-seq-pipeline", "create_snap_and_analyze.cwl"), + Path("portal-containers", "scatac-csv-to-arrow.cwl"), + ) + + def build_dataset_name(**kwargs): + id_l = kwargs["dag_run"].conf["parent_submission_id"] + inner_str = id_l if isinstance(id_l, str) else "_".join(id_l) + return f"{dag.dag_id}__{inner_str}__{params.pipeline_name}" + + prepare_cwl1 = DummyOperator(task_id="prepare_cwl1") + + prepare_cwl2 = DummyOperator(task_id="prepare_cwl2") + + def build_cwltool_cmd1(**kwargs): + ctx = kwargs["dag_run"].conf + run_id = kwargs["run_id"] + tmpdir = utils.get_tmp_dir_path(run_id) + print("tmpdir: ", tmpdir) + data_dirs = ctx["parent_lz_path"] + data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs + print("data_dirs: ", data_dirs) + + command = [ + *get_cwltool_base_cmd(tmpdir), + "--assay", + params.assay, + "--outdir", + tmpdir / "cwl_out", + "--parallel", + cwl_workflows[0], + "--threads", + THREADS, + ] + for data_dir in data_dirs: + command.append("--sequence_directory") + command.append(data_dir) + + return join_quote_command_str(command) + + def build_cwltool_cmd2(**kwargs): + ctx = kwargs["dag_run"].conf + run_id = kwargs["run_id"] + tmpdir = utils.get_tmp_dir_path(run_id) + print("tmpdir: ", tmpdir) + data_dir = ctx["parent_lz_path"] + print("data_dir: ", data_dir) + + command = [ + *get_cwltool_base_cmd(tmpdir), + cwl_workflows[1], + "--input_dir", + ".", + ] + + return join_quote_command_str(command) + + t_build_cmd1 = PythonOperator( + task_id="build_cmd1", + python_callable=build_cwltool_cmd1, + provide_context=True, + ) + + t_build_cmd2 = PythonOperator( + task_id="build_cmd2", + python_callable=build_cwltool_cmd2, + provide_context=True, + ) + + t_pipeline_exec = BashOperator( + task_id="pipeline_exec", + bash_command=""" \ + tmp_dir={{tmp_dir_path(run_id)}} ; \ + {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ + echo $? + """, + ) + + t_make_arrow1 = BashOperator( + task_id="make_arrow1", + bash_command=""" \ + tmp_dir={{tmp_dir_path(run_id)}} ; \ + ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ + cd "$tmp_dir"/cwl_out ; \ + {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ + echo $? + """, + ) + + t_maybe_keep_cwl1 = BranchPythonOperator( + task_id="maybe_keep_cwl1", + python_callable=utils.pythonop_maybe_keep, + provide_context=True, + op_kwargs={ + "next_op": "prepare_cwl2", + "bail_op": "set_dataset_error", + "test_op": "pipeline_exec", + }, + ) + + t_maybe_keep_cwl2 = BranchPythonOperator( + task_id="maybe_keep_cwl2", + python_callable=utils.pythonop_maybe_keep, + provide_context=True, + op_kwargs={ + "next_op": "move_data", + "bail_op": "set_dataset_error", + "test_op": "make_arrow1", + }, + ) + + t_send_create_dataset = PythonOperator( + task_id="send_create_dataset", + python_callable=utils.pythonop_send_create_dataset, + provide_context=True, + op_kwargs={ + "parent_dataset_uuid_callable": get_parent_dataset_uuid, + "http_conn_id": "ingest_api_connection", + "endpoint": "/datasets/derived", + "dataset_name_callable": build_dataset_name, + "dataset_types": [params.dataset_type], + }, + ) + + t_set_dataset_error = PythonOperator( + task_id="set_dataset_error", + python_callable=utils.pythonop_set_dataset_state, + provide_context=True, + trigger_rule="all_done", + op_kwargs={ + "dataset_uuid_callable": get_dataset_uuid, + "http_conn_id": "ingest_api_connection", + "endpoint": "/datasets/status", + "ds_state": "Error", + "message": f"An error occurred in {params.pipeline_name}", + }, + ) + + send_status_msg = make_send_status_msg_function( + dag_file=__file__, + retcode_ops=["pipeline_exec", "move_data", "make_arrow1"], + cwl_workflows=cwl_workflows, + ) + t_send_status = PythonOperator( + task_id="send_status_msg", + python_callable=send_status_msg, + provide_context=True, + ) + + t_log_info = LogInfoOperator(task_id="log_info") + t_join = JoinOperator(task_id="join") + t_create_tmpdir = CreateTmpDirOperator(task_id="create_tmpdir") + t_cleanup_tmpdir = CleanupTmpDirOperator(task_id="cleanup_tmpdir") + t_set_dataset_processing = SetDatasetProcessingOperator(task_id="set_dataset_processing") + t_move_data = MoveDataOperator(task_id="move_data") + + ( + dag + >> t_log_info + >> t_create_tmpdir + >> t_send_create_dataset + >> t_set_dataset_processing + >> prepare_cwl1 + >> t_build_cmd1 + >> t_pipeline_exec + >> t_maybe_keep_cwl1 + >> prepare_cwl2 + >> t_build_cmd2 + >> t_make_arrow1 + >> t_maybe_keep_cwl2 + >> t_move_data + >> t_send_status + >> t_join + ) + t_maybe_keep_cwl1 >> t_set_dataset_error + t_maybe_keep_cwl2 >> t_set_dataset_error + t_set_dataset_error >> t_join + t_join >> t_cleanup_tmpdir + + return dag + + +atacseq_dag_data: List[SequencingDagParameters] = [ + SequencingDagParameters( + dag_id="sc_atac_seq_sci", + pipeline_name="sci-atac-seq-pipeline", + assay="sciseq", + dataset_type="sc_atac_seq_sci", + ), + SequencingDagParameters( + dag_id="sc_atac_seq_snare", + pipeline_name="sc-atac-seq-pipeline", + assay="snareseq", + dataset_type="sc_atac_seq_snare", + ), + SequencingDagParameters( + dag_id="sc_atac_seq_sn", + pipeline_name="sn-atac-seq-pipeline", + assay="snseq", + dataset_type="sn_atac_seq", + ), +] + +for params in atacseq_dag_data: + globals()[params.dag_id] = generate_atac_seq_dag(params) diff --git a/src/ingest-pipeline/airflow/dags/sc_atac_seq_sci.py b/src/ingest-pipeline/airflow/dags/sc_atac_seq_sci.py deleted file mode 100644 index ea41b1b0..00000000 --- a/src/ingest-pipeline/airflow/dags/sc_atac_seq_sci.py +++ /dev/null @@ -1,219 +0,0 @@ -from pathlib import Path -from datetime import datetime, timedelta - -from airflow import DAG -from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, - CleanupTmpDirOperator, - SetDatasetProcessingOperator, - MoveDataOperator, -) - -import utils -from utils import ( - get_absolute_workflows, - get_cwltool_base_cmd, - get_dataset_uuid, - get_parent_dataset_uuid, - get_uuid_for_error, - join_quote_command_str, - make_send_status_msg_function, -) - -THREADS = 6 # to be used by the CWL worker - -default_args = { - 'owner': 'hubmap', - 'depends_on_past': False, - 'start_date': datetime(2019, 1, 1), - 'email': ['joel.welling@gmail.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=1), - 'xcom_push': True, - 'queue': utils.map_queue_name('general'), - 'on_failure_callback': utils.create_dataset_state_error_callback(get_uuid_for_error), -} - -with DAG( - 'sc_atac_seq_sci', - schedule_interval=None, - is_paused_upon_creation=False, - default_args=default_args, - max_active_runs=4, - user_defined_macros={'tmp_dir_path': utils.get_tmp_dir_path}, -) as dag: - pipeline_name = 'sci-atac-seq-pipeline' - cwl_workflows = get_absolute_workflows( - Path(pipeline_name, 'create_snap_and_analyze.cwl'), - Path('portal-containers', 'scatac-csv-to-arrow.cwl'), - ) - - def build_dataset_name(**kwargs): - return '{}__{}__{}'.format(dag.dag_id, - kwargs['dag_run'].conf['parent_submission_id'], - pipeline_name - ) - - prepare_cwl1 = DummyOperator( - task_id='prepare_cwl1' - ) - - prepare_cwl2 = DummyOperator( - task_id='prepare_cwl2' - ) - - def build_cwltool_cmd1(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - '--outdir', - tmpdir / 'cwl_out', - '--parallel', - cwl_workflows[0], - '--sequence_directory', - data_dir, - '--threads', - THREADS, - ] - - return join_quote_command_str(command) - - def build_cwltool_cmd2(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - cwl_workflows[1], - '--input_dir', - '.', - ] - - return join_quote_command_str(command) - - t_build_cmd1 = PythonOperator( - task_id='build_cmd1', - python_callable=build_cwltool_cmd1, - provide_context=True, - ) - - t_build_cmd2 = PythonOperator( - task_id='build_cmd2', - python_callable=build_cwltool_cmd2, - provide_context=True, - ) - - t_pipeline_exec = BashOperator( - task_id='pipeline_exec', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ - echo $? - """, - ) - - t_make_arrow1 = BashOperator( - task_id='make_arrow1', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out ; \ - {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_maybe_keep_cwl1 = BranchPythonOperator( - task_id='maybe_keep_cwl1', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs={ - 'next_op': 'prepare_cwl2', - 'bail_op': 'set_dataset_error', - 'test_op': 'pipeline_exec', - }, - ) - - t_maybe_keep_cwl2 = BranchPythonOperator( - task_id='maybe_keep_cwl2', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs={ - 'next_op': 'move_data', - 'bail_op': 'set_dataset_error', - 'test_op': 'make_arrow1', - }, - ) - - t_send_create_dataset = PythonOperator( - task_id='send_create_dataset', - python_callable=utils.pythonop_send_create_dataset, - provide_context=True, - op_kwargs={ - 'parent_dataset_uuid_callable': get_parent_dataset_uuid, - 'http_conn_id': 'ingest_api_connection', - 'endpoint': '/datasets/derived', - 'dataset_name_callable': build_dataset_name, - "dataset_types": ["sc_atac_seq_sci"], - }, - ) - - t_set_dataset_error = PythonOperator( - task_id='set_dataset_error', - python_callable=utils.pythonop_set_dataset_state, - provide_context=True, - trigger_rule='all_done', - op_kwargs={ - 'dataset_uuid_callable': get_dataset_uuid, - 'http_conn_id': 'ingest_api_connection', - 'endpoint': '/datasets/status', - 'ds_state': 'Error', - 'message': 'An error occurred in {}'.format(pipeline_name), - }, - ) - - send_status_msg = make_send_status_msg_function( - dag_file=__file__, - retcode_ops=['pipeline_exec', 'move_data', 'make_arrow1'], - cwl_workflows=cwl_workflows, - ) - t_send_status = PythonOperator( - task_id='send_status_msg', - python_callable=send_status_msg, - provide_context=True, - ) - - t_log_info = LogInfoOperator(task_id='log_info') - t_join = JoinOperator(task_id='join') - t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') - t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') - t_set_dataset_processing = SetDatasetProcessingOperator(task_id='set_dataset_processing') - t_move_data = MoveDataOperator(task_id='move_data') - - (dag >> t_log_info >> t_create_tmpdir - >> t_send_create_dataset >> t_set_dataset_processing - >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> prepare_cwl2 >> t_build_cmd2 >> t_make_arrow1 >> t_maybe_keep_cwl2 - >> t_move_data >> t_send_status >> t_join) - t_maybe_keep_cwl1 >> t_set_dataset_error - t_maybe_keep_cwl2 >> t_set_dataset_error - t_set_dataset_error >> t_join - t_join >> t_cleanup_tmpdir diff --git a/src/ingest-pipeline/airflow/dags/sc_atac_seq_sn.py b/src/ingest-pipeline/airflow/dags/sc_atac_seq_sn.py deleted file mode 100644 index 31e21676..00000000 --- a/src/ingest-pipeline/airflow/dags/sc_atac_seq_sn.py +++ /dev/null @@ -1,223 +0,0 @@ -from pathlib import Path -from datetime import datetime, timedelta - -from airflow import DAG -from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from airflow.hooks.http_hook import HttpHook -from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, - CleanupTmpDirOperator, - SetDatasetProcessingOperator, - MoveDataOperator, -) - -import utils -from utils import ( - decrypt_tok, - find_pipeline_manifests, - get_cwltool_base_cmd, - get_absolute_workflows, - get_dataset_uuid, - get_parent_dataset_uuid, - get_uuid_for_error, - join_quote_command_str, - localized_assert_json_matches_schema as assert_json_matches_schema, - make_send_status_msg_function, -) - -THREADS = 6 # to be used by the CWL worker - -default_args = { - 'owner': 'hubmap', - 'depends_on_past': False, - 'start_date': datetime(2019, 1, 1), - 'email': ['joel.welling@gmail.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=1), - 'xcom_push': True, - 'queue': utils.map_queue_name('general'), - 'on_failure_callback': utils.create_dataset_state_error_callback(get_uuid_for_error), -} - -with DAG( - 'sc_atac_seq_sn', - schedule_interval=None, - is_paused_upon_creation=False, - default_args=default_args, - max_active_runs=4, - user_defined_macros={'tmp_dir_path': utils.get_tmp_dir_path}, -) as dag: - pipeline_name = 'sn-atac-seq-pipeline' - cwl_workflows = get_absolute_workflows( - Path(pipeline_name, 'create_snap_and_analyze.cwl'), - Path('portal-containers', 'scatac-csv-to-arrow.cwl'), - ) - - def build_dataset_name(**kwargs): - return '{}__{}__{}'.format(dag.dag_id, - kwargs['dag_run'].conf['parent_submission_id'], - pipeline_name - ) - - prepare_cwl1 = DummyOperator( - task_id='prepare_cwl1' - ) - - prepare_cwl2 = DummyOperator( - task_id='prepare_cwl2' - ) - - def build_cwltool_cmd1(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - '--outdir', - os.path.join(tmpdir, 'cwl_out'), - '--parallel', - cwl_workflows[0], - '--sequence_directory', - data_dir, - '--threads', - THREADS, - ] - - return join_quote_command_str(command) - - def build_cwltool_cmd2(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - cwl_workflows[1], - '--input_dir', - '.', - ] - - return join_quote_command_str(command) - - t_build_cmd1 = PythonOperator( - task_id='build_cmd1', - python_callable=build_cwltool_cmd1, - provide_context=True, - ) - - t_build_cmd2 = PythonOperator( - task_id='build_cmd2', - python_callable=build_cwltool_cmd2, - provide_context=True, - ) - - t_pipeline_exec = BashOperator( - task_id='pipeline_exec', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ - echo $? - """, - ) - - t_make_arrow1 = BashOperator( - task_id='make_arrow1', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out ; \ - {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_maybe_keep_cwl1 = BranchPythonOperator( - task_id='maybe_keep_cwl1', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs={ - 'next_op': 'prepare_cwl2', - 'bail_op': 'set_dataset_error', - 'test_op': 'pipeline_exec', - }, - ) - - t_maybe_keep_cwl2 = BranchPythonOperator( - task_id='maybe_keep_cwl2', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs={ - 'next_op': 'move_data', - 'bail_op': 'set_dataset_error', - 'test_op': 'make_arrow1', - }, - ) - - t_send_create_dataset = PythonOperator( - task_id='send_create_dataset', - python_callable=utils.pythonop_send_create_dataset, - provide_context=True, - op_kwargs={ - 'parent_dataset_uuid_callable': get_parent_dataset_uuid, - 'http_conn_id': 'ingest_api_connection', - 'endpoint': '/datasets/derived', - 'dataset_name_callable': build_dataset_name, - "dataset_types": ["sn_atac_seq"], - }, - ) - - t_set_dataset_error = PythonOperator( - task_id='set_dataset_error', - python_callable=utils.pythonop_set_dataset_state, - provide_context=True, - trigger_rule='all_done', - op_kwargs={ - 'dataset_uuid_callable': get_dataset_uuid, - 'http_conn_id': 'ingest_api_connection', - 'endpoint': '/datasets/status', - 'ds_state': 'Error', - 'message': 'An error occurred in {}'.format(pipeline_name), - }, - ) - - send_status_msg = make_send_status_msg_function( - dag_file=__file__, - retcode_ops=['pipeline_exec', 'move_data', 'make_arrow1'], - cwl_workflows=cwl_workflows, - ) - t_send_status = PythonOperator( - task_id='send_status_msg', - python_callable=send_status_msg, - provide_context=True, - ) - - t_log_info = LogInfoOperator(task_id='log_info') - t_join = JoinOperator(task_id='join') - t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') - t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') - t_set_dataset_processing = SetDatasetProcessingOperator(task_id='set_dataset_processing') - t_move_data = MoveDataOperator(task_id='move_data') - - (dag >> t_log_info >> t_create_tmpdir - >> t_send_create_dataset >> t_set_dataset_processing - >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> prepare_cwl2 >> t_build_cmd2 >> t_make_arrow1 >> t_maybe_keep_cwl2 - >> t_move_data >> t_send_status >> t_join) - t_maybe_keep_cwl1 >> t_set_dataset_error - t_maybe_keep_cwl2 >> t_set_dataset_error - t_set_dataset_error >> t_join - t_join >> t_cleanup_tmpdir diff --git a/src/ingest-pipeline/airflow/dags/sc_atac_seq_snare.py b/src/ingest-pipeline/airflow/dags/sc_atac_seq_snare.py deleted file mode 100644 index 62d7c00b..00000000 --- a/src/ingest-pipeline/airflow/dags/sc_atac_seq_snare.py +++ /dev/null @@ -1,222 +0,0 @@ -from pathlib import Path -from datetime import datetime, timedelta - -from airflow import DAG -from airflow.operators.bash_operator import BashOperator -from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from hubmap_operators.common_operators import ( - LogInfoOperator, - JoinOperator, - CreateTmpDirOperator, - CleanupTmpDirOperator, - SetDatasetProcessingOperator, - MoveDataOperator, -) - -import utils -from utils import ( - get_absolute_workflows, - get_cwltool_base_cmd, - get_dataset_uuid, - get_parent_dataset_uuid, - get_uuid_for_error, - join_quote_command_str, - make_send_status_msg_function, -) - -THREADS = 6 # to be used by the CWL worker - -default_args = { - 'owner': 'hubmap', - 'depends_on_past': False, - 'start_date': datetime(2019, 1, 1), - 'email': ['joel.welling@gmail.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=1), - 'xcom_push': True, - 'queue': utils.map_queue_name('general'), - 'on_failure_callback': utils.create_dataset_state_error_callback(get_uuid_for_error), -} - -with DAG( - 'sc_atac_seq_snare', - schedule_interval=None, - is_paused_upon_creation=False, - default_args=default_args, - max_active_runs=4, - user_defined_macros={'tmp_dir_path': utils.get_tmp_dir_path}, -) as dag: - pipeline_name = 'sc-atac-seq-pipeline' - cwl_workflows = get_absolute_workflows( - Path(pipeline_name, 'create_snap_and_analyze.cwl'), - Path('portal-containers', 'scatac-csv-to-arrow.cwl'), - ) - - - def build_dataset_name(**kwargs): - id_l = kwargs['dag_run'].conf['parent_submission_id'] - inner_str = id_l if isinstance(id_l, str) else '_'.join(id_l) - return f'{dag.dag_id}__{inner_str}__{pipeline_name}' - - - prepare_cwl1 = DummyOperator( - task_id='prepare_cwl1' - ) - - prepare_cwl2 = DummyOperator( - task_id='prepare_cwl2' - ) - - def build_cwltool_cmd1(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dirs = ctx['parent_lz_path'] - data_dirs = [data_dirs] if isinstance(data_dirs, str) else data_dirs - print('data_dirs: ', data_dirs) - - command = [ - *get_cwltool_base_cmd(tmpdir), - '--outdir', - tmpdir / 'cwl_out', - '--parallel', - cwl_workflows[0], - '--threads', - THREADS, - ] - for data_dir in data_dirs: - command.append('--sequence_directory') - command.append(data_dir) - - return join_quote_command_str(command) - - def build_cwltool_cmd2(**kwargs): - ctx = kwargs['dag_run'].conf - run_id = kwargs['run_id'] - tmpdir = utils.get_tmp_dir_path(run_id) - print('tmpdir: ', tmpdir) - data_dir = ctx['parent_lz_path'] - print('data_dir: ', data_dir) - - command = [ - *get_cwltool_base_cmd(tmpdir), - cwl_workflows[1], - '--input_dir', - '.', - ] - - return join_quote_command_str(command) - - t_build_cmd1 = PythonOperator( - task_id='build_cmd1', - python_callable=build_cwltool_cmd1, - provide_context=True, - ) - - t_build_cmd2 = PythonOperator( - task_id='build_cmd2', - python_callable=build_cwltool_cmd2, - provide_context=True, - ) - - t_pipeline_exec = BashOperator( - task_id='pipeline_exec', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - {{ti.xcom_pull(task_ids='build_cmd1')}} > $tmp_dir/session.log 2>&1 ; \ - echo $? - """, - ) - - t_make_arrow1 = BashOperator( - task_id='make_arrow1', - bash_command=""" \ - tmp_dir={{tmp_dir_path(run_id)}} ; \ - ds_dir="{{ti.xcom_pull(task_ids="send_create_dataset")}}" ; \ - cd "$tmp_dir"/cwl_out ; \ - {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ - echo $? - """ - ) - - t_maybe_keep_cwl1 = BranchPythonOperator( - task_id='maybe_keep_cwl1', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs={ - 'next_op': 'prepare_cwl2', - 'bail_op': 'set_dataset_error', - 'test_op': 'pipeline_exec', - }, - ) - - t_maybe_keep_cwl2 = BranchPythonOperator( - task_id='maybe_keep_cwl2', - python_callable=utils.pythonop_maybe_keep, - provide_context=True, - op_kwargs={ - 'next_op': 'move_data', - 'bail_op': 'set_dataset_error', - 'test_op': 'make_arrow1', - }, - ) - - t_send_create_dataset = PythonOperator( - task_id='send_create_dataset', - python_callable=utils.pythonop_send_create_dataset, - provide_context=True, - op_kwargs={ - 'parent_dataset_uuid_callable': get_parent_dataset_uuid, - 'http_conn_id': 'ingest_api_connection', - 'endpoint': '/datasets/derived', - 'dataset_name_callable': build_dataset_name, - "dataset_types": ["sc_atac_seq_snare"], - }, - ) - - t_set_dataset_error = PythonOperator( - task_id='set_dataset_error', - python_callable=utils.pythonop_set_dataset_state, - provide_context=True, - trigger_rule='all_done', - op_kwargs={ - 'dataset_uuid_callable': get_dataset_uuid, - 'http_conn_id': 'ingest_api_connection', - 'endpoint': '/datasets/status', - 'ds_state': 'Error', - 'message': 'An error occurred in {}'.format(pipeline_name), - }, - ) - - send_status_msg = make_send_status_msg_function( - dag_file=__file__, - retcode_ops=['pipeline_exec', 'move_data', 'make_arrow1'], - cwl_workflows=cwl_workflows, - ) - t_send_status = PythonOperator( - task_id='send_status_msg', - python_callable=send_status_msg, - provide_context=True, - ) - - t_log_info = LogInfoOperator(task_id='log_info') - t_join = JoinOperator(task_id='join') - t_create_tmpdir = CreateTmpDirOperator(task_id='create_tmpdir') - t_cleanup_tmpdir = CleanupTmpDirOperator(task_id='cleanup_tmpdir') - t_set_dataset_processing = SetDatasetProcessingOperator(task_id='set_dataset_processing') - t_move_data = MoveDataOperator(task_id='move_data') - - (dag >> t_log_info >> t_create_tmpdir - >> t_send_create_dataset >> t_set_dataset_processing - >> prepare_cwl1 >> t_build_cmd1 >> t_pipeline_exec >> t_maybe_keep_cwl1 - >> prepare_cwl2 >> t_build_cmd2 >> t_make_arrow1 >> t_maybe_keep_cwl2 - >> t_move_data >> t_send_status >> t_join) - t_maybe_keep_cwl1 >> t_set_dataset_error - t_maybe_keep_cwl2 >> t_set_dataset_error - t_set_dataset_error >> t_join - t_join >> t_cleanup_tmpdir From 58ee67d083050ddb8c792966ed3bf0156192eb7e Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 3 Feb 2021 10:35:03 -0500 Subject: [PATCH 59/63] Remove sci-atac-seq-pipeline, sn-atac-seq-pipeline submodules --- src/ingest-pipeline/airflow/dags/cwl/sci-atac-seq-pipeline | 1 - src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline | 1 - 2 files changed, 2 deletions(-) delete mode 160000 src/ingest-pipeline/airflow/dags/cwl/sci-atac-seq-pipeline delete mode 160000 src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline diff --git a/src/ingest-pipeline/airflow/dags/cwl/sci-atac-seq-pipeline b/src/ingest-pipeline/airflow/dags/cwl/sci-atac-seq-pipeline deleted file mode 160000 index dba3c89e..00000000 --- a/src/ingest-pipeline/airflow/dags/cwl/sci-atac-seq-pipeline +++ /dev/null @@ -1 +0,0 @@ -Subproject commit dba3c89eea0702116995a095789c4aff777e5ec7 diff --git a/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline b/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline deleted file mode 160000 index 8471d6e6..00000000 --- a/src/ingest-pipeline/airflow/dags/cwl/sn-atac-seq-pipeline +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8471d6e68c990d84106527d36f74ead3ab9c09b3 From bec97aa33810c9797a798eaa031b6c2a42eb5e9e Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Wed, 3 Feb 2021 10:37:02 -0500 Subject: [PATCH 60/63] Remove bulk-atac-seq submodule, use common version in bulk_atacseq DAG --- src/ingest-pipeline/airflow/dags/bulk_atacseq.py | 2 +- src/ingest-pipeline/airflow/dags/cwl/bulk-atac-seq | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 160000 src/ingest-pipeline/airflow/dags/cwl/bulk-atac-seq diff --git a/src/ingest-pipeline/airflow/dags/bulk_atacseq.py b/src/ingest-pipeline/airflow/dags/bulk_atacseq.py index 75e82713..c6bc6b28 100644 --- a/src/ingest-pipeline/airflow/dags/bulk_atacseq.py +++ b/src/ingest-pipeline/airflow/dags/bulk_atacseq.py @@ -52,7 +52,7 @@ ) as dag: pipeline_name = 'bulk-atac-seq' cwl_workflows = get_absolute_workflows( - Path(pipeline_name, 'bulk-atac-seq-pipeline.cwl'), + Path('sc-atac-seq-pipeline', 'bulk-atac-seq-pipeline.cwl'), ) def build_dataset_name(**kwargs): diff --git a/src/ingest-pipeline/airflow/dags/cwl/bulk-atac-seq b/src/ingest-pipeline/airflow/dags/cwl/bulk-atac-seq deleted file mode 160000 index 302f1f3c..00000000 --- a/src/ingest-pipeline/airflow/dags/cwl/bulk-atac-seq +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 302f1f3c019b74b85a4decc56c0793726e99c191 From 5d436158ed25202e774fbe40ece23163f8e9e3c7 Mon Sep 17 00:00:00 2001 From: Matt Ruffalo Date: Fri, 5 Feb 2021 11:02:10 -0500 Subject: [PATCH 61/63] Update sc-atac-seq-pipeline to v1.3 --- src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline b/src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline index c08d43fe..ffcc84c3 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline +++ b/src/ingest-pipeline/airflow/dags/cwl/sc-atac-seq-pipeline @@ -1 +1 @@ -Subproject commit c08d43fe2801551e5bab22872256bcc94e0c6201 +Subproject commit ffcc84c3376c4bd10738e22b7daec6a0aaa37108 From 63f25cf6ec5b7083e648646b9fdc54c100ba9182 Mon Sep 17 00:00:00 2001 From: Hubmap Hive Date: Sat, 6 Feb 2021 21:57:02 -0500 Subject: [PATCH 62/63] Working version. --- .../airflow/dags/validation_test.py | 62 ++++++++++++------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/validation_test.py b/src/ingest-pipeline/airflow/dags/validation_test.py index b5912220..d8a4a989 100644 --- a/src/ingest-pipeline/airflow/dags/validation_test.py +++ b/src/ingest-pipeline/airflow/dags/validation_test.py @@ -16,17 +16,18 @@ from airflow.operators.dagrun_operator import TriggerDagRunOperator, DagRunOrder from airflow.operators.multi_dagrun import TriggerMultiDagRunOperator from airflow.hooks.http_hook import HttpHook +from airflow.exceptions import AirflowException from hubmap_operators.flex_multi_dag_run import FlexMultiDagRunOperator -import utils +import utils from utils import localized_assert_json_matches_schema as assert_json_matches_schema - -def get_src_path(**kwargs): - rslt = airflow_conf.as_dict()['connections']['SRC_PATH'] - return rslt.strip("'").strip('"') - +sys.path.append(airflow_conf.as_dict()['connections']['SRC_PATH'].strip("'").strip('"')) +from submodules import (ingest_validation_tools_submission, + ingest_validation_tools_error_report, + ingest_validation_tests) +sys.path.pop() # Following are defaults which can be overridden later on default_args = { @@ -47,7 +48,6 @@ def get_src_path(**kwargs): schedule_interval=None, is_paused_upon_creation=False, default_args=default_args, - user_defined_macros={'get_src_path' : get_src_path} ) as dag: def find_uuid(**kwargs): @@ -111,24 +111,38 @@ def find_uuid(**kwargs): } ) - - t_run_md_extract = BashOperator( + + def run_md_extract(**kwargs): + assay_type = kwargs['ti'].xcom_pull(key='assay_type') + lz_path = kwargs['ti'].xcom_pull(key='lz_path') + uuid = kwargs['ti'].xcom_pull(key='uuid') + plugin_path = [path for path in ingest_validation_tests.__path__][0] + + ignore_globs = [uuid, 'extras', '*metadata.tsv', 'validation_report.txt'] + # + # Uncomment offline=True below to avoid validating orcid_id URLs &etc + # + submission = ingest_validation_tools_submission.Submission(directory_path=Path(lz_path), + dataset_ignore_globs=ignore_globs, + submission_ignore_globs='*', + plugin_directory=plugin_path, + #offline=True, + add_notes=False + ) + # Scan reports an error result + report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) + with open(os.path.join(lz_path, 'validation_report.txt'), 'w') as f: + f.write(report.as_text()) + + + t_run_md_extract = PythonOperator( task_id='run_md_extract', - bash_command=""" \ - lz_dir="{{ti.xcom_pull(task_ids='find_uuid', key='lz_path')}}" \ - src_dir="{{get_src_path()}}/md" ; \ - top_dir="{{get_src_path()}}" ; \ - cd "$lz_dir" ; \ - env PYTHONPATH=${PYTHONPATH}:$top_dir \ - python $src_dir/metadata_extract.py --out /dev/null "$lz_dir" \ - > session.log 2> error.log ; \ - echo $? ; \ - if [ -s error.log ] ; \ - then echo 'ERROR!' `cat error.log` >> session.log ; \ - else rm error.log ; \ - fi - """ - ) + python_callable=run_md_extract, + provide_context=True, + op_kwargs={'crypt_auth_tok' : utils.encrypt_tok(airflow_conf.as_dict() + ['connections']['APP_CLIENT_SECRET']).decode(), + } + ) (dag >> t_find_uuid >> t_run_md_extract) From 1ecff86025bfdfc4e3061fb9aba7ef4450c1a78b Mon Sep 17 00:00:00 2001 From: Joel Welling Date: Mon, 15 Feb 2021 17:29:19 -0500 Subject: [PATCH 63/63] Flake8 and a variable name change --- .../airflow/dags/validation_test.py | 104 +++++++++--------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/src/ingest-pipeline/airflow/dags/validation_test.py b/src/ingest-pipeline/airflow/dags/validation_test.py index d8a4a989..21d3cba6 100644 --- a/src/ingest-pipeline/airflow/dags/validation_test.py +++ b/src/ingest-pipeline/airflow/dags/validation_test.py @@ -1,7 +1,5 @@ import sys import os -import yaml -import json import ast from pathlib import Path from pprint import pprint @@ -9,22 +7,17 @@ from airflow import DAG from airflow.configuration import conf as airflow_conf -from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from airflow.operators.dagrun_operator import TriggerDagRunOperator, DagRunOrder -from airflow.operators.multi_dagrun import TriggerMultiDagRunOperator -from airflow.hooks.http_hook import HttpHook from airflow.exceptions import AirflowException -from hubmap_operators.flex_multi_dag_run import FlexMultiDagRunOperator - import utils -from utils import localized_assert_json_matches_schema as assert_json_matches_schema +from utils import ( + localized_assert_json_matches_schema as assert_json_matches_schema + ) -sys.path.append(airflow_conf.as_dict()['connections']['SRC_PATH'].strip("'").strip('"')) -from submodules import (ingest_validation_tools_submission, +sys.path.append(airflow_conf.as_dict()['connections']['SRC_PATH'] + .strip("'").strip('"')) +from submodules import (ingest_validation_tools_submission, # noqa E402 ingest_validation_tools_error_report, ingest_validation_tests) sys.path.pop() @@ -44,9 +37,9 @@ } -with DAG('validation_test', - schedule_interval=None, - is_paused_upon_creation=False, +with DAG('validation_test', + schedule_interval=None, + is_paused_upon_creation=False, default_args=default_args, ) as dag: @@ -54,16 +47,21 @@ def find_uuid(**kwargs): try: assert_json_matches_schema(kwargs['dag_run'].conf, 'validation_test_schema.yml') - except AssertionError as e: + except AssertionError: print('invalid metadata follows:') pprint(kwargs['dag_run'].conf) raise - + uuid = kwargs['dag_run'].conf['uuid'] - my_callable = lambda **kwargs: uuid - rslt=utils.pythonop_get_dataset_state(dataset_uuid_callable=my_callable, - http_conn_id='ingest_api_connection', - **kwargs) + + def my_callable(**kwargs): + return uuid + + rslt = utils.pythonop_get_dataset_state( + dataset_uuid_callable=my_callable, + http_conn_id='ingest_api_connection', + **kwargs + ) if not rslt: raise AirflowException(f'Invalid uuid/doi for group: {uuid}') print('rslt:') @@ -71,7 +69,8 @@ def find_uuid(**kwargs): assert 'dataset' in rslt, f"Status for {uuid} has no dataset entry" ds_rslt = rslt['dataset'] - for key in ['status', 'uuid', 'data_types', 'local_directory_full_path']: + for key in ['status', 'uuid', 'data_types', + 'local_directory_full_path']: assert key in ds_rslt, f"Dataset status for {uuid} has no {key}" if not ds_rslt['status'] in ['New', 'Invalid']: @@ -88,12 +87,13 @@ def find_uuid(**kwargs): else: filtered_data_types = [tuple(dt)] else: - raise AirflowException(f'Dataset data_types for {uuid} is empty') + raise AirflowException(f'Dataset data_types for {uuid}' + ' is empty') else: filtered_data_types = [dt] lz_path = ds_rslt['local_directory_full_path'] - uuid = ds_rslt['uuid'] # in case the original 'uuid' was actually a DOI + uuid = ds_rslt['uuid'] # 'uuid' may actually be a DOI print(f'Finished uuid {uuid}') print(f'filtered data types: {filtered_data_types}') print(f'lz path: {lz_path}') @@ -106,44 +106,50 @@ def find_uuid(**kwargs): python_callable=find_uuid, provide_context=True, op_kwargs={ - 'crypt_auth_tok' : utils.encrypt_tok(airflow_conf.as_dict() - ['connections']['APP_CLIENT_SECRET']).decode(), + 'crypt_auth_tok': ( + utils.encrypt_tok(airflow_conf.as_dict() + ['connections']['APP_CLIENT_SECRET']) + .decode() + ), } ) - - def run_md_extract(**kwargs): - assay_type = kwargs['ti'].xcom_pull(key='assay_type') + def run_validation(**kwargs): lz_path = kwargs['ti'].xcom_pull(key='lz_path') uuid = kwargs['ti'].xcom_pull(key='uuid') plugin_path = [path for path in ingest_validation_tests.__path__][0] - ignore_globs = [uuid, 'extras', '*metadata.tsv', 'validation_report.txt'] + ignore_globs = [uuid, 'extras', '*metadata.tsv', + 'validation_report.txt'] # # Uncomment offline=True below to avoid validating orcid_id URLs &etc # - submission = ingest_validation_tools_submission.Submission(directory_path=Path(lz_path), - dataset_ignore_globs=ignore_globs, - submission_ignore_globs='*', - plugin_directory=plugin_path, - #offline=True, - add_notes=False - ) + submission = ingest_validation_tools_submission.Submission( + directory_path=Path(lz_path), + dataset_ignore_globs=ignore_globs, + submission_ignore_globs='*', + plugin_directory=plugin_path, + #offline=True, # noqa E265 + add_notes=False + ) # Scan reports an error result - report = ingest_validation_tools_error_report.ErrorReport(submission.get_errors()) + report = ingest_validation_tools_error_report.ErrorReport( + submission.get_errors() + ) with open(os.path.join(lz_path, 'validation_report.txt'), 'w') as f: f.write(report.as_text()) - - t_run_md_extract = PythonOperator( - task_id='run_md_extract', - python_callable=run_md_extract, + t_run_validation = PythonOperator( + task_id='run_validation', + python_callable=run_validation, provide_context=True, - op_kwargs={'crypt_auth_tok' : utils.encrypt_tok(airflow_conf.as_dict() - ['connections']['APP_CLIENT_SECRET']).decode(), - } + op_kwargs={ + 'crypt_auth_tok': ( + utils.encrypt_tok(airflow_conf.as_dict() + ['connections']['APP_CLIENT_SECRET']) + .decode() + ), + } ) - - (dag >> t_find_uuid >> t_run_md_extract) - + (dag >> t_find_uuid >> t_run_validation)