Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Segmentation mask file size validator #62

Draft
wants to merge 15 commits into
base: devel
Choose a base branch
from
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,7 @@ dmypy.json

# Pyre type checker
.pyre/

# PyCharm
.idea/

Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from pathlib import Path
from typing import List, Optional, Union

import tifffile
import xmlschema
from ingest_validation_tools.plugin_validator import Validator
from utils import GetParentData


def get_ometiff_size(file) -> Union[str, dict]:
try:
tf = tifffile.TiffFile(file)
xml_document = xmlschema.XmlDocument(tf.ome_metadata)
if xml_document.schema and not xml_document.schema.is_valid(xml_document):
return f"{file} is not a valid OME.TIFF file"
except Exception as excp:
return f"{file} is not a valid OME.TIFF file: {excp}"
xml_image_data = xml_document.schema.to_dict(xml_document).get("Image")[0].get("Pixels")
try:
rst = {
"X": xml_image_data.get("@PhysicalSizeX"),
"XUnits": xml_image_data.get("@PhysicalSizeXUnits"),
"Y": xml_image_data.get("@PhysicalSizeY"),
"YUnits": xml_image_data.get("@PhysicalSizeYUnits"),
"Z": xml_image_data.get("@PhysicalSizeZ"),
"ZUnits": xml_image_data.get("@PhysicalSizeZUnits"),
}
return rst
except Exception as excp:
return f"{file} is not a valid OME.TIFF file: {excp}"


class ImageSizeValidator(Validator):
description = "Check dataset and parent image size so they can be matched in the visualization"
cost = 1.0
version = "1.0"
required = "segmentation_mask"
files_to_find = [
"**/*.ome.tif",
"**/*.ome.tiff",
"**/*.OME.TIFF",
"**/*.OME.TIF",
]

def collect_errors(self, **kwargs) -> List[Optional[str]]:
del kwargs
if self.required not in self.contains and self.assay_type.lower() != self.required:
return [] # We only test Segmentation Masks
files_tested = None
output = []
filenames_to_test = []
parent_filenames_to_test = []
try:
for row in self.metadata_tsv.rows:
data_path = Path(row["data_path"])
if not data_path.is_absolute():
data_path = Path(self.paths[0]).parent / data_path

for glob_expr in self.files_to_find:
for file in data_path.glob(glob_expr):
filenames_to_test.append(file)

for file in Path(
GetParentData(
row["parent_dataset_id"], self.globus_token, self.app_context
).get_path()
).glob(glob_expr):
parent_filenames_to_test.append(file)

assert len(filenames_to_test) != 1, "Too many or too few files Mask"
Copy link
Contributor

@gesinaphillips gesinaphillips Sep 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading this again, I'm actually not sure how many files we are expecting--more than 1? I think I misread this as == 1 before, which I thought made sense.

Would it be reasonable instead to do something like assert len(filenames_to_test) == len(parent_filenames_to_test), "Mismatched number of files in dataset and parent_dataset directories." ? I might still be misunderstanding the intent here though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, I thought I had it "==", the idea is that a segmask can only happen if the parent dataset is one image, more than 1 image is not allowed. Updated...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what I read originally so we're both losing it apparently haha. Okay good now I think!

assert len(parent_filenames_to_test) != 1, "Too many or too few files Base Images"

segmentation_mask_size = get_ometiff_size(filenames_to_test[0])
base_image_size = get_ometiff_size(parent_filenames_to_test[0])
assert (
segmentation_mask_size == base_image_size
), "Files and base image size do not match"

except AssertionError as exep:
output.append(str(exep))

if output:
return output
elif files_tested:
return [None]
else:
return []
38 changes: 38 additions & 0 deletions src/ingest_validation_tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import requests


class GetParentData:
def __init__(self, hubmap_id, globus_token, app_context):
self.hubmap_id = hubmap_id
self.token = globus_token
self.app_context = app_context

def __get_uuid(self) -> None:
url = self.app_context.get("uuid_url") + self.hubmap_id
headers = self.app_context.get("request_headers", {})
headers({"Authorization": "Bearer " + self.token})
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
self.uuid = response.json().get("uuid")
except requests.exceptions.HTTPError as err:
self.uuid = None
print(f"Error: {err}")

def get_path(self) -> str:
self.__get_uuid()
if self.uuid is not None:
url = (
self.app_context.get("ingest_url")
+ "datasets/"
+ self.uuid
+ "/file-system-abs-path"
)
headers = self.app_context.get("request_headers", {})
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json().get("path")
except requests.exceptions.HTTPError as err:
print(f"Error: {err}")
return ""
Loading