From b15fe6bc4f1b8b7f3261cc1493abfaa5f43796f6 Mon Sep 17 00:00:00 2001 From: Artem Sokolov Date: Fri, 21 Jan 2022 16:59:25 -0500 Subject: [PATCH] Fixed hdf5; check for number of channels (#33) * Fixed HDF5 parsing * *_intensity -> intensity_* to account for regionprops update * Function to peek the number of channels * Comparing number of channels between image and markers.csv * Incorporated review suggestions --- ParseInput.py | 2 +- SingleCellDataExtraction.py | 102 +++++++++++++++++------------------- 2 files changed, 49 insertions(+), 55 deletions(-) diff --git a/ParseInput.py b/ParseInput.py index 2b3f12b..56177a5 100644 --- a/ParseInput.py +++ b/ParseInput.py @@ -38,7 +38,7 @@ def ParseInputDataExtract(): #Create a dictionary object to pass to the next function dict = {'masks': args.masks, 'image': args.image,\ 'channel_names': args.channel_names,'output':args.output, - 'intensity_props': set(args.intensity_props if args.intensity_props is not None else []).union(["mean_intensity"]), + 'intensity_props': set(args.intensity_props if args.intensity_props is not None else []).union(["intensity_mean"]), 'mask_props': args.mask_props, } #Print the dictionary object diff --git a/SingleCellDataExtraction.py b/SingleCellDataExtraction.py index ce2aebe..0eceeac 100644 --- a/SingleCellDataExtraction.py +++ b/SingleCellDataExtraction.py @@ -8,6 +8,7 @@ import numpy as np import os import skimage.measure as measure +import tifffile from pathlib import Path @@ -19,10 +20,10 @@ def gini_index(mask, intensity): cumx = np.cumsum(sorted_x, dtype=float) return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n -def median_intensity(mask, intensity): +def intensity_median(mask, intensity): return np.median(intensity[mask]) -def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["mean_intensity"]): +def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]): """Function for quantifying a single channel image Returns a table with CellID according to the mask and the mean pixel intensity @@ -73,6 +74,25 @@ def MaskIDs(mask, mask_props=None): return dat +def n_channels(image): + """Returns the number of channel in the input image. Supports [OME]TIFF and HDF5.""" + + image_path = Path(image) + + if image_path.suffix in ['.tiff', '.tif', '.btf']: + s = tifffile.TiffFile(image).series[0] + ndim = len(s.shape) + if ndim == 2: return 1 + elif ndim == 3: return min(s.shape) + else: raise Exception('mcquant supports only 2D/3D images.') + + elif image_path.suffix in ['.h5', '.hdf5']: + f = h5py.File(image, 'r') + dat_name = list(f.keys())[0] + return f[dat_name].shape[3] + + else: + raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only') def PrepareData(image,z): """Function for preparing input for maskzstack function. Connecting function @@ -81,38 +101,26 @@ def PrepareData(image,z): image_path = Path(image) #Check to see if image tif(f) - if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': - #Check to see if the image is ome.tif(f) - if image.endswith(('.ome.tif','.ome.tiff')): - #Read the image - image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') - #print('OME TIF(F) found') - else: - #Read the image - image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') - #print('TIF(F) found') - # Remove extra axis - #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4])) + if image_path.suffix in ['.tiff', '.tif', '.btf']: + image_loaded_z = tifffile.imread(image, key=z) #Check to see if image is hdf5 - elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': + elif image_path.suffix in ['.h5', '.hdf5']: #Read the image - f = h5py.File(image,'r+') + f = h5py.File(image,'r') #Get the dataset name from the h5 file dat_name = list(f.keys())[0] - ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' - #Get the image data - image_loaded = np.array(f[dat_name]) - #Remove the first axis (ilastik convention) - image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) - ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be - ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1) + #Retrieve the z^th channel + image_loaded_z = f[dat_name][0,:,:,z] + + else: + raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only') #Return the objects return image_loaded_z -def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["mean_intensity"]): +def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]): """This function will extract the stats for each cell mask through each channel in the input image @@ -166,10 +174,10 @@ def col_sort(x): mask_dict = {} # Mean intensity is default property, stored without suffix mask_dict.update( - zip(channel_names_loaded, [x["mean_intensity"] for x in dict_of_chan[nm]]) + zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]]) ) # All other properties are suffixed with their names - for prop_n in set(dict_of_chan[nm][0].keys()).difference(["mean_intensity"]): + for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]): mask_dict.update( zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]]) ) @@ -181,42 +189,31 @@ def col_sort(x): # Return the dict of dataframes for each mask return dict_of_chan -def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["mean_intensity"]): +def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]): """Function for extracting single cell information from input path containing single-cell masks, z_stack path, and channel_names path.""" #Create pathlib object for output output = Path(output) - #Check if header available - #sniffer = csv.Sniffer() - #sniffer.has_header(open(channel_names).readline()) - #If header not available - #if not sniffer: - #If header available - #channel_names_loaded = pd.read_csv(channel_names) - #channel_names_loaded_list = list(channel_names_loaded.marker_name) - #else: - #print("negative") - #old one column version - #channel_names_loaded = pd.read_csv(channel_names,header=None) - #Add a column index for ease - #channel_names_loaded.columns = ["marker"] - #channel_names_loaded = list(channel_names_loaded.marker.values) - #Read csv channel names channel_names_loaded = pd.read_csv(channel_names) - #Check for size of columns - if channel_names_loaded.shape[1] > 1: + #Check for the presence of `marker_name` column + if 'marker_name' in channel_names_loaded: #Get the marker_name column if more than one column (CyCIF structure) channel_names_loaded_list = list(channel_names_loaded.marker_name) - else: - #old one column version -- re-read the csv file and add column name + #Consider the old one-marker-per-line plain text format + elif channel_names_loaded.shape[1] == 1: + #re-read the csv file and add column name channel_names_loaded = pd.read_csv(channel_names, header = None) - #Add a column index for ease and for standardization - channel_names_loaded.columns = ["marker"] - channel_names_loaded_list = list(channel_names_loaded.marker) + channel_names_loaded_list = list(channel_names_loaded.iloc[:,0]) + else: + raise Exception('%s must contain the marker_name column'%channel_names) + #Contrast against the number of markers in the image + if len(channel_names_loaded_list) != n_channels(image): + raise Exception("The number of channels in %s doesn't match the image"%channel_names) + #Check for unique marker names -- create new list to store new names channel_names_loaded_checked = [] for idx,val in enumerate(channel_names_loaded_list): @@ -228,9 +225,6 @@ def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intens #Otherwise, leave channel name channel_names_loaded_checked.append(val) - #Clear small memory amount by clearing old channel names - channel_names_loaded, channel_names_loaded_list = None, None - #Read the masks masks_loaded = {} #iterate through mask paths and read images to add to dictionary object @@ -255,7 +249,7 @@ def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intens ) -def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["mean_intensity"]): +def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]): """Function for iterating over a list of z_stacks and output locations to export single-cell data from image masks"""