diff --git a/bin/convert_to_ometiff.py b/bin/convert_to_ometiff.py index e17931d..3660fe6 100755 --- a/bin/convert_to_ometiff.py +++ b/bin/convert_to_ometiff.py @@ -10,7 +10,6 @@ import yaml from aicsimageio import AICSImage from aicsimageio.writers.ome_tiff_writer import OmeTiffWriter -from antibodies_tsv_util import antibodies_tsv_util as antb_tools from ome_types.model import AnnotationRef, Map, MapAnnotation, StructuredAnnotationList from tifffile import TiffFile @@ -25,6 +24,104 @@ "nucleus_boundaries", ] TIFF_FILE_NAMING_PATTERN = re.compile(r"^R\d{3}_X(\d{3})_Y(\d{3})\.tif") +metadata_filename_pattern = re.compile(r"^[0-9A-Fa-f]{32}antibodies\.tsv$") + + +def find_antibodies_meta(input_dir: Path) -> Optional[Path]: + """ + Finds and returns the first metadata file for a HuBMAP data set. + Does not check whether the dataset ID (32 hex characters) matches + the directory name, nor whether there might be multiple metadata files. + """ + # possible_dirs = [input_dir, input_dir / "extras"] + metadata_filename_pattern = re.compile(r"^[0-9A-Za-z\-_]*antibodies\.tsv$") + found_files = [] + for dirpath, dirnames, filenames in walk(input_dir): + for filename in filenames: + if metadata_filename_pattern.match(filename): + found_files.append(Path(dirpath) / filename) + + if len(found_files) == 0: + logger.warning("No antibody.tsv file found") + antb_path = None + else: + antb_path = found_files[0] + return antb_path + + +def sort_by_cycle(antb_path: Path): + """ + Sorts antibodies.tsv by cycle and channel number. The original tsv is not sorted correctly. + """ + df = pd.read_table(antb_path) + cycle_channel_pattern = re.compile(r"cycle(?P\d+)_ch(?P\d+)", re.IGNORECASE) + searches = [cycle_channel_pattern.search(v) for v in df["channel_id"]] + cycles = [int(s.group("cycle")) for s in searches] + channels = [int(s.group("channel")) for s in searches] + df.index = [cycles, channels] + df = df.sort_index() + return df + + +def get_ch_info_from_antibodies_meta(df: pd.DataFrame) -> Optional[pd.DataFrame]: + """ + Adds "target" column with the antibody name that we want to replace. + """ + # df = df.set_index("channel_id", inplace=False) + antb_names = df["antibody_name"].to_list() + antb_targets = [get_analyte_name(antb) for antb in antb_names] + df["target"] = antb_targets + return df + + +def get_analyte_name(antibody_name: str) -> str: + """ + Strips unnecessary prefixes and suffixes off of antibody name from antibodies.tsv. + """ + antb = re.sub(r"Anti-", "", antibody_name) + antb = re.sub(r"\s+antibody", "", antb) + return antb + + +def create_original_channel_names_df(channelList: List[str]) -> pd.DataFrame: + """ + Creates a dataframe with the original channel names, cycle numbers, and channel numbers. + """ + # Separate channel and cycle info from channel names and remove "orig" + cyc_ch_pattern = re.compile(r"cyc(\d+)_ch(\d+)_orig(.*)") + og_ch_names_df = pd.DataFrame(channelList, columns=["Original_Channel_Name"]) + og_ch_names_df[["Cycle", "Channel", "channel_name"]] = og_ch_names_df[ + "Original_Channel_Name" + ].str.extract(cyc_ch_pattern) + og_ch_names_df["Cycle"] = pd.to_numeric(og_ch_names_df["Cycle"]) + og_ch_names_df["Channel"] = pd.to_numeric(og_ch_names_df["Channel"]) + og_ch_names_df["channel_id"] = ( + "cycle" + + og_ch_names_df["Cycle"].astype(str) + + "_ch" + + og_ch_names_df["Channel"].astype(str) + ) + + return og_ch_names_df + + +def replace_provider_ch_names_with_antb( + og_ch_names_df: pd.DataFrame, antibodies_df: pd.DataFrame +) -> List[str]: + """ + Uses cycle and channel mapping to replace the channel name with the one in antibodies.tsv. + """ + updated_channel_names = [] + mapping = map_cycles_and_channels(antibodies_df) + for i in og_ch_names_df.index: + channel_id = og_ch_names_df.at[i, "channel_id"].lower() + original_name = og_ch_names_df.at[i, "channel_name"] + target = mapping.get(channel_id, None) + if target is not None: + updated_channel_names.append(target) + else: + updated_channel_names.append(original_name) + return updated_channel_names def generate_sa_ch_info( @@ -280,16 +377,14 @@ def check_dir_is_empty(dir_path: Path): segmentationFileList = collect_tiff_file_list(cytometryTileDir, TIFF_FILE_NAMING_PATTERN) extractFileList = collect_tiff_file_list(extractDir, TIFF_FILE_NAMING_PATTERN) - antb_path = antb_tools.find_antibodies_meta(args.input_data_dir) + antb_path = find_antibodies_meta(args.input_data_dir) lateral_resolution = get_lateral_resolution(args.cytokit_config) - df = antb_tools.sort_by_cycle(antb_path) - antb_info = antb_tools.get_ch_info_from_antibodies_meta(df) + df = sort_by_cycle(antb_path) + antb_info = get_ch_info_from_antibodies_meta(df) extractChannelNames = collect_expressions_extract_channels(extractFileList[0]) - original_ch_names_df = antb_tools.create_original_channel_names_df(extractChannelNames) - updated_channel_names = antb_tools.replace_provider_ch_names_with_antb( - original_ch_names_df, antb_info - ) + original_ch_names_df = create_original_channel_names_df(extractChannelNames) + updated_channel_names = replace_provider_ch_names_with_antb(original_ch_names_df, antb_info) # Create segmentation mask OME-TIFFs if segmentationFileList: diff --git a/bin/dataset_info/collect_dataset_info_old.py b/bin/dataset_info/collect_dataset_info_old.py index 072badf..325b6ed 100644 --- a/bin/dataset_info/collect_dataset_info_old.py +++ b/bin/dataset_info/collect_dataset_info_old.py @@ -13,8 +13,6 @@ from pprint import pprint from typing import Dict, List, Optional, Tuple -from antibodies_tsv_util import antibodies_tsv_util as antb_tools - sys.path.append("/opt") from pipeline_utils.dataset_listing import get_tile_dtype, get_tile_shape @@ -29,6 +27,26 @@ logger.addHandler(handler) +def add_cycle_channel_numbers(channel_names: List[str]) -> List[str]: + """ + Adds cycle and channel info during the collect dataset info step. Replaces a similar function that adds a number on the end of duplicate channel names. + """ + new_names = [] + cycle_count = 1 + channel_count = 1 + + for original_name in channel_names: + new_name = f"cyc{cycle_count}_ch{channel_count}_orig{original_name}" + new_names.append(new_name) + + channel_count += 1 + if channel_count > 4: # Assuming 4 channels per cycle, modify accordingly + channel_count = 1 + cycle_count += 1 + + return new_names + + def find_files( base_directory: Path, filename: str, @@ -444,7 +462,7 @@ def standardize_metadata(directory: Path, num_concurrent_tasks: int): # If there are identical channel names, make them unique by adding # incremental numbers to the end. - channelNames = antb_tools.add_cycle_channel_numbers(channelNames) + channelNames = add_cycle_channel_numbers(channelNames) print(channelNames) datasetInfo["channel_names"] = channelNames @@ -459,7 +477,7 @@ def standardize_metadata(directory: Path, num_concurrent_tasks: int): for row in csvreader: ch_names_qc.append(row[0]) qc_vals.append(row[1].strip()) - unique_qc_ch_names = antb_tools.add_cycle_channel_numbers(ch_names_qc) + unique_qc_ch_names = add_cycle_channel_numbers(ch_names_qc) for i, ch in enumerate(unique_qc_ch_names): channel_names_qc_pass[ch] = [qc_vals[i]] else: diff --git a/environment.yml b/environment.yml index ba28c03..b424062 100644 --- a/environment.yml +++ b/environment.yml @@ -23,4 +23,3 @@ dependencies: - opencv-contrib-python-headless>4.0,<5.0 - pint==0.22 - jsonschema==4.19.0 - - git+https://github.com/hubmapconsortium/antibodies-tsv-util.git diff --git a/steps/illumination_first_stitching/best_focus.cwl b/steps/illumination_first_stitching/best_focus.cwl index 1185cac..075a841 100644 --- a/steps/illumination_first_stitching/best_focus.cwl +++ b/steps/illumination_first_stitching/best_focus.cwl @@ -3,7 +3,7 @@ class: CommandLineTool requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest dockerOutputDirectory: "/output" baseCommand: ["python", "/opt/best_focus/run_best_focus_selection.py"] diff --git a/steps/illumination_first_stitching/collect_dataset_info.cwl b/steps/illumination_first_stitching/collect_dataset_info.cwl index b84aba0..12f1291 100644 --- a/steps/illumination_first_stitching/collect_dataset_info.cwl +++ b/steps/illumination_first_stitching/collect_dataset_info.cwl @@ -4,7 +4,7 @@ label: Collect dataset info for Cytokit requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest baseCommand: ["python", "/opt/dataset_info/run_collection.py"] diff --git a/steps/illumination_first_stitching/create_yaml_config.cwl b/steps/illumination_first_stitching/create_yaml_config.cwl index 3acec38..806860c 100644 --- a/steps/illumination_first_stitching/create_yaml_config.cwl +++ b/steps/illumination_first_stitching/create_yaml_config.cwl @@ -4,7 +4,7 @@ label: Create Cytokit experiment config requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest baseCommand: ["python", "/opt/create_cytokit_config.py"] diff --git a/steps/illumination_first_stitching/first_stitching.cwl b/steps/illumination_first_stitching/first_stitching.cwl index 3e52320..11513e6 100644 --- a/steps/illumination_first_stitching/first_stitching.cwl +++ b/steps/illumination_first_stitching/first_stitching.cwl @@ -3,7 +3,7 @@ class: CommandLineTool requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest dockerOutputDirectory: "/output" baseCommand: ["python", "/opt/codex_stitching/run_stitching.py"] diff --git a/steps/illumination_first_stitching/illumination_correction.cwl b/steps/illumination_first_stitching/illumination_correction.cwl index d1dea6c..89c01ce 100644 --- a/steps/illumination_first_stitching/illumination_correction.cwl +++ b/steps/illumination_first_stitching/illumination_correction.cwl @@ -3,7 +3,7 @@ class: CommandLineTool requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest dockerOutputDirectory: "/output" baseCommand: ["python", "/opt/illumination_correction/run_illumination_correction.py"] diff --git a/steps/illumination_first_stitching/slicing.cwl b/steps/illumination_first_stitching/slicing.cwl index 8e1126f..068a7b5 100644 --- a/steps/illumination_first_stitching/slicing.cwl +++ b/steps/illumination_first_stitching/slicing.cwl @@ -3,7 +3,7 @@ class: CommandLineTool requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest dockerOutputDirectory: "/output" baseCommand: ["python", "/opt/slicing/run_slicing.py"] diff --git a/steps/ometiff_second_stitching/background_subtraction.cwl b/steps/ometiff_second_stitching/background_subtraction.cwl index aeab1e9..a4d0deb 100644 --- a/steps/ometiff_second_stitching/background_subtraction.cwl +++ b/steps/ometiff_second_stitching/background_subtraction.cwl @@ -3,7 +3,7 @@ class: CommandLineTool requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest dockerOutputDirectory: "/output" baseCommand: ["python", "/opt/background_subtraction/run_background_subtraction.py"] diff --git a/steps/ometiff_second_stitching/ome_tiff_creation.cwl b/steps/ometiff_second_stitching/ome_tiff_creation.cwl index c11f104..6c8cf3f 100644 --- a/steps/ometiff_second_stitching/ome_tiff_creation.cwl +++ b/steps/ometiff_second_stitching/ome_tiff_creation.cwl @@ -4,7 +4,7 @@ label: Create OME-TIFF versions of Cytokit segmentation and extract results requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest baseCommand: ["python", "/opt/convert_to_ometiff.py"] diff --git a/steps/ometiff_second_stitching/second_stitching.cwl b/steps/ometiff_second_stitching/second_stitching.cwl index c431194..660359c 100644 --- a/steps/ometiff_second_stitching/second_stitching.cwl +++ b/steps/ometiff_second_stitching/second_stitching.cwl @@ -3,7 +3,7 @@ class: CommandLineTool requirements: DockerRequirement: - dockerPull: hubmap/codex-scripts + dockerPull: hubmap/codex-scripts:latest dockerOutputDirectory: /output baseCommand: ["python", "/opt/codex_stitching/secondary_stitcher/secondary_stitcher_runner.py"]