Merge pull request hubmapconsortium#57 from hubmapconsortium/pennycud…

…a/fix-antibodies-util Remove references to defunct antibodies util
djgomezsantos · Aug 7, 2024 · 0fadb51 · 0fadb51
2 parents 2e574a8 + d855591
commit 0fadb51
Show file tree

Hide file tree

Showing 12 changed files with 134 additions and 22 deletions.
diff --git a/bin/convert_to_ometiff.py b/bin/convert_to_ometiff.py
@@ -10,7 +10,6 @@
 import yaml
 from aicsimageio import AICSImage
 from aicsimageio.writers.ome_tiff_writer import OmeTiffWriter
-from antibodies_tsv_util import antibodies_tsv_util as antb_tools
 from ome_types.model import AnnotationRef, Map, MapAnnotation, StructuredAnnotationList
 from tifffile import TiffFile
 
@@ -25,6 +24,104 @@
     "nucleus_boundaries",
 ]
 TIFF_FILE_NAMING_PATTERN = re.compile(r"^R\d{3}_X(\d{3})_Y(\d{3})\.tif")
+metadata_filename_pattern = re.compile(r"^[0-9A-Fa-f]{32}antibodies\.tsv$")
+
+
+def find_antibodies_meta(input_dir: Path) -> Optional[Path]:
+    """
+    Finds and returns the first metadata file for a HuBMAP data set.
+    Does not check whether the dataset ID (32 hex characters) matches
+    the directory name, nor whether there might be multiple metadata files.
+    """
+    # possible_dirs = [input_dir, input_dir / "extras"]
+    metadata_filename_pattern = re.compile(r"^[0-9A-Za-z\-_]*antibodies\.tsv$")
+    found_files = []
+    for dirpath, dirnames, filenames in walk(input_dir):
+        for filename in filenames:
+            if metadata_filename_pattern.match(filename):
+                found_files.append(Path(dirpath) / filename)
+
+    if len(found_files) == 0:
+        logger.warning("No antibody.tsv file found")
+        antb_path = None
+    else:
+        antb_path = found_files[0]
+    return antb_path
+
+
+def sort_by_cycle(antb_path: Path):
+    """
+    Sorts antibodies.tsv by cycle and channel number. The original tsv is not sorted correctly.
+    """
+    df = pd.read_table(antb_path)
+    cycle_channel_pattern = re.compile(r"cycle(?P<cycle>\d+)_ch(?P<channel>\d+)", re.IGNORECASE)
+    searches = [cycle_channel_pattern.search(v) for v in df["channel_id"]]
+    cycles = [int(s.group("cycle")) for s in searches]
+    channels = [int(s.group("channel")) for s in searches]
+    df.index = [cycles, channels]
+    df = df.sort_index()
+    return df
+
+
+def get_ch_info_from_antibodies_meta(df: pd.DataFrame) -> Optional[pd.DataFrame]:
+    """
+    Adds "target" column with the antibody name that we want to replace.
+    """
+    # df = df.set_index("channel_id", inplace=False)
+    antb_names = df["antibody_name"].to_list()
+    antb_targets = [get_analyte_name(antb) for antb in antb_names]
+    df["target"] = antb_targets
+    return df
+
+
+def get_analyte_name(antibody_name: str) -> str:
+    """
+    Strips unnecessary prefixes and suffixes off of antibody name from antibodies.tsv.
+    """
+    antb = re.sub(r"Anti-", "", antibody_name)
+    antb = re.sub(r"\s+antibody", "", antb)
+    return antb
+
+
+def create_original_channel_names_df(channelList: List[str]) -> pd.DataFrame:
+    """
+    Creates a dataframe with the original channel names, cycle numbers, and channel numbers.
+    """
+    # Separate channel and cycle info from channel names and remove "orig"
+    cyc_ch_pattern = re.compile(r"cyc(\d+)_ch(\d+)_orig(.*)")
+    og_ch_names_df = pd.DataFrame(channelList, columns=["Original_Channel_Name"])
+    og_ch_names_df[["Cycle", "Channel", "channel_name"]] = og_ch_names_df[
+        "Original_Channel_Name"
+    ].str.extract(cyc_ch_pattern)
+    og_ch_names_df["Cycle"] = pd.to_numeric(og_ch_names_df["Cycle"])
+    og_ch_names_df["Channel"] = pd.to_numeric(og_ch_names_df["Channel"])
+    og_ch_names_df["channel_id"] = (
+        "cycle"
+        + og_ch_names_df["Cycle"].astype(str)
+        + "_ch"
+        + og_ch_names_df["Channel"].astype(str)
+    )
+
+    return og_ch_names_df
+
+
+def replace_provider_ch_names_with_antb(
+    og_ch_names_df: pd.DataFrame, antibodies_df: pd.DataFrame
+) -> List[str]:
+    """
+    Uses cycle and channel mapping to replace the channel name with the one in antibodies.tsv.
+    """
+    updated_channel_names = []
+    mapping = map_cycles_and_channels(antibodies_df)
+    for i in og_ch_names_df.index:
+        channel_id = og_ch_names_df.at[i, "channel_id"].lower()
+        original_name = og_ch_names_df.at[i, "channel_name"]
+        target = mapping.get(channel_id, None)
+        if target is not None:
+            updated_channel_names.append(target)
+        else:
+            updated_channel_names.append(original_name)
+    return updated_channel_names
 
 
 def generate_sa_ch_info(
@@ -280,16 +377,14 @@ def check_dir_is_empty(dir_path: Path):
 
     segmentationFileList = collect_tiff_file_list(cytometryTileDir, TIFF_FILE_NAMING_PATTERN)
     extractFileList = collect_tiff_file_list(extractDir, TIFF_FILE_NAMING_PATTERN)
-    antb_path = antb_tools.find_antibodies_meta(args.input_data_dir)
+    antb_path = find_antibodies_meta(args.input_data_dir)
 
     lateral_resolution = get_lateral_resolution(args.cytokit_config)
-    df = antb_tools.sort_by_cycle(antb_path)
-    antb_info = antb_tools.get_ch_info_from_antibodies_meta(df)
+    df = sort_by_cycle(antb_path)
+    antb_info = get_ch_info_from_antibodies_meta(df)
     extractChannelNames = collect_expressions_extract_channels(extractFileList[0])
-    original_ch_names_df = antb_tools.create_original_channel_names_df(extractChannelNames)
-    updated_channel_names = antb_tools.replace_provider_ch_names_with_antb(
-        original_ch_names_df, antb_info
-    )
+    original_ch_names_df = create_original_channel_names_df(extractChannelNames)
+    updated_channel_names = replace_provider_ch_names_with_antb(original_ch_names_df, antb_info)
 
     # Create segmentation mask OME-TIFFs
     if segmentationFileList:

diff --git a/bin/dataset_info/collect_dataset_info_old.py b/bin/dataset_info/collect_dataset_info_old.py
@@ -13,8 +13,6 @@
 from pprint import pprint
 from typing import Dict, List, Optional, Tuple
 
-from antibodies_tsv_util import antibodies_tsv_util as antb_tools
-
 sys.path.append("/opt")
 from pipeline_utils.dataset_listing import get_tile_dtype, get_tile_shape
 
@@ -29,6 +27,26 @@
 logger.addHandler(handler)
 
 
+def add_cycle_channel_numbers(channel_names: List[str]) -> List[str]:
+    """
+    Adds cycle and channel info during the collect dataset info step. Replaces a similar function that adds a number on the end of duplicate channel names.
+    """
+    new_names = []
+    cycle_count = 1
+    channel_count = 1
+
+    for original_name in channel_names:
+        new_name = f"cyc{cycle_count}_ch{channel_count}_orig{original_name}"
+        new_names.append(new_name)
+
+        channel_count += 1
+        if channel_count > 4:  # Assuming 4 channels per cycle, modify accordingly
+            channel_count = 1
+            cycle_count += 1
+
+    return new_names
+
+
 def find_files(
     base_directory: Path,
     filename: str,
@@ -444,7 +462,7 @@ def standardize_metadata(directory: Path, num_concurrent_tasks: int):
 
     # If there are identical channel names, make them unique by adding
     # incremental numbers to the end.
-    channelNames = antb_tools.add_cycle_channel_numbers(channelNames)
+    channelNames = add_cycle_channel_numbers(channelNames)
     print(channelNames)
 
     datasetInfo["channel_names"] = channelNames
@@ -459,7 +477,7 @@ def standardize_metadata(directory: Path, num_concurrent_tasks: int):
             for row in csvreader:
                 ch_names_qc.append(row[0])
                 qc_vals.append(row[1].strip())
-        unique_qc_ch_names = antb_tools.add_cycle_channel_numbers(ch_names_qc)
+        unique_qc_ch_names = add_cycle_channel_numbers(ch_names_qc)
         for i, ch in enumerate(unique_qc_ch_names):
             channel_names_qc_pass[ch] = [qc_vals[i]]
     else:

diff --git a/environment.yml b/environment.yml
@@ -23,4 +23,3 @@ dependencies:
         - opencv-contrib-python-headless>4.0,<5.0
         - pint==0.22
         - jsonschema==4.19.0
-        - git+https://github.com/hubmapconsortium/antibodies-tsv-util.git
diff --git a/steps/illumination_first_stitching/best_focus.cwl b/steps/illumination_first_stitching/best_focus.cwl
@@ -3,7 +3,7 @@ class: CommandLineTool
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
     dockerOutputDirectory: "/output"
 
 baseCommand: ["python", "/opt/best_focus/run_best_focus_selection.py"]

diff --git a/steps/illumination_first_stitching/collect_dataset_info.cwl b/steps/illumination_first_stitching/collect_dataset_info.cwl
@@ -4,7 +4,7 @@ label: Collect dataset info for Cytokit
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
 
 baseCommand: ["python", "/opt/dataset_info/run_collection.py"]
 

diff --git a/steps/illumination_first_stitching/create_yaml_config.cwl b/steps/illumination_first_stitching/create_yaml_config.cwl
@@ -4,7 +4,7 @@ label: Create Cytokit experiment config
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
 
 baseCommand: ["python", "/opt/create_cytokit_config.py"]
 

diff --git a/steps/illumination_first_stitching/first_stitching.cwl b/steps/illumination_first_stitching/first_stitching.cwl
@@ -3,7 +3,7 @@ class: CommandLineTool
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
     dockerOutputDirectory: "/output"
 
 baseCommand: ["python", "/opt/codex_stitching/run_stitching.py"]

diff --git a/steps/illumination_first_stitching/illumination_correction.cwl b/steps/illumination_first_stitching/illumination_correction.cwl
@@ -3,7 +3,7 @@ class: CommandLineTool
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
     dockerOutputDirectory: "/output"
 
 baseCommand: ["python", "/opt/illumination_correction/run_illumination_correction.py"]

diff --git a/steps/illumination_first_stitching/slicing.cwl b/steps/illumination_first_stitching/slicing.cwl
@@ -3,7 +3,7 @@ class: CommandLineTool
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
     dockerOutputDirectory: "/output"
 
 baseCommand: ["python", "/opt/slicing/run_slicing.py"]

diff --git a/steps/ometiff_second_stitching/background_subtraction.cwl b/steps/ometiff_second_stitching/background_subtraction.cwl
@@ -3,7 +3,7 @@ class: CommandLineTool
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
     dockerOutputDirectory: "/output"
 
 baseCommand: ["python", "/opt/background_subtraction/run_background_subtraction.py"]

diff --git a/steps/ometiff_second_stitching/ome_tiff_creation.cwl b/steps/ometiff_second_stitching/ome_tiff_creation.cwl
@@ -4,7 +4,7 @@ label: Create OME-TIFF versions of Cytokit segmentation and extract results
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
 
 baseCommand: ["python", "/opt/convert_to_ometiff.py"]
 

diff --git a/steps/ometiff_second_stitching/second_stitching.cwl b/steps/ometiff_second_stitching/second_stitching.cwl
@@ -3,7 +3,7 @@ class: CommandLineTool
 
 requirements:
   DockerRequirement:
-    dockerPull: hubmap/codex-scripts
+    dockerPull: hubmap/codex-scripts:latest
     dockerOutputDirectory: /output
 
 baseCommand: ["python", "/opt/codex_stitching/secondary_stitcher/secondary_stitcher_runner.py"]