Refactor Azimuth (#4)

* updated to support run_azimuth * azimuth implemented * gene expression done * gene expression done * Finished up azimuth data download cwl/scripts * Add reference data input * Remove gene expression (not part of pr) --------- Co-authored-by: Vicky Daiya <[email protected]>
hubmapconsortium · Oct 23, 2023 · 5876942 · 5876942
1 parent 4a10afd
commit 5876942
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -168,3 +168,6 @@ containers/*/context/src
 # Temp directories
 tmp/
 temp/
+
+# DS Store 
+.DS_Store
diff --git a/containers/azimuth/Dockerfile b/containers/azimuth/Dockerfile
@@ -1,4 +1,6 @@
-FROM hubmap/azimuth-annotate:1.3
+FROM satijalab/azimuth:0.4.6
+
+RUN R --no-echo -e "install.packages('rjson', repo='https://cloud.r-project.org')"
 
 COPY context/requirements-freeze.txt .
 RUN pip install -r requirements-freeze.txt

diff --git a/containers/azimuth/context/download-data.sh b/containers/azimuth/context/download-data.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+OUTPUT_DIR=${1:-"./azimuth"}
+MAPPING_FILE=${2:-"/organ-mapping.json"}
+
+mkdir -p "$OUTPUT_DIR"
+Rscript /download_reference_data.R "$MAPPING_FILE" "$OUTPUT_DIR"
diff --git a/containers/azimuth/context/download_reference_data.R b/containers/azimuth/context/download_reference_data.R
@@ -0,0 +1,15 @@
+library(rjson)
+library(Seurat)
+library(SeuratData)
+
+args <- commandArgs(trailingOnly = TRUE)
+organ_mapping_file <- args[1]
+output_dir <- args[2]
+
+# Load unique reference organs
+mapping <- fromJSON(file = organ_mapping_file)
+references <- unlist(unique(mapping))
+
+# Download and install data
+options(timeout=60 * 60) # Probably overkill but the default of 60s is to low for some of the datasets
+InstallData(references, lib=output_dir)
diff --git a/containers/azimuth/context/main.py b/containers/azimuth/context/main.py
@@ -1,28 +1,47 @@
+import logging
 import subprocess
 from pathlib import Path
+import typing as t
 
 import anndata
 import pandas
-import write_metadata  # type: ignore From azimuth-annotate docker image
 
 from src.algorithm import Algorithm, OrganLookup, add_common_arguments
 
 
+class AzimuthOptions(t.TypedDict):
+    reference_data_dir: Path
+
+
 class AzimuthOrganLookup(OrganLookup[str]):
     def __init__(self, mapping_file: Path):
         super().__init__(mapping_file)
 
     def get_builtin_options(self):
-        references = ["RK", "LK", "RL", "LL", "HT"]
+        # TODO read from mapping file?
+        references = [
+            "adiposeref",
+            "bonemarrowref",
+            "fetusref",
+            "heartref",
+            "humancortexref",
+            "kidneyref",
+            "lungref",
+            "mousecortexref",
+            "pancreasref",
+            "pbmcref",
+            "tonsilref",
+        ]
         return zip(references, references)
 
 
-class AzimuthAlgorithm(Algorithm[str, dict]):
+class AzimuthAlgorithm(Algorithm[str, AzimuthOptions]):
     def __init__(self):
         super().__init__(AzimuthOrganLookup)
 
-    def do_run(self, matrix: Path, organ: str, options: dict):
+    def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
         data = anndata.read_h5ad(matrix)
+        reference_data = self.find_reference_data(organ, options["reference_data_dir"])
 
         # Azimuth chokes when trying to load matrices that has
         # obs columns of dtype 'object'. As a workaround we create a
@@ -32,7 +51,9 @@ def do_run(self, matrix: Path, organ: str, options: dict):
         clean_matrix = self.create_clean_matrix(data)
         clean_matrix.write_h5ad(clean_matrix_path)
 
-        annotated_matrix_path = self.run_azimuth_scripts(clean_matrix_path, organ)
+        annotated_matrix_path = self.run_azimuth_scripts(
+            clean_matrix_path, reference_data
+        )
         annotated_matrix = anndata.read_h5ad(annotated_matrix_path)
         self.copy_annotations(data, annotated_matrix)
 
@@ -49,27 +70,49 @@ def copy_annotations(
     ):
         matrix.obs = matrix.obs.join(annotated_matrix.obs)
 
-    def run_azimuth_scripts(self, matrix_path: Path, organ: str):
-        script_outputs = [
-            "./secondary_analysis.h5ad",
-            "./version_metadata.json",
-            "./annotations.csv",
-        ]
-        script_command = [
-            "Rscript",
-            "/azimuth_analysis.R",
-            organ,
-            matrix_path,
-            matrix_path,
-        ]
-
+    def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path):
+        script_command = ["Rscript", "/run_azimuth.R", matrix_path, reference_data]
         subprocess.run(script_command, capture_output=True, check=True)
-        write_metadata.main(*script_outputs)
-        return script_outputs[0]
+        return "./result.h5ad"
 
+    def find_reference_data(self, organ: str, dir: Path):
+        def is_reference_data_candidate(path: Path):
+            return path.is_dir() and organ.lower() in path.name.lower()
 
-if __name__ == "__main__":
+        return self._find_in_dir(
+            dir,
+            is_reference_data_candidate,
+            f"Cannot find reference data for organ '{organ}'",
+            f"Multiple reference data candidates for organ '{organ}'",
+        )
+
+    def _find_in_dir(
+        self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
+    ):
+        candidates = list(filter(cond, dir.iterdir()))
+        candidates.sort(key=lambda path: len(path.name))
+
+        if not candidates:
+            raise ValueError(error_msg)
+        elif len(candidates) > 1:
+            logging.warn(warn_msg)
+        return candidates[0]
+
+
+def _get_arg_parser():
     parser = add_common_arguments()
+    parser.add_argument(
+        "--reference-data-dir",
+        type=Path,
+        required=True,
+        help="Path to directory with reference data",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = _get_arg_parser()
     args = parser.parse_args()
     algorithm = AzimuthAlgorithm()
     result = algorithm.run(**args.__dict__)

diff --git a/containers/azimuth/context/organ-mapping.json b/containers/azimuth/context/organ-mapping.json
@@ -1,8 +1,8 @@
 {
-  "UBERON:0002113": "LK",
-  "UBERON:0004538": "LK",
-  "UBERON:0004539": "RK",
-  "UBERON:0002048": "LL",
-  "UBERON:0001004": "LL",
-  "UBERON:0000948": "HT"
-}
+  "UBERON:0002113": "kidneyref",
+  "UBERON:0004538": "kidneyref",
+  "UBERON:0004539": "kidneyref",
+  "UBERON:0002048": "lungref",
+  "UBERON:0001004": "lungref",
+  "UBERON:0000948": "heartref"
+}
diff --git a/containers/azimuth/context/run_azimuth.R b/containers/azimuth/context/run_azimuth.R
@@ -0,0 +1,16 @@
+library(Azimuth)
+library(Seurat)
+library(SeuratData)
+library(SeuratDisk)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+matrix_path <- args[1]
+reference <- args[2]
+
+# Annotate
+output_data <- RunAzimuth(matrix_path, reference=reference)
+
+# Save and convert to h5ad
+SaveH5Seurat(output_data, 'result.h5seurat')
+Convert('result.h5seurat', dest='h5ad')
diff --git a/containers/azimuth/download-data.cwl b/containers/azimuth/download-data.cwl
@@ -0,0 +1,33 @@
+#!/usr/bin/env cwl-runner
+class: CommandLineTool
+cwlVersion: v1.2
+
+requirements:
+  DockerRequirement:
+    dockerPull: ghcr.io/hubmapconsortium/hra-workflows/azimuth:main
+  NetworkAccess:
+    networkAccess: true
+
+baseCommand: /bin/bash
+arguments:
+  - /download-data.sh
+
+inputs:
+  outputDirectory:
+    type: string
+    label: Output directory for reference data
+    default: ./azimuth
+    inputBinding:
+      position: 0
+  organMappingFile:
+    type: File?
+    label: Organ mapping json file
+    inputBinding:
+      position: 1
+
+outputs:
+  data:
+    type: Directory
+    outputBinding:
+      glob: $(inputs.outputDirectory)
+      loadListing: deep_listing
diff --git a/containers/azimuth/options.yml b/containers/azimuth/options.yml
@@ -1,4 +1,9 @@
 type: record
 name: options
 label: Azimuth specific options
-fields: {}
+fields:
+  referenceDataDir:
+    type: Directory
+    label: Directory with reference data directories
+    inputBinding:
+      prefix: --reference-data-dir