diff --git a/.gitignore b/.gitignore index 07b0dee..9f3fc76 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,6 @@ containers/*/context/src # Temp directories tmp/ temp/ + +# DS Store +.DS_Store \ No newline at end of file diff --git a/containers/azimuth/Dockerfile b/containers/azimuth/Dockerfile index c9bd0b2..e3618a7 100644 --- a/containers/azimuth/Dockerfile +++ b/containers/azimuth/Dockerfile @@ -1,4 +1,6 @@ -FROM hubmap/azimuth-annotate:1.3 +FROM satijalab/azimuth:0.4.6 + +RUN R --no-echo -e "install.packages('rjson', repo='https://cloud.r-project.org')" COPY context/requirements-freeze.txt . RUN pip install -r requirements-freeze.txt diff --git a/containers/azimuth/context/download-data.sh b/containers/azimuth/context/download-data.sh new file mode 100644 index 0000000..a22154c --- /dev/null +++ b/containers/azimuth/context/download-data.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +OUTPUT_DIR=${1:-"./azimuth"} +MAPPING_FILE=${2:-"/organ-mapping.json"} + +mkdir -p "$OUTPUT_DIR" +Rscript /download_reference_data.R "$MAPPING_FILE" "$OUTPUT_DIR" diff --git a/containers/azimuth/context/download_reference_data.R b/containers/azimuth/context/download_reference_data.R new file mode 100644 index 0000000..cbc6504 --- /dev/null +++ b/containers/azimuth/context/download_reference_data.R @@ -0,0 +1,15 @@ +library(rjson) +library(Seurat) +library(SeuratData) + +args <- commandArgs(trailingOnly = TRUE) +organ_mapping_file <- args[1] +output_dir <- args[2] + +# Load unique reference organs +mapping <- fromJSON(file = organ_mapping_file) +references <- unlist(unique(mapping)) + +# Download and install data +options(timeout=60 * 60) # Probably overkill but the default of 60s is to low for some of the datasets +InstallData(references, lib=output_dir) diff --git a/containers/azimuth/context/main.py b/containers/azimuth/context/main.py index c443ca3..9474b94 100644 --- a/containers/azimuth/context/main.py +++ b/containers/azimuth/context/main.py @@ -1,28 +1,47 @@ +import logging import subprocess from pathlib import Path +import typing as t import anndata import pandas -import write_metadata # type: ignore From azimuth-annotate docker image from src.algorithm import Algorithm, OrganLookup, add_common_arguments +class AzimuthOptions(t.TypedDict): + reference_data_dir: Path + + class AzimuthOrganLookup(OrganLookup[str]): def __init__(self, mapping_file: Path): super().__init__(mapping_file) def get_builtin_options(self): - references = ["RK", "LK", "RL", "LL", "HT"] + # TODO read from mapping file? + references = [ + "adiposeref", + "bonemarrowref", + "fetusref", + "heartref", + "humancortexref", + "kidneyref", + "lungref", + "mousecortexref", + "pancreasref", + "pbmcref", + "tonsilref", + ] return zip(references, references) -class AzimuthAlgorithm(Algorithm[str, dict]): +class AzimuthAlgorithm(Algorithm[str, AzimuthOptions]): def __init__(self): super().__init__(AzimuthOrganLookup) - def do_run(self, matrix: Path, organ: str, options: dict): + def do_run(self, matrix: Path, organ: str, options: AzimuthOptions): data = anndata.read_h5ad(matrix) + reference_data = self.find_reference_data(organ, options["reference_data_dir"]) # Azimuth chokes when trying to load matrices that has # obs columns of dtype 'object'. As a workaround we create a @@ -32,7 +51,9 @@ def do_run(self, matrix: Path, organ: str, options: dict): clean_matrix = self.create_clean_matrix(data) clean_matrix.write_h5ad(clean_matrix_path) - annotated_matrix_path = self.run_azimuth_scripts(clean_matrix_path, organ) + annotated_matrix_path = self.run_azimuth_scripts( + clean_matrix_path, reference_data + ) annotated_matrix = anndata.read_h5ad(annotated_matrix_path) self.copy_annotations(data, annotated_matrix) @@ -49,27 +70,49 @@ def copy_annotations( ): matrix.obs = matrix.obs.join(annotated_matrix.obs) - def run_azimuth_scripts(self, matrix_path: Path, organ: str): - script_outputs = [ - "./secondary_analysis.h5ad", - "./version_metadata.json", - "./annotations.csv", - ] - script_command = [ - "Rscript", - "/azimuth_analysis.R", - organ, - matrix_path, - matrix_path, - ] - + def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path): + script_command = ["Rscript", "/run_azimuth.R", matrix_path, reference_data] subprocess.run(script_command, capture_output=True, check=True) - write_metadata.main(*script_outputs) - return script_outputs[0] + return "./result.h5ad" + def find_reference_data(self, organ: str, dir: Path): + def is_reference_data_candidate(path: Path): + return path.is_dir() and organ.lower() in path.name.lower() -if __name__ == "__main__": + return self._find_in_dir( + dir, + is_reference_data_candidate, + f"Cannot find reference data for organ '{organ}'", + f"Multiple reference data candidates for organ '{organ}'", + ) + + def _find_in_dir( + self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str + ): + candidates = list(filter(cond, dir.iterdir())) + candidates.sort(key=lambda path: len(path.name)) + + if not candidates: + raise ValueError(error_msg) + elif len(candidates) > 1: + logging.warn(warn_msg) + return candidates[0] + + +def _get_arg_parser(): parser = add_common_arguments() + parser.add_argument( + "--reference-data-dir", + type=Path, + required=True, + help="Path to directory with reference data", + ) + + return parser + + +if __name__ == "__main__": + parser = _get_arg_parser() args = parser.parse_args() algorithm = AzimuthAlgorithm() result = algorithm.run(**args.__dict__) diff --git a/containers/azimuth/context/organ-mapping.json b/containers/azimuth/context/organ-mapping.json index 64c5588..98c3f9c 100644 --- a/containers/azimuth/context/organ-mapping.json +++ b/containers/azimuth/context/organ-mapping.json @@ -1,8 +1,8 @@ { - "UBERON:0002113": "LK", - "UBERON:0004538": "LK", - "UBERON:0004539": "RK", - "UBERON:0002048": "LL", - "UBERON:0001004": "LL", - "UBERON:0000948": "HT" -} \ No newline at end of file + "UBERON:0002113": "kidneyref", + "UBERON:0004538": "kidneyref", + "UBERON:0004539": "kidneyref", + "UBERON:0002048": "lungref", + "UBERON:0001004": "lungref", + "UBERON:0000948": "heartref" +} diff --git a/containers/azimuth/context/run_azimuth.R b/containers/azimuth/context/run_azimuth.R new file mode 100644 index 0000000..231b1ee --- /dev/null +++ b/containers/azimuth/context/run_azimuth.R @@ -0,0 +1,16 @@ +library(Azimuth) +library(Seurat) +library(SeuratData) +library(SeuratDisk) + +args <- commandArgs(trailingOnly = TRUE) + +matrix_path <- args[1] +reference <- args[2] + +# Annotate +output_data <- RunAzimuth(matrix_path, reference=reference) + +# Save and convert to h5ad +SaveH5Seurat(output_data, 'result.h5seurat') +Convert('result.h5seurat', dest='h5ad') diff --git a/containers/azimuth/download-data.cwl b/containers/azimuth/download-data.cwl new file mode 100644 index 0000000..eadc250 --- /dev/null +++ b/containers/azimuth/download-data.cwl @@ -0,0 +1,33 @@ +#!/usr/bin/env cwl-runner +class: CommandLineTool +cwlVersion: v1.2 + +requirements: + DockerRequirement: + dockerPull: ghcr.io/hubmapconsortium/hra-workflows/azimuth:main + NetworkAccess: + networkAccess: true + +baseCommand: /bin/bash +arguments: + - /download-data.sh + +inputs: + outputDirectory: + type: string + label: Output directory for reference data + default: ./azimuth + inputBinding: + position: 0 + organMappingFile: + type: File? + label: Organ mapping json file + inputBinding: + position: 1 + +outputs: + data: + type: Directory + outputBinding: + glob: $(inputs.outputDirectory) + loadListing: deep_listing diff --git a/containers/azimuth/options.yml b/containers/azimuth/options.yml index b10f044..d78b252 100644 --- a/containers/azimuth/options.yml +++ b/containers/azimuth/options.yml @@ -1,4 +1,9 @@ type: record name: options label: Azimuth specific options -fields: {} +fields: + referenceDataDir: + type: Directory + label: Directory with reference data directories + inputBinding: + prefix: --reference-data-dir