Skip to content

Commit

Permalink
Refactor Azimuth (#4)
Browse files Browse the repository at this point in the history
* updated to support run_azimuth

* azimuth implemented

* gene expression done

* gene expression done

* Finished up azimuth data download cwl/scripts

* Add reference data input

* Remove gene expression (not part of pr)

---------

Co-authored-by: Vicky Daiya <[email protected]>
  • Loading branch information
axdanbol and vickydaiya authored Oct 23, 2023
1 parent 4a10afd commit 5876942
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 31 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,6 @@ containers/*/context/src
# Temp directories
tmp/
temp/

# DS Store
.DS_Store
4 changes: 3 additions & 1 deletion containers/azimuth/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
FROM hubmap/azimuth-annotate:1.3
FROM satijalab/azimuth:0.4.6

RUN R --no-echo -e "install.packages('rjson', repo='https://cloud.r-project.org')"

COPY context/requirements-freeze.txt .
RUN pip install -r requirements-freeze.txt
Expand Down
8 changes: 8 additions & 0 deletions containers/azimuth/context/download-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -e

OUTPUT_DIR=${1:-"./azimuth"}
MAPPING_FILE=${2:-"/organ-mapping.json"}

mkdir -p "$OUTPUT_DIR"
Rscript /download_reference_data.R "$MAPPING_FILE" "$OUTPUT_DIR"
15 changes: 15 additions & 0 deletions containers/azimuth/context/download_reference_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
library(rjson)
library(Seurat)
library(SeuratData)

args <- commandArgs(trailingOnly = TRUE)
organ_mapping_file <- args[1]
output_dir <- args[2]

# Load unique reference organs
mapping <- fromJSON(file = organ_mapping_file)
references <- unlist(unique(mapping))

# Download and install data
options(timeout=60 * 60) # Probably overkill but the default of 60s is to low for some of the datasets
InstallData(references, lib=output_dir)
87 changes: 65 additions & 22 deletions containers/azimuth/context/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,47 @@
import logging
import subprocess
from pathlib import Path
import typing as t

import anndata
import pandas
import write_metadata # type: ignore From azimuth-annotate docker image

from src.algorithm import Algorithm, OrganLookup, add_common_arguments


class AzimuthOptions(t.TypedDict):
reference_data_dir: Path


class AzimuthOrganLookup(OrganLookup[str]):
def __init__(self, mapping_file: Path):
super().__init__(mapping_file)

def get_builtin_options(self):
references = ["RK", "LK", "RL", "LL", "HT"]
# TODO read from mapping file?
references = [
"adiposeref",
"bonemarrowref",
"fetusref",
"heartref",
"humancortexref",
"kidneyref",
"lungref",
"mousecortexref",
"pancreasref",
"pbmcref",
"tonsilref",
]
return zip(references, references)


class AzimuthAlgorithm(Algorithm[str, dict]):
class AzimuthAlgorithm(Algorithm[str, AzimuthOptions]):
def __init__(self):
super().__init__(AzimuthOrganLookup)

def do_run(self, matrix: Path, organ: str, options: dict):
def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
data = anndata.read_h5ad(matrix)
reference_data = self.find_reference_data(organ, options["reference_data_dir"])

# Azimuth chokes when trying to load matrices that has
# obs columns of dtype 'object'. As a workaround we create a
Expand All @@ -32,7 +51,9 @@ def do_run(self, matrix: Path, organ: str, options: dict):
clean_matrix = self.create_clean_matrix(data)
clean_matrix.write_h5ad(clean_matrix_path)

annotated_matrix_path = self.run_azimuth_scripts(clean_matrix_path, organ)
annotated_matrix_path = self.run_azimuth_scripts(
clean_matrix_path, reference_data
)
annotated_matrix = anndata.read_h5ad(annotated_matrix_path)
self.copy_annotations(data, annotated_matrix)

Expand All @@ -49,27 +70,49 @@ def copy_annotations(
):
matrix.obs = matrix.obs.join(annotated_matrix.obs)

def run_azimuth_scripts(self, matrix_path: Path, organ: str):
script_outputs = [
"./secondary_analysis.h5ad",
"./version_metadata.json",
"./annotations.csv",
]
script_command = [
"Rscript",
"/azimuth_analysis.R",
organ,
matrix_path,
matrix_path,
]

def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path):
script_command = ["Rscript", "/run_azimuth.R", matrix_path, reference_data]
subprocess.run(script_command, capture_output=True, check=True)
write_metadata.main(*script_outputs)
return script_outputs[0]
return "./result.h5ad"

def find_reference_data(self, organ: str, dir: Path):
def is_reference_data_candidate(path: Path):
return path.is_dir() and organ.lower() in path.name.lower()

if __name__ == "__main__":
return self._find_in_dir(
dir,
is_reference_data_candidate,
f"Cannot find reference data for organ '{organ}'",
f"Multiple reference data candidates for organ '{organ}'",
)

def _find_in_dir(
self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
):
candidates = list(filter(cond, dir.iterdir()))
candidates.sort(key=lambda path: len(path.name))

if not candidates:
raise ValueError(error_msg)
elif len(candidates) > 1:
logging.warn(warn_msg)
return candidates[0]


def _get_arg_parser():
parser = add_common_arguments()
parser.add_argument(
"--reference-data-dir",
type=Path,
required=True,
help="Path to directory with reference data",
)

return parser


if __name__ == "__main__":
parser = _get_arg_parser()
args = parser.parse_args()
algorithm = AzimuthAlgorithm()
result = algorithm.run(**args.__dict__)
Expand Down
14 changes: 7 additions & 7 deletions containers/azimuth/context/organ-mapping.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"UBERON:0002113": "LK",
"UBERON:0004538": "LK",
"UBERON:0004539": "RK",
"UBERON:0002048": "LL",
"UBERON:0001004": "LL",
"UBERON:0000948": "HT"
}
"UBERON:0002113": "kidneyref",
"UBERON:0004538": "kidneyref",
"UBERON:0004539": "kidneyref",
"UBERON:0002048": "lungref",
"UBERON:0001004": "lungref",
"UBERON:0000948": "heartref"
}
16 changes: 16 additions & 0 deletions containers/azimuth/context/run_azimuth.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
library(Azimuth)
library(Seurat)
library(SeuratData)
library(SeuratDisk)

args <- commandArgs(trailingOnly = TRUE)

matrix_path <- args[1]
reference <- args[2]

# Annotate
output_data <- RunAzimuth(matrix_path, reference=reference)

# Save and convert to h5ad
SaveH5Seurat(output_data, 'result.h5seurat')
Convert('result.h5seurat', dest='h5ad')
33 changes: 33 additions & 0 deletions containers/azimuth/download-data.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env cwl-runner
class: CommandLineTool
cwlVersion: v1.2

requirements:
DockerRequirement:
dockerPull: ghcr.io/hubmapconsortium/hra-workflows/azimuth:main
NetworkAccess:
networkAccess: true

baseCommand: /bin/bash
arguments:
- /download-data.sh

inputs:
outputDirectory:
type: string
label: Output directory for reference data
default: ./azimuth
inputBinding:
position: 0
organMappingFile:
type: File?
label: Organ mapping json file
inputBinding:
position: 1

outputs:
data:
type: Directory
outputBinding:
glob: $(inputs.outputDirectory)
loadListing: deep_listing
7 changes: 6 additions & 1 deletion containers/azimuth/options.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
type: record
name: options
label: Azimuth specific options
fields: {}
fields:
referenceDataDir:
type: Directory
label: Directory with reference data directories
inputBinding:
prefix: --reference-data-dir

0 comments on commit 5876942

Please sign in to comment.