diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index 8f413a4..b42df37 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -2,7 +2,7 @@ name: Build Containers on: push: - branches: ['main'] + branches: ["main"] env: REGISTRY: ghcr.io @@ -15,8 +15,16 @@ jobs: packages: write strategy: matrix: - container: [azimuth, celltypist, extract-summary, popv, preprocess] - + container: + [ + azimuth, + celltypist, + extract-summary, + gene-expression, + popv, + preprocess, + ] + steps: - name: Checkout repository uses: actions/checkout@v2 diff --git a/containers/gene-expression/Dockerfile b/containers/gene-expression/Dockerfile new file mode 100644 index 0000000..19feaee --- /dev/null +++ b/containers/gene-expression/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 + +COPY context/requirements-freeze.txt . +RUN pip install -r requirements-freeze.txt + +COPY context/ . + +CMD ["python", "/main.py"] \ No newline at end of file diff --git a/containers/gene-expression/context/main.py b/containers/gene-expression/context/main.py new file mode 100644 index 0000000..ec539f4 --- /dev/null +++ b/containers/gene-expression/context/main.py @@ -0,0 +1,107 @@ +import argparse +import anndata +import numpy as np +import scanpy as sc +import pandas as pd +from pathlib import Path + +MIN_CELLS_PER_CT = 2 # need a filter to reomve CTs with one cell as sc.tl.rank_genes_groups gives an error + + +def filter_matrix(matrix: anndata.AnnData, annotation_column: str): + """filters an anndata matrix to only include cells which are annotated to a cell type with more than MIN_CELLS_PER_CT""" + ct_counts = matrix.obs[annotation_column].value_counts() + valid_cts = ct_counts.index[ct_counts >= MIN_CELLS_PER_CT] + mask = np.isin(matrix.obs[annotation_column], valid_cts) + return matrix[mask, :] + + +def format_marker_genes_df(df: pd.DataFrame, annotation_column: str): + """formats the output from sc.tl.rank_genes_groups to a dataframe with celltype as one column and list of marker genes as other""" + df = df.transpose() + df["marker_genes"] = df.apply(lambda row: row.tolist(), axis=1) + df = df["marker_genes"].rename_axis(annotation_column).reset_index() + return df + + +def get_mean_expr_value(matrix, annotation_column, cell_type, gene): + """gets the mean expression value for a cell type and a gene""" + cell_indices = [ + matrix.obs.index.get_loc(cell_index) + for cell_index in matrix.obs[matrix.obs[annotation_column] == cell_type].index + ] + mean_expr = matrix.X[cell_indices, matrix.var.index.get_loc(gene)].mean() + return mean_expr + + +def get_marker_genes_with_expr(matrix, annotation_column, cell_type, marker_genes): + """gets the mean expression values for all marker genes""" + output = [] + for gene in marker_genes: + output.append( + { + "gene_label": gene, + "mean_gene_expr_value": get_mean_expr_value( + matrix, annotation_column, cell_type, gene + ), + } + ) + return output + + +def get_gene_expr( + matrix: anndata.AnnData, annotation_column: str, gene_expr_column: str +): + """gets the marker genes and mean expression values for all cells in the anndata matrix""" + + matrix.raw = matrix # for getting gene names as output for sc.tl.rank_genes_groups + filtered_matrix = filter_matrix(matrix, annotation_column) + sc.tl.rank_genes_groups(filtered_matrix, groupby=annotation_column, n_genes=10) + ct_marker_genes_df = format_marker_genes_df( + pd.DataFrame(filtered_matrix.uns["rank_genes_groups"]["names"]), + annotation_column, + ) + + ct_marker_genes_df[gene_expr_column] = ct_marker_genes_df.apply( + lambda row: get_marker_genes_with_expr( + matrix, annotation_column, row[annotation_column], row["marker_genes"] + ), + axis=1, + ) + merged_obs = matrix.obs.merge( + ct_marker_genes_df[[annotation_column, gene_expr_column]], how="left" + ) + merged_obs[gene_expr_column] = merged_obs[gene_expr_column].astype(str) + merged_obs.index = matrix.obs.index + matrix.obs = merged_obs + return matrix + + +def main(args: argparse.Namespace): + matrix = get_gene_expr(args.matrix, args.annotation_column, args.gene_expr_column) + matrix.write_h5ad(args.output_matrix) + + +def _get_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Add gene expressions to h5ad data") + parser.add_argument("matrix", type=anndata.read_h5ad, help="h5ad data file") + parser.add_argument( + "--annotation-column", required=True, help="Column with annotations" + ) + parser.add_argument( + "--gene-expr-column", default="gene_expr", help="Column for gene_expr" + ) + parser.add_argument( + "--output-matrix", + type=Path, + default="matrix_with_gene_expr.h5ad", + help="matrix with gene expressions output path", + ) + + return parser + + +if __name__ == "__main__": + parser = _get_arg_parser() + args = parser.parse_args() + main(args) diff --git a/containers/gene-expression/context/requirements-freeze.txt b/containers/gene-expression/context/requirements-freeze.txt new file mode 100644 index 0000000..d4c4bf7 --- /dev/null +++ b/containers/gene-expression/context/requirements-freeze.txt @@ -0,0 +1,35 @@ +anndata==0.10.2 +array-api-compat==1.4 +contourpy==1.1.1 +cycler==0.12.1 +fonttools==4.43.1 +h5py==3.10.0 +joblib==1.3.2 +kiwisolver==1.4.5 +llvmlite==0.41.1 +matplotlib==3.8.0 +natsort==8.4.0 +networkx==3.2 +numba==0.58.1 +numpy==1.26.1 +packaging==23.2 +pandas==2.1.1 +pathlib==1.0.1 +patsy==0.5.3 +Pillow==10.1.0 +pynndescent==0.5.10 +pyparsing==3.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +scanpy==1.9.5 +scikit-learn==1.3.2 +scipy==1.11.3 +seaborn==0.13.0 +session-info==1.0.0 +six==1.16.0 +statsmodels==0.14.0 +stdlib-list==0.9.0 +threadpoolctl==3.2.0 +tqdm==4.66.1 +tzdata==2023.3 +umap-learn==0.5.4 diff --git a/containers/gene-expression/context/requirements.txt b/containers/gene-expression/context/requirements.txt new file mode 100644 index 0000000..039f867 --- /dev/null +++ b/containers/gene-expression/context/requirements.txt @@ -0,0 +1,4 @@ +anndata +numpy +scanpy +pandas \ No newline at end of file diff --git a/containers/gene-expression/options.yml b/containers/gene-expression/options.yml new file mode 100644 index 0000000..248686e --- /dev/null +++ b/containers/gene-expression/options.yml @@ -0,0 +1,14 @@ +type: record +name: options +label: Gene expression options +fields: + annotationColumn: + type: string + label: Annotation column + inputBinding: + prefix: --annotation-column + geneExprColumn: + type: string + label: Gene expression column + inputBinding: + prefix: --gene-expr-column diff --git a/containers/gene-expression/pipeline.cwl b/containers/gene-expression/pipeline.cwl new file mode 100644 index 0000000..807793c --- /dev/null +++ b/containers/gene-expression/pipeline.cwl @@ -0,0 +1,28 @@ +#!/usr/bin/env cwl-runner +class: CommandLineTool +cwlVersion: v1.0 + +requirements: + DockerRequirement: + dockerPull: ghcr.io/hubmapconsortium/hra-workflows/gene-expression:main + SchemaDefRequirement: + types: + - $import: ./options.yml + +baseCommand: python +arguments: + - /main.py + +inputs: + matrix: + type: File + label: Data to get gene expression for in h5ad format + inputBinding: + position: 0 + options: ./options.yml#options + +outputs: + matrix_with_gene_expr: + type: File + outputBinding: + glob: matrix_with_gene_expr.h5ad