From e907bbd5b7f324d867a3a6cfdacfe33c54973371 Mon Sep 17 00:00:00 2001 From: Qirong Mao Date: Wed, 13 Dec 2023 10:27:53 +0000 Subject: [PATCH 1/4] Feature selection by Moran'I score --- .../spatially_variable_genes_scanpy.py | 125 ++++++++++++++++++ .../spatially_variable_genes_scanpy.yml | 8 ++ 2 files changed, 133 insertions(+) create mode 100755 preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py create mode 100644 preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.yml diff --git a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py new file mode 100755 index 00000000..f47808db --- /dev/null +++ b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +# Author_and_contribution: Qirong Mao; created script + +import argparse + +# TODO adjust description +parser = argparse.ArgumentParser(description="SVG selection using Squidpy (Moran's I)") + +parser.add_argument( + "-c", "--coordinates", help="Path to coordinates (as tsv).", required=True +) +parser.add_argument( + "-m", "--matrix", help="Path to (transformed) counts (as mtx).", required=True +) +parser.add_argument( + "-f", "--features", help="Path to features (as tsv).", required=True +) +parser.add_argument( + "-o", "--observations", help="Path to observations (as tsv).", required=True +) +parser.add_argument( + "-n", "--n_top_genes", help="Number of genes to keep (Default:3000).", required=False, type=int + ) + +parser.add_argument("-d", "--out_dir", help="Output directory.", required=True) + +parser.add_argument( + "--config", + help="Optional config file (json) used to pass additional parameters.", + required=False, +) + +args = parser.parse_args() + +from pathlib import Path +import pandas as pd + +out_dir = Path(args.out_dir) + +# Output files +feature_selection_file = out_dir / "features_MoranI.tsv" +# if additional output files are required write it also to out_dir + +# Use these filepaths and inputs ... +coord_file = args.coordinates +matrix_file = args.matrix +feature_file = args.features +observation_file = args.observations + + +if args.n_top_genes is not None: + n_top_genes = int(args.n_top_genes) +else: + n_top_genes= 3000 ### Default: 3000 genes + + +if args.config is not None: + config_file = args.config + + +# ... or AnnData if you want +def get_anndata(args): + # Untested template + import anndata as ad + import pandas as pd + import scipy as sp + X = sp.io.mmread(args.matrix) + if sp.sparse.issparse(X): + X = X.tocsr() + observations = pd.read_table(args.observations, index_col=0) + features = pd.read_table(args.features, index_col=0) + coordinates = ( + pd.read_table(args.coordinates, index_col=0) + .loc[observations.index, :] + .to_numpy() + ) + + adata = ad.AnnData( + X=X, obs=observations, var=features, obsm={"spatial": coordinates} + ) + + return adata + + +adata = get_anndata(args) + +## Your code goes here +import scanpy as sc +import squidpy as sq + +## Warning if the input dataset has less features than the number of genes want to keep + +n_input_features = len(adata.var) + +if n_input_features < n_top_genes: + raise ValueError('Input data has less features than the number of genes you want to keep, please clarify the numbers of genes you need to keep (--n_top_genes)') + +features_df = adata.var.copy() + +## Log Normalization + +sc.pp.normalize_total(adata) +sc.pp.log1p(adata) + +## Compute the graph based from Delaunay triangulation +sq.gr.spatial_neighbors(adata,coord_type="generic",delaunay= True) +## Calculate Moran's I score for each gene +sq.gr.spatial_autocorr(adata, mode="moran", genes=adata.var_names,show_progress_bar=True) + + +## Selecting top spatially variable genes based on Moran's I score +SVG = ( + adata.uns["moranI"]["I"].sort_values(ascending=False).head(n_top_genes).index.tolist() +) + +### Create boolean indication to specify spatially variable genes +adata.var['spatially_variable'] = adata.var.index.isin(SVG) + +features_df["spatially_variable"] = adata.var["spatially_variable"] + + +## Write output +out_dir.mkdir(parents=True, exist_ok=True) +features_df.to_csv(feature_selection_file, sep="\t", index_label="") diff --git a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.yml b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.yml new file mode 100644 index 00000000..06372f3d --- /dev/null +++ b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge +dependencies: + - python=3.9.18 + - scanpy=1.9.6 + - pip + - pip: + - squidpy==1.3.1 \ No newline at end of file From fa9f6f116733679e54882d6ba460d5631f649714 Mon Sep 17 00:00:00 2001 From: Qirong Mao <57286623+Qirongmao97@users.noreply.github.com> Date: Wed, 13 Dec 2023 11:59:15 +0100 Subject: [PATCH 2/4] Update spatially_variable_genes_scanpy.py - Moved default gene numbers to Argparser - Removed normalization steps --- .../spatially_variable_genes_scanpy.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py index f47808db..b9de5084 100755 --- a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py +++ b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py @@ -20,8 +20,7 @@ "-o", "--observations", help="Path to observations (as tsv).", required=True ) parser.add_argument( - "-n", "--n_top_genes", help="Number of genes to keep (Default:3000).", required=False, type=int - ) + "-n", "--n_top_genes", help="Number of genes to keep (Default:3000).", required=False, type=int, default=3000) parser.add_argument("-d", "--out_dir", help="Output directory.", required=True) @@ -51,8 +50,6 @@ if args.n_top_genes is not None: n_top_genes = int(args.n_top_genes) -else: - n_top_genes= 3000 ### Default: 3000 genes if args.config is not None: @@ -98,11 +95,6 @@ def get_anndata(args): features_df = adata.var.copy() -## Log Normalization - -sc.pp.normalize_total(adata) -sc.pp.log1p(adata) - ## Compute the graph based from Delaunay triangulation sq.gr.spatial_neighbors(adata,coord_type="generic",delaunay= True) ## Calculate Moran's I score for each gene From 6fef445206d954526da1a72364c4d660be7ce2ca Mon Sep 17 00:00:00 2001 From: Qirong Mao <57286623+Qirongmao97@users.noreply.github.com> Date: Wed, 13 Dec 2023 13:53:37 +0100 Subject: [PATCH 3/4] Removed if args.n_top is None --- .../spatially_variable_genes_scanpy.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py index b9de5084..7fd200f6 100755 --- a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py +++ b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py @@ -46,12 +46,7 @@ matrix_file = args.matrix feature_file = args.features observation_file = args.observations - - -if args.n_top_genes is not None: - n_top_genes = int(args.n_top_genes) - - + if args.config is not None: config_file = args.config From 7fafb2015493722b8f4a8b48b354b47cd24882a5 Mon Sep 17 00:00:00 2001 From: Qirong Mao <57286623+Qirongmao97@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:36:54 +0100 Subject: [PATCH 4/4] Aligning with the output of /preprocessing/neibours Replacing the neibourhood calculation by using the output of squidpy.gr.spatial_neighbors from upstream preprocessing script --- .../spatially_variable_genes_scanpy.py | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py index 7fd200f6..e8334228 100755 --- a/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py +++ b/preprocessing/feature_selection_MoranI/spatially_variable_genes_scanpy.py @@ -16,6 +16,12 @@ parser.add_argument( "-f", "--features", help="Path to features (as tsv).", required=True ) +parser.add_argument( + "-sc", "--spatial_connectivities", help="Path to spatial connectivities (as mtx).", required=True +) +parser.add_argument( + "-sd", "--spatial_distance", help="Path to spatial distance (as mtx).", required=True +) parser.add_argument( "-o", "--observations", help="Path to observations (as tsv).", required=True ) @@ -37,7 +43,7 @@ out_dir = Path(args.out_dir) -# Output files +# Output files feature_selection_file = out_dir / "features_MoranI.tsv" # if additional output files are required write it also to out_dir @@ -45,6 +51,8 @@ coord_file = args.coordinates matrix_file = args.matrix feature_file = args.features +spatial_connectivities_file = args.spatial_connectivities +spatial_distance_file = args.spatial_distance observation_file = args.observations if args.config is not None: @@ -57,9 +65,20 @@ def get_anndata(args): import anndata as ad import pandas as pd import scipy as sp + X = sp.io.mmread(args.matrix) if sp.sparse.issparse(X): X = X.tocsr() + + ## Output files of sq.gr.spatial_neighbors + SC = sp.io.mmread(args.spatial_connectivities) + if sp.sparse.issparse(SC): + SC = SC.tocsr() + + SD = sp.io.mmread(args.spatial_distance) + if sp.sparse.issparse(SD): + SD = SD.tocsr() + observations = pd.read_table(args.observations, index_col=0) features = pd.read_table(args.features, index_col=0) coordinates = ( @@ -69,7 +88,7 @@ def get_anndata(args): ) adata = ad.AnnData( - X=X, obs=observations, var=features, obsm={"spatial": coordinates} + X=X, obs=observations, var=features, obsm={"spatial": coordinates},obsp={"spatial_connectivities":SC,"spatial_distances":SD} ) return adata @@ -90,8 +109,6 @@ def get_anndata(args): features_df = adata.var.copy() -## Compute the graph based from Delaunay triangulation -sq.gr.spatial_neighbors(adata,coord_type="generic",delaunay= True) ## Calculate Moran's I score for each gene sq.gr.spatial_autocorr(adata, mode="moran", genes=adata.var_names,show_progress_bar=True)