From f1c40faefbda3d62c6e7aa9f1baafade82cc24ab Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Tue, 17 Sep 2024 18:38:28 +0200 Subject: [PATCH 1/5] convert cell_indices str->array dict to a csr matrix before storing the anndata result --- src/scirpy/tl/_clonotypes.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/scirpy/tl/_clonotypes.py b/src/scirpy/tl/_clonotypes.py index 59e5cc94f..1116102c6 100644 --- a/src/scirpy/tl/_clonotypes.py +++ b/src/scirpy/tl/_clonotypes.py @@ -332,10 +332,28 @@ def define_clonotype_clusters( clonotype_cluster_series = pd.Series(values, index=idx).reindex(params.adata.obs_names) clonotype_cluster_size_series = clonotype_cluster_series.groupby(clonotype_cluster_series).transform("count") + def convert_str_array_dict_to_csr(str_array_dict: dict[str, np.ndarray[str]]) -> sp.csr_matrix: + num_rows = len(str_array_dict) + + data_arrays = [np.array([])] * num_rows + indices_arrays = [np.array([])] * num_rows + nnz_array = np.zeros(num_rows) + + for key_str, value in str_array_dict.items(): + key = int(key_str) + data_arrays[key] = value.astype(int) + indices_arrays[key] = np.array(range(0,len(value))) + nnz_array[key] = len(value) + + data = np.concatenate(data_arrays) + indices = np.concatenate(indices_arrays) + indptr = np.concatenate([np.array([0]), np.cumsum(nnz_array)]) + return sp.csr_matrix((data, indices, indptr)) + # Return or store results clonotype_distance_res = { "distances": clonotype_dist, - "cell_indices": ctn.cell_indices, + "cell_indices": convert_str_array_dict_to_csr(ctn.cell_indices), } if inplace: params.set_obs(key_added, clonotype_cluster_series) From 97e13b75b27398fdbace516403c37091424f02ab Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Wed, 18 Sep 2024 19:09:38 +0200 Subject: [PATCH 2/5] save cell_indices as json format --- src/scirpy/ir_dist/_clonotype_neighbors.py | 2 +- src/scirpy/pl/_clonotypes.py | 4 +++- src/scirpy/tests/test_ir_query.py | 10 ++++++++-- src/scirpy/tl/_clonotypes.py | 15 +++++++++------ src/scirpy/tl/_ir_query.py | 18 +++++++++++------- src/scirpy/util/__init__.py | 18 ++++++++++++++++++ 6 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/scirpy/ir_dist/_clonotype_neighbors.py b/src/scirpy/ir_dist/_clonotype_neighbors.py index c948c617e..e615734da 100644 --- a/src/scirpy/ir_dist/_clonotype_neighbors.py +++ b/src/scirpy/ir_dist/_clonotype_neighbors.py @@ -118,7 +118,7 @@ def _make_clonotype_table(self, params: DataHandler) -> tuple[Mapping, pd.DataFr ct_tuple[0] if len(ct_tuple) == 1 else ct_tuple, [], ) - ].values + ].values.tolist() for i, ct_tuple in enumerate(clonotypes.itertuples(index=False, name=None)) } diff --git a/src/scirpy/pl/_clonotypes.py b/src/scirpy/pl/_clonotypes.py index da493921c..e6cb28ab0 100644 --- a/src/scirpy/pl/_clonotypes.py +++ b/src/scirpy/pl/_clonotypes.py @@ -22,7 +22,7 @@ from scipy.sparse import issparse from scirpy.tl._clonotypes import _doc_clonotype_network, _graph_from_coordinates -from scirpy.util import DataHandler +from scirpy.util import DataHandler, read_cell_indices from scirpy.util.graph import _distance_to_connectivity from .styling import _get_colors, _init_ax @@ -413,6 +413,8 @@ def _plot_clonotype_network_panel( scale_by_n_cells, color_by_n_cells, ): + cell_indices = read_cell_indices(cell_indices) + colorbar_title = "mean per dot" pie_colors = None cat_colors = None diff --git a/src/scirpy/tests/test_ir_query.py b/src/scirpy/tests/test_ir_query.py index 2713f2718..dfc10150e 100644 --- a/src/scirpy/tests/test_ir_query.py +++ b/src/scirpy/tests/test_ir_query.py @@ -5,6 +5,8 @@ import pytest from mudata import MuData +from scirpy.util import read_cell_indices + from scirpy.pp import ir_dist from scirpy.tl._ir_query import ( _reduce_json, @@ -32,9 +34,13 @@ def test_ir_query(adata_cdr3, adata_cdr3_2, metric, key1, key2): tmp_key2 = f"ir_query_TESTDB_aa_{metric}" if key2 is None else key2 tmp_ad = adata_cdr3.mod["airr"] if isinstance(adata_cdr3, MuData) else adata_cdr3 + + cell_indices = read_cell_indices(tmp_ad.uns[tmp_key2]["cell_indices"]) + cell_indices_reference = read_cell_indices(tmp_ad.uns[tmp_key2]["cell_indices_reference"]) + assert tmp_ad.uns[tmp_key2]["distances"].shape == (4, 3) - assert len(tmp_ad.uns[tmp_key2]["cell_indices"]) == 4 - assert len(tmp_ad.uns[tmp_key2]["cell_indices_reference"]) == 3 + assert len(cell_indices) == 4 + assert len(cell_indices_reference) == 3 @pytest.mark.parametrize( diff --git a/src/scirpy/tl/_clonotypes.py b/src/scirpy/tl/_clonotypes.py index 1116102c6..0a0aafa19 100644 --- a/src/scirpy/tl/_clonotypes.py +++ b/src/scirpy/tl/_clonotypes.py @@ -9,11 +9,12 @@ import scipy.sparse as sp from anndata import AnnData from scanpy import logging +import json from scirpy.ir_dist import MetricType, _get_metric_key from scirpy.ir_dist._clonotype_neighbors import ClonotypeNeighbors from scirpy.pp import ir_dist -from scirpy.util import DataHandler +from scirpy.util import DataHandler, read_cell_indices from scirpy.util.graph import igraph_from_sparse_matrix, layout_components _common_doc = """\ @@ -89,7 +90,7 @@ A dictionary containing * `distances`: A sparse, pairwise distance matrix between unique receptor configurations - * `cell_indices`: A dict of arrays, containing the `adata.obs_names` + * `cell_indices`: A dict of lists, containing the `adata.obs_names` (cell indices) for each row in the distance matrix. If `inplace` is `True`, this is added to `adata.uns[key_added]`. @@ -353,7 +354,7 @@ def convert_str_array_dict_to_csr(str_array_dict: dict[str, np.ndarray[str]]) -> # Return or store results clonotype_distance_res = { "distances": clonotype_dist, - "cell_indices": convert_str_array_dict_to_csr(ctn.cell_indices), + "cell_indices": json.dumps(ctn.cell_indices), } if inplace: params.set_obs(key_added, clonotype_cluster_series) @@ -547,7 +548,8 @@ def clonotype_network( graph.vs["node_id"] = np.arange(0, len(graph.vs)) # store size in graph to be accessed by layout algorithms - clonotype_size = np.array([idx.size for idx in clonotype_res["cell_indices"].values()]) + cell_indices = read_cell_indices(clonotype_res["cell_indices"]) + clonotype_size = np.array([len(idx) for idx in cell_indices.values()]) graph.vs["size"] = clonotype_size components = np.array(graph.decompose("weak")) component_node_count = np.array([len(component.vs) for component in components]) @@ -590,7 +592,7 @@ def clonotype_network( # Expand to cell coordinates to store in adata.obsm idx, coords = zip( *itertools.chain.from_iterable( - zip(clonotype_res["cell_indices"][str(node_id)], itertools.repeat(coord)) + zip(cell_indices[str(node_id)], itertools.repeat(coord)) for node_id, coord in zip(graph.vs["node_id"], coords, strict=False) # type: ignore ), strict=False, @@ -618,9 +620,10 @@ def _graph_from_coordinates(adata: AnnData, clonotype_key: str, basis: str) -> t """ clonotype_res = adata.uns[clonotype_key] # map the cell-id to the corresponding row/col in the clonotype distance matrix + cell_indices = read_cell_indices(clonotype_res["cell_indices"]) dist_idx, obs_names = zip( *itertools.chain.from_iterable( - zip(itertools.repeat(i), obs_names) for i, obs_names in clonotype_res["cell_indices"].items() + zip(itertools.repeat(i), obs_names) for i, obs_names in cell_indices.items() ), strict=False, ) diff --git a/src/scirpy/tl/_ir_query.py b/src/scirpy/tl/_ir_query.py index 72a45679f..fbfa7cec8 100644 --- a/src/scirpy/tl/_ir_query.py +++ b/src/scirpy/tl/_ir_query.py @@ -7,10 +7,11 @@ import numpy as np import pandas as pd from scanpy import logging +import json from scirpy.ir_dist import MetricType, _get_metric_key from scirpy.ir_dist._clonotype_neighbors import ClonotypeNeighbors -from scirpy.util import DataHandler, _is_na, tqdm +from scirpy.util import DataHandler, _is_na, tqdm, read_cell_indices from ._clonotypes import _common_doc, _common_doc_parallelism, _doc_clonotype_definition, _validate_parameters @@ -166,9 +167,9 @@ def ir_query( A dictionary containing * `distances`: A sparse distance matrix between unique receptor configurations in `adata` aund unique receptor configurations in `reference`. - * `cell_indices`: A dict of arrays, containing the the `adata.obs_names` + * `cell_indices`: A dict of lists, containing the the `adata.obs_names` (cell indices) for each row in the distance matrix. - * `cell_indices_reference`: A dict of arrays, containing the `reference.obs_names` + * `cell_indices_reference`: A dict of lists, containing the `reference.obs_names` for each column in the distance matrix. If `inplace` is `True`, this is added to `adata.uns[key_added]`. @@ -206,8 +207,8 @@ def ir_query( # Return or store results clonotype_distance_res = { "distances": clonotype_dist, - "cell_indices": ctn.cell_indices, - "cell_indices_reference": ctn.cell_indices2, + "cell_indices": json.dumps(ctn.cell_indices), + "cell_indices_reference": json.dumps(ctn.cell_indices2), } if inplace: params.adata.uns[key_added] = clonotype_distance_res @@ -284,10 +285,13 @@ def ir_query_annotate_df( res = params.adata.uns[query_key] dist = res["distances"] + cell_indices = read_cell_indices(res["cell_indices"]) + cell_indices_reference = read_cell_indices(res["cell_indices_reference"]) + def get_pairs(): - for i, query_cells in res["cell_indices"].items(): + for i, query_cells in cell_indices.items(): reference_cells = itertools.chain.from_iterable( - res["cell_indices_reference"][str(k)] for k in dist[int(i), :].indices + cell_indices_reference[str(k)] for k in dist[int(i), :].indices ) yield from itertools.product(query_cells, reference_cells) diff --git a/src/scirpy/util/__init__.py b/src/scirpy/util/__init__.py index b46e0d336..789614579 100644 --- a/src/scirpy/util/__init__.py +++ b/src/scirpy/util/__init__.py @@ -16,6 +16,9 @@ from scipy.sparse import issparse from tqdm.auto import tqdm +from typing import Literal +import json + # reexport tqdm (here was previously a workaround for https://github.com/tqdm/tqdm/issues/1082) __all__ = ["tqdm"] @@ -605,3 +608,18 @@ def _get_usable_cpus(n_jobs: int = 0, use_numba: bool = False): usable_cpus = min(usable_cpus, config.NUMBA_NUM_THREADS) return usable_cpus + +def read_cell_indices(cell_indices: Union[dict[str, np.ndarray[str]], str]) -> dict[str,list[str]]: + """ + The datatype of the cell_indices Mapping (clonotype_id -> cell_ids) that is stored to the anndata.uns + attribute after the ´define_clonotype_clusters´ function has changed from dict[str, np.ndarray[str] to + str (json) due to performance considerations regarding the writing speed of the anndata object. But we still + want that older anndata objects with the dict[str, np.ndarray[str] datatype can be used. So we use this function + to read the cell_indices from the anndata object to support both formats. + """ + if(isinstance(cell_indices, str)): # new format + return json.loads(cell_indices) + elif(isinstance(cell_indices, dict)): # old format + return {k: v.tolist() for k, v in cell_indices.items()} + else: # unsupported format + raise TypeError(f"Unsupported type for cell_indices: {type(cell_indices)}. Expected str (json) or dict[str, np.ndarray[str]].") From 78124f1a3a9e4146b28c800816a6dd94e956568e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:28:03 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scirpy/pl/_clonotypes.py | 2 +- src/scirpy/tests/test_ir_query.py | 7 +++---- src/scirpy/tl/_clonotypes.py | 8 +++----- src/scirpy/tl/_ir_query.py | 3 +-- src/scirpy/util/__init__.py | 19 ++++++++++--------- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/scirpy/pl/_clonotypes.py b/src/scirpy/pl/_clonotypes.py index e6cb28ab0..5dc2a9b48 100644 --- a/src/scirpy/pl/_clonotypes.py +++ b/src/scirpy/pl/_clonotypes.py @@ -414,7 +414,7 @@ def _plot_clonotype_network_panel( color_by_n_cells, ): cell_indices = read_cell_indices(cell_indices) - + colorbar_title = "mean per dot" pie_colors = None cat_colors = None diff --git a/src/scirpy/tests/test_ir_query.py b/src/scirpy/tests/test_ir_query.py index dfc10150e..7c33e69cf 100644 --- a/src/scirpy/tests/test_ir_query.py +++ b/src/scirpy/tests/test_ir_query.py @@ -5,8 +5,6 @@ import pytest from mudata import MuData -from scirpy.util import read_cell_indices - from scirpy.pp import ir_dist from scirpy.tl._ir_query import ( _reduce_json, @@ -16,6 +14,7 @@ ir_query_annotate, ir_query_annotate_df, ) +from scirpy.util import read_cell_indices @pytest.mark.parametrize("metric", ["identity", "levenshtein"]) @@ -34,10 +33,10 @@ def test_ir_query(adata_cdr3, adata_cdr3_2, metric, key1, key2): tmp_key2 = f"ir_query_TESTDB_aa_{metric}" if key2 is None else key2 tmp_ad = adata_cdr3.mod["airr"] if isinstance(adata_cdr3, MuData) else adata_cdr3 - + cell_indices = read_cell_indices(tmp_ad.uns[tmp_key2]["cell_indices"]) cell_indices_reference = read_cell_indices(tmp_ad.uns[tmp_key2]["cell_indices_reference"]) - + assert tmp_ad.uns[tmp_key2]["distances"].shape == (4, 3) assert len(cell_indices) == 4 assert len(cell_indices_reference) == 3 diff --git a/src/scirpy/tl/_clonotypes.py b/src/scirpy/tl/_clonotypes.py index 0a0aafa19..2b3a60c4d 100644 --- a/src/scirpy/tl/_clonotypes.py +++ b/src/scirpy/tl/_clonotypes.py @@ -1,4 +1,5 @@ import itertools +import json import random from collections.abc import Sequence from typing import Literal, cast @@ -9,7 +10,6 @@ import scipy.sparse as sp from anndata import AnnData from scanpy import logging -import json from scirpy.ir_dist import MetricType, _get_metric_key from scirpy.ir_dist._clonotype_neighbors import ClonotypeNeighbors @@ -343,7 +343,7 @@ def convert_str_array_dict_to_csr(str_array_dict: dict[str, np.ndarray[str]]) -> for key_str, value in str_array_dict.items(): key = int(key_str) data_arrays[key] = value.astype(int) - indices_arrays[key] = np.array(range(0,len(value))) + indices_arrays[key] = np.array(range(0, len(value))) nnz_array[key] = len(value) data = np.concatenate(data_arrays) @@ -622,9 +622,7 @@ def _graph_from_coordinates(adata: AnnData, clonotype_key: str, basis: str) -> t # map the cell-id to the corresponding row/col in the clonotype distance matrix cell_indices = read_cell_indices(clonotype_res["cell_indices"]) dist_idx, obs_names = zip( - *itertools.chain.from_iterable( - zip(itertools.repeat(i), obs_names) for i, obs_names in cell_indices.items() - ), + *itertools.chain.from_iterable(zip(itertools.repeat(i), obs_names) for i, obs_names in cell_indices.items()), strict=False, ) dist_idx_lookup = pd.DataFrame(index=obs_names, data=dist_idx, columns=["dist_idx"]) diff --git a/src/scirpy/tl/_ir_query.py b/src/scirpy/tl/_ir_query.py index fbfa7cec8..89bf5c975 100644 --- a/src/scirpy/tl/_ir_query.py +++ b/src/scirpy/tl/_ir_query.py @@ -7,11 +7,10 @@ import numpy as np import pandas as pd from scanpy import logging -import json from scirpy.ir_dist import MetricType, _get_metric_key from scirpy.ir_dist._clonotype_neighbors import ClonotypeNeighbors -from scirpy.util import DataHandler, _is_na, tqdm, read_cell_indices +from scirpy.util import DataHandler, _is_na, read_cell_indices, tqdm from ._clonotypes import _common_doc, _common_doc_parallelism, _doc_clonotype_definition, _validate_parameters diff --git a/src/scirpy/util/__init__.py b/src/scirpy/util/__init__.py index 789614579..d9924c60f 100644 --- a/src/scirpy/util/__init__.py +++ b/src/scirpy/util/__init__.py @@ -1,9 +1,10 @@ import contextlib +import json import os import warnings from collections.abc import Callable, Mapping, Sequence from textwrap import dedent -from typing import Any, Optional, Union, cast, overload +from typing import Any, Literal, Optional, Union, cast, overload import awkward as ak import numpy as np @@ -16,9 +17,6 @@ from scipy.sparse import issparse from tqdm.auto import tqdm -from typing import Literal -import json - # reexport tqdm (here was previously a workaround for https://github.com/tqdm/tqdm/issues/1082) __all__ = ["tqdm"] @@ -609,7 +607,8 @@ def _get_usable_cpus(n_jobs: int = 0, use_numba: bool = False): return usable_cpus -def read_cell_indices(cell_indices: Union[dict[str, np.ndarray[str]], str]) -> dict[str,list[str]]: + +def read_cell_indices(cell_indices: dict[str, np.ndarray[str]] | str) -> dict[str, list[str]]: """ The datatype of the cell_indices Mapping (clonotype_id -> cell_ids) that is stored to the anndata.uns attribute after the ´define_clonotype_clusters´ function has changed from dict[str, np.ndarray[str] to @@ -617,9 +616,11 @@ def read_cell_indices(cell_indices: Union[dict[str, np.ndarray[str]], str]) -> d want that older anndata objects with the dict[str, np.ndarray[str] datatype can be used. So we use this function to read the cell_indices from the anndata object to support both formats. """ - if(isinstance(cell_indices, str)): # new format + if isinstance(cell_indices, str): # new format return json.loads(cell_indices) - elif(isinstance(cell_indices, dict)): # old format + elif isinstance(cell_indices, dict): # old format return {k: v.tolist() for k, v in cell_indices.items()} - else: # unsupported format - raise TypeError(f"Unsupported type for cell_indices: {type(cell_indices)}. Expected str (json) or dict[str, np.ndarray[str]].") + else: # unsupported format + raise TypeError( + f"Unsupported type for cell_indices: {type(cell_indices)}. Expected str (json) or dict[str, np.ndarray[str]]." + ) From bca630e0d9ac170f935d377b1aabf592ec8f0148 Mon Sep 17 00:00:00 2001 From: felixpetschko Date: Wed, 18 Sep 2024 20:15:43 +0200 Subject: [PATCH 4/5] removed unused conversion function --- src/scirpy/tl/_clonotypes.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/scirpy/tl/_clonotypes.py b/src/scirpy/tl/_clonotypes.py index 0a0aafa19..fdb1491a5 100644 --- a/src/scirpy/tl/_clonotypes.py +++ b/src/scirpy/tl/_clonotypes.py @@ -333,24 +333,6 @@ def define_clonotype_clusters( clonotype_cluster_series = pd.Series(values, index=idx).reindex(params.adata.obs_names) clonotype_cluster_size_series = clonotype_cluster_series.groupby(clonotype_cluster_series).transform("count") - def convert_str_array_dict_to_csr(str_array_dict: dict[str, np.ndarray[str]]) -> sp.csr_matrix: - num_rows = len(str_array_dict) - - data_arrays = [np.array([])] * num_rows - indices_arrays = [np.array([])] * num_rows - nnz_array = np.zeros(num_rows) - - for key_str, value in str_array_dict.items(): - key = int(key_str) - data_arrays[key] = value.astype(int) - indices_arrays[key] = np.array(range(0,len(value))) - nnz_array[key] = len(value) - - data = np.concatenate(data_arrays) - indices = np.concatenate(indices_arrays) - indptr = np.concatenate([np.array([0]), np.cumsum(nnz_array)]) - return sp.csr_matrix((data, indices, indptr)) - # Return or store results clonotype_distance_res = { "distances": clonotype_dist, From 6be5b2aeb89ded0cc943635657714fdafc9d3394 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 6 Nov 2024 20:22:24 +0100 Subject: [PATCH 5/5] Update CHANGELOG --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 436c7dc0c..0436fcf8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,17 @@ and this project adheres to [Semantic Versioning][]. ## [Unreleased] +### Backwards-incompatible changes + +- The format of storing the results of `tl.define_clonotypes`/`tl.define_clonotype_clusters` in `adata.uns` has changed. + Older versions of Scirpy won't be able to run downstream functions (e.g. `tl.clonotype_network`) on AnnData objects + created with Scirpy v0.20 or later. This change was necessary to speed up writing results to `h5ad` when working + with large datasets ([#556](https://github.com/scverse/scirpy/pull/556)). + ### Documentation - Add a tutorial for BCR analysis with Scirpy ([#542](https://github.com/scverse/scirpy/pull/542)). +- Fix typo in `pp.index_chains` methods description ([#570](https://github.com/scverse/scirpy/pull/570)) ## v0.19.0