diff --git a/README.md b/README.md index cee48d7..acea211 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,28 @@ Firstly, let's load in the famous PBMC 4k dataset from 10X Genomics: ```python import singlecellexperiment as sce -data = sce.read_tenx_h5("pbmc4k-tenx.h5") +data = sce.read_tenx_h5("pbmc4k-tenx.h5", realize_assays=True) mat = data.assay("counts") features = [str(x) for x in data.row_data["name"]] ``` -Now we use the Blueprint/ENCODE reference to annotate each cell in `mat`: +Now, we fetch the Blueprint/ENCODE reference: + +```python +import celldex + +ref_data = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True) +``` + +We can annotate each cell in `mat` with the reference: ```python import singler results = singler.annotate_single( - mat, - features, - ref_data = "BlueprintEncode", - ref_features = "symbol", - ref_labels = "main", - cache_dir = "_cache" + test_data = mat, + test_features = features, + ref_data = ref_data, + ref_labels = "label.main", ) ``` @@ -74,34 +80,12 @@ The `annotate_single()` function is a convenient wrapper around a number of lowe Advanced users may prefer to build the reference and run the classification separately. This allows us to re-use the same reference for multiple datasets without repeating the build step. -We start by fetching the reference of interest from [GitHub](https://github.com/kanaverse/singlepp-references). -Note the use of `cache_dir` to avoid repeated downloads from GitHub. - -```python -ref = singler.fetch_github_reference("BlueprintEncode", cache_dir="_cache") -``` - -We'll be using the gene symbols here with the markers for the main labels. -We need to set `restrict_to` to the features in our test data, so as to avoid picking marker genes in the reference that won't be present in the test. - -```python -ref_features = ref.row_data.column("symbol") - -markers = singler.realize_github_markers( - ref.metadata["main"], - ref_features, - restrict_to=set(features), -) -``` - -Now we build the reference from the ranked expression values and the associated labels in the reference: - ```python built = singler.build_single_reference( - ref_data=ref.assay("ranks"), - ref_labels=ref.col_data.column("main"), - ref_features=ref_features, - markers=markers, + ref_data=ref_data.assay("logcounts"), + ref_labels=ref_data.col_data.column("label.main"), + ref_features=ref_data.get_row_names(), + restrict_to=features, ) ``` @@ -134,14 +118,17 @@ We can use annotations from multiple references through the `annotate_integrated ```python import singler +import celldex + +blueprint_ref = celldex.fetch_reference("blueprint_encode", "2024-02-26", realize_assays=True) + +immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) + single_results, integrated = singler.annotate_integrated( mat, features, - ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"), - ref_features_list= "symbol", - ref_labels_list = "main", - build_integrated_args = { "ref_names": ("Blueprint", "DICE") }, - cache_dir = "_cache", + ref_data_list = (blueprint_ref, immune_cell_ref), + ref_labels_list = "label.main", num_threads = 6 ) ``` diff --git a/setup.cfg b/setup.cfg index 59d2073..796ce89 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,6 +54,7 @@ install_requires = delayedarray biocframe>=0.5.0 summarizedexperiment>=0.4.0 + singlecellexperiment>=0.4.6 biocutils [options.packages.find] @@ -71,6 +72,9 @@ testing = setuptools pytest pytest-cov + celldex + scrnaseq + scipy [options.entry_points] # Add here console scripts like: diff --git a/src/singler/__init__.py b/src/singler/__init__.py index dc17568..8c512e7 100644 --- a/src/singler/__init__.py +++ b/src/singler/__init__.py @@ -16,11 +16,10 @@ del version, PackageNotFoundError -from .get_classic_markers import get_classic_markers, number_of_classic_markers +from .annotate_integrated import annotate_integrated +from .annotate_single import annotate_single +from .build_integrated_references import IntegratedReferences, build_integrated_references from .build_single_reference import build_single_reference -from .build_integrated_references import build_integrated_references, IntegratedReferences -from .classify_single_reference import classify_single_reference from .classify_integrated_references import classify_integrated_references -from .fetch_reference import fetch_github_reference, realize_github_markers -from .annotate_single import annotate_single -from .annotate_integrated import annotate_integrated +from .classify_single_reference import classify_single_reference +from .get_classic_markers import get_classic_markers, number_of_classic_markers diff --git a/src/singler/_utils.py b/src/singler/_utils.py index e6a4ed6..4e31b51 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -74,11 +74,18 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): return x, features if isinstance(x, SummarizedExperiment): + if features is None: + features = x.get_row_names() + elif isinstance(features, str): + features = x.get_row_data().column(features) + features = list(features) + x = x.assay(assay_type) curshape = x.shape if len(curshape) != 2: raise ValueError("each entry of 'ref' should be a 2-dimensional array") + if curshape[0] != len(features): raise ValueError( "number of rows of 'x' should be equal to the length of 'features'" diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py index fad6f42..afb38f4 100644 --- a/src/singler/annotate_integrated.py +++ b/src/singler/annotate_integrated.py @@ -3,7 +3,7 @@ from biocframe import BiocFrame from ._utils import _clean_matrix -from .annotate_single import _attach_markers, _resolve_reference +from .annotate_single import _resolve_reference from .build_integrated_references import build_integrated_references from .build_single_reference import build_single_reference from .classify_integrated_references import classify_integrated_references @@ -12,15 +12,14 @@ def annotate_integrated( test_data: Any, - test_features: Sequence, ref_data_list: Sequence[Union[Any, str]], - ref_labels_list: Union[str, Sequence[Union[Sequence, str]]], - ref_features_list: Union[str, Sequence[Union[Sequence, str]]], + test_features: Optional[Union[Sequence, str]] = None, + ref_labels_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None, + ref_features_list: Optional[Union[Optional[str], Sequence[Union[Sequence, str]]]] = None, test_assay_type: Union[str, int] = 0, test_check_missing: bool = True, ref_assay_type: Union[str, int] = "logcounts", ref_check_missing: bool = True, - cache_dir: Optional[str] = None, build_single_args: dict = {}, classify_single_args: dict = {}, build_integrated_args: dict = {}, @@ -45,6 +44,11 @@ def annotate_integrated( Sequence of length equal to the number of rows in ``test_data``, containing the feature identifier for each row. + Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features`` + may be a string speciying the column name in `row_data` that contains the + features. It can also be set to `None`, to use the `row_names` of the + experiment as features. + ref_data_list: Sequence consisting of one or more of the following: @@ -69,6 +73,10 @@ def annotate_integrated( - If ``ref_data_list[i]`` is a string, ``ref_labels_list[i]`` should be a string specifying the label type to use, e.g., "main", "fine", "ont". If a single string is supplied, it is recycled for all ``ref_data``. + - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_labels_list[i]`` + may be a string speciying the column name in `column_data` that contains the + features. It can also be set to `None`, to use the `column_names`of the + experiment as features. ref_features_list: Sequence of the same length as ``ref_data_list``, where the contents @@ -80,6 +88,10 @@ def annotate_integrated( - If ``ref_data_list[i]`` is a string, ``ref_features_list[i]`` should be a string specifying the feature type to use, e.g., "ensembl", "symbol". If a single string is supplied, it is recycled for all ``ref_data``. + - If ``ref_data_list[i]`` is a ``SummarizedExperiment``, ``ref_features_list[i]`` + may be a string speciying the column name in `row_data` that contains the + features. It can also be set to `None`, to use the `row_names` of the + experiment as features. test_assay_type: Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a @@ -95,11 +107,6 @@ def annotate_integrated( ref_check_missing: Whether to check for and remove missing (i.e., NaN) values from the reference datasets. - cache_dir: - Path to a cache directory for downloading reference files, see - :py:meth:`~singler.fetch_reference.fetch_github_reference` for details. - Only used if ``ref_data`` is a string. - build_single_args: Further arguments to pass to :py:meth:`~singler.build_single_reference.build_single_reference`. @@ -128,18 +135,22 @@ def annotate_integrated( :py:meth:`~singler.classify_integrated_references.classify_integrated_references`). """ nrefs = len(ref_data_list) + if isinstance(ref_labels_list, str): ref_labels_list = [ref_labels_list] * nrefs - elif nrefs != len(ref_labels_list): - raise ValueError( - "'ref_data_list' and 'ref_labels_list' must be the same length" - ) + elif ref_labels_list is None: + ref_labels_list = [None] * nrefs + + if nrefs != len(ref_labels_list): + raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length") + if isinstance(ref_features_list, str): ref_features_list = [ref_features_list] * nrefs - elif nrefs != len(ref_features_list): - raise ValueError( - "'ref_data_list' and 'ref_features_list' must be the same length" - ) + elif ref_features_list is None: + ref_features_list = [None] * nrefs + + if nrefs != len(ref_features_list): + raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length") test_ptr, test_features = _clean_matrix( test_data, @@ -157,13 +168,11 @@ def annotate_integrated( test_features_set = set(test_features) for r in range(nrefs): - curref_mat, curref_labels, curref_features, curref_markers = _resolve_reference( + curref_mat, curref_labels, curref_features = _resolve_reference( ref_data=ref_data_list[r], ref_labels=ref_labels_list[r], ref_features=ref_features_list[r], - cache_dir=cache_dir, build_args=build_single_args, - test_features_set=test_features_set, ) curref_ptr, curref_features = _clean_matrix( @@ -174,13 +183,12 @@ def annotate_integrated( num_threads=num_threads, ) - bargs = _attach_markers(curref_markers, build_single_args) curbuilt = build_single_reference( ref_data=curref_ptr, ref_labels=curref_labels, ref_features=curref_features, restrict_to=test_features_set, - **bargs, + **build_single_args, num_threads=num_threads, ) diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index 03be964..6b1ad36 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -1,57 +1,53 @@ -from copy import copy +import warnings from typing import Any, Optional, Sequence, Union from biocframe import BiocFrame +from summarizedexperiment import SummarizedExperiment from .build_single_reference import build_single_reference from .classify_single_reference import classify_single_reference -from .fetch_reference import fetch_github_reference, realize_github_markers -def _resolve_reference( - ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set -): - if isinstance(ref_data, str): - ref = fetch_github_reference(ref_data, cache_dir=cache_dir) - ref_features = ref.row_data.column(ref_features) +def _resolve_reference(ref_data, ref_labels, ref_features, build_args): + if isinstance(ref_data, SummarizedExperiment) or issubclass(type(ref_data), SummarizedExperiment): + if ref_features is None: + ref_features = ref_data.get_row_names() + elif isinstance(ref_features, str): + ref_features = ref_data.get_row_data().column(ref_features) - num_de = None - if "marker_args" in build_args: - marker_args = build_args["marker_args"] - if "num_de" in marker_args: - num_de = marker_args["num_de"] + ref_features = list(ref_features) - ref_markers = realize_github_markers( - ref.metadata[ref_labels], - ref_features, - num_markers=num_de, - restrict_to=test_features_set, - ) + if ref_labels is None: + ref_labels = ref_data.get_column_names() + elif isinstance(ref_labels, str): + ref_labels = ref_data.get_column_data().column(ref_labels) - ref_data = ref.assay("ranks") - ref_labels = ref.col_data.column(ref_labels) - else: - ref_markers = None + ref_labels = list(ref_labels) - return ref_data, ref_labels, ref_features, ref_markers + try: + _default_asy = "logcounts" + if "assay_type" in build_args: + _default_asy = build_args["assay_type"] + ref_data = ref_data.assay(_default_asy) + except Exception as _: + raise ValueError(f"Reference dataset must contain log-normalized count ('{_default_asy}') assay.") -def _attach_markers(markers, build_args): - if markers is not None and "markers" not in build_args: - tmp = copy(build_args) - tmp["markers"] = markers - print(tmp) - return tmp - return build_args + if ref_labels is None: + raise ValueError("'ref_labels' cannot be `None`.") + + if ref_features is None: + raise ValueError("'ref_features' cannot be `None`.") + + return ref_data, ref_labels, ref_features def annotate_single( test_data: Any, - test_features: Sequence, ref_data: Any, - ref_labels: Union[Sequence, str], - ref_features: Union[Sequence, str], - cache_dir: Optional[str] = None, + ref_labels: Optional[Union[Sequence, str]], + test_features: Optional[Union[Sequence, str]] = None, + ref_features: Optional[Union[Sequence, str]] = None, build_args: dict = {}, classify_args: dict = {}, num_threads: int = 1, @@ -74,36 +70,41 @@ def annotate_single( Sequence of length equal to the number of rows in ``test_data``, containing the feature identifier for each row. + Alternatively, if ``test_data`` is a ``SummarizedExperiment``, ``test_features`` + may be a string speciying the column name in `row_data` that contains the + features. It can also be set to `None`, to use the `row_names` of + the experiment as features. + ref_data: A matrix-like object representing the reference dataset, where rows are features and columns are samples. Entries should be expression values, usually log-transformed (see comments for the ``ref`` argument in :py:meth:`~singler.build_single_reference.build_single_reference`). - Alternatively, a string that can be passed as ``name`` to - :py:meth:`~singler.fetch_reference.fetch_github_reference`. - This will use the specified dataset as the reference. + Alternatively, a + :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` + containing such a matrix in one of its assays. Non-default assay + types can be specified in ``classify_args``. ref_labels: If ``ref_data`` is a matrix-like object, ``ref_labels`` should be a sequence of length equal to the number of columns of ``ref_data``, containing the label associated with each column. - If ``ref_data`` is a string, ``ref_labels`` should be a string - specifying the label type to use, e.g., "main", "fine", "ont". + Alternatively, if ``ref_data`` is a ``SummarizedExperiment``, + ``ref_labels`` may be a string specifying the label type to use, + e.g., "main", "fine", "ont". It can also be set to `None`, to use + the `row_names` of the experiment as features. ref_features: If ``ref_data`` is a matrix-like object, ``ref_features`` should be a sequence of length equal to the number of rows of ``ref_data``, containing the feature identifier associated with each row. - If ``ref_data`` is a string, ``ref_features`` should be a string - specifying the label type to use, e.g., "ensembl", "symbol". - - cache_dir: - Path to a cache directory for downloading reference files, see - :py:meth:`~singler.fetch_reference.fetch_github_reference` for details. - Only used if ``ref_data`` is a string. + Alternatively, if ``ref_data`` is a ``SummarizedExperiment``, + ``ref_features`` may be a string speciying the column name in `column_data` + that contains the features. It can also be set to `None`, to use the + `row_names` of the experiment as features. build_args: Further arguments to pass to @@ -123,24 +124,36 @@ def annotate_single( specifying the markers that were used for each pairwise comparison between labels; and a list of ``unique_markers`` across all labels. """ + + if isinstance(test_data, SummarizedExperiment): + if test_features is None: + test_features = test_data.get_row_names() + elif isinstance(test_features, str): + test_features = test_data.get_row_data().column(test_features) + + if test_features is None: + raise ValueError("'test_features' cannot be `None`.") + test_features_set = set(test_features) + if len(test_features_set) != len(test_features): + warnings.warn("'test_features' is not unique, subsetting test matrix...", UserWarning) + _idxs = [test_features.index(x) for x in test_features_set] + print("modifying test data") + test_data = test_data[_idxs,] - ref_data, ref_labels, ref_features, markers = _resolve_reference( + ref_data, ref_labels, ref_features = _resolve_reference( ref_data=ref_data, ref_labels=ref_labels, ref_features=ref_features, - cache_dir=cache_dir, build_args=build_args, - test_features_set=test_features_set, ) - bargs = _attach_markers(markers, build_args) built = build_single_reference( ref_data=ref_data, ref_labels=ref_labels, ref_features=ref_features, restrict_to=test_features_set, - **bargs, + **build_args, num_threads=num_threads, ) diff --git a/src/singler/classify_single_reference.py b/src/singler/classify_single_reference.py index 4643e44..fa233f0 100644 --- a/src/singler/classify_single_reference.py +++ b/src/singler/classify_single_reference.py @@ -37,11 +37,16 @@ def classify_single_reference( Sequence of identifiers for each feature in the test dataset, i.e., row in ``test_data``. + If ``test_data`` is a ``SummarizedExperiment``, ``test_features`` + may be a string speciying the column name in `row_data`that contains the + features. Alternatively can be set to `None`, to use the `row_names` of + the experiment as used as features. + ref_prebuilt: A pre-built reference created with :py:meth:`~singler.build_single_reference.build_single_reference`. - assay_type: + assay_type: Assay containing the expression matrix, if `test_data` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. diff --git a/src/singler/fetch_reference.py b/src/singler/fetch_reference.py deleted file mode 100644 index 7449364..0000000 --- a/src/singler/fetch_reference.py +++ /dev/null @@ -1,243 +0,0 @@ -import gzip -import os -import tempfile -import urllib.parse -import urllib.request as req -from typing import Any, Literal, Optional, Sequence, Union - -import biocframe -import numpy -import summarizedexperiment - -SESSION_DIR = None - -KNOWN_REFERENCE = Literal[ - "BlueprintEncode", - "DatabaseImmuneCellExpression", - "HumanPrimaryCellAtlas", - "MonacoImmune", - "NovershternHematopoietic", - "ImmGen", - "MouseRNAseq", -] - - -def fetch_github_reference( - name: KNOWN_REFERENCE, cache_dir: Optional[str] = None, multiple_ids: bool = False -) -> summarizedexperiment.SummarizedExperiment: - """Fetch a reference dataset from the - `pre-compiled GitHub registry `_, - for use in annotation with other **singler** functions. - - Args: - name: Name of the reference dataset. - - cache_dir: Path to a cache directory in which to store - the files downloaded from the remote. If the files are already - present, the download is skipped. - - multiple_ids: Whether to report multiple feature IDs. - If True, each feature is represented by a list with zero, - one or more feature identifiers (e.g., for ambiguous mappings). - If False, each feature is represented by a string or None. - - Returns: - The reference dataset as a SummarizedExperiment, - parts of which can be passed to :py:meth:`~singler.build_single_reference.build_single_reference`. - - Specifically, the ``ranks`` assay of the output can be used as ``ref`` in - :py:meth:`~singler.build_single_reference.build_single_reference`; - one of the labels in the column data can be used as ``labels``; - and one of the gene types in the row data can be used as ``features``. - - As the ranks are not log-normalized values, users should also use - the relevant pre-computed marker list in the metadata. The selected - marker list should match up with the chosen set of ``labels``. In - addition, the markers are stored as row indices and need to be converted - to feature identifiers; this is achieved by passing the marker list to - :py:meth:`~singler.fetch_reference.realize_github_markers` with the same - gene types that were used in ``features``. The output can then be passed - as ``markers`` in the `build_reference()` call. - - If ``multiple_ids = True``, each ``row_data`` column will be a list of - lists of possible identifiers for each feature. Callers are responsible for - resolving this list of lists into a list of single identifiers for each - feature, before passing it onto other functions like - :py:meth:`~singler.build_single_reference.build_single_reference`. - """ - - all_files = {"matrix": name + "_matrix.csv.gz"} - gene_types = ["ensembl", "entrez", "symbol"] - for g in gene_types: - suff = "genes_" + g - all_files[suff] = name + "_" + suff + ".csv.gz" - - lab_types = ["fine", "main", "ont"] - for lab in lab_types: - suff = "labels_" + lab - all_files[suff] = name + "_" + suff + ".csv.gz" - suff = "label_names_" + lab - all_files[suff] = name + "_" + suff + ".csv.gz" - suff = "markers_" + lab - all_files[suff] = name + "_" + suff + ".gmt.gz" - - base_url = ( - "https://github.com/kanaverse/singlepp-references/releases/download/2023-04-28" - ) - - if cache_dir is None: - global SESSION_DIR - # This should already lie inside the OS's temporary directory, based on - # documentation for tempfile.gettempdir(); no need to clean it up afterwards. - if SESSION_DIR is None: - SESSION_DIR = tempfile.mkdtemp() - cache_dir = SESSION_DIR - elif not os.path.exists(cache_dir): - os.makedirs(cache_dir) - - all_paths = {} - for k, v in all_files.items(): - url = base_url + "/" + v - path = os.path.join(cache_dir, urllib.parse.quote(url, safe="")) - if not os.path.exists(path): - req.urlretrieve(url=url, filename=path) - all_paths[k] = path - - # Reading in labels. - labels = {} - markers = {} - for lab in lab_types: - all_labels = [] - with gzip.open(all_paths["labels_" + lab], "rt") as handle: - for line in handle: - line = line.strip() - if line == "NA": # I dunno man, I dunno. - all_labels.append(None) - else: - all_labels.append(int(line)) - - all_label_names = [] - with gzip.open(all_paths["label_names_" + lab], "rt") as handle: - for line in handle: - all_label_names.append(line.strip()) - - for i, x in enumerate(all_labels): - if x is not None: - all_labels[i] = all_label_names[x] - labels[lab] = all_labels - - current_markers = {} - for x in all_label_names: - current_inner = {} - for x2 in all_label_names: - current_inner[x2] = [] - current_markers[x] = current_inner - - with gzip.open(all_paths["markers_" + lab], "rt") as handle: - for line in handle: - fields = line.strip().split("\t") - first = all_label_names[int(fields[0])] - second = all_label_names[int(fields[1])] - current_markers[first][second] = [int(j) for j in fields[2:]] - - markers[lab] = current_markers - - # Reading in genes. - gene_ids = {} - for g in gene_types: - with gzip.open(all_paths["genes_" + g], "rt") as handle: - current_genes = [] - for line in handle: - y = line.strip() - if multiple_ids: - if y == "": - y = [] - else: - y = y.split("\t") - else: - if y == "": - y = None - else: - tx = y.find("\t") - if tx != -1: - y = y[:tx] - current_genes.append(y) - gene_ids[g] = current_genes - - row_data = biocframe.BiocFrame(gene_ids) - col_data = biocframe.BiocFrame(labels) - - # Reading in the matrix first. - mat = numpy.ndarray( - (row_data.shape[0], col_data.shape[0]), dtype=numpy.int32, order="F" - ) - with gzip.open(all_paths["matrix"], "rt") as handle: - sample = 0 - for line in handle: - contents = line.strip().split(",") - for i, x in enumerate(contents): - contents[i] = int(x) - mat[:, sample] = contents - sample += 1 - - return summarizedexperiment.SummarizedExperiment( - {"ranks": mat}, row_data=row_data, column_data=col_data, metadata=markers - ) - - -def realize_github_markers( - markers: dict[Any, dict[Any, Sequence]], - features: Sequence, - num_markers: Optional[int] = None, - restrict_to: Optional[Union[set, dict]] = None, -) -> dict[Any, dict[Any, Sequence]]: - """Convert marker indices from a GitHub reference dataset into feature identifiers. This allows the markers to be - used in :py:meth:`~singler.build_single_reference.build_single_reference`. - - Args: - markers: - Upregulated markers for each pairwise comparison between labels. - Specifically, ``markers[a][b]`` should be a sequence of features - that are upregulated in ``a`` compared to ``b``. Features are - represented as indices into ``features``. - - features: - Sequence of identifiers for each feature. Features with no valid - identifier for a particular gene type (e.g., no known symbol) - should be represented by None. - - num_markers: - Number of markers to retain. If None, all markers are retained. - - restrict_to: - Subset of available features to restrict the marker selection. - Only features in ``restrict_to`` will be reported in the output. - If None, no restriction is performed. - - Returns: - A dictionary with the same structure - as ``markers``, where each inner sequence contains the corresponding - feature identifiers in ``features``. Feature identifiers are guaranteed - to be non-None and to be in ``restrict_to`` (if specified). Each - inner sequence should have length ``num_markers`` (or less, if not - enough non-None/restricted identifiers are available). - """ - output = {} - for k, v in markers.items(): - current = {} - - for k2, v2 in v.items(): - renamed = [] - - for i in v2: - if num_markers is not None and len(renamed) == num_markers: - break - feat = features[i] - if feat is not None: - if restrict_to is None or feat in restrict_to: - renamed.append(feat) - - current[k2] = renamed - output[k] = current - - return output diff --git a/src/singler/get_classic_markers.py b/src/singler/get_classic_markers.py index 2ed9fa2..14fb675 100644 --- a/src/singler/get_classic_markers.py +++ b/src/singler/get_classic_markers.py @@ -111,8 +111,10 @@ def get_classic_markers( ref_data: A matrix-like object containing the log-normalized expression values of a reference dataset. Each column is a sample and each row is a feature. + Alternatively, this can be a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a matrix-like object in one of its assays. + Alternatively, a list of such matrices or ``SummarizedExperiment`` objects, typically for multiple batches of the same reference; it is assumed that different batches exhibit at least some overlap in their ``features`` and ``labels``. diff --git a/tests/test_annotate_single.py b/tests/test_annotate_single.py index 540a43f..ff123a3 100644 --- a/tests/test_annotate_single.py +++ b/tests/test_annotate_single.py @@ -59,42 +59,3 @@ def test_annotate_single_intersect(): output.column("scores").column("B") == expected.column("scores").column("B") ).all() - -def test_annotate_single_github(): - se = singler.fetch_github_reference("ImmGen", cache_dir="_cache") - - keep = range(5, se.shape[0], 2) - test = numpy.random.rand(len(keep), 50) - ref_features = se.row_data.column("symbol") - test_features = [ref_features[i] for i in keep] - - output = singler.annotate_single( - test, - test_features=test_features, - ref_data="ImmGen", - ref_features="symbol", - ref_labels="main", - cache_dir="_cache", - ) - assert output.shape[0] == 50 - - expected_markers = singler.realize_github_markers( - se.metadata["main"], - se.row_data.column("symbol"), - restrict_to=set(test_features), - ) - assert output.metadata["markers"] == expected_markers - - # Checking that we handle the number of markers correctly. - more_output = singler.annotate_single( - test, - test_features=test_features, - ref_data="ImmGen", - ref_features="symbol", - ref_labels="main", - build_args={"marker_args": {"num_de": 10}}, - cache_dir="_cache", - ) - - ref_labels = list(set(se.column_data.column("main"))) - assert len(more_output.metadata["markers"][ref_labels[0]][ref_labels[1]]) == 10 diff --git a/tests/test_fetch_reference.py b/tests/test_fetch_reference.py deleted file mode 100644 index 7f241fb..0000000 --- a/tests/test_fetch_reference.py +++ /dev/null @@ -1,89 +0,0 @@ -import singler -import summarizedexperiment -import re -import numpy - - -def test_fetch_github_reference(): - out = singler.fetch_github_reference("ImmGen", cache_dir="_cache") - assert isinstance(out, summarizedexperiment.SummarizedExperiment) - - # Checking the genes. - assert out.row_data.column("ensembl")[0].startswith("ENS") - assert re.match("^[0-9]+", out.row_data.column("entrez")[0]) is not None - assert re.match("^[A-Z][a-z]+[0-9]*", out.row_data.column("symbol")[0]) is not None - - ens = out.row_data.column("ensembl") - has_none = False - for x in ens: - if x is None: - has_none = True - break - assert has_none - - has_tab = False - for x in ens: - if x is not None and x.find("\t") != -1: - has_tab = True - break - assert not has_tab - - # Checking the labels. - assert isinstance(out.col_data.column("fine")[0], str) - assert isinstance(out.col_data.column("main")[0], str) - assert isinstance(out.col_data.column("ont")[0], str) - - # Checking the assay. - ass = out.assays["ranks"] - assert ass.shape[0] > ass.shape[1] - assert (ass.min(0) == numpy.ones(ass.shape[1])).all() - - # Checking markers. - markers = out.metadata["fine"] - flabs = out.col_data.column("fine") - all_labels = sorted(list(set(flabs))) - assert sorted(markers.keys()) == all_labels - assert sorted(markers[all_labels[0]].keys()) == all_labels - assert len(markers[all_labels[0]][all_labels[0]]) == 0 - assert len(markers[all_labels[0]][all_labels[1]]) > 0 - - -def test_fetch_github_reference_multiple(): - out = singler.fetch_github_reference( - "ImmGen", cache_dir="_cache", multiple_ids=True - ) - - ens = out.row_data.column("ensembl") - all_lengths = set() - for x in ens: - all_lengths.add(len(x)) - assert 0 in all_lengths - assert 1 in all_lengths - assert 2 in all_lengths - - -def test_realize_github_markers(): - markers = {"A": {"B": [1, 3, 5, 7]}} - out = singler.realize_github_markers( - markers, ["A", "B", "C", "D", "E", "F", "G", "H"] - ) - assert out["A"]["B"] == ["B", "D", "F", "H"] - - # Behaves with the number of markers set. - out = singler.realize_github_markers( - markers, ["A", "B", "C", "D", "E", "F", "G", "H"], num_markers=2 - ) - assert out["A"]["B"] == ["B", "D"] - - out = singler.realize_github_markers( - markers, ["A", "B", "C", None, "E", "F", "G", "H"], num_markers=2 - ) - assert out["A"]["B"] == ["B", "F"] - - # Behaves with the restrict_to set. - out = singler.realize_github_markers( - markers, - ["A", "B", "C", "D", "E", "F", "G", "H"], - restrict_to=set(["E", "F", "G", "H"]), - ) - assert out["A"]["B"] == ["F", "H"] diff --git a/tests/test_integrated_with_celldex.py b/tests/test_integrated_with_celldex.py new file mode 100644 index 0000000..8181d66 --- /dev/null +++ b/tests/test_integrated_with_celldex.py @@ -0,0 +1,75 @@ +import singler +import numpy +import celldex +import scrnaseq +import pandas as pd +import scipy +import pytest +from biocframe import BiocFrame + + +def test_with_minimal_args(): + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + + blueprint_ref = celldex.fetch_reference( + "blueprint_encode", "2024-02-26", realize_assays=True + ) + immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) + + with pytest.raises(Exception): + singler.annotate_integrated( + test_data=sce.assays["counts"], + ref_data_list=(blueprint_ref, immune_cell_ref), + ref_labels_list="label.main", + num_threads=6, + ) + + single, integrated = singler.annotate_integrated( + test_data=sce, + ref_data_list=(blueprint_ref, immune_cell_ref), + ref_labels_list="label.main", + num_threads=6, + ) + assert len(single) == 2 + assert isinstance(integrated, BiocFrame) + + +def test_with_all_supplied(): + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + + blueprint_ref = celldex.fetch_reference( + "blueprint_encode", "2024-02-26", realize_assays=True + ) + immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) + + single, integrated = singler.annotate_integrated( + test_data=sce, + test_features=sce.get_row_names(), + ref_data_list=(blueprint_ref, immune_cell_ref), + ref_labels_list=[ + x.get_column_data().column("label.main") + for x in (blueprint_ref, immune_cell_ref) + ], + ref_features_list=[x.get_row_names() for x in (blueprint_ref, immune_cell_ref)], + ) + + assert len(single) == 2 + assert isinstance(integrated, BiocFrame) + + +def test_with_colname(): + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + + blueprint_ref = celldex.fetch_reference( + "blueprint_encode", "2024-02-26", realize_assays=True + ) + immune_cell_ref = celldex.fetch_reference("dice", "2024-02-26", realize_assays=True) + + single, integrated = singler.annotate_integrated( + test_data=sce, + ref_data_list=(blueprint_ref, immune_cell_ref), + ref_labels_list="label.main", + ) + + assert len(single) == 2 + assert isinstance(integrated, BiocFrame) diff --git a/tests/test_single_with_celldex.py b/tests/test_single_with_celldex.py new file mode 100644 index 0000000..b5cd6db --- /dev/null +++ b/tests/test_single_with_celldex.py @@ -0,0 +1,64 @@ +import singler +import numpy +import celldex +import scrnaseq +import pandas as pd +import scipy +import pytest +from biocframe import BiocFrame + +def test_with_minimal_args(): + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + + immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) + + with pytest.raises(Exception): + matches = singler.annotate_single( + test_data=sce.assays["counts"], + ref_data=immgen_ref, + ref_labels=immgen_ref.get_column_data().column("label.main"), + ) + + matches = singler.annotate_single( + test_data=sce, + ref_data=immgen_ref, + ref_labels=immgen_ref.get_column_data().column("label.main"), + ) + assert isinstance(matches, BiocFrame) + + counts = pd.Series(matches["best"]).value_counts() + assert counts is not None + + +def test_with_all_supplied(): + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + + immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) + + matches = singler.annotate_single( + test_data=sce, + test_features=sce.get_row_names(), + ref_data=immgen_ref, + ref_labels=immgen_ref.get_column_data().column("label.main"), + ref_features=immgen_ref.get_row_names(), + ) + assert isinstance(matches, BiocFrame) + + counts = pd.Series(matches["best"]).value_counts() + assert counts is not None + + +def test_with_colname(): + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + + immgen_ref = celldex.fetch_reference("immgen", "2024-02-26", realize_assays=True) + + matches = singler.annotate_single( + test_data=sce, + ref_data=immgen_ref, + ref_labels="label.main", + ) + assert isinstance(matches, BiocFrame) + + counts = pd.Series(matches["best"]).value_counts() + assert counts is not None