diff --git a/README.md b/README.md index 21a6a93..cee48d7 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,18 @@ output = singler.classify_single_reference( ) ``` + ## output + BiocFrame with 4340 rows and 3 columns + best scores delta + + [0] Monocytes 0.33265560369962943:0.407117403330602... 0.40706830113982534 + [1] Monocytes 0.4078771641637374:0.4783396310685646... 0.07000418564184802 + [2] Monocytes 0.3517036021728629:0.4076971245524348... 0.30997293412307647 + ... ... ... + [4337] NK cells 0.3472631136865701:0.3937898240670208... 0.09640242155786138 + [4338] B-cells 0.26974632191999887:0.334862058137758... 0.061215905058676856 + [4339] Monocytes 0.39390119034537324:0.468867490667427... 0.06678168346812047 + ## Integrating labels across references We can use annotations from multiple references through the `annotate_integrated()` function: @@ -125,9 +137,9 @@ import singler single_results, integrated = singler.annotate_integrated( mat, features, - ref_data = ("BlueprintEncode", "DatabaseImmuneCellExpression"), - ref_features = "symbol", - ref_labels = "main", + ref_data_list = ("BlueprintEncode", "DatabaseImmuneCellExpression"), + ref_features_list= "symbol", + ref_labels_list = "main", build_integrated_args = { "ref_names": ("Blueprint", "DICE") }, cache_dir = "_cache", num_threads = 6 diff --git a/setup.cfg b/setup.cfg index ca8489a..59d2073 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,17 +5,17 @@ [metadata] name = singler -description = Add a short description here! +description = Python bindings to the singleR algorithm to annotate cell types from known references. author = Aaron Lun author_email = lun.aaron@gene.com license = MIT license_files = LICENSE.txt long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8; variant=GFM -url = https://github.com/pyscaffold/pyscaffold/ +url = https://github.com/BiocPy/singler # Add here related links, for example: project_urls = - Documentation = https://pyscaffold.org/ + Documentation = https://github.com/BiocPy/singler # Source = https://github.com/pyscaffold/pyscaffold/ # Changelog = https://pyscaffold.org/en/latest/changelog.html # Tracker = https://github.com/pyscaffold/pyscaffold/issues @@ -41,7 +41,7 @@ package_dir = =src # Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 +python_requires = >=3.8 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in @@ -50,10 +50,11 @@ package_dir = install_requires = importlib-metadata; python_version<"3.8" mattress>=0.1.4 - assorthead + assorthead>=0.0.11 delayedarray - biocframe - summarizedexperiment + biocframe>=0.5.0 + summarizedexperiment>=0.4.0 + biocutils [options.packages.find] where = src diff --git a/src/singler/_Markers.py b/src/singler/_Markers.py index 1c7bde5..9259f7d 100644 --- a/src/singler/_Markers.py +++ b/src/singler/_Markers.py @@ -1,6 +1,8 @@ +from typing import Any, Sequence + +from numpy import array, int32, ndarray + from . import _cpphelpers as lib -from numpy import ndarray, int32, array -from typing import Sequence, Any class _Markers: diff --git a/src/singler/_utils.py b/src/singler/_utils.py index 2306a35..e6a4ed6 100644 --- a/src/singler/_utils.py +++ b/src/singler/_utils.py @@ -1,37 +1,15 @@ -from numpy import ndarray from typing import Sequence, Tuple -from summarizedexperiment import SummarizedExperiment -from mattress import tatamize, TatamiNumericPointer -from delayedarray import DelayedArray - - -def _factorize(x: Sequence) -> Tuple[Sequence, ndarray]: - levels = [] - mapping = {} - indices = [] - - for i, lev in enumerate(x): - if lev is None: - indices.append(None) - else: - if lev not in mapping: - mapping[lev] = len(levels) - levels.append(lev) - indices.append(mapping[lev]) - - return levels, indices +import biocutils as ut +import numpy as np +from delayedarray import DelayedArray +from mattress import TatamiNumericPointer, tatamize +from summarizedexperiment import SummarizedExperiment -def _match(x: Sequence, levels: Sequence) -> ndarray: - mapping = _create_map(levels) - indices = [] - for i, y in enumerate(x): - if y is None or y not in mapping: - indices.append(None) - else: - indices.append(mapping[y]) - return indices +def _factorize(x: Sequence) -> Tuple[list, np.ndarray]: + _factor = ut.Factor.from_sequence(x, sort_levels=False) + return _factor.levels, np.array(_factor.codes, np.int32) def _create_map(x: Sequence) -> dict: @@ -92,7 +70,7 @@ def _clean_matrix(x, features, assay_type, check_missing, num_threads): if isinstance(x, TatamiNumericPointer): # Assume the pointer was previously generated from _clean_matrix, # so it's 2-dimensional, matches up with features and it's already - # clean of NaNs... so we no-op and just return it directly. + # clean of NaNs... so we no-op and just return it directly. return x, features if isinstance(x, SummarizedExperiment): diff --git a/src/singler/annotate_integrated.py b/src/singler/annotate_integrated.py index 411f9e6..fad6f42 100644 --- a/src/singler/annotate_integrated.py +++ b/src/singler/annotate_integrated.py @@ -1,13 +1,13 @@ -from typing import Union, Sequence, Optional, Any, Tuple +from typing import Any, Optional, Sequence, Tuple, Union + from biocframe import BiocFrame -from .fetch_reference import fetch_github_reference, realize_github_markers -from .build_single_reference import build_single_reference -from .classify_single_reference import classify_single_reference +from ._utils import _clean_matrix +from .annotate_single import _attach_markers, _resolve_reference from .build_integrated_references import build_integrated_references +from .build_single_reference import build_single_reference from .classify_integrated_references import classify_integrated_references -from .annotate_single import _resolve_reference, _attach_markers -from ._utils import _clean_matrix +from .classify_single_reference import classify_single_reference def annotate_integrated( @@ -27,20 +27,22 @@ def annotate_integrated( classify_integrated_args: dict = {}, num_threads: int = 1, ) -> Tuple[list[BiocFrame], BiocFrame]: - """Annotate a single-cell expression dataset based on the correlation + """Annotate a single-cell expression dataset based on the correlation of each cell to profiles in multiple labelled references, where the annotation from each reference is then integrated across references. Args: - test_data: A matrix-like object representing the test dataset, where rows are + test_data: + A matrix-like object representing the test dataset, where rows are features and columns are samples (usually cells). Entries should be expression values; only the ranking within each column will be used. Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` - containing such a matrix in one of its assays. + containing such a matrix in one of its assays. - test_features: Sequence of length equal to the number of rows in + test_features: + Sequence of length equal to the number of rows in ``test_data``, containing the feature identifier for each row. ref_data_list: @@ -50,7 +52,7 @@ def annotate_integrated( are features and columns are samples. Entries should be expression values, usually log-transformed (see comments for the ``ref`` argument in :py:meth:`~singler.build_single_reference.build_single_reference`). - - A + - A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` object containing such a matrix in its assays. - A string that can be passed as ``name`` to @@ -129,18 +131,22 @@ def annotate_integrated( if isinstance(ref_labels_list, str): ref_labels_list = [ref_labels_list] * nrefs elif nrefs != len(ref_labels_list): - raise ValueError("'ref_data_list' and 'ref_labels_list' must be the same length") + raise ValueError( + "'ref_data_list' and 'ref_labels_list' must be the same length" + ) if isinstance(ref_features_list, str): ref_features_list = [ref_features_list] * nrefs elif nrefs != len(ref_features_list): - raise ValueError("'ref_data_list' and 'ref_features_list' must be the same length") + raise ValueError( + "'ref_data_list' and 'ref_features_list' must be the same length" + ) test_ptr, test_features = _clean_matrix( test_data, test_features, - assay_type = test_assay_type, - check_missing = test_check_missing, - num_threads = num_threads, + assay_type=test_assay_type, + check_missing=test_check_missing, + num_threads=num_threads, ) all_ref_data = [] @@ -163,9 +169,9 @@ def annotate_integrated( curref_ptr, curref_features = _clean_matrix( curref_mat, curref_features, - assay_type = ref_assay_type, - check_missing = ref_check_missing, - num_threads = num_threads, + assay_type=ref_assay_type, + check_missing=ref_check_missing, + num_threads=num_threads, ) bargs = _attach_markers(curref_markers, build_single_args) diff --git a/src/singler/annotate_single.py b/src/singler/annotate_single.py index 01b9cc1..03be964 100644 --- a/src/singler/annotate_single.py +++ b/src/singler/annotate_single.py @@ -1,14 +1,16 @@ -from typing import Union, Sequence, Optional, Any -from biocframe import BiocFrame from copy import copy +from typing import Any, Optional, Sequence, Union + +from biocframe import BiocFrame -from .fetch_reference import fetch_github_reference, realize_github_markers from .build_single_reference import build_single_reference from .classify_single_reference import classify_single_reference -from ._utils import _clean_matrix +from .fetch_reference import fetch_github_reference, realize_github_markers -def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set): +def _resolve_reference( + ref_data, ref_labels, ref_features, cache_dir, build_args, test_features_set +): if isinstance(ref_data, str): ref = fetch_github_reference(ref_data, cache_dir=cache_dir) ref_features = ref.row_data.column(ref_features) @@ -27,7 +29,7 @@ def _resolve_reference(ref_data, ref_labels, ref_features, cache_dir, build_args ) ref_data = ref.assay("ranks") - ref_labels=ref.col_data.column(ref_labels) + ref_labels = ref.col_data.column(ref_labels) else: ref_markers = None @@ -54,11 +56,12 @@ def annotate_single( classify_args: dict = {}, num_threads: int = 1, ) -> BiocFrame: - """Annotate a single-cell expression dataset based on the correlation + """Annotate a single-cell expression dataset based on the correlation of each cell to profiles in a labelled reference. Args: - test_data: A matrix-like object representing the test dataset, where rows are + test_data: + A matrix-like object representing the test dataset, where rows are features and columns are samples (usually cells). Entries should be expression values; only the ranking within each column will be used. @@ -67,10 +70,12 @@ def annotate_single( containing such a matrix in one of its assays. Non-default assay types can be specified in ``classify_args``. - test_features: Sequence of length equal to the number of rows in + test_features: + Sequence of length equal to the number of rows in ``test_data``, containing the feature identifier for each row. - ref_data: A matrix-like object representing the reference dataset, where rows + ref_data: + A matrix-like object representing the reference dataset, where rows are features and columns are samples. Entries should be expression values, usually log-transformed (see comments for the ``ref`` argument in :py:meth:`~singler.build_single_reference.build_single_reference`). diff --git a/src/singler/build_integrated_references.py b/src/singler/build_integrated_references.py index e9575c2..19cb2d7 100644 --- a/src/singler/build_integrated_references.py +++ b/src/singler/build_integrated_references.py @@ -1,10 +1,11 @@ from typing import Sequence, Optional, Union from numpy import array, ndarray, int32, uintp -from mattress import tatamize + +import biocutils as ut from .build_single_reference import SinglePrebuiltReference from . import _cpphelpers as lib -from ._utils import _stable_union, _factorize, _match, _clean_matrix +from ._utils import _stable_union, _factorize, _clean_matrix class IntegratedReferences: @@ -30,7 +31,7 @@ def reference_names(self) -> Union[Sequence[str], None]: def reference_labels(self) -> list: """List of lists containing the names of the labels for each reference. - Each entry corresponds to a reference in :py:attr:`~reference_names`, + Each entry corresponds to a reference in :py:attr:`~reference_names`, if ``reference_names`` is not None. """ return self._labels @@ -55,25 +56,31 @@ def build_integrated_references( """Build a set of integrated references for classification of a test dataset. Arguments: - test_features: Sequence of features for the test dataset. + test_features: + Sequence of features for the test dataset. - ref_data_list: List of reference datasets, where each entry is equivalent to ``ref_data`` in + ref_data_list: + List of reference datasets, where each entry is equivalent to ``ref_data`` in :py:meth:`~singler.build_single_reference.build_single_reference`. - ref_labels_list: List of reference labels, where each entry is equivalent to ``ref_labels`` in + ref_labels_list: + List of reference labels, where each entry is equivalent to ``ref_labels`` in :py:meth:`~singler.build_single_reference.build_single_reference`. - ref_features_list: List of reference features, where each entry is equivalent to ``ref_features`` in + ref_features_list: + List of reference features, where each entry is equivalent to ``ref_features`` in :py:meth:`~singler.build_single_reference.build_single_reference`. - ref_prebuilt_list: List of prebuilt references, typically created by + ref_prebuilt_list: + List of prebuilt references, typically created by calling :py:meth:`~singler.build_single_reference.build_single_reference` on the corresponding elements of ``ref_data_list``, ``ref_labels_list`` and ``ref_features_list``. - ref_names: Sequence of names for the references. + ref_names: + Sequence of names for the references. If None, these are automatically generated. - assay_type: + assasy_type: Assay containing the expression matrix for any entry of ``ref_data_list`` that is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. @@ -96,7 +103,7 @@ def build_integrated_references( universe = _stable_union(test_features, *ref_features_list) original_test_features = test_features - test_features = array(_match(test_features, universe), dtype=int32) + test_features = array(ut.match(test_features, universe), dtype=int32) converted_ref_data = [] ref_data_ptrs = ndarray(nrefs, dtype=uintp) @@ -107,14 +114,14 @@ def build_integrated_references( curptr, curfeatures = _clean_matrix( x, ref_features_list[i], - assay_type = assay_type, - check_missing = check_missing, - num_threads = num_threads, + assay_type=assay_type, + check_missing=check_missing, + num_threads=num_threads, ) converted_ref_data.append(curptr) ref_data_ptrs[i] = curptr.ptr - ind = array(_match(curfeatures, universe), dtype=int32) + ind = array(ut.match(curfeatures, universe), dtype=int32) converted_feature_data.append(ind) ref_features_ptrs[i] = ind.ctypes.data @@ -142,7 +149,9 @@ def build_integrated_references( if ref_names is not None: if nrefs != len(ref_names): - raise ValueError("'ref_names' and 'ref_data_list' should have the same length") + raise ValueError( + "'ref_names' and 'ref_data_list' should have the same length" + ) elif nrefs != len(set(ref_names)): raise ValueError("'ref_names' should contain unique names") diff --git a/src/singler/build_single_reference.py b/src/singler/build_single_reference.py index dabe4cd..5761a65 100644 --- a/src/singler/build_single_reference.py +++ b/src/singler/build_single_reference.py @@ -1,9 +1,11 @@ -from numpy import int32, array, ndarray -from typing import Sequence, Union, Any, Optional, Literal +from typing import Any, Literal, Optional, Sequence, Union + +import biocutils as ut +from numpy import array, int32, ndarray -from ._Markers import _Markers from . import _cpphelpers as lib -from ._utils import _factorize, _match, _clean_matrix, _restrict_features +from ._Markers import _Markers +from ._utils import _clean_matrix, _factorize, _restrict_features from .get_classic_markers import _get_classic_markers_raw @@ -31,7 +33,7 @@ def __del__(self): def num_markers(self) -> int: """ Returns: - int: Number of markers to be used for classification. This is the + Number of markers to be used for classification. This is the same as the size of the array from :py:meth:`~marker_subset`. """ return lib.get_nsubset_from_single_reference(self._ptr) @@ -39,7 +41,7 @@ def num_markers(self) -> int: def num_labels(self) -> int: """ Returns: - int: Number of unique labels in this reference. + Number of unique labels in this reference. """ return lib.get_nlabels_from_single_reference(self._ptr) @@ -70,13 +72,14 @@ def markers(self) -> dict[Any, dict[Any, Sequence]]: def marker_subset(self, indices_only: bool = False) -> Union[ndarray, list]: """ Args: - indices_only: Whether to return the markers as indices + indices_only: + Whether to return the markers as indices into :py:attr:`~features`, or as a list of feature identifiers. Returns: If ``indices_only = False``, a list of feature identifiers for the markers. - If ``indices_only = True``, a NumPy array containing the integer indices of + If ``indices_only = True``, a NumPy array containing the integer indices of features in ``features`` that were chosen as markers. """ nmarkers = self.num_markers() @@ -104,7 +107,8 @@ def build_single_reference( """Build a single reference dataset in preparation for classification. Args: - ref_data: A matrix-like object where rows are features, columns are + ref_data: + A matrix-like object where rows are features, columns are reference profiles, and each entry is the expression value. If `markers` is not provided, expression should be normalized and log-transformed in preparation for marker prioritization via @@ -115,13 +119,16 @@ def build_single_reference( :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays. - labels: Sequence of labels for each reference profile, + labels: + Sequence of labels for each reference profile, i.e., column in ``ref``. - features: Sequence of identifiers for each feature, + features: + Sequence of identifiers for each feature, i.e., row in ``ref``. - assay_type: Assay containing the expression matrix, + assay_type: + Assay containing the expression matrix, if `ref_data` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. @@ -159,7 +166,7 @@ def build_single_reference( Number of threads to use for reference building. Returns: - The pre-built reference, ready for use in downstream methods like + The pre-built reference, ready for use in downstream methods like :py:meth:`~singler.classify_single_reference.classify_single_reference`. """ @@ -183,7 +190,7 @@ def build_single_reference( **marker_args, ) markers = mrk.to_dict(lablev, ref_features) - labind = array(_match(ref_labels, lablev), dtype=int32) + labind = array(ut.match(ref_labels, lablev), dtype=int32) else: raise NotImplementedError("other marker methods are not implemented, sorry") else: diff --git a/src/singler/classify_integrated_references.py b/src/singler/classify_integrated_references.py index 756c97c..79f1845 100644 --- a/src/singler/classify_integrated_references.py +++ b/src/singler/classify_integrated_references.py @@ -1,12 +1,13 @@ -from typing import Sequence, Union, Any -from numpy import array, ndarray, int32, float64, uintp -from mattress import tatamize, TatamiNumericPointer +from typing import Any, Sequence, Union + +import biocutils as ut from biocframe import BiocFrame +from mattress import TatamiNumericPointer, tatamize +from numpy import array, float64, int32, ndarray, uintp from summarizedexperiment import SummarizedExperiment -from .build_integrated_references import IntegratedReferences from . import _cpphelpers as lib -from ._utils import _match +from .build_integrated_references import IntegratedReferences def classify_integrated_references( @@ -20,7 +21,8 @@ def classify_integrated_references( """Integrate classification results across multiple references for a single test dataset. Args: - test_data: A matrix-like object where each row is a feature and each column + test_data: + A matrix-like object where each row is a feature and each column is a test sample (usually a single cell), containing expression values. Normalized and/or transformed expression values are also acceptable as only the ranking is used within this function. @@ -103,7 +105,7 @@ def classify_integrated_references( "each entry of 'results' should have results for all cells in 'test_data'" ) - ind = array(_match(curlabs, all_labels[i]), dtype=int32) + ind = array(ut.match(curlabs, all_labels[i]), dtype=int32) coerced_labels.append(ind) assign_ptrs[i] = ind.ctypes.data diff --git a/src/singler/classify_single_reference.py b/src/singler/classify_single_reference.py index d33eac8..4643e44 100644 --- a/src/singler/classify_single_reference.py +++ b/src/singler/classify_single_reference.py @@ -1,11 +1,11 @@ -from mattress import tatamize -from numpy import ndarray, int32, float64, uintp +from typing import Any, Sequence, Union + from biocframe import BiocFrame -from typing import Sequence, Any, Union +from numpy import float64, int32, ndarray, uintp -from .build_single_reference import SinglePrebuiltReference from . import _cpphelpers as lib -from ._utils import _create_map, _clean_matrix +from ._utils import _clean_matrix, _create_map +from .build_single_reference import SinglePrebuiltReference def classify_single_reference( @@ -23,7 +23,8 @@ def classify_single_reference( using the SingleR algorithm. Args: - test_data: A matrix-like object where each row is a feature and each column + test_data: + A matrix-like object where each row is a feature and each column is a test sample (usually a single cell), containing expression values. Normalized and transformed expression values are also acceptable as only the ranking is used within this function. @@ -32,14 +33,16 @@ def classify_single_reference( :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays. - test_features: Sequence of identifiers for each feature in the test + test_features: + Sequence of identifiers for each feature in the test dataset, i.e., row in ``test_data``. ref_prebuilt: A pre-built reference created with :py:meth:`~singler.build_single_reference.build_single_reference`. - assay_type: Assay containing the expression matrix, + assay_type: + Assay containing the expression matrix, if `test_data` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`. diff --git a/src/singler/fetch_reference.py b/src/singler/fetch_reference.py index 26c2967..7449364 100644 --- a/src/singler/fetch_reference.py +++ b/src/singler/fetch_reference.py @@ -1,13 +1,13 @@ -import urllib.request as req -import urllib.parse -import summarizedexperiment -import tempfile -import os import gzip +import os +import tempfile +import urllib.parse +import urllib.request as req +from typing import Any, Literal, Optional, Sequence, Union + import biocframe import numpy -from typing import Literal, Any, Sequence, Optional, Union - +import summarizedexperiment SESSION_DIR = None @@ -23,7 +23,7 @@ def fetch_github_reference( - name: KNOWN_REFERENCE, cache_dir: str = None, multiple_ids: bool = False + name: KNOWN_REFERENCE, cache_dir: Optional[str] = None, multiple_ids: bool = False ) -> summarizedexperiment.SummarizedExperiment: """Fetch a reference dataset from the `pre-compiled GitHub registry `_, @@ -181,7 +181,7 @@ def fetch_github_reference( sample += 1 return summarizedexperiment.SummarizedExperiment( - {"ranks": mat}, row_data=row_data, col_data=col_data, metadata=markers + {"ranks": mat}, row_data=row_data, column_data=col_data, metadata=markers ) @@ -190,7 +190,7 @@ def realize_github_markers( features: Sequence, num_markers: Optional[int] = None, restrict_to: Optional[Union[set, dict]] = None, -) -> dict[Any, dict[Any, Sequence]]: +) -> dict[Any, dict[Any, Sequence]]: """Convert marker indices from a GitHub reference dataset into feature identifiers. This allows the markers to be used in :py:meth:`~singler.build_single_reference.build_single_reference`. diff --git a/src/singler/get_classic_markers.py b/src/singler/get_classic_markers.py index a7279fe..2ed9fa2 100644 --- a/src/singler/get_classic_markers.py +++ b/src/singler/get_classic_markers.py @@ -1,17 +1,18 @@ -from numpy import ndarray, int32, uintp -from mattress import tatamize -from typing import Union, Sequence, Optional, Any +from typing import Any, Optional, Sequence, Union + import delayedarray +from mattress import tatamize +from numpy import int32, ndarray, uintp from . import _cpphelpers as lib +from ._Markers import _Markers from ._utils import ( _clean_matrix, - _stable_intersect, - _stable_union, _create_map, _restrict_features, + _stable_intersect, + _stable_union, ) -from ._Markers import _Markers def _get_classic_markers_raw( diff --git a/tests/test_annotate_single.py b/tests/test_annotate_single.py index be5cc54..540a43f 100644 --- a/tests/test_annotate_single.py +++ b/tests/test_annotate_single.py @@ -96,5 +96,5 @@ def test_annotate_single_github(): cache_dir="_cache", ) - ref_labels = list(set(se.col_data.column("main"))) + ref_labels = list(set(se.column_data.column("main"))) assert len(more_output.metadata["markers"][ref_labels[0]][ref_labels[1]]) == 10 diff --git a/tests/test_build_integrated_references.py b/tests/test_build_integrated_references.py index 496d2c1..0ea2f93 100644 --- a/tests/test_build_integrated_references.py +++ b/tests/test_build_integrated_references.py @@ -25,7 +25,8 @@ def test_build_integrated_references(): ) assert integrated.reference_names == None - assert integrated.reference_labels == [["A", "B", "C", "D", "E"], ["z", "y", "x"]] + assert list(integrated.reference_labels[0]) == ["A", "B", "C", "D", "E"] + assert list(integrated.reference_labels[1]) == ["z", "y", "x"] assert integrated.test_features == test_features # Works in parallel. diff --git a/tests/test_classify_integrated_references.py b/tests/test_classify_integrated_references.py index ac15146..0663a8b 100644 --- a/tests/test_classify_integrated_references.py +++ b/tests/test_classify_integrated_references.py @@ -70,4 +70,4 @@ def test_classify_integrated_references(): assert results.shape[0] == 50 assert set(results.column("best_reference")) == set([0, 1]) - assert results.column("scores").column_names == ['0', '1'] + assert list(results.column("scores").column_names) == ['0', '1'] diff --git a/tests/test_utils.py b/tests/test_utils.py index 2c62de0..2cbdaf8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,5 @@ from singler._utils import ( _factorize, - _match, _stable_intersect, _stable_union, _clean_matrix, @@ -12,36 +11,18 @@ def test_factorize(): lev, ind = _factorize([1, 3, 5, 5, 3, 1]) - assert lev == [1, 3, 5] - assert ind == [0, 1, 2, 2, 1, 0] + assert list(lev) == ["1", "3", "5"] + assert (ind == [0, 1, 2, 2, 1, 0]).all() # Preserves the order. lev, ind = _factorize(["C", "D", "A", "B", "C", "A"]) - assert lev == ["C", "D", "A", "B"] - assert ind == [0, 1, 2, 3, 0, 2] + assert list(lev) == ["C", "D", "A", "B"] + assert (ind == [0, 1, 2, 3, 0, 2]).all() # Handles None-ness. lev, ind = _factorize([1, None, 5, None, 3, None]) - assert lev == [1, 5, 3] - assert ind == [0, None, 1, None, 2, None] - - -def test_match(): - mm = _match(["A", "C", "B", "D", "A", "A", "C", "D", "B"], ["D", "C", "B", "A"]) - assert list(mm) == [3, 1, 2, 0, 3, 3, 1, 0, 2] - - # Handles duplicate targets. - x = [5, 1, 2, 3, 5, 6, 7, 7, 2, 1] - mm = _match(x, [1, 2, 3, 3, 5, 6, 1, 7, 6]) - assert mm == [4, 0, 1, 2, 4, 5, 7, 7, 1, 0] - - # Handles None-ness. - mm = _match(["A", None, "B", "D", None, "A", "C", None, "B"], ["D", "C", "B", "A"]) - assert list(mm) == [3, None, 2, 0, None, 3, 1, None, 2] - - mm = _match(["A", "B", "D", "A", "C", "B"], ["D", None, "C", "B", None, "A"]) - assert list(mm) == [5, 3, 0, 5, 2, 3] - + assert list(lev) == ["1", "5", "3"] + assert (ind == [0, -1, 1, -1, 2, -1]).all() def test_intersect(): # Preserves the order in the first argument.