diff --git a/.travis.yml b/.travis.yml index 23559c2..347ac96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ before_install: - sudo ln -s /run/shm /dev/shm install: - - conda install --yes python="3.6" numpy scipy cython h5py typing pandas + - conda install --yes python="3.6" numpy scipy cython h5py typing numba - pip install sphinx_bootstrap_theme # Install the dev version (1.7) of sphinx that has solved a problem with f-strings - git clone https://github.com/sphinx-doc/sphinx.git diff --git a/README.md b/README.md index cca5385..76f9f08 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,10 @@ -# loompy 2 +# loompy v3.0 -⭐ Loompy v2.0 was released Dec. 24, 2017! ([what's new](https://github.com/linnarsson-lab/loompy/releases/tag/v2.0)?) +⭐ Loompy v3.0 was released Sep. 24, 2019! -`.loom` is an efficient file format for very large omics datasets, -consisting of a main matrix, optional additional layers, a variable number of row and column -annotations. Loom also supports sparse graphs. We use loom files to store single-cell gene expression -data: the main matrix contains the actual expression values (one -column per cell, one row per gene); row and column annotations -contain metadata for genes and cells, such as `Name`, `Chromosome`, -`Position` (for genes), and `Strain`, `Sex`, `Age` (for cells). - -![Illustration of Loom format structure](/doc/Loom-images.png) - -Loom files (`.loom`) are created in the [HDF5](https://en.wikipedia.org/wiki/Hierarchical_Data_Format) file format, which -supports an internal collection of numerical multidimensional datasets. -HDF5 is supported by many computer languages, including Java, MATLAB, -Mathematica, Python, R, and Julia. `.loom` files are accessible from -any language that supports HDF5. - -To get started, head over to [the documentation](http://linnarssonlab.org/loompy/)! +To get started, head over to [the documentation](http://loompy.org)! Loom, loompy, and the [loom-viewer](https://github.com/linnarsson-lab/loom-viewer) are being developed by members of the [Linnarsson Lab](http://linnarssonlab.org). diff --git a/doc/format/index.rst b/doc/format/index.rst index 53f5ae2..b8aa3d1 100644 --- a/doc/format/index.rst +++ b/doc/format/index.rst @@ -6,7 +6,7 @@ Loom file format specs Versions -------- -This specification defines the Loom file format version ``2.0.1``. +This specification defines the Loom file format version ``3.0.0``. .. _formatinfo: @@ -91,15 +91,27 @@ Main matrix and layers Global attributes ^^^^^^^^^^^^^^^^^ -- There can OPTIONALLY be at least one `HDF5 - attribute `__ on the - root ``/`` group, which can be any valid scalar or multidimensional datatype and should be - interpreted as attributes of the whole Loom file. -- There can OPTIONALLY be an `HDF5 - attribute `__ on the - root ``/`` group named ``LOOM_SPEC_VERSION``, a string value giving the - loom file spec version that was followed in creating the file. See top of this - document for the current version of the spec. +- There MUST be an HDF5 group ``/attrs`` containing global attributes. +- There MUST be a HDF5 dataset ``/attrs/LOOM_SPEC_VERSION`` with the value ``v3.0.0``. + +Global attributes apply semantically to the whole file, not any specific part of it. +Such attributes are stored in the HDF5 group ``/attrs`` and can be any valid scalar +or multidimensional datatype. + +As of Loom file format v3.0.0, only one global attribute is mandatory: the ``LOOM_SPEC_VERSION`` +attribute, which is a string value giving the loom file spec version that was followed in creating +the file. See top of this document for the current version of the spec. + +Note: previous versions of the loom file format stored global attributes as `HDF5 attributes `__ +on the root ``/`` group. However, such attributes are size-limited, which caused problems for some +applications. For backwards compatibility, readers compatible with Loom v3.0.0 and above MUST first look +for global attributes under the HDF5 group ``/attrs`` (if it exists). If a requested attribute does not exist +as a dataset under that group, the reader MUST then examine the HDF5 attributes on the root ``/`` group. + +When writing a global attribute, the writer MUST write only to the ``/attrs`` group if ``LOOM_SPEC_VERSION`` is +``3.0.0`` or higher. The writer MUST write to both the ``/attrs`` group and the HDF5 attributes on the root ``/`` +group if ``LOOM_SPEC_VERSION`` is lower than ``3.0.0`` or if it does not exist. This is to preserve a consistent +format for legacy files. Row and column attributes diff --git a/loompy/__init__.py b/loompy/__init__.py index 5d5d27b..c09c818 100644 --- a/loompy/__init__.py +++ b/loompy/__init__.py @@ -1,13 +1,15 @@ -from .utils import * +from .utils import get_loom_spec_version, compare_loom_spec_version, timestamp, deprecated from .normalize import normalize_attr_values, materialize_attr_values from .attribute_manager import AttributeManager -from .file_attribute_manager import FileAttributeManager +from .global_attribute_manager import GlobalAttributeManager from .graph_manager import GraphManager from .layer_manager import LayerManager from .loom_view import LoomView from .loom_layer import MemoryLoomLayer, LoomLayer from .to_html import to_html from .view_manager import ViewManager -from .loompy import connect, create, create_append, combine, create_from_cellranger, LoomConnection, new, combine_faster +from .loompy import connect, create, create_append, combine, create_from_cellranger, LoomConnection, new, combine_faster, create_from_matrix_market from .loom_validator import LoomValidator from ._version import __version__, loom_spec_version +from .bus_file import create_from_fastq +from .cell_calling import call_cells diff --git a/loompy/_version.py b/loompy/_version.py index 26c8c01..f742e1b 100644 --- a/loompy/_version.py +++ b/loompy/_version.py @@ -1,2 +1,2 @@ -__version__ = '2.0.18' -loom_spec_version = '2.0.1' +__version__ = '3.0.0' +loom_spec_version = '3.0.0' diff --git a/loompy/attribute_manager.py b/loompy/attribute_manager.py index 5968ef3..f48d898 100644 --- a/loompy/attribute_manager.py +++ b/loompy/attribute_manager.py @@ -1,7 +1,9 @@ from typing import * import numpy as np +import h5py import loompy from loompy import timestamp +from .utils import compare_loom_spec_version class AttributeManager: @@ -144,13 +146,17 @@ def __setattr__(self, name: str, val: np.ndarray) -> None: raise KeyError("Attribute name cannot contain slash (/)") else: if self.ds is not None: - values = loompy.normalize_attr_values(val) + values = loompy.normalize_attr_values(val, compare_loom_spec_version(self.ds._file, "3.0.0") >= 0) a = ["/row_attrs/", "/col_attrs/"][self.axis] if self.ds.shape[self.axis] != 0 and values.shape[0] != self.ds.shape[self.axis]: raise ValueError(f"Attribute '{name}' must have exactly {self.ds.shape[self.axis]} values but {len(values)} were given") if self.ds._file[a].__contains__(name): del self.ds._file[a + name] - self.ds._file[a + name] = values # TODO: for 2D arrays, use block compression along columns/rows + + if values.dtype == np.object_: + self.ds._file.create_dataset(a + name, data=values, dtype=h5py.string_dtype(encoding="utf-8")) + else: + self.ds._file[a + name] = values self.ds._file[a + name].attrs["last_modified"] = timestamp() self.ds._file[a].attrs["last_modified"] = timestamp() self.ds._file.attrs["last_modified"] = timestamp() diff --git a/loompy/bus_file.py b/loompy/bus_file.py new file mode 100644 index 0000000..ca4a324 --- /dev/null +++ b/loompy/bus_file.py @@ -0,0 +1,463 @@ +import array +import gzip +import json +import logging +import os +import sqlite3 as sqlite +import subprocess +from math import lgamma +from tempfile import TemporaryDirectory +from typing import Dict, Generator, List, Optional, Tuple + +import numpy as np +import scipy.sparse as sparse +from numba import jit + +from .cell_calling import call_cells +from .loompy import connect, create + + +# Copied from cytograph +@jit("float32(float64[:], float64[:])", nopython=True, parallel=True, nogil=True) +def multinomial_distance(p: np.ndarray, q: np.ndarray) -> float: + N = p.shape[0] + p_sum = p.sum() + q_sum = q.sum() + x = lgamma(N) + lgamma(p_sum + q_sum + N) - lgamma(p_sum + N) - lgamma(q_sum + N) + for k in range(N): + x += lgamma(p[k] + 1) + lgamma(q[k] + 1) - lgamma(1) - lgamma(p[k] + q[k] + 1) + x = np.exp(x) + return 1 - 1 / (1 + x) + + +# https://maciejkula.github.io/2015/02/22/incremental-construction-of-sparse-matrices/ +class IncrementalSparseMatrixUInt16: + def __init__(self, shape: Tuple[int, int]): + self.dtype = np.uint16 + self.shape = shape + + self.rows = array.array('i') + self.cols = array.array('i') + self.data = array.array('H') + + def append(self, i: int, j: int, v: int) -> None: + m, n = self.shape + + if (i >= m or j >= n): + raise Exception('Index out of bounds') + + self.rows.append(i) + self.cols.append(j) + self.data.append(v) + + def tocoo(self) -> sparse.coo_matrix: + rows = np.frombuffer(self.rows, dtype=np.int32) + cols = np.frombuffer(self.cols, dtype=np.int32) + data = np.frombuffer(self.data, dtype=np.uint16) + return sparse.coo_matrix((data, (rows, cols)), shape=self.shape) + + def __len__(self) -> int: + return len(self.data) + + +twobit_to_dna_table = {0: "A", 1: "C", 2: "G", 3: "T"} +dna_to_twobit_table = {"A": 0, "C": 1, "G": 2, "T": 3} + + +@jit +def twobit_to_dna(twobit: int, size: int) -> str: + result = [] + for i in range(size): + x = (twobit & (3 << 2 * i)) >> 2 * i + if x == 0: + result.append("A") + elif x == 1: + result.append("C") + elif x == 2: + result.append("G") + elif x == 3: + result.append("T") + result.reverse() + return "".join(result) + + +@jit +def dna_to_twobit(dna: str) -> int: + x = 0 + for nt in dna: + if nt == "A": + x += 0 + elif nt == "C": + x += 1 + elif nt == "G": + x += 2 + elif nt == "T": + x += 3 + x <<= 2 + x >>= 2 + return x + + +@jit +def twobit_1hamming(twobit: int, size: int) -> List[int]: + result = [] + for i in range(size): + x = (twobit >> 2 * (size - i - 1)) & 3 + for j in range(4): + if x == j: + continue + result.append(twobit & ~(3 << 2 * (size - i - 1)) | (j << 2 * (size - i - 1))) + return result + + +def ixs_thatsort_a2b(a: np.ndarray, b: np.ndarray, check_content: bool = True) -> np.ndarray: + "This is super duper magic sauce to make the order of one list to be like another" + if check_content: + assert len(np.intersect1d(a, b)) == len(a), f"The two arrays are not matching" + return np.argsort(a)[np.argsort(np.argsort(b))] + + +def load_sample_metadata(path: str, sample_id: str) -> Dict[str, str]: + if not os.path.exists(path): + raise ValueError(f"Samples metadata file '{path}' not found.") + if path.endswith(".db"): + # sqlite3 + with sqlite.connect(path) as db: + cursor = db.cursor() + cursor.execute("SELECT * FROM sample WHERE name = ?", (sample_id,)) + keys = [x[0] for x in cursor.description] + vals = cursor.fetchone() + if vals is not None: + return dict(zip(keys, vals)) + raise ValueError(f"SampleID '{sample_id}' was not found in the samples database.") + else: + result = {} + with open(path) as f: + headers = [x.lower() for x in f.readline()[:-1].split("\t")] + if "sampleid" not in headers and 'name' not in headers: + raise ValueError("Required column 'SampleID' or 'Name' not found in sample metadata file") + if "sampleid" in headers: + sample_metadata_key_idx = headers.index("sampleid") + else: + sample_metadata_key_idx = headers.index("name") + sample_found = False + for line in f: + items = line[:-1].split("\t") + if len(items) > sample_metadata_key_idx and items[sample_metadata_key_idx] == sample_id: + for i, item in enumerate(items): + result[headers[i]] = item + sample_found = True + if not sample_found: + raise ValueError(f"SampleID '{sample_id}' not found in sample metadata file") + return result + + +class BusFile: + def __init__(self, path: str, genes_metadata_file: str, genes_metadata_key: str, fragments2genes_file: str, equivalence_classes_file: str, fragments_file: str) -> None: + self.matrix: sparse.coo_matrix = None + logging.info("Loading gene metadata") + self.genes: Dict[str, List[str]] = {} # Keys are Accessions, values are lists of attribute values + self.gene_metadata_attributes: List[str] = [] # Attribute names + with open(genes_metadata_file) as f: + line = f.readline() + self.gene_metadata_attributes = line[:-1].split("\t") + if genes_metadata_key not in self.gene_metadata_attributes: + raise ValueError(f"Metadata key '{genes_metadata_key}' not found in gene metadata file") + key_col = self.gene_metadata_attributes.index(genes_metadata_key) + for line in f: + items = line[:-1].split("\t") + self.genes[items[key_col]] = items + self.accessions = np.array([x for x in self.genes.keys()]) + accession_idx = {acc: i for (i, acc) in enumerate(self.accessions)} + self.n_genes = len(self.accessions) + + logging.info("Loading fragments-to-gene mappings") + self.gene_for_fragment: List[str] = [] + with open(fragments2genes_file) as f: + for line in f: + transcript_id, accession = line[:-1].split("\t") + self.gene_for_fragment.append(accession) + self.gene_for_fragment = np.array(self.gene_for_fragment) + + logging.info("Indexing genes") + # Array of indices into self.accessions for each gene in gene_for_transcript + self.gene_for_fragment_idx = np.zeros(len(self.gene_for_fragment), dtype="int32") + for i in range(len(self.gene_for_fragment)): + self.gene_for_fragment_idx[i] = accession_idx[self.gene_for_fragment[i]] + + logging.info("Loading equivalence classes") + self.equivalence_classes: Dict[int, List[int]] = {} + with open(equivalence_classes_file) as f: + for line in f: + ec, trs = line[:-1].split("\t") + # Each equivalence class is a set of fragments (transcripts) + self.equivalence_classes[int(ec)] = np.array([int(x) for x in trs.split(",")]) + # But we want each equivalence class mapped to a gene (or -1 if multimapping) + logging.info("Mapping equivalence classes to genes") + self.gene_for_ec: Dict[int, int] = {} + for eqc in self.equivalence_classes.keys(): + gene = -1 + for tid in self.equivalence_classes[eqc]: + if gene == -1: + gene = self.gene_for_fragment_idx[tid] + continue + elif self.gene_for_fragment_idx[tid] != gene: + gene = -1 # Multimapping UMI + break + self.gene_for_ec[eqc] = gene + + logging.info("Loading fragment IDs") + self.fragments: List[str] = [] + with open(fragments_file) as f: + for line in f: + self.fragments.append(line[:-1]) + self.fragments = np.array(self.fragments) + + logging.info("Loading BUS records") + fsize = os.path.getsize(path) + with open(path, "rb") as fb: + # Read the header + magic = fb.read(4) + if magic != b"BUS\0": + raise IOError("Not a valid BUS file (four leading magic bytes are missing)") + self.version = int.from_bytes(fb.read(4), byteorder="little", signed=False) + self.barcode_length = int.from_bytes(fb.read(4), byteorder="little", signed=False) + self.umi_length = int.from_bytes(fb.read(4), byteorder="little", signed=False) + tlen = int.from_bytes(fb.read(4), byteorder="little", signed=False) + self.header = fb.read(tlen).decode("utf-8") # BUS does not specify an encoding, but let's assume UTF8 + + # Read the records + self.n_records = (fsize - tlen - 20) // 32 + self.bus = np.fromfile(fb, dtype=[ + ("barcode", np.uint64), + ("UMI", np.uint64), + ("equivalence_class", np.int32), + ("count", np.uint32), + ("flags", np.uint32), + ("padding", np.int32) + ], count=self.n_records) + self.bus_gene = np.array([self.gene_for_ec[x] for x in self.bus["equivalence_class"]], dtype=np.int32) + self.bus_valid = self.bus_gene != -1 + + logging.info("Sorting cell IDs") + self.cell_ids = np.unique(self.bus["barcode"]) + self.cell_ids.sort() + self.n_cells = len(self.cell_ids) # This will change after error correction + self.layers: Dict[str, sparse.coo_matrix] = {} # Dict of layer name -> sparse matrix + self.ambient_umis = 0 + + def correct(self, whitelist_file: str = None) -> np.ndarray: + if whitelist_file is not None: + size = 0 + whitelist = set() + with open(whitelist_file) as f: + for bc in f: + size = len(bc) - 1 # Don't count the newline + whitelist.add(dna_to_twobit(bc[:-1])) + for i, bc in enumerate(self.bus["barcode"]): + if bc in whitelist: + continue + corrected = False + for mut in twobit_1hamming(int(bc), size=size): + if mut in whitelist: + self.bus["barcode"][i] = mut + corrected = True + break + if not corrected: + self.bus_valid[i] = False + self.cell_ids = np.unique(self.bus["barcode"][self.bus_valid]) + self.cell_ids.sort() + self.cell_for_barcode_idx = {bc: i for (i, bc) in enumerate(self.cell_ids)} + self.n_cells = len(self.cell_ids) + + def deduplicate(self) -> None: + # Sort by barcode, then by UMI, then by gene + ordering = np.lexsort((self.bus_gene, self.bus["UMI"], self.bus["barcode"])) + self.bus = self.bus[ordering] + self.bus_gene = self.bus_gene[ordering] + self.bus_valid = self.bus_valid[ordering] + dupes = (self.bus["barcode"][1:] == self.bus["barcode"][:-1]) & (self.bus["UMI"][1:] == self.bus["UMI"][:-1]) & (self.bus_gene[1:] == self.bus_gene[:-1]) + self.bus_valid[1:][dupes] = False + + def count(self) -> sparse.coo_matrix: + logging.info("Counting pseudoalignments for main matrix") + genes = self.bus_gene[self.bus_valid] + cells = [self.cell_for_barcode_idx[x] for x in self.bus["barcode"][self.bus_valid]] + self.matrix = sparse.coo_matrix((np.ones_like(genes), (genes, cells)), shape=(self.n_genes, self.n_cells), dtype=np.uint16) + self.total_umis = np.array(self.matrix.sum(axis=0))[0] + return self.matrix + + def remove_empty_beads(self, expected_n_cells: int) -> None: + logging.info("Calling cells") + self.ambient_umis, self.ambient_pvalue = call_cells(self.matrix.tocsc(), expected_n_cells) + self.valid_cells = (self.ambient_pvalue < 0.01) | (self.total_umis > 1500) + self.matrix = self.matrix.tocsr()[:, self.valid_cells] + self.cell_ids = self.cell_ids[self.valid_cells] + self.n_cells = self.valid_cells.sum() + for name in self.layers.keys(): + self.layers[name] = self.layers[name].tocsc()[:, self.valid_cells] + logging.info(f"Found {self.n_cells} valid cells and ~{int(self.ambient_umis)} ambient UMIs.") + + def count_layer(self, layer_name: str, layer_fragments_file: str) -> sparse.coo_matrix: + fragments_idx = {f: i for i, f in enumerate(self.fragments)} + with open(layer_fragments_file) as f: + include_fragments = set([fragments_idx[x[:-1]] for x in f.readlines()]) + logging.info(f"Counting pseudoalignments for layer '{layer_name}'") + # Figure out which of the equivalence classes are relevant + include_ec = {} + for ec, tids in self.equivalence_classes.items(): + if any([tid in include_fragments for tid in tids]): + include_ec[ec] = True + else: + include_ec[ec] = False + m = IncrementalSparseMatrixUInt16((self.n_genes, self.n_cells)) + for ix in range(self.n_records): + if not self.bus_valid[ix]: + continue + gene = self.bus_gene[ix] + if include_ec[self.bus["equivalence_class"][ix]]: + m.append(gene, self.cell_for_barcode_idx[self.bus["barcode"][ix]], 1) + self.layers[layer_name] = m.tocoo() + return self.layers[layer_name] + + def save(self, out_file: str, sample_id: str, samples_metadata_file: str) -> None: + logging.info("Saving") + row_attrs = {} + # Transpose the gene metadata + for i, attr in enumerate(self.gene_metadata_attributes): + row_attrs[attr] = [v[i] for v in self.genes.values()] + + # Create cell attributes + col_attrs = { + "CellID": [sample_id + "_" + twobit_to_dna(int(cid), 16) for cid in self.cell_ids], + "TotalUMIs": self.total_umis[self.valid_cells] + } + + # Load sample metadata + metadata = load_sample_metadata(samples_metadata_file, sample_id) + global_attrs = { + **{ + "SampleID": sample_id, + "AmbientUMIs": self.ambient_umis, + "RedundantReadFraction": 1 - self.bus_valid.sum() / self.n_records, + "AmbientPValue": self.ambient_pvalue, + "BarcodeTotalUMIs": self.total_umis, + "CellBarcodes": self.valid_cells + }, **metadata} + + layers = self.layers.copy() + layers[""] = self.matrix + create(out_file, layers, row_attrs, col_attrs, file_attrs=global_attrs) + + +def execute(cmd: List[str], synchronous: bool = False) -> Generator: + if synchronous: + yield os.popen(" ".join(cmd)).read() + else: + popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # type: ignore + for stdout_line in iter(popen.stdout.readline, ""): + yield stdout_line + popen.stdout.close() + return_code = popen.wait() + if return_code: + raise subprocess.CalledProcessError(return_code, cmd) + + +def create_from_fastq(out_file: str, sample_id: str, fastqs: List[str], index_path: str, samples_metadata_file: str, n_threads: int = 1, temp_folder: str = None, synchronous: bool = False) -> None: + """ + Args: + technology String like "10xv2" or None to read the technology from the sample metadata file + expected_n_cells Expected number of cells captured in the sample, or None to read the number from the sample metadata file + samples_metadata_file Path to tab-delimited file with one header row OR path to sqlite database with one table called "sample" + + Remarks: + Samples metadata table should contain these columns: + name Sample name (i.e. sample id) + chemistry 10x chemistry version (v1, v2 or v3) + targetnumcells Number of cells expected in the sample + """ + manifest_file = os.path.join(index_path, "manifest.json") + if not os.path.exists(manifest_file): + raise ValueError(f"Manifest file 'manifest.json' was missing from index at '{index_path}'") + for fastq in fastqs: + if not os.path.exists(fastq): + raise ValueError(f"Fastq file '{fastq}' was not found") + if not os.path.exists(samples_metadata_file): + raise ValueError("Samples metadata file not found") + with open(manifest_file) as f: + manifest = json.load(f) + + metadata = load_sample_metadata(samples_metadata_file, sample_id) + + if (("technology" not in metadata) and ("chemistry" not in metadata)) or ("targetnumcells" not in metadata): + print(metadata.keys()) + raise ValueError("Samples metadata must contain columns 'targetnumcells' and either 'chemistry' or 'technology'") + if "technology" in metadata: + technology = metadata["technology"] + else: + technology = "10x" + metadata["chemistry"] + try: + expected_n_cells = int(metadata["targetnumcells"]) + except: + expected_n_cells = 5000 + + whitelist_file: Optional[str] = os.path.join(index_path, f"{technology}_whitelist.txt") + if not os.path.exists(whitelist_file): # type: ignore + logging.warning(f"Barcode whitelist file {whitelist_file} not found in index folder at '{index_path}'; barcode correction will be skipped.") + whitelist_file = None + + with TemporaryDirectory() as d: + if temp_folder is not None: + d = temp_folder + if not os.path.exists(d): + os.mkdir(d) + cmd = ["kallisto", "bus", "-i", os.path.join(index_path, manifest["index_file"]), "-o", d, "-x", technology, "-t", str(n_threads)] + fastqs + logging.info(" ".join(cmd)) + for line in execute(cmd, synchronous): + if line != "\n": + logging.info(line[:-1]) + + run_info: Optional[Dict[str, str]] = None + try: + with open(os.path.join(d, "run_info.json")) as f: + run_info = json.load(f) + except json.JSONDecodeError as e: + with open(os.path.join(d, "run_info.json")) as f: + for line in f: + logging.error(line) + logging.error(f"Error decoding run_info.json: {e}") + bus = BusFile( + os.path.join(d, "output.bus"), + os.path.join(index_path, manifest["gene_metadata_file"]), + manifest["gene_metadata_key"], + os.path.join(index_path, manifest["fragments_to_genes_file"]), + os.path.join(d, "matrix.ec"), + os.path.join(d, "transcripts.txt") + ) + logging.info(f"Found {bus.n_records:,} records for {bus.n_genes:,} genes and {bus.n_cells:,} uncorrected cell barcodes.") + if whitelist_file is None: + logging.warning("Not correcting barcodes, because whitelist file was not provided.") + else: + logging.info("Correcting cell barcodes") + bus.correct(whitelist_file) + logging.info(f"Found {bus.n_cells:,} corrected cell barcodes.") + logging.info("Removing redundant reads using UMIs") + bus.deduplicate() + seq_sat = 1 - bus.bus_valid.sum() / bus.n_records + logging.info(f"{int(seq_sat * 100)}% sequencing saturation.") + bus.count() + logging.info(f"Found {bus.matrix.count_nonzero():,} UMIs.") + for layer, layer_def in manifest["layers"].items(): + bus.count_layer(layer, os.path.join(index_path, layer_def)) + logging.info(f"Found {bus.layers[layer].count_nonzero():,} UMIs.") + bus.remove_empty_beads(expected_n_cells) + logging.info(f"Creating loom file '{out_file}'") + bus.save(out_file, sample_id, samples_metadata_file) + with connect(out_file) as ds: + ds.attrs.Species = manifest["species"] + ds.attrs.Saturation = seq_sat + if run_info is not None: + ds.attrs.NumReadsProcessed = int(run_info["n_processed"]) + ds.attrs.NumPseudoaligned = int(run_info["n_pseudoaligned"]) + ds.attrs.KallistoCommand = run_info["call"] + ds.attrs.KallistoVersion = run_info["kallisto_version"] diff --git a/loompy/cell_calling.py b/loompy/cell_calling.py new file mode 100644 index 0000000..9a165d0 --- /dev/null +++ b/loompy/cell_calling.py @@ -0,0 +1,352 @@ +# Original algorithm was published by John Marioni and colleagues as EmptyDrops (Lun, A. et al. Distinguishing cells from empty droplets in droplet-based single-cell RNA sequencing data.) + +# This implementation is based on the code in cellranger v3.0 by 10x Genomics + +# Copyright 2018 10X Genomics, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import logging + +import numpy as np +import scipy.sparse as sparse +import scipy.stats as sp_stats + +# Simple Good-Turing estimator. +# Based on S implementation in +# William A. Gale & Geoffrey Sampson (1995) Good-turing frequency estimation without tears, +# Journal of Quantitative Linguistics, 2:3, 217-237, DOI: 10.1080/09296179508590051 + + +class SimpleGoodTuringError(Exception): + pass + + +def _averaging_transform(r, nr): + d = np.concatenate((np.ones(1, dtype=int), np.diff(r))) + dr = np.concatenate(( + 0.5 * (d[1:] + d[0:-1]), + np.array((d[-1],), dtype=float), + )) + return nr.astype(float) / dr + + +def _rstest(r, coef): + return r * np.power(1 + 1 / r, 1 + coef) + + +def simple_good_turing(xr, xnr): + """Make a Simple Good-Turing estimate of the frequencies. + + Args: + xr (np.array(int)): Non-zero item frequencies + xnr (np.array(int)): Non-zero frequencies of frequencies + Returns: + (rstar (np.array(float)), p0 (float)): + rstar: The adjusted non-zero frequencies + p0: The total probability of unobserved items + """ + + xr = xr.astype(float) + xnr = xnr.astype(float) + + xN = np.sum(xr * xnr) + + # Get Linear Good-Turing estimate + xnrz = _averaging_transform(xr, xnr) + slope, intercept, _, _, _ = sp_stats.linregress(np.log(xr), np.log(xnrz)) + + if slope > -1: + raise SimpleGoodTuringError("The log-log slope is > -1 (%d); the SGT estimator is not applicable to these data." % slope) + + xrst = _rstest(xr, slope) + xrstrel = xrst / xr + + # Get traditional Good-Turing estimate + xrtry = xr == np.concatenate((xr[1:] - 1, np.zeros(1))) + xrstarel = np.zeros(len(xr)) + xrstarel[xrtry] = (xr[xrtry] + 1) / xr[xrtry] * np.concatenate((xnr[1:], np.zeros(1)))[xrtry] / xnr[xrtry] + + # Determine when to switch from GT to LGT estimates + tursd = np.ones(len(xr)) + for i in range(len(xr)): + if xrtry[i]: + tursd[i] = float(i + 2) / xnr[i] * np.sqrt(xnr[i + 1] * (1 + xnr[i + 1] / xnr[i])) + + xrstcmbrel = np.zeros(len(xr)) + useturing = True + for r in range(len(xr)): + if not useturing: + xrstcmbrel[r] = xrstrel[r] + else: + if np.abs(xrstrel[r] - xrstarel[r]) * (1 + r) / tursd[r] > 1.65: + xrstcmbrel[r] = xrstarel[r] + else: + useturing = False + xrstcmbrel[r] = xrstrel[r] + + # Renormalize the probabilities for observed objects + sumpraw = np.sum(xrstcmbrel * xr * xnr / xN) + + xrstcmbrel = xrstcmbrel * (1 - xnr[0] / xN) / sumpraw + p0 = xnr[0] / xN + + return (xr * xrstcmbrel, p0) + + +def sgt_proportions(frequencies): + """Use Simple Good-Turing estimate to adjust for unobserved items + + Args: + frequencies (np.array(int)): Nonzero frequencies of items + Returns: + (pstar (np.array(float)), p0 (float)): + pstar: The adjusted non-zero proportions + p0: The total probability of unobserved items + """ + if len(frequencies) == 0: + raise ValueError("Input frequency vector is empty") + if np.count_nonzero(frequencies) != len(frequencies): + raise ValueError("Frequencies must be greater than zero") + + freqfreqs = np.bincount(frequencies.astype(np.int64)) + assert freqfreqs[0] == 0 + use_freqs = np.flatnonzero(freqfreqs) + + if len(use_freqs) < 10: + raise SimpleGoodTuringError("Too few non-zero frequency items (%d). Aborting SGT." % len(use_freqs)) + + rstar, p0 = simple_good_turing(use_freqs, freqfreqs[use_freqs]) + + # rstar contains the smoothed frequencies. + # Map each original frequency r to its smoothed rstar. + rstar_dict = dict(zip(use_freqs, rstar)) + + rstar_sum = np.sum(freqfreqs[use_freqs] * rstar) + rstar_i = np.fromiter((rstar_dict[f] for f in frequencies), dtype=float, count=len(frequencies)) + pstar = (1 - p0) * (rstar_i / rstar_sum) + + assert np.isclose(p0 + np.sum(pstar), 1) + return (pstar, p0) + + +def adjust_pvalue_bh(p): + """ Multiple testing correction of p-values using the Benjamini-Hochberg procedure """ + descending = np.argsort(p)[::-1] + # q = p * N / k where p = p-value, N = # tests, k = p-value rank + scale = float(len(p)) / np.arange(len(p), 0, -1) + q = np.minimum(1, np.minimum.accumulate(scale * p[descending])) + + # Return to original order + return q[np.argsort(descending)] + + +def eval_multinomial_loglikelihoods(matrix, profile_p, max_mem_gb=0.1): + """Compute the multinomial log PMF for many barcodes + Args: + matrix (scipy.sparse.csc_matrix): Matrix of UMI counts (feature x barcode) + profile_p (np.ndarray(float)): Multinomial probability vector + max_mem_gb (float): Try to bound memory usage. + Returns: + log_likelihoods (np.ndarray(float)): Log-likelihood for each barcode + """ + gb_per_bc = float(matrix.shape[0] * matrix.dtype.itemsize) / (1024**3) + bcs_per_chunk = max(1, int(round(max_mem_gb / gb_per_bc))) + num_bcs = matrix.shape[1] + + loglk = np.zeros(num_bcs) + + for chunk_start in range(0, num_bcs, bcs_per_chunk): + chunk = slice(chunk_start, chunk_start + bcs_per_chunk) + matrix_chunk = matrix[:, chunk].transpose().toarray() + n = matrix_chunk.sum(1) + loglk[chunk] = sp_stats.multinomial.logpmf(matrix_chunk, n, p=profile_p) + return loglk + + +def simulate_multinomial_loglikelihoods(profile_p, umis_per_bc, num_sims=1000, jump=1000, n_sample_feature_block=1000000, verbose=False): + """Simulate draws from a multinomial distribution for various values of N. + Uses the approximation from Lun et al. ( https://www.biorxiv.org/content/biorxiv/early/2018/04/04/234872.full.pdf ) + Args: + profile_p (np.ndarray(float)): Probability of observing each feature. + umis_per_bc (np.ndarray(int)): UMI counts per barcode (multinomial N). + num_sims (int): Number of simulations per distinct N value. + jump (int): Vectorize the sampling if the gap between two distinct Ns exceeds this. + n_sample_feature_block (int): Vectorize this many feature samplings at a time. + Returns: + (distinct_ns (np.ndarray(int)), log_likelihoods (np.ndarray(float)): + distinct_ns is an array containing the distinct N values that were simulated. + log_likelihoods is a len(distinct_ns) x num_sims matrix containing the + simulated log likelihoods. + """ + distinct_n = np.flatnonzero(np.bincount(umis_per_bc.astype(np.int64))) + + loglk = np.zeros((len(distinct_n), num_sims), dtype=float) + + sampled_features = np.random.choice(len(profile_p), size=n_sample_feature_block, p=profile_p, replace=True) + k = 0 + + log_profile_p = np.log(profile_p) + + for sim_idx in range(num_sims): + curr_counts = np.ravel(sp_stats.multinomial.rvs(distinct_n[0], profile_p, size=1)) + + curr_loglk = sp_stats.multinomial.logpmf(curr_counts, distinct_n[0], p=profile_p) + + loglk[0, sim_idx] = curr_loglk + + for i in range(1, len(distinct_n)): + step = distinct_n[i] - distinct_n[i - 1] + if step >= jump: + # Instead of iterating for each n, sample the intermediate ns all at once + curr_counts += np.ravel(sp_stats.multinomial.rvs(step, profile_p, size=1)) + curr_loglk = sp_stats.multinomial.logpmf(curr_counts, distinct_n[i], p=profile_p) + assert not np.isnan(curr_loglk) + else: + # Iteratively sample between the two distinct values of n + for n in range(distinct_n[i - 1] + 1, distinct_n[i] + 1): + j = sampled_features[k] + k += 1 + if k >= n_sample_feature_block: + # Amortize this operation + sampled_features = np.random.choice(len(profile_p), size=n_sample_feature_block, p=profile_p, replace=True) + k = 0 + curr_counts[j] += 1 + curr_loglk += log_profile_p[j] + np.log(float(n) / curr_counts[j]) + + loglk[i, sim_idx] = curr_loglk + + return distinct_n, loglk + + +def compute_ambient_pvalues(umis_per_bc, obs_loglk, sim_n, sim_loglk): + """Compute p-values for observed multinomial log-likelihoods + Args: + umis_per_bc (nd.array(int)): UMI counts per barcode + obs_loglk (nd.array(float)): Observed log-likelihoods of each barcode deriving from an ambient profile + sim_n (nd.array(int)): Multinomial N for simulated log-likelihoods + sim_loglk (nd.array(float)): Simulated log-likelihoods of shape (len(sim_n), num_simulations) + Returns: + pvalues (nd.array(float)): p-values + """ + assert len(umis_per_bc) == len(obs_loglk) + assert sim_loglk.shape[0] == len(sim_n) + + # Find the index of the simulated N for each barcode + sim_n_idx = np.searchsorted(sim_n, umis_per_bc) + num_sims = sim_loglk.shape[1] + + num_barcodes = len(umis_per_bc) + + pvalues = np.zeros(num_barcodes) + + for i in range(num_barcodes): + num_lower_loglk = np.sum(sim_loglk[sim_n_idx[i], :] < obs_loglk[i]) + pvalues[i] = float(1 + num_lower_loglk) / (1 + num_sims) + return pvalues + + +def estimate_profile_sgt(matrix, barcode_indices, nz_feat): + """ Estimate a gene expression profile by Simple Good Turing. + Args: + raw_mat (sparse matrix): Sparse matrix of all counts + barcode_indices (np.array(int)): Barcode indices to use + nz_feat (np.array(int)): Indices of features that are non-zero at least once + Returns: + profile (np.array(float)): Estimated probabilities of length len(nz_feat). + """ + # Initial profile estimate + prof_mat = matrix[:, barcode_indices] + + profile = np.ravel(prof_mat[nz_feat, :].sum(axis=1)) + zero_feat = np.flatnonzero(profile == 0) + + # Simple Good Turing estimate + p_smoothed, p0 = sgt_proportions(profile[np.flatnonzero(profile)]) + + # Distribute p0 equally among the zero elements. + p0_i = p0 / len(zero_feat) + + profile_p = np.repeat(p0_i, len(nz_feat)) + profile_p[np.flatnonzero(profile)] = p_smoothed + + assert np.isclose(profile_p.sum(), 1.0) + return profile_p + + +# Construct a background expression profile from barcodes with <= T UMIs +def est_background_profile_sgt(matrix, use_bcs): + """ Estimate a gene expression profile on a given subset of barcodes. + Use Good-Turing to smooth the estimated profile. + Args: + matrix (scipy.sparse.csc_matrix): Sparse matrix of all counts + use_bcs (np.array(int)): Indices of barcodes to use (col indices into matrix) + Returns: + profile (use_features, np.array(float)): Estimated probabilities of length use_features. + """ + # Use features that are nonzero anywhere in the data + use_feats = np.flatnonzero(np.asarray(matrix.sum(1))) + + # Estimate background profile + bg_profile_p = estimate_profile_sgt(matrix, use_bcs, use_feats) + + return (use_feats, bg_profile_p) + + +# Sten Linnarsson's version (Aug 2019) +def call_cells(matrix: sparse.csr_matrix, expected_n_cells: int = 5000) -> np.ndarray: + """ + Determine likely true cells among the barcodes by contrasting with the ambient RNA profile + + Args: + matrix: expression matrix + expected_n_cells: expected number of true cells in the sample + + Returns: + calls: vector of bools indicating true cell barcodes + """ + n_barcodes = matrix.shape[1] + expected_n_cells = min(expected_n_cells, n_barcodes // 5) + total_umis = np.array(matrix.sum(axis=0))[0] # total UMIs per barcode + # upper limit of UMIs for barcodes considered ambient, calculated as greatest UMI count after removing twice the expected number of cells + max_ambient_umis = np.percentile(total_umis, 100 * (n_barcodes - expected_n_cells * 2) / n_barcodes) + # median number of UMIs among the top expected_n_cells barcodes + median_initial_umis = np.median(total_umis[total_umis > np.percentile(total_umis, 100 * (n_barcodes - expected_n_cells) / n_barcodes)]) + min_cell_umis = int(max(500, median_initial_umis * 0.1)) # 10% of median, but at least 500 UMIs + + # Ambient RNA beads, covering the range 20 to max_amient_umis + ambient_bcs = (total_umis < max_ambient_umis) & (total_umis > 20) + if ambient_bcs.sum() == 0: + # No beads were ambient, because cells had very low UMIs + logging.warning("No ambient RNA beads were found; maybe sample had too few cells?") + return max_ambient_umis, np.ones_like(total_umis) + try: + eval_features, ambient_profile_p = est_background_profile_sgt(matrix, ambient_bcs) + except SimpleGoodTuringError as e: + logging.error(e) + return max_ambient_umis, np.ones_like(total_umis) + + # Evaluate candidate barcodes + eval_bcs = total_umis > min_cell_umis + eval_mat = matrix[eval_features, :][:, eval_bcs] + + # Compute observed log-likelihood of barcodes being generated from ambient RNA + obs_loglk = eval_multinomial_loglikelihoods(eval_mat, ambient_profile_p) + + # Simulate log likelihoods + distinct_ns, sim_loglk = simulate_multinomial_loglikelihoods(ambient_profile_p, total_umis[eval_bcs], num_sims=1000, verbose=True) + + # Compute p-values + pvalues = compute_ambient_pvalues(total_umis[eval_bcs], obs_loglk, distinct_ns, sim_loglk) + pvalues_adj = adjust_pvalue_bh(pvalues) + pvalues_adj_all = np.ones_like(total_umis) + pvalues_adj_all[eval_bcs] = pvalues_adj + return max_ambient_umis, pvalues_adj_all diff --git a/loompy/color.py b/loompy/color.py deleted file mode 100644 index 61c8482..0000000 --- a/loompy/color.py +++ /dev/null @@ -1,367 +0,0 @@ -import colorsys -import json -import os -import random -import sys -from typing import * - -import matplotlib.colors as colors -import matplotlib.pyplot as plt -import numpy as np -from sklearn.preprocessing import LabelEncoder - -# See https://graphicdesign.stackexchange.com/questions/3682/where-can-i-find-a-large-palette-set-of-contrasting-colors-for-coloring-many-d - -# Adapted from https://github.com/kevinwuhoo/randomcolor-py - -# Copyright (c) 2016 Kevin Wu - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -_hues = { - "blue": { - "hue_range": [179, 257], - "lower_bounds": [ - [20, 100], - [30, 86], - [40, 80], - [50, 74], - [60, 60], - [70, 52], - [80, 44], - [90, 39], - [100, 35] - ] - }, - "green": { - "hue_range": [63, 178], - "lower_bounds": [ - [30, 100], - [40, 90], - [50, 85], - [60, 81], - [70, 74], - [80, 64], - [90, 50], - [100, 40] - ] - }, - "monochrome": { - "hue_range": [0, 0], - "lower_bounds": [ - [0, 0], - [100, 0] - ] - }, - "orange": { - "hue_range": [19, 46], - "lower_bounds": [ - [20, 100], - [30, 93], - [40, 88], - [50, 86], - [60, 85], - [70, 70], - [100, 70] - ] - }, - "pink": { - "hue_range": [283, 334], - "lower_bounds": [ - [20, 100], - [30, 90], - [40, 86], - [60, 84], - [80, 80], - [90, 75], - [100, 73] - ] - }, - "purple": { - "hue_range": [258, 282], - "lower_bounds": [ - [20, 100], - [30, 87], - [40, 79], - [50, 70], - [60, 65], - [70, 59], - [80, 52], - [90, 45], - [100, 42] - ] - }, - "red": { - "hue_range": [-26, 18], - "lower_bounds": [ - [20, 100], - [30, 92], - [40, 89], - [50, 85], - [60, 78], - [70, 70], - [80, 60], - [90, 55], - [100, 50] - ] - }, - "yellow": { - "hue_range": [47, 62], - "lower_bounds": [ - [25, 100], - [40, 94], - [50, 89], - [60, 86], - [70, 84], - [80, 82], - [90, 80], - [100, 75] - ] - } -} - - -class RandomColor: - def __init__(self, seed=None): - # Load color dictionary and populate the color dictionary - self.colormap = _hues - self.seed = seed if seed else random.randint(0, sys.maxsize) - self.random = random.Random(self.seed) - - for color_name, color_attrs in self.colormap.items(): - lower_bounds = color_attrs['lower_bounds'] - s_min = lower_bounds[0][0] - s_max = lower_bounds[len(lower_bounds) - 1][0] - - b_min = lower_bounds[len(lower_bounds) - 1][1] - b_max = lower_bounds[0][1] - - self.colormap[color_name]['saturation_range'] = [s_min, s_max] - self.colormap[color_name]['brightness_range'] = [b_min, b_max] - - def generate(self, hue=None, luminosity=None, count=1, format_='hex'): - colors = [] - for _ in range(count): - # First we pick a hue (H) - H = self.pick_hue(hue) - - # Then use H to determine saturation (S) - S = self.pick_saturation(H, hue, luminosity) - - # Then use S and H to determine brightness (B). - B = self.pick_brightness(H, S, luminosity) - - # Then we return the HSB color in the desired format - colors.append(self.set_format([H, S, B], format_)) - - return colors - - def pick_hue(self, hue): - hue_range = self.get_hue_range(hue) - hue = self.random_within(hue_range) - - # Instead of storing red as two seperate ranges, - # we group them, using negative numbers - if (hue < 0): - hue += 360 - - return hue - - def pick_saturation(self, hue, hue_name, luminosity): - - if luminosity == 'random': - return self.random_within([0, 100]) - - if hue_name == 'monochrome': - return 0 - - saturation_range = self.get_saturation_range(hue) - - s_min = saturation_range[0] - s_max = saturation_range[1] - - if luminosity == 'bright': - s_min = 55 - elif luminosity == 'dark': - s_min = s_max - 10 - elif luminosity == 'light': - s_max = 55 - - return self.random_within([s_min, s_max]) - - def pick_brightness(self, H, S, luminosity): - b_min = self.get_minimum_brightness(H, S) - b_max = 100 - - if luminosity == 'dark': - b_max = b_min + 20 - elif luminosity == 'light': - b_min = (b_max + b_min) / 2 - elif luminosity == 'random': - b_min = 0 - b_max = 100 - - return self.random_within([b_min, b_max]) - - def set_format(self, hsv, format_): - if 'hsv' in format_: - color = hsv - elif 'rgb' in format_: - color = self.hsv_to_rgb(hsv) - elif 'hex' in format_: - r, g, b = self.hsv_to_rgb(hsv) - return '#%02x%02x%02x' % (r, g, b) - else: - return "unrecognized format" - - if "Array" in format_ or format_ == 'hex': - return color - else: - prefix = format_[:3] - color_values = [str(x) for x in color] - return "%s(%s)" % (prefix, ', '.join(color_values)) - - def get_minimum_brightness(self, H, S): - lower_bounds = self.get_color_info(H)['lower_bounds'] - - for i in range(len(lower_bounds) - 1): - s1 = lower_bounds[i][0] - v1 = lower_bounds[i][1] - - s2 = lower_bounds[i + 1][0] - v2 = lower_bounds[i + 1][1] - - if s1 <= S <= s2: - m = (v2 - v1) / (s2 - s1) - b = v1 - m * s1 - - return m * S + b - - return 0 - - def get_hue_range(self, color_input): - if color_input and color_input.isdigit(): - number = int(color_input) - - if 0 < number < 360: - return [number, number] - - elif color_input and color_input in self.colormap: - color = self.colormap[color_input] - if 'hue_range' in color: - return color['hue_range'] - - else: - return [0, 360] - - def get_saturation_range(self, hue): - return self.get_color_info(hue)['saturation_range'] - - def get_color_info(self, hue): - # Maps red colors to make picking hue easier - if 334 <= hue <= 360: - hue -= 360 - - for color_name, color in self.colormap.items(): - if color['hue_range'] and color['hue_range'][0] <= hue <= color['hue_range'][1]: - return self.colormap[color_name] - - # this should probably raise an exception - return 'Color not found' - - def random_within(self, r): - return self.random.randint(int(r[0]), int(r[1])) - - @classmethod - def hsv_to_rgb(cls, hsv): - h, s, v = hsv - h = 1 if h == 0 else h - h = 359 if h == 360 else h - - h = float(h) / 360 - s = float(s) / 100 - v = float(v) / 100 - - rgb = colorsys.hsv_to_rgb(h, s, v) - return [int(c * 255) for c in rgb] - - -zviridis = colors.LinearSegmentedColormap.from_list("zviridis", [(0.9, 0.9, 0.9, 1)] + list(plt.cm.viridis(np.arange(1000) / 1000)), N=1001) - -_color_alphabet = [ - '#efa2fe', - '#0075db', - '#983f00', - '#4c005c', - '#005c31', - '#2bcd48', - '#fecb98', - '#808080', - '#93feb4', - '#8e7c00', - '#9ccb00', - '#c10087', - '#003380', - '#fea305', - '#fea7ba', - '#426600', - '#fe0010', - '#5ef0f1', - '#00988e', - '#dffe66', - '#740afe', - '#980000', - '#fefe80', - '#fefe00', - '#fe5005' -] - - -def cat_colors(N: int = 1, *, hue: str = None, luminosity: str = None, bgvalue: int = None, loop: bool = False, seed: str = "cat") -> Union[List[Any], colors.LinearSegmentedColormap]: - """ - Return a colormap suitable for N categorical values, optimized to be both aesthetically pleasing and perceptually distinct. - - Args: - N The number of colors requested. - hue Controls the hue of the generated color. You can pass a string representing a color name: "red", "orange", "yellow", "green", "blue", "purple", "pink" and "monochrome" are currently supported. If you pass a hexidecimal color string such as "#00FFFF", its hue value will be used to generate colors. - luminosity Controls the luminosity of the generated color: "bright", "light" or "dark". - bgvalue If not None, then the corresponding index color will be set to light gray - loop If True, loop the color alphabet instead of generating random colors - seed If not None, use as the random seed (default: "cat") - - Returns: - A set of colors in the requested format, either a list of values or a matplotlib LinearSegmentedColormap (when format="cmap") - If N <= 25 and hue and luminosity are both None, a subset of the optimally perceptually distinct "color alphabet" is returned. - Else, a pleasing set of random colors is returned. - - Colors are designed to be displayed on a white background. - """ - c: List[str] = [] - if N <= 25 and hue is None and luminosity is None: - c = _color_alphabet[:N] - elif not loop: - c = RandomColor(seed=seed).generate(count=N, hue=hue, luminosity=luminosity, format_="hex") - else: - n = N - while n > 0: - c += _color_alphabet[:n] - n -= 25 - if bgvalue is not None: - c[bgvalue] = "#aaaaaa" - return colors.LinearSegmentedColormap.from_list("", c, N) diff --git a/loompy/commands.py b/loompy/commands.py new file mode 100644 index 0000000..05a674f --- /dev/null +++ b/loompy/commands.py @@ -0,0 +1,34 @@ +import logging +import os +import sys +from typing import List + +import click + +from ._version import __version__ +from .bus_file import create_from_fastq + + +@click.group() +@click.option('--show-message/--hide-message', default=True) +@click.option('--verbosity', default="info", type=click.Choice(['error', 'warning', 'info', 'debug'])) +def cli(show_message: bool = True, verbosity: str = "info") -> None: + level = {"error": 40, "warning": 30, "info": 20, "debug": 10}[verbosity] + logging.basicConfig(stream=sys.stdout, format='%(asctime)s - %(levelname)s - %(message)s', level=level) + logging.captureWarnings(True) + + if show_message: + print(f"Loompy v{__version__} by Linnarsson Lab 🌸 (http://linnarssonlab.org & http://loompy.org)") + print() + + +@cli.command() +@click.argument('loomfile', required=True) +@click.argument('sampleid', required=True) +@click.argument('indexdir', required=True) +@click.argument('metadatafile', required=True) +@click.argument('fastqs', required=True, nargs=-1) +@click.option('--threads', default=os.cpu_count(), help="Number of threads to use") +def fromfq(loomfile: str, sampleid: str, indexdir: str, metadatafile: str, threads: int, fastqs: List[str]) -> None: + logging.info(f"Using {threads} threads.") + create_from_fastq(loomfile, sampleid, list(fastqs), indexdir, metadatafile, threads) diff --git a/loompy/file_attribute_manager.py b/loompy/file_attribute_manager.py deleted file mode 100644 index 323e905..0000000 --- a/loompy/file_attribute_manager.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import * -import scipy.sparse as sparse -import numpy as np -import loompy -import warnings -with warnings.catch_warnings(): - warnings.simplefilter("ignore") - import h5py - - -class FileAttributeManager(object): - def __init__(self, f: h5py.File) -> None: - setattr(self, "!f", f) - storage: Dict[str, str] = {} - setattr(self, "!storage", storage) - for key, val in f.attrs.items(): - materialized = loompy.materialize_attr_values(val) - self.__dict__["storage"][key] = materialized - - def keys(self) -> List[str]: - return list(self.__dict__["storage"].keys()) - - def items(self) -> Iterable[Tuple[str, sparse.coo_matrix]]: - for key in self.keys(): - yield (key, self[key]) - - def __len__(self) -> int: - return len(self.keys()) - - def __contains__(self, name: str) -> bool: - return name in self.keys() - - def __iter__(self) -> Iterator[str]: - for key in self.keys(): - yield key - - def __getitem__(self, thing: Any) -> np.ndarray: - return self.__getattr__(thing) - - def __getattr__(self, name: str) -> np.ndarray: - try: - return self.__dict__["storage"][name] - except KeyError: - if self.f is not None: - if name in self.f.attrs: - val = self.f.attrs[name] - materialized = loompy.materialize_attr_values(val) - self.__dict__["storage"][name] = materialized - return materialized - raise AttributeError(f"'{type(self)}' object has no attribute '{name}'") - - def __setitem__(self, name: str, val: Any) -> None: - return self.__setattr__(name, val) - - def __setattr__(self, name: str, val: Any) -> None: - if name.startswith("!"): - super(FileAttributeManager, self).__setattr__(name[1:], val) - elif "/" in name: - raise KeyError("Attribute name cannot contain slash (/)") - else: - if self.f is not None: - normalized = loompy.normalize_attr_values(val) - self.f.attrs[name] = normalized - self.f.flush() - val = self.f.attrs[name] - # Read it back in to ensure it's synced and normalized - normalized = loompy.materialize_attr_values(val) - self.__dict__["storage"][name] = normalized - - def __delattr__(self, name: str) -> None: - if name.startswith("!"): - super(FileAttributeManager, self).__delattr__(name[1:]) - else: - if self.f is not None: - del self.f.attrs[name] - del self.__dict__["storage"][name] - - def get(self, name: str, default: Any = None) -> np.ndarray: - """ - Return the value for a named attribute if it exists, else default. - If default is not given, it defaults to None, so that this method never raises a KeyError. - """ - if name in self: - return self[name] - else: - return default diff --git a/loompy/global_attribute_manager.py b/loompy/global_attribute_manager.py new file mode 100644 index 0000000..1d703ee --- /dev/null +++ b/loompy/global_attribute_manager.py @@ -0,0 +1,118 @@ +from typing import Dict, List, Iterable, Tuple, Any, Iterator +import scipy.sparse as sparse +import numpy as np +import loompy +import warnings +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + import h5py +from .utils import compare_loom_spec_version + + +class GlobalAttributeManager(object): + def __init__(self, f: h5py.File) -> None: + setattr(self, "!f", f) + storage: Dict[str, str] = {} + setattr(self, "!storage", storage) + if "attrs" not in self.f: + for key, val in f.attrs.items(): + materialized = loompy.materialize_attr_values(val) + self.__dict__["storage"][key] = materialized + else: + for key, val in f["attrs"].items(): + materialized = loompy.materialize_attr_values(val[()]) + self.__dict__["storage"][key] = materialized + + def keys(self) -> List[str]: + return list(self.__dict__["storage"].keys()) + + def items(self) -> Iterable[Tuple[str, sparse.coo_matrix]]: + for key in self.keys(): + yield (key, self[key]) + + def __len__(self) -> int: + return len(self.keys()) + + def __contains__(self, name: str) -> bool: + return name in self.keys() + + def __iter__(self) -> Iterator[str]: + for key in self.keys(): + yield key + + def __getitem__(self, thing: Any) -> np.ndarray: + return self.__getattr__(thing) + + def __getattr__(self, name: str) -> np.ndarray: + try: + return self.__dict__["storage"][name] + except KeyError: + if self.f is not None: + if loompy.compare_loom_spec_version(self.f, "3.0.0") < 0: + if name in self.f.attrs: + val = self.f.attrs[name] + else: + raise AttributeError(f"File has no global attribute '{name}'") + else: + if name in self.f["attrs"]: + val = self.f["attrs"][name] + else: + raise AttributeError(f"File has no global attribute '{name}'") + materialized = loompy.materialize_attr_values(val) + self.__dict__["storage"][name] = materialized + return materialized + + def __setitem__(self, name: str, val: Any) -> None: + return self.__setattr__(name, val) + + def __setattr__(self, name: str, val: Any) -> None: + if name.startswith("!"): + super(GlobalAttributeManager, self).__setattr__(name[1:], val) + elif "/" in name: + raise KeyError("Attribute name cannot contain slash (/)") + else: + if self.f is not None: + if loompy.compare_loom_spec_version(self.f, "3.0.0") < 0 and "attrs" not in self.f["/"]: + normalized = loompy.normalize_attr_values(val, False) + self.f.attrs[name] = normalized + self.f.flush() + val = self.f.attrs[name] + # Read it back in to ensure it's synced and normalized + normalized = loompy.materialize_attr_values(val) + self.__dict__["storage"][name] = normalized + else: + normalized = loompy.normalize_attr_values(val, True) + if name in self.f["attrs"]: + del self.f["attrs"][name] + if not np.isscalar(normalized) and normalized.dtype == np.object_: + self.ds._file.create_dataset("attrs/" + name, data=normalized, dtype=h5py.string_dtype(encoding="utf-8")) + else: + self.f["attrs"][name] = normalized + self.f.flush() + val = self.f["attrs"][name][()] + # Read it back in to ensure it's synced and normalized + normalized = loompy.materialize_attr_values(val) + self.__dict__["storage"][name] = normalized + + def __delattr__(self, name: str) -> None: + if name.startswith("!"): + super(GlobalAttributeManager, self).__delattr__(name[1:]) + else: + if self.f is not None: + if loompy.compare_loom_spec_version(self.f, "3.0.0") < 0: + if name in self.f.attrs: + del self.f.attrs[name] + else: + if name in self.f["attrs"]: + del self.f["attrs"][name] + del self.__dict__["storage"][name] + + def get(self, name: str, default: Any = None) -> np.ndarray: + """ + Return the value for a named attribute if it exists, else default. + If default is not given, it defaults to None, so that this method never raises a KeyError. + """ + if name in self: + return self[name] + else: + return default diff --git a/loompy/layer_manager.py b/loompy/layer_manager.py index 5c45354..8327efc 100644 --- a/loompy/layer_manager.py +++ b/loompy/layer_manager.py @@ -146,7 +146,7 @@ def __setattr__(self, name: str, val: np.ndarray) -> None: # Fill the matrix with sparse data if sparse.issparse(val): - m = val.tocsr() + m = val.tocsc() window = 6400 ix = 0 while ix < val.shape[1]: diff --git a/loompy/loom_validator.py b/loompy/loom_validator.py index ea6ce29..9d9cfa4 100644 --- a/loompy/loom_validator.py +++ b/loompy/loom_validator.py @@ -6,10 +6,10 @@ class LoomValidator: - def __init__(self, version: str = "2.0.1") -> None: + def __init__(self, version: str = "3.0.0") -> None: """ Args: - version: The Loom file format version to validate against ("2.0.1" or "old") + version: The Loom file format version to validate against ("3.0.0", "2.0.1", "old") Remarks: "old" version will accept files that lack the "row_graphs" and "col_graphs" groups diff --git a/loompy/loom_view.py b/loompy/loom_view.py index e1ec081..905e1ae 100644 --- a/loompy/loom_view.py +++ b/loompy/loom_view.py @@ -7,7 +7,7 @@ class LoomView: """ An in-memory loom dataset """ - def __init__(self, layers: loompy.LayerManager, row_attrs: loompy.AttributeManager, col_attrs: loompy.AttributeManager, row_graphs: loompy.GraphManager, col_graphs: loompy.GraphManager, *, filename: str = "", file_attrs: loompy.FileAttributeManager = None) -> None: + def __init__(self, layers: loompy.LayerManager, row_attrs: loompy.AttributeManager, col_attrs: loompy.AttributeManager, row_graphs: loompy.GraphManager, col_graphs: loompy.GraphManager, *, filename: str = "", file_attrs: loompy.GlobalAttributeManager = None) -> None: self.filename = filename self.view = loompy.ViewManager(self) self.layers = layers diff --git a/loompy/loompy.py b/loompy/loompy.py index 81ad760..a7a21f4 100755 --- a/loompy/loompy.py +++ b/loompy/loompy.py @@ -25,9 +25,8 @@ import gzip import logging import os.path -import time from shutil import copyfile -from typing import * +from typing import Tuple, Union, Any, Dict, List, Iterable, Callable, Optional import h5py import numpy as np @@ -92,7 +91,7 @@ def __init__(self, filename: str, mode: str = 'r+', *, validate: bool = True, sp self.view = loompy.ViewManager(self) #: Create a view of the file by slicing this attribute, like ``ds.view[:100, :100]`` self.ra = loompy.AttributeManager(self, axis=0) #: Row attributes, dict-like with np.ndarray values self.ca = loompy.AttributeManager(self, axis=1) #: Column attributes, dict-like with np.ndarray values - self.attrs = loompy.FileAttributeManager(self._file) #: Global attributes + self.attrs = loompy.GlobalAttributeManager(self._file) #: Global attributes self.row_graphs = loompy.GraphManager(self, axis=0) #: Row graphs, dict-like with values that are :class:`scipy.sparse.coo_matrix` objects self.col_graphs = loompy.GraphManager(self, axis=1) #: Column graphs, dict-like with values that are :class:`scipy.sparse.coo_matrix` objects @@ -467,7 +466,7 @@ def ixs_thatsort_a2b(a: np.ndarray, b: np.ndarray, check_content: bool = True) - ca = {key: v[selection] for key, v in other.col_attrs.items()} if ordering is not None: vals = {key: val[ordering, :] for key, val in vals.items()} - self.add_columns(vals, ca, fill_values=fill_values) + self.add_columns(vals, ca, fill_values=fill_values) # type: ignore if include_graphs: for gname in self.col_graphs.keys(): @@ -614,7 +613,7 @@ def scan(self, *, items: np.ndarray = None, axis: int = None, layers: Iterable = temp = temp[ordering, :] if selection is not None: temp = temp[:, selection] - vals[layer] = loompy.MemoryLoomLayer(layer, temp) + vals[layer] = loompy.MemoryLoomLayer(layer, temp) lm = loompy.LayerManager(None) for key, layer in vals.items(): lm[key] = loompy.MemoryLoomLayer(key, layer) @@ -659,7 +658,7 @@ def scan(self, *, items: np.ndarray = None, axis: int = None, layers: Iterable = ix += rows_per_chunk continue - if selection.shape[0] ==rows_per_chunk: + if selection.shape[0] == rows_per_chunk: selection = None # Meaning, select all rows # Load the whole chunk from the file, then extract genes and cells using fancy indexing @@ -722,7 +721,6 @@ def batch_scan(self, cells: np.ndarray = None, genes: np.ndarray = None, axis: i ix += cols_per_chunk continue - # Load the whole chunk from the file, then extract genes and cells using fancy indexing vals = self.layers[layer][:, ix:ix + cols_per_chunk] vals = vals[genes, :] @@ -851,7 +849,6 @@ def permute(self, ordering: np.ndarray, axis: int) -> None: self.col_attrs._permute(ordering) self.col_graphs._permute(ordering) - def aggregate(self, out_file: str = None, select: np.ndarray = None, group_by: Union[str, np.ndarray] = "Clusters", aggr_by: str = "mean", aggr_ca_by: Dict[str, str] = None) -> np.ndarray: """ Aggregate the Loom file by applying aggregation functions to the main matrix as well as to the column attributes @@ -984,6 +981,7 @@ def new(filename: str, *, file_attrs: Optional[Dict[str, str]] = None) -> LoomCo # Create the file (empty). # Yes, this might cause an exception, which we prefer to send to the caller f = h5py.File(name=filename, mode='w') + f.create_group('/attrs') # v3.0.0 f.create_group('/layers') f.create_group('/row_attrs') f.create_group('/col_attrs') @@ -994,9 +992,11 @@ def new(filename: str, *, file_attrs: Optional[Dict[str, str]] = None) -> LoomCo ds = connect(filename, validate=False) for vals in file_attrs: - ds.attrs[vals] = file_attrs[vals] + if file_attrs[vals] is None: + ds.attrs[vals] = "None" + else: + ds.attrs[vals] = file_attrs[vals] # store creation date - currentTime = time.localtime(time.time()) ds.attrs['CreationDate'] = timestamp() ds.attrs["LOOM_SPEC_VERSION"] = loompy.loom_spec_version return ds @@ -1047,15 +1047,18 @@ def create(filename: str, layers: Union[np.ndarray, Dict[str, np.ndarray], loomp if layer.shape != shape: # type: ignore raise ValueError(f"Layer '{name}' is not the same shape as the main matrix") for name, ra in row_attrs.items(): - if ra.shape[0] != shape[0]: + if len(ra) != shape[0]: raise ValueError(f"Row attribute '{name}' is not the same length ({ra.shape[0]}) as number of rows in main matrix ({shape[0]})") for name, ca in col_attrs.items(): - if ca.shape[0] != shape[1]: + if len(ca) != shape[1]: raise ValueError(f"Column attribute '{name}' is not the same length ({ca.shape[0]}) as number of columns in main matrix ({shape[1]})") try: with new(filename, file_attrs=file_attrs) as ds: + ds.layer[""] = layers[""] for key, vals in layers.items(): + if key == "": + continue ds.layer[key] = vals for key, vals in row_attrs.items(): @@ -1065,7 +1068,6 @@ def create(filename: str, layers: Union[np.ndarray, Dict[str, np.ndarray], loomp ds.ca[key] = vals except ValueError as ve: - #ds.close(suppress_warning=True) # ds does not exist here if os.path.exists(filename): os.remove(filename) raise ve @@ -1094,16 +1096,16 @@ def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) - if genome is None: genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0] matrix_folder = os.path.join(matrix_folder, genome) - matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).astype("float32").todense() + matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).todense() genelines = open(os.path.join(matrix_folder, "genes.tsv"), "r").readlines() bclines = open(os.path.join(matrix_folder, "barcodes.tsv"), "r").readlines() - else: # cellranger V3 file locations + else: # cellranger V3 file locations if genome is None: - genome = "" # Genome is not visible from V3 folder + genome = "" # Genome is not visible from V3 folder matrix_folder = os.path.join(indir, 'outs', 'filtered_feature_bc_matrix') - matrix = mmread(os.path.join(matrix_folder, "matrix.mtx.gz")).astype("float32").todense() - genelines = [ l.decode() for l in gzip.open(os.path.join(matrix_folder, "features.tsv.gz"), "r").readlines() ] - bclines = [ l.decode() for l in gzip.open(os.path.join(matrix_folder, "barcodes.tsv.gz"), "r").readlines() ] + matrix = mmread(os.path.join(matrix_folder, "matrix.mtx.gz")).todense() + genelines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "features.tsv.gz"), "r").readlines()] + bclines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "barcodes.tsv.gz"), "r").readlines()] accession = np.array([x.split("\t")[0] for x in genelines]).astype("str") gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str") @@ -1131,6 +1133,85 @@ def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) - return path +def create_from_matrix_market(out_file: str, sample_id: str, layer_paths: Dict[str, str], row_metadata_path: str, column_metadata_path: str, delim: str = "\t", skip_row_headers: bool = False, skip_colums_headers: bool = False, file_attrs: Dict[str, str] = None, matrix_transposed: bool = False) -> None: + """ + Create a .loom file from .mtx matrix market format + + Args: + out_file: path to the newly created .loom file (will be overwritten if it exists) + sample_id: string to use as prefix for cell IDs + layer_paths: dict mapping layer names to paths to the corresponding matrix file (usually with .mtx extension) + row_metadata_path: path to the row (usually genes) metadata file + column_metadata_path: path to the column (usually cells) metadata file + delim: delimiter used for metadata (default: "\t") + skip_row_headers: if true, skip first line in rows metadata file + skip_column_headers: if true, skip first line in columns metadata file + file_attrs: dict of global file attributes, or None + matrix_transposed: if true, the main matrix is transposed + + Remarks: + layer_paths should typically map the empty string to a matrix market file: {"": "path/to/filename.mtx"}. + To create a multilayer loom file, map multiple named layers {"": "path/to/layer1.mtx", "layer2": "path/to/layer2.mtx"} + Note: the created file MUST have a main layer named "". If no such layer is given, BUT all given layers are the same + datatype, then a main layer will be created as the sum of the other layers. For example, {"spliced": "spliced.mtx", "unspliced": "unspliced.mtx"} + will create three layers, "", "spliced", and "unspliced", where "" is the sum of the other two. + """ + layers: Dict[str, Union[np.ndarray, scipy.sparse.coo_matrix]] = {} + + for name, path in layer_paths.items(): + matrix = mmread(path) + if matrix_transposed: + matrix = matrix.T + layers[name] = matrix + if "" not in layers: + main_matrix = None + for name, matrix in layers.items(): + if main_matrix is None: + main_matrix = matrix.copy() + else: + main_matrix = main_matrix + matrix + layers[""] = main_matrix + + genelines = open(row_metadata_path, "r").readlines() + bclines = open(column_metadata_path, "r").readlines() + + accession = np.array([x.split("\t")[0] for x in genelines]).astype("str") + if(len(genelines[0].split("\t")) > 1): + gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str") + row_attrs = {"Accession": accession, "Gene": gene} + else: + row_attrs = {"Accession": accession} + + cellids = np.array([sample_id + ":" + x.strip() for x in bclines]).astype("str") + col_attrs = {"CellID": cellids} + + create(out_file, layers[""], row_attrs, col_attrs, file_attrs=file_attrs) + + if len(layers) > 1: + with loompy.connect(out_file) as ds: + for name, layer in layers.items(): + if name == "": + continue + ds[name] = layer + + +def create_from_kallistobus(out_file: str, in_dir: str, tr2g_file: str, whitelist_file: str, file_attrs: Dict[str, str] = None, layers: Dict[str, str] = None): + """ + Create a loom file from a kallisto-bus output folder. + + Args: + out_file Full path to the loom file to be created + in_dir Full path to the kallisto-bus directory (containing output.bus, matrix.ec and transcripts.txt) + whitelist_file Full path to the barcode whitelist file (e.g. 10xv2_whitelist.txt) + file_attrs Optional dictionary of global attributes + layers Dict of {layer_name: capture_file_path} to define extra layers + """ + + pass + + # def import(file: str, key: str) + # def demote() + def combine(files: List[str], output_file: str, key: str = None, file_attrs: Dict[str, str] = None, batch_size: int = 1000, convert_attrs: bool = False) -> None: """ Combine two or more loom files and save as a new loom file diff --git a/loompy/normalize.py b/loompy/normalize.py index ed4a938..4b9c0cc 100644 --- a/loompy/normalize.py +++ b/loompy/normalize.py @@ -47,7 +47,7 @@ def normalize_attr_array(a: Any) -> np.ndarray: raise ValueError("Argument must be a list, tuple, numpy matrix, numpy ndarray or sparse matrix.") -def normalize_attr_values(a: Any) -> np.ndarray: +def normalize_attr_values(a: Any, use_object_strings: bool = False) -> np.ndarray: """ Take all kinds of input values and validate/normalize them. @@ -70,7 +70,10 @@ def normalize_attr_values(a: Any) -> np.ndarray: if np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating): pass # We allow all these types elif np.issubdtype(arr.dtype, np.character) or np.issubdtype(arr.dtype, np.object_): - arr = normalize_attr_strings(arr) + if use_object_strings: + arr = np.array([str(elm) for elm in a], dtype=object) + else: + arr = normalize_attr_strings(arr) elif np.issubdtype(arr.dtype, np.bool_): arr = arr.astype('ubyte') if scalar: diff --git a/loompy/utils.py b/loompy/utils.py index 2f31ed3..9d3df9f 100644 --- a/loompy/utils.py +++ b/loompy/utils.py @@ -1,7 +1,8 @@ import logging -from typing import * from inspect import currentframe, getouterframes import datetime +from h5py import File as HDF5File +from .normalize import materialize_attr_values def deprecated(message: str) -> None: @@ -12,3 +13,21 @@ def deprecated(message: str) -> None: def timestamp() -> str: return datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") + + +def get_loom_spec_version(f: HDF5File) -> str: + if "attrs" in f and "LOOM_SPEC_VERSION" in f["/attrs"]: + return materialize_attr_values(f["/attrs"]["LOOM_SPEC_VERSION"][()]) + if "LOOM_SPEC_VERSION" in f.attrs: + return materialize_attr_values(f.attrs["LOOM_SPEC_VERSION"]) + return "0.0.0" + + +def compare_loom_spec_version(f: HDF5File, v: str) -> int: + vf = int("".join(get_loom_spec_version(f).split("."))) + vc = int("".join(v.split("."))) + if vf == vc: + return 0 + if vf > vc: + return 1 + return -1 diff --git a/setup.cfg b/setup.cfg index 09a2b36..1be5f4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,13 +1,9 @@ [pycodestyle] -ignore = W191, W293, E501, E128 +ignore = W191, W293, W0312, W0401, E501, E701, E117 max-line-length = 160 - [pep8] -ignore = W191, W293, E501, E128 +ignore = W191, W293, W0312, W0401, E501, E701, E117 max-line-length = 160 - -[egg_info] -tag_build = -tag_date = 0 -tag_svn_revision = 0 - +[flake8] +ignore = W191, W293, W0312, W0401, E501, E701, E117, E251 +max-line-length = 160 \ No newline at end of file diff --git a/setup.py b/setup.py index 6150ad5..076d87e 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,7 @@ # First update the version in loompy/_version.py, then: -# cd ~/code (the directory where loompy resides) -# cp -R loompy loompy-3.5 -# cd loompy-3.5/loompy -# for f in *.py; do py-backwards -i $f -o $f -t 3.5; done -# cd .. +# cd ~/code/loompy (the directory where loompy resides) # rm -r dist (otherwise twine will upload the oldest build!) # python setup.py sdist # twine upload dist/* @@ -22,8 +18,11 @@ version=__version__, packages=find_packages(), python_requires='>=3.6', - install_requires=['h5py', 'numpy', 'scipy', 'setuptools', 'pandas'], - extras_require=dict(colors=['matplotlib']), + install_requires=['h5py', 'numpy', 'scipy', 'setuptools', 'numba', 'click'], + entry_points=''' + [console_scripts] + loompy=loompy.commands:cli + ''', # metadata for upload to PyPI author="Linnarsson Lab", author_email="sten.linnarsson@ki.se", diff --git a/test.loom b/test.loom new file mode 100644 index 0000000..20d4eb6 Binary files /dev/null and b/test.loom differ diff --git a/tests/debug.py b/tests/debug.py new file mode 100644 index 0000000..3b2572c --- /dev/null +++ b/tests/debug.py @@ -0,0 +1,29 @@ +import sys +import logging +import loompy + +# Create logger +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +# Create STDERR handler +handler = logging.StreamHandler(sys.stderr) +# ch.setLevel(logging.DEBUG) + +# Create formatter and add it to the handler +formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s') +handler.setFormatter(formatter) + +# Set STDERR handler as the only handler +logger.handlers = [handler] + +d = "/Users/stelin/kallisto_GRCh38/" + +fastqs = [ + d + "HLFGJBCXY/10X_17_029_S2_L002_R1_001.fastq.gz", + d + "HLFGJBCXY/10X_17_029_S2_L002_R2_001.fastq.gz" +# d + "HL73JBCXY/10X_17_029_S2_L002_R1_001.fastq.gz", +# d + "HL73JBCXY/10X_17_029_S2_L002_R2_001.fastq.gz" +] +#loompy.create_from_fastq(d + "10X_17_029.loom", "10X_17_029", fastqs, d + "human_GRCh38_gencode.v31", d + "samples.tab", n_threads=6) +loompy.create_from_fastq(d + "10X_17_029.loom", "10X_17_029", fastqs, d + "human_GRCh38_gencode.v31", "/Users/stelin/sqlite3_chromium.db", n_threads=6) diff --git a/tests/test_file_attribute_manager.py b/tests/test_file_attribute_manager.py index fd36206..b5e5fd8 100644 --- a/tests/test_file_attribute_manager.py +++ b/tests/test_file_attribute_manager.py @@ -5,10 +5,10 @@ import h5py import numpy as np -from loompy import FileAttributeManager +from loompy import GlobalAttributeManager -class FileAttributeManagerTests(TestCase): +class GlobalAttributeManagerTests(TestCase): VALUE_IN_FILE = np.arange(3) def setUp(self): @@ -23,7 +23,7 @@ def tearDown(self): os.remove(self.filename) def test_get(self): - m = FileAttributeManager(self.file) + m = GlobalAttributeManager(self.file) default = np.arange(2, 5) val = m.get("arr")