Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Builder paradigm for constructing CellArrDataset #60

Draft
wants to merge 34 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
4ea4677
EOD
jkanche Nov 22, 2024
8be9e3e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 22, 2024
298f4d1
Merge branch 'master' into refactor-layers
jkanche Nov 22, 2024
7f72009
Merge branch 'refactor-layers' of https://github.com/BiocPy/cellarr i…
jkanche Nov 22, 2024
acbc55f
EOD
jkanche Nov 23, 2024
88e1146
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 23, 2024
8e22118
there's many changes to support building the cellarr collection and q…
jkanche Nov 25, 2024
0ba6f4f
Merge branch 'refactor-layers' of https://github.com/BiocPy/cellarr i…
jkanche Nov 25, 2024
09b41ca
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 25, 2024
d66a593
reset sample index
jkanche Nov 25, 2024
89b4358
does the pool need to return?
jkanche Nov 26, 2024
fb67352
is fork the problem?
jkanche Nov 26, 2024
b9b46b5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
6548d59
add checks with threads
jkanche Nov 26, 2024
6f84688
run autoencoder tests only on github action
jkanche Nov 26, 2024
03eb747
fix docstring typos
jkanche Nov 26, 2024
7014544
update assets
jkanche Nov 26, 2024
6835ad1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
08d94a3
update README
jkanche Nov 26, 2024
fb0f2f2
update docstrings throughout
jkanche Nov 26, 2024
fa6c3d1
filter dataframes with tiledb query expressions
jkanche Nov 26, 2024
bf71cd5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
62365ff
fix dataloader when filtering query conditions
jkanche Nov 26, 2024
60c93b8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
0db584c
back to using remap
jkanche Nov 26, 2024
b28f037
get all cells for a sample
jkanche Nov 26, 2024
eef1a41
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2024
0577a8d
minor changes to README
jkanche Nov 27, 2024
f7cd450
separate assay group
jkanche Nov 27, 2024
d21f5a3
add caching and with clause support
jkanche Nov 27, 2024
3087a58
Thinking of a builder pattern for constructing CellArrDataset
jkanche Nov 27, 2024
eab1207
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 27, 2024
3ce5b71
Merge branch 'master' into builder-paradigm
jkanche Nov 28, 2024
e72b889
Merge branch 'builder-paradigm' of https://github.com/BiocPy/cellarr …
jkanche Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions src/cellarr/builder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
For simple builds,

# Simple usage
builder = CellArrDatasetBuilder("path/to/output")
builder.add_data(adata1, "sample1")
builder.add_data(adata2, "sample2")
dataset = builder.build()

# With more options
config = BuilderConfig(
output_path="path/to/output",
matrix_name="normalized_counts",
matrix_dtype=np.float32,
num_threads=4
)

builder = CellArrDatasetBuilder(config)

# Add data with metadata
builder.add_data(
adata1,
"sample1",
sample_metadata={
"condition": "treatment",
"batch": "1"
},
cell_metadata=cell_meta_df1
)

# Set gene metadata
builder.set_gene_metadata(gene_annotations_df)

# Build the dataset
dataset = builder.build()
Empty file added src/cellarr/builder/__init__.py
Empty file.
186 changes: 186 additions & 0 deletions src/cellarr/builder/cellarr_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union

import anndata
import numpy as np
import pandas as pd

from ..build_options import (
CellMetadataOptions,
GeneAnnotationOptions,
MatrixOptions,
SampleMetadataOptions,
)
from ..CellArrDataset import CellArrDataset
from . import build_cellarrdataset


@dataclass
class BuilderConfig:
"""Configuration for the CellArrDataset builder.

Provides simplified options for building a CellArrDataset.
"""

output_path: str
optimize_tiledb: bool = True
num_threads: int = 1

# Matrix options
matrix_name: str = "counts"
matrix_dtype: np.dtype = np.float32

# Gene options
feature_column: str = "index"
gene_id_type: np.dtype = np.uint32

# Cell/Sample options
cell_id_type: np.dtype = np.uint32
sample_id_type: np.dtype = np.uint32


class CellArrDatasetBuilder:
"""A builder class to simplify creating CellArrDatasets.

Example:
>>> builder = CellArrDatasetBuilder(
... output_path="path/to/output"
... )
>>> builder.add_data(
... adata1,
... "sample1",
... )
>>> builder.add_data(
... adata2,
... "sample2",
... )
>>> dataset = (
... builder.build()
... )
"""

def __init__(self, config: Union[BuilderConfig, dict, str]):
"""Initialize the builder with configuration.

Args:
config:
Either a BuilderConfig object, a dict with config parameters,
or a path to the output directory (simplest case)
"""
if isinstance(config, str):
self.config = BuilderConfig(output_path=config)
elif isinstance(config, dict):
self.config = BuilderConfig(**config)
else:
self.config = config

self.data_objects = []
self.sample_metadata = {}
self.cell_metadata = {}
self.gene_metadata = None

Path(self.config.output_path).mkdir(parents=True, exist_ok=True)

def add_data(
self,
data: Union[str, anndata.AnnData],
sample_name: str,
sample_metadata: Optional[Dict] = None,
cell_metadata: Optional[pd.DataFrame] = None,
) -> "CellArrDatasetBuilder":
"""Add a data object to the dataset.

Args:
data:
AnnData object or path to h5ad file

sample_name:
Name for this sample

sample_metadata:
Optional dictionary of metadata for this sample

cell_metadata:
Optional DataFrame with cell metadata

Returns:
self for method chaining
"""
self.data_objects.append((data, sample_name))

if sample_metadata:
self.sample_metadata[sample_name] = sample_metadata

if cell_metadata is not None:
self.cell_metadata[sample_name] = cell_metadata

return self

def set_gene_metadata(self, gene_metadata: Union[pd.DataFrame, List[str]]) -> "CellArrDatasetBuilder":
"""Set gene/feature metadata or list.

Args:
gene_metadata:
DataFrame with gene annotations or list of gene IDs

Returns:
self for method chaining
"""
self.gene_metadata = gene_metadata
return self

def build(self) -> CellArrDataset:
"""Build and return the CellArrDataset.

Returns:
Constructed CellArrDataset object

Raises:
ValueError:
If no data has been added
"""
if not self.data_objects:
raise ValueError("No data objects have been added to build")

# Prepare options
matrix_options = MatrixOptions(matrix_name=self.config.matrix_name, dtype=self.config.matrix_dtype)

gene_options = GeneAnnotationOptions(feature_column=self.config.feature_column, dtype=self.config.gene_id_type)

cell_options = CellMetadataOptions(dtype=self.config.cell_id_type)

sample_options = SampleMetadataOptions(dtype=self.config.sample_id_type)

# Prepare sample metadata
if self.sample_metadata:
sample_df = pd.DataFrame.from_dict(self.sample_metadata, orient="index").reset_index(
names=["cellarr_sample"]
)
else:
sample_df = None

# Prepare cell metadata
if self.cell_metadata:
cell_df = pd.concat(
[df.assign(cellarr_sample=sample) for sample, df in self.cell_metadata.items()], ignore_index=True
)
else:
cell_df = None

# Build dataset
dataset = build_cellarrdataset(
files=[obj for obj, _ in self.data_objects],
output_path=self.config.output_path,
gene_annotation=self.gene_metadata,
sample_metadata=sample_df,
cell_metadata=cell_df,
matrix_options=matrix_options,
gene_annotation_options=gene_options,
cell_metadata_options=cell_options,
sample_metadata_options=sample_options,
optimize_tiledb=self.config.optimize_tiledb,
num_threads=self.config.num_threads,
)

return dataset