Skip to content

Commit

Permalink
Support 1-d outputs for agg functions (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche authored Jan 29, 2025
1 parent 0e16c4c commit ebbb9ab
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ on:
jobs:
build:
runs-on: ubuntu-latest
permissions:
id-token: write
repository-projects: write
contents: write
pages: write

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -45,8 +50,6 @@ jobs:
run: |
python -m tox -e clean,build
- name: Publish package
uses: pypa/[email protected]
with:
user: __token__
password: ${{ secrets.PYPI_PASSWORD }}
# This uses the trusted publisher workflow so no token is required.
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ jobs:
- name: Test with tox
run: |
tox
tox
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
[![Twitter](https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Twitter)](https://twitter.com/GenomicArrays)
-->

[![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/)
[![PyPI-Server](https://img.shields.io/pypi/v/GenomicArrays.svg)](https://pypi.org/project/GenomicArrays/)
![Unit tests](https://github.com/CellArr/GenomicArrays/actions/workflows/run-tests.yml/badge.svg)

# Genomic Arrays based on TileDB

Expand Down Expand Up @@ -80,8 +81,16 @@ dataset = garr.build_genomicarray(
)
```

The build process stores missing intervals from a bigwig file as `np.nan`. The
default is to choose an aggregate functions that works with `np.nan`.
> [!NOTE]
> - The aggregate function is expected to return either a scalar value or a 1-dimensional NumPy ndarray. If the later, users need to specify the expected dimension of the return array. e.g.
> ```python
> feature_annotation_options=garr.FeatureAnnotationOptions(
> aggregate_function = my_custom_func,
> expected_agg_function_length = 10,
> ),
> - The build process stores missing intervals from a bigwig file as `np.nan`. The default is to choose an aggregate functions that works with `np.nan`.
### Query a `GenomicArrayDataset`
Expand Down
Binary file modified assets/genarr.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/genarr.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion assets/genarr.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 9 additions & 5 deletions src/genomicarrays/build_genomicarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,16 @@ def build_genomicarray(
# append start index for each interval
input_intervals["widths"] = input_intervals["ends"] - input_intervals["starts"]
total_length = None
outsize_per_feature = 1
if feature_annotation_options.aggregate_function is None:
total_length = int(input_intervals["widths"].sum())
counter = input_intervals["widths"].shift(1)
counter[0] = 0
input_intervals["genarr_feature_start_index"] = counter.cumsum().astype(int)
else:
counter = [1] * len(input_intervals)
total_length = len(input_intervals)
outsize_per_feature = feature_annotation_options.expected_agg_function_length
counter = [outsize_per_feature] * len(input_intervals)
total_length = len(input_intervals) * outsize_per_feature
counter[0] = 0
input_intervals["genarr_feature_start_index"] = pd.Series(counter).cumsum().astype(int)

Expand Down Expand Up @@ -272,6 +274,7 @@ def build_genomicarray(
idx,
feature_annotation_options.aggregate_function,
total_length,
outsize_per_feature,
)
for idx, bwpath in enumerate(files)
]
Expand All @@ -294,14 +297,15 @@ def build_genomicarray(
)


def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func, total_length):
def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func, total_length, outsize_per_feature):
"""Wrapper to extract the data for the given intervals from the bigwig file and write the output to the tiledb
file."""
data = ubw.wrapper_extract_bw_values(
bw_path=bwpath,
intervals=intervals,
agg_func=agg_func,
total_length=total_length,
outsize_per_feature=outsize_per_feature,
)

if data is not None and len(data) > 0:
Expand All @@ -310,5 +314,5 @@ def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func, tota

def _wrapper_extract_bwinfo(args):
"""Wrapper for multiprocessing multiple files and intervals."""
counts_uri, input_intervals, bwpath, idx, agg_func, total_length = args
return _write_intervals_to_tiledb(counts_uri, input_intervals, bwpath, idx, agg_func, total_length)
counts_uri, input_intervals, bwpath, idx, agg_func, total_length, outsize_per_feature = args
return _write_intervals_to_tiledb(counts_uri, input_intervals, bwpath, idx, agg_func, total_length, outsize_per_feature)
14 changes: 13 additions & 1 deletion src/genomicarrays/build_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,26 @@ class FeatureAnnotationOptions:
aggregate_function:
A callable to summarize the values in a given
interval. Defaults to None.
interval. The aggregate function is expected to
return either a scalar value or a 1-dimensional
NumPy `ndarray`.
Defaults to None.
expected_agg_function_length:
Length of the output when a agg function is applied
to an interval. Defaults to 1, expecting a scalar.
Note: `ndarrays` will be flattenned before writing to
TileDB.
"""

skip: bool = False
dtype: np.dtype = np.uint32
tiledb_store_name: str = "feature_annotation"
column_types: Dict[str, np.dtype] = None
aggregate_function: Optional[Callable] = None
expected_agg_function_length: int = 1

def __post_init__(self):
if self.column_types is None:
Expand Down
35 changes: 25 additions & 10 deletions src/genomicarrays/utils_bw.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,22 @@ def wrapper_extract_bw_values(
agg_func: Optional[callable],
val_dtype: np.dtype = np.float32,
total_length: int = None,
outsize_per_feature: int = 1,
) -> np.ndarray:
print("outsize_per_feature", outsize_per_feature)

if total_length is None:
total_length = len(intervals)

if agg_func is not None:
return extract_bw_values_as_vec(bw_path=bw_path, intervals=intervals, agg_func=agg_func, val_dtype=val_dtype)
return extract_bw_values_as_vec(
bw_path=bw_path,
intervals=intervals,
agg_func=agg_func,
val_dtype=val_dtype,
total_length=total_length,
outsize_per_feature=outsize_per_feature,
)
else:
return extract_bw_intervals_as_vec(
bw_path=bw_path, intervals=intervals, val_dtype=val_dtype, total_length=total_length
Expand All @@ -58,8 +68,10 @@ def wrapper_extract_bw_values(
def extract_bw_values_as_vec(
bw_path: str,
intervals: pd.DataFrame,
total_length: int,
agg_func: Optional[callable] = None,
val_dtype: np.dtype = np.float32,
outsize_per_feature: int = 1,
) -> np.ndarray:
"""Extract data from BigWig for a given region and apply the aggregate function.
Expand All @@ -77,26 +89,29 @@ def extract_bw_values_as_vec(
val_dtype:
Dtype of the resulting array.
total_length:
Size of all the regions.
outsize_per_feature:
Expected length of output after applying the ``agg_func``.
Returns:
A vector with length as the number of intervals,
A vector with length as number of intervals X outsize_per_feature,
a value if the file contains the data for the corresponding
region or ``np.nan`` if the region is not measured.
"""
bwfile = bw.open(bw_path)

results = []
for row in intervals.itertuples():
results = _get_empty_array(total_length, val_dtype=val_dtype)
for i, row in enumerate(intervals.itertuples()):
start_idx = i * outsize_per_feature
if row.seqnames in bwfile.chroms():
try:
data = bwfile.values(row.seqnames, row.starts, row.ends, numpy=True)
if data is not None and len(data) != 0:
results.append(agg_func(data))
else:
results.append(np.nan)
results[start_idx : start_idx + outsize_per_feature] = agg_func(data).flatten()
except Exception as _:
results.append(np.nan)
else:
results.append(np.nan)
pass

return np.array(results, dtype=val_dtype)

Expand Down
9 changes: 9 additions & 0 deletions tests/agg_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import numpy as np

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"

def aggfunc(x):
print(x)
return np.array([np.nanmin(x), np.nansum(x)])
39 changes: 39 additions & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import tiledb

from genomicarrays import build_genomicarray, FeatureAnnotationOptions
from agg_test import aggfunc

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
Expand Down Expand Up @@ -62,6 +63,7 @@ def test_ingest_bigwigs_agg():
features = ffp.df[:]

assert len(features) == 15
assert len(cfp) == 15

assert np.allclose(
cfp.multi_index[0:5, 0]["data"],
Expand All @@ -75,3 +77,40 @@ def test_ingest_bigwigs_agg():
sfp = tiledb.open(f"{tempdir}/sample_metadata", "r")
samples = sfp.df[:]
assert len(samples) == 2

def test_ingest_bigwigs_agg_not_scalar():
tempdir = tempfile.mkdtemp()

strts = np.arange(300, 600, 20)
features = pd.DataFrame(
{"seqnames": ["chr1"] * 15, "starts": strts, "ends": strts + 15}
)

build_genomicarray(
output_path=tempdir,
files=["tests/data/test1.bw", "tests/data/test2.bw"],
features=features,
genome_fasta="tests/data/test.fa",
feature_annotation_options=FeatureAnnotationOptions(aggregate_function=aggfunc, expected_agg_function_length=2),
)

cfp = tiledb.open(f"{tempdir}/coverage", "r")
ffp = tiledb.open(f"{tempdir}/feature_annotation", "r")

features = ffp.df[:]

assert len(features) == 15
assert len(cfp) == 30

assert np.allclose(
cfp.multi_index[0:5, 0]["data"].flatten(),
np.array([1,10,1,10,1,10]),
)
assert np.allclose(
cfp.multi_index[0:5, 1]["data"].flatten(),
np.array([0.5, 5,0.5, 5,0.5, 5]),
)

sfp = tiledb.open(f"{tempdir}/sample_metadata", "r")
samples = sfp.df[:]
assert len(samples) == 2

0 comments on commit ebbb9ab

Please sign in to comment.