Support 1-d outputs for agg functions (#15)

CellArr · Jan 29, 2025 · ebbb9ab · ebbb9ab
1 parent 0e16c4c
commit ebbb9ab
Show file tree

Hide file tree

Showing 11 changed files with 117 additions and 26 deletions.
diff --git a/.github/workflows/pypi-publish.yml → .github/workflows/publish-pypi.yml b/.github/workflows/pypi-publish.yml → .github/workflows/publish-pypi.yml
@@ -10,6 +10,11 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      repository-projects: write
+      contents: write
+      pages: write
 
     steps:
       - uses: actions/checkout@v4
@@ -45,8 +50,6 @@ jobs:
         run: |
           python -m tox -e clean,build
 
-      - name: Publish package
-        uses: pypa/[email protected]
-        with:
-          user: __token__
-          password: ${{ secrets.PYPI_PASSWORD }}
+      # This uses the trusted publisher workflow so no token is required.
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/pypi-test.yml → .github/workflows/run-tests.yml b/.github/workflows/pypi-test.yml → .github/workflows/run-tests.yml
@@ -30,4 +30,4 @@ jobs:
 
       - name: Test with tox
         run: |
-          tox
+          tox
diff --git a/README.md b/README.md
@@ -10,7 +10,8 @@
 [![Twitter](https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Twitter)](https://twitter.com/GenomicArrays)
 -->
 
-[![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/)
+[![PyPI-Server](https://img.shields.io/pypi/v/GenomicArrays.svg)](https://pypi.org/project/GenomicArrays/)
+![Unit tests](https://github.com/CellArr/GenomicArrays/actions/workflows/run-tests.yml/badge.svg)
 
 # Genomic Arrays based on TileDB
 
@@ -80,8 +81,16 @@ dataset = garr.build_genomicarray(
 )
 ```
 
-The build process stores missing intervals from a bigwig file as `np.nan`. The
-default is to choose an aggregate functions that works with `np.nan`.
+> [!NOTE]
+> - The aggregate function is expected to return either a scalar value or a 1-dimensional NumPy ndarray. If the later, users need to specify the expected dimension of the return array. e.g. 
+>   ```python
+>         feature_annotation_options=garr.FeatureAnnotationOptions(
+>               aggregate_function = my_custom_func,
+>               expected_agg_function_length = 10,
+>          ),
+> - The build process stores missing intervals from a bigwig file as `np.nan`. The default is to choose an aggregate functions that works with `np.nan`.
+
+
 
 ### Query a `GenomicArrayDataset`
 

diff --git a/assets/genarr.jpg b/assets/genarr.jpg
diff --git a/assets/genarr.png b/assets/genarr.png
diff --git a/assets/genarr.svg b/assets/genarr.svg
diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py
@@ -179,14 +179,16 @@ def build_genomicarray(
     # append start index for each interval
     input_intervals["widths"] = input_intervals["ends"] - input_intervals["starts"]
     total_length = None
+    outsize_per_feature = 1
     if feature_annotation_options.aggregate_function is None:
         total_length = int(input_intervals["widths"].sum())
         counter = input_intervals["widths"].shift(1)
         counter[0] = 0
         input_intervals["genarr_feature_start_index"] = counter.cumsum().astype(int)
     else:
-        counter = [1] * len(input_intervals)
-        total_length = len(input_intervals)
+        outsize_per_feature = feature_annotation_options.expected_agg_function_length
+        counter = [outsize_per_feature] * len(input_intervals)
+        total_length = len(input_intervals) * outsize_per_feature
         counter[0] = 0
         input_intervals["genarr_feature_start_index"] = pd.Series(counter).cumsum().astype(int)
 
@@ -272,6 +274,7 @@ def build_genomicarray(
                 idx,
                 feature_annotation_options.aggregate_function,
                 total_length,
+                outsize_per_feature,
             )
             for idx, bwpath in enumerate(files)
         ]
@@ -294,14 +297,15 @@ def build_genomicarray(
     )
 
 
-def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func, total_length):
+def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func, total_length, outsize_per_feature):
     """Wrapper to extract the data for the given intervals from the bigwig file and write the output to the tiledb
     file."""
     data = ubw.wrapper_extract_bw_values(
         bw_path=bwpath,
         intervals=intervals,
         agg_func=agg_func,
         total_length=total_length,
+        outsize_per_feature=outsize_per_feature,
     )
 
     if data is not None and len(data) > 0:
@@ -310,5 +314,5 @@ def _write_intervals_to_tiledb(outpath, intervals, bwpath, bwidx, agg_func, tota
 
 def _wrapper_extract_bwinfo(args):
     """Wrapper for multiprocessing multiple files and intervals."""
-    counts_uri, input_intervals, bwpath, idx, agg_func, total_length = args
-    return _write_intervals_to_tiledb(counts_uri, input_intervals, bwpath, idx, agg_func, total_length)
+    counts_uri, input_intervals, bwpath, idx, agg_func, total_length, outsize_per_feature = args
+    return _write_intervals_to_tiledb(counts_uri, input_intervals, bwpath, idx, agg_func, total_length, outsize_per_feature)
diff --git a/src/genomicarrays/build_options.py b/src/genomicarrays/build_options.py
@@ -122,14 +122,26 @@ class FeatureAnnotationOptions:
 
         aggregate_function:
             A callable to summarize the values in a given
-            interval. Defaults to None.
+            interval. The aggregate function is expected to
+            return either a scalar value or a 1-dimensional
+            NumPy `ndarray`.
+            
+            Defaults to None.
+
+        expected_agg_function_length:
+            Length of the output when a agg function is applied
+            to an interval. Defaults to 1, expecting a scalar.
+
+            Note: `ndarrays` will be flattenned before writing to
+            TileDB.
     """
 
     skip: bool = False
     dtype: np.dtype = np.uint32
     tiledb_store_name: str = "feature_annotation"
     column_types: Dict[str, np.dtype] = None
     aggregate_function: Optional[Callable] = None
+    expected_agg_function_length: int = 1
 
     def __post_init__(self):
         if self.column_types is None:

diff --git a/src/genomicarrays/utils_bw.py b/src/genomicarrays/utils_bw.py
@@ -43,12 +43,22 @@ def wrapper_extract_bw_values(
     agg_func: Optional[callable],
     val_dtype: np.dtype = np.float32,
     total_length: int = None,
+    outsize_per_feature: int = 1,
 ) -> np.ndarray:
+    print("outsize_per_feature", outsize_per_feature)
+
     if total_length is None:
         total_length = len(intervals)
 
     if agg_func is not None:
-        return extract_bw_values_as_vec(bw_path=bw_path, intervals=intervals, agg_func=agg_func, val_dtype=val_dtype)
+        return extract_bw_values_as_vec(
+            bw_path=bw_path,
+            intervals=intervals,
+            agg_func=agg_func,
+            val_dtype=val_dtype,
+            total_length=total_length,
+            outsize_per_feature=outsize_per_feature,
+        )
     else:
         return extract_bw_intervals_as_vec(
             bw_path=bw_path, intervals=intervals, val_dtype=val_dtype, total_length=total_length
@@ -58,8 +68,10 @@ def wrapper_extract_bw_values(
 def extract_bw_values_as_vec(
     bw_path: str,
     intervals: pd.DataFrame,
+    total_length: int,
     agg_func: Optional[callable] = None,
     val_dtype: np.dtype = np.float32,
+    outsize_per_feature: int = 1,
 ) -> np.ndarray:
     """Extract data from BigWig for a given region and apply the aggregate function.
 
@@ -77,26 +89,29 @@ def extract_bw_values_as_vec(
         val_dtype:
             Dtype of the resulting array.
 
+        total_length:
+            Size of all the regions.
+
+        outsize_per_feature:
+            Expected length of output after applying the ``agg_func``.
+
     Returns:
-        A vector with length as the number of intervals,
+        A vector with length as number of intervals X outsize_per_feature,
         a value if the file contains the data for the corresponding
         region or ``np.nan`` if the region is not measured.
     """
     bwfile = bw.open(bw_path)
 
-    results = []
-    for row in intervals.itertuples():
+    results = _get_empty_array(total_length, val_dtype=val_dtype)
+    for i, row in enumerate(intervals.itertuples()):
+        start_idx = i * outsize_per_feature
         if row.seqnames in bwfile.chroms():
             try:
                 data = bwfile.values(row.seqnames, row.starts, row.ends, numpy=True)
                 if data is not None and len(data) != 0:
-                    results.append(agg_func(data))
-                else:
-                    results.append(np.nan)
+                    results[start_idx : start_idx + outsize_per_feature] = agg_func(data).flatten()
             except Exception as _:
-                results.append(np.nan)
-        else:
-            results.append(np.nan)
+                pass
 
     return np.array(results, dtype=val_dtype)
 

diff --git a/tests/agg_test.py b/tests/agg_test.py
@@ -0,0 +1,9 @@
+import numpy as np
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+def aggfunc(x):
+    print(x)
+    return np.array([np.nanmin(x), np.nansum(x)])
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -5,6 +5,7 @@
 import tiledb
 
 from genomicarrays import build_genomicarray, FeatureAnnotationOptions
+from agg_test import aggfunc
 
 __author__ = "Jayaram Kancherla"
 __copyright__ = "Jayaram Kancherla"
@@ -62,6 +63,7 @@ def test_ingest_bigwigs_agg():
     features = ffp.df[:]
 
     assert len(features) == 15
+    assert len(cfp) == 15
 
     assert np.allclose(
         cfp.multi_index[0:5, 0]["data"],
@@ -75,3 +77,40 @@ def test_ingest_bigwigs_agg():
     sfp = tiledb.open(f"{tempdir}/sample_metadata", "r")
     samples = sfp.df[:]
     assert len(samples) == 2
+
+def test_ingest_bigwigs_agg_not_scalar():
+    tempdir = tempfile.mkdtemp()
+
+    strts = np.arange(300, 600, 20)
+    features = pd.DataFrame(
+        {"seqnames": ["chr1"] * 15, "starts": strts, "ends": strts + 15}
+    )
+
+    build_genomicarray(
+        output_path=tempdir,
+        files=["tests/data/test1.bw", "tests/data/test2.bw"],
+        features=features,
+        genome_fasta="tests/data/test.fa",
+        feature_annotation_options=FeatureAnnotationOptions(aggregate_function=aggfunc, expected_agg_function_length=2),
+    )
+
+    cfp = tiledb.open(f"{tempdir}/coverage", "r")
+    ffp = tiledb.open(f"{tempdir}/feature_annotation", "r")
+
+    features = ffp.df[:]
+
+    assert len(features) == 15
+    assert len(cfp) == 30
+
+    assert np.allclose(
+        cfp.multi_index[0:5, 0]["data"].flatten(),
+        np.array([1,10,1,10,1,10]),
+    )
+    assert np.allclose(
+        cfp.multi_index[0:5, 1]["data"].flatten(),
+        np.array([0.5, 5,0.5, 5,0.5, 5]),
+    )
+
+    sfp = tiledb.open(f"{tempdir}/sample_metadata", "r")
+    samples = sfp.df[:]
+    assert len(samples) == 2
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,4 +30,4 @@ jobs: @@
           - name: Test with tox
             run: |
-              tox
+              tox