Replace pandas with Biocframes (#13)

jkanche · web-flow · commit d868720ba392 · 2025-01-08T12:41:22.000-08:00
Search and List operations now return a `BiocFrame` instead of the pandas data frame containing the results.
diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml
@@ -61,7 +61,7 @@ jobs:
       - run: touch ./docs/_build/html/.nojekyll
 
       - name: GH Pages Deployment
-        uses: JamesIves/github-pages-deploy-action@4.1.3
+        uses: JamesIves/github-pages-deploy-action@v4
         with:
           branch: gh-pages # The branch the action should deploy to.
           folder: ./docs/_build/html
@@ -74,7 +74,7 @@ jobs:
           LD_LIBRARY_PATH: /usr/local/lib
 
       - name: Publish package
-        uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+        uses: pypa/gh-action-pypi-publish@v1.12.2
         with:
           user: __token__
           password: ${{ secrets.PYPI_PASSWORD }}
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -4,14 +4,13 @@ on:
   push:
     branches: [master]
   pull_request:
-    branches: [master]
 
 jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
 
     name: Python ${{ matrix.python-version }}
     steps:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Version 0.3.0
+
+- Replace pandas with BiocFrame.
+- Rename Github actions for consistency with the rest of the packages.
+
 ## Version 0.2.0
 
 - chore: Remove Python 3.8 (EOL)
diff --git a/setup.cfg b/setup.cfg
@@ -56,7 +56,7 @@ install_requires =
     delayedarray>=0.5.1
     summarizedexperiment
     singlecellexperiment
-    pandas
+    biocframe
 
 [options.packages.find]
 where = src
diff --git a/src/celldex/list_references.py b/src/celldex/list_references.py
@@ -2,7 +2,7 @@
 import sqlite3
 from functools import lru_cache
 
-import pandas as pd
+from biocframe import BiocFrame
 from gypsum_client import (
     cache_directory,
     fetch_metadata_database,
@@ -14,7 +14,7 @@
 
 
 @lru_cache
-def list_references(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> pd.DataFrame:
+def list_references(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> BiocFrame:
     """List all available reference datasets.
 
     Example:
@@ -36,7 +36,7 @@ def list_references(cache_dir: str = cache_directory(), overwrite: bool = False,
             Defaults to True.
 
     Returns:
-        A :py:class:`~pandas.DataFrame` where each row corresponds to a reference
+        A :py:class:`~biocframe.BiocFrame` where each row corresponds to a reference
         dataset. Each row contains title and description for each reference,
         the number of rows and columns, the organisms and genome builds involved,
         whether the dataset has any pre-computed reduced dimensions, and so on.
@@ -83,7 +83,7 @@ def _format_query_results(results: list, key_names: list):
 def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "meta"):
     _all_paths = [None if "/" not in p else p.rsplit("/", 1)[0] for p in results["path"]]
 
-    df = pd.DataFrame(
+    df = BiocFrame(
         {
             "name": results["asset"],
             "version": results["version"],
@@ -148,10 +148,10 @@ def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "met
     for meta in _all_metas:
         cursources = meta.get("sources")
         if cursources is None:
-            sources.append(pd.DataFrame(columns=["provider", "id", "version"]))
+            sources.append(BiocFrame(columns=["provider", "id", "version"]))
         else:
             sources.append(
-                pd.DataFrame(
+                BiocFrame(
                     {
                         "provider": [s.get("provider") for s in cursources],
                         "id": [s.get("id") for s in cursources],
diff --git a/src/celldex/search_references.py b/src/celldex/search_references.py
@@ -2,7 +2,7 @@
 from functools import lru_cache
 from typing import Union
 
-import pandas as pd
+from biocframe import BiocFrame
 from gypsum_client import cache_directory, fetch_metadata_database
 from gypsum_client.search_metadata import (
     GypsumSearchClause,
@@ -22,7 +22,7 @@ def search_references(
     cache_dir: str = cache_directory(),
     overwrite: bool = False,
     latest: bool = True,
-) -> pd.DataFrame:
+) -> BiocFrame:
     """Search for reference datasets of interest based on matching text in the associated metadata.
 
     This is a wrapper around
@@ -70,7 +70,7 @@ def search_references(
             Defaults to True.
 
     Returns:
-        A :py:class:`~pandas.DataFrame` where each row corresponds to
+        A :py:class:`~biocframe.BiocFrame` where each row corresponds to
         a dataset, containing various columns of metadata.
         Some columns may be lists to capture 1:many mappings.
     """
diff --git a/tests/test_list_refs.py b/tests/test_list_refs.py
@@ -1,6 +1,6 @@
 import tempfile
 
-import pandas as pd
+from biocframe import BiocFrame
 from celldex import list_references, list_versions, fetch_latest_version
 
 __author__ = "Jayaram Kancherla"
@@ -11,7 +11,7 @@
 def test_list_references():
     refs = list_references(cache_dir=tempfile.mkdtemp())
 
-    assert isinstance(refs, pd.DataFrame)
+    assert isinstance(refs, BiocFrame)
     assert len(refs) >= 7
 
 
diff --git a/tests/test_search_refs.py b/tests/test_search_refs.py
@@ -1,4 +1,4 @@
-import pandas as pd
+from biocframe import BiocFrame
 from gypsum_client import define_text_query
 from celldex import search_references
 
@@ -10,12 +10,12 @@
 def test_search_references():
     res = search_references("human")
     assert len(res) > 3
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
 
     res = search_references(define_text_query("Immun%", partial=True))
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
     assert len(res) > 0
 
     res = search_references(define_text_query("10090", field="taxonomy_id"))
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
     assert len(res) > 0