Merge branch 'main' into update_pkg

RasmussenLab · Jun 6, 2024 · e0528fb · e0528fb
2 parents 0d3b503 + 4c8e16d
commit e0528fb
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 65 deletions.
diff --git a/.github/workflows/colab.yml b/.github/workflows/colab.yml
@@ -3,7 +3,7 @@ name: Test that tutorial runs on latest colab image
 on:
   push:
   schedule:
-    - cron: '0 2 * * 3'
+    - cron: '0 2 3 * *'
 
 jobs:
   test:

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,9 @@
 {
-    "python.formatting.provider": "yapf"
+    "editor.defaultFormatter": "eeyore.yapf",
+    "python.testing.pytestArgs": [
+        "test"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,4 +1,48 @@
+[project]
+name = "njab"
+authors = [{ name = "Henry Webel", email = "[email protected]" }]
+description = "not Just Another Biomarker"
+readme = "README.md"
+requires-python = ">=3.8"
+keywords = ["bioinformatics", "biomarker"]
+license = { file = "LICENSE" }
+classifiers = [
+  "License :: OSI Approved :: MIT License",
+  "Intended Audience :: Healthcare Industry",
+  "Intended Audience :: Science/Research",
+  "Programming Language :: Python :: 3",
+  "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+  "omegaconf",
+  "lifelines",
+  "numpy",
+  "pandas",
+  "scikit-learn",
+  "statsmodels",
+  "umap-learn",
+  "matplotlib",
+  "mrmr_selection",
+  "pingouin",
+  "seaborn",
+]
+dynamic = ["version"]
+
+[project.optional-dependencies]
+docs = [
+  "sphinx",
+  "sphinx-book-theme",
+  "myst-nb",
+  "ipywidgets",
+  "sphinx-new-tab-link!=0.2.2",
+]
+
+[project.urls]
+"Bug Tracker" = "https://github.com/RasmussenLab/njab/issues"
+"Homepage" = "https://github.com/RasmussenLab/njab"
+
 [build-system]
-requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
+requires = ["setuptools>=64", "setuptools_scm>=8"]
 
+[tool.setuptools_scm]
diff --git a/setup.cfg b/setup.cfg
@@ -1,52 +1,3 @@
-[metadata]
-name = njab
-version = 0.0.4
-description = not Just Another Biomarker
-long_description = file: README.md
-long_description_content_type = text/markdown
-author = Henry Webel
-author_email = "Henry Webel" <[email protected]>
-license = MIT
-license_files = LICENSE
-classifiers =
-    License :: OSI Approved :: MIT License
-    Intended Audience :: Healthcare Industry
-    Intended Audience :: Science/Research
-    Programming Language :: Python :: 3
-    Topic :: Scientific/Engineering :: Bio-Informatics
-
-[options]
-requires_python = ">=3.8"
-package_dir =
-    =src
-packages = find:
-include_package_data = False
-install_requires =
-    omegaconf
-    lifelines
-    numpy
-    pandas
-    scikit-learn
-    statsmodels
-    umap-learn
-    matplotlib
-    mrmr_selection
-    pingouin
-    seaborn
-
-[options.extras_require]
-docs =
-    sphinx
-    sphinx-book-theme
-    myst-nb
-    ipywidgets
-    sphinx-new-tab-link!=0.2.2
-
-[options.packages.find]
-where = src
-exclude =
-    test*
-
 [yapf]
 based_on_style = pep8
 

diff --git a/src/njab/pandas/__init__.py b/src/njab/pandas/__init__.py
@@ -1,20 +1,26 @@
 import logging
 import typing
 
+import omegaconf
 import pandas as pd
 import pandas.io.formats.format as pf
 
-import omegaconf
-
 logger = logging.getLogger(__name__)
 
 
-def set_pandas_options() -> None:
+def set_pandas_options(max_columns: int = 100,
+                       max_row: int = 30,
+                       min_row: int = 20,
+                       float_format='{:,.3f}') -> None:
     """Update default pandas options for better display."""
-    pd.options.display.max_columns = 100
-    pd.options.display.max_rows = 30
-    pd.options.display.min_rows = 20
-    pd.options.display.float_format = '{:,.3f}'.format
+    pd.options.display.max_columns = max_columns
+    pd.options.display.max_rows = max_row
+    pd.options.display.min_rows = min_row
+    set_pandas_number_formatting(float_format=float_format)
+
+
+def set_pandas_number_formatting(float_format='{:,.3f}') -> None:
+    pd.options.display.float_format = float_format.format
     # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.describe_option.html#pandas.describe_option
     pd.options.styler.format.thousands = ','
     # # https://github.com/pandas-dev/pandas/blob/main/pandas/io/formats/format.py#L1475
@@ -114,10 +120,8 @@ def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame:
     pandas.DataFrame
         DataFrame of combined value counts.
     """
-    """
-    """
-    _df = pd.DataFrame()
+    freq_targets = list()
     for col in X.columns:
-        _df = _df.join(X[col].value_counts(dropna=dropna), how='outer')
-    freq_targets = _df.sort_index()
+        freq_targets.append(X[col].value_counts(dropna=dropna).rename(col))
+    freq_targets = pd.concat(freq_targets, axis=1, sort=True)
     return freq_targets
diff --git a/src/njab/stats/ancova.py b/src/njab/stats/ancova.py
@@ -1,8 +1,8 @@
 """Analysis of covariance using pingouin and statsmodels."""
 from __future__ import annotations
+
 import numpy as np
 import pandas as pd
-
 import pingouin as pg
 import statsmodels
 
@@ -52,6 +52,7 @@ def ancova_pg(df_long: pd.DataFrame,
     # num_covar = len(covar)
 
     for feat_name, data_feat in df_long.groupby(feat_col):
+        # ? drop duplicated colummns in long data format?
         ancova = pg.ancova(data=data_feat, dv=dv, between=between, covar=covar)
         ancova[feat_col] = feat_name
         scores.append(ancova)
@@ -137,6 +138,8 @@ def ancova(self, random_seed=123):
 
         scores = self.get_scores()
         scores = filter_residuals_from_scores(scores)
+        # drop nan values (due to multicollinearity of features - i.e. duplicated features)
+        scores = scores.dropna()
         scores = add_fdr_scores(scores, random_seed=random_seed)
         self.scores = scores
         return scores.set_index('Source', append=True)

diff --git a/test/test_pandas.py b/test/test_pandas.py
@@ -1,8 +1,16 @@
 import pandas as pd
+
 import njab
 
 
 def test_thousands_display():
     njab.pandas.set_pandas_options()
     s = pd.Series([1_000_000])
     assert str(s)[4:13] == '1,000,000'
+
+
+def test_combine_value_counts():
+    df = pd.DataFrame({'a': [1, 2, 2, 2, 3, 3, 3], 'b': [1, 1, 1, 2, 2, 3, 3]})
+    exp = {'a': {1: 1, 2: 3, 3: 3}, 'b': {1: 3, 2: 2, 3: 2}}
+    act = njab.pandas.combine_value_counts(df).to_dict()
+    assert act == exp