From 96001782f747004fb9c9e193a27f375821a6dfde Mon Sep 17 00:00:00 2001 From: Henry Date: Thu, 2 May 2024 14:09:38 +0200 Subject: [PATCH 1/7] :art: separete pandas formatting --- src/njab/pandas/__init__.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/njab/pandas/__init__.py b/src/njab/pandas/__init__.py index 9af3096..b0c1505 100644 --- a/src/njab/pandas/__init__.py +++ b/src/njab/pandas/__init__.py @@ -1,20 +1,26 @@ import logging import typing +import omegaconf import pandas as pd import pandas.io.formats.format as pf -import omegaconf - logger = logging.getLogger(__name__) -def set_pandas_options() -> None: +def set_pandas_options(max_columns: int = 100, + max_row: int = 30, + min_row: int = 20, + float_format='{:,.3f}') -> None: """Update default pandas options for better display.""" - pd.options.display.max_columns = 100 - pd.options.display.max_rows = 30 - pd.options.display.min_rows = 20 - pd.options.display.float_format = '{:,.3f}'.format + pd.options.display.max_columns = max_columns + pd.options.display.max_rows = max_row + pd.options.display.min_rows = min_row + set_pandas_number_formatting(float_format=float_format) + + +def set_pandas_number_formatting(float_format='{:,.3f}') -> None: + pd.options.display.float_format = float_format.format # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.describe_option.html#pandas.describe_option pd.options.styler.format.thousands = ',' # # https://github.com/pandas-dev/pandas/blob/main/pandas/io/formats/format.py#L1475 From 17e3fa063766545f405fa5aec31e1a392b1b7f12 Mon Sep 17 00:00:00 2001 From: Henry Date: Thu, 2 May 2024 16:19:07 +0200 Subject: [PATCH 2/7] :bookmark: v0.0.5 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index bdd1cac..23d61d5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = njab -version = 0.0.4 +version = 0.0.5 description = not Just Another Biomarker long_description = file: README.md long_description_content_type = text/markdown From 9d4fd08da544b91ae00ef48416381f93b4a66c9b Mon Sep 17 00:00:00 2001 From: Henry Date: Tue, 14 May 2024 15:07:40 +0200 Subject: [PATCH 3/7] :art: run collab test only once a month --- .github/workflows/colab.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/colab.yml b/.github/workflows/colab.yml index 8293dcf..2f04594 100644 --- a/.github/workflows/colab.yml +++ b/.github/workflows/colab.yml @@ -3,7 +3,7 @@ name: Test that tutorial runs on latest colab image on: push: schedule: - - cron: '0 2 * * 3' + - cron: '0 2 3 * *' jobs: test: From 12f19c5c18623d4859e94600038919e36dd74a3e Mon Sep 17 00:00:00 2001 From: Henry Date: Tue, 14 May 2024 16:46:05 +0200 Subject: [PATCH 4/7] :bug: update to make it pandas 2 compatible --- setup.cfg | 2 +- src/njab/pandas/__init__.py | 8 +++----- test/test_pandas.py | 8 ++++++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 23d61d5..1e8bb9a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = njab -version = 0.0.5 +version = 0.0.6 description = not Just Another Biomarker long_description = file: README.md long_description_content_type = text/markdown diff --git a/src/njab/pandas/__init__.py b/src/njab/pandas/__init__.py index b0c1505..ef5db2c 100644 --- a/src/njab/pandas/__init__.py +++ b/src/njab/pandas/__init__.py @@ -120,10 +120,8 @@ def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame: pandas.DataFrame DataFrame of combined value counts. """ - """ - """ - _df = pd.DataFrame() + freq_targets = list() for col in X.columns: - _df = _df.join(X[col].value_counts(dropna=dropna), how='outer') - freq_targets = _df.sort_index() + freq_targets.append(X[col].value_counts(dropna=dropna).rename(col)) + freq_targets = pd.concat(freq_targets, axis=1, sort=True) return freq_targets diff --git a/test/test_pandas.py b/test/test_pandas.py index b117f72..0db68dc 100644 --- a/test/test_pandas.py +++ b/test/test_pandas.py @@ -1,4 +1,5 @@ import pandas as pd + import njab @@ -6,3 +7,10 @@ def test_thousands_display(): njab.pandas.set_pandas_options() s = pd.Series([1_000_000]) assert str(s)[4:13] == '1,000,000' + + +def test_combine_value_counts(): + df = pd.DataFrame({'a': [1, 2, 2, 2, 3, 3, 3], 'b': [1, 1, 1, 2, 2, 3, 3]}) + exp = {'a': {1: 1, 2: 3, 3: 3}, 'b': {1: 3, 2: 2, 3: 2}} + act = njab.pandas.combine_value_counts(df).to_dict() + assert act == exp From 55036f573b3a261f66c1e8bfa45457c4b917cfac Mon Sep 17 00:00:00 2001 From: Henry Date: Mon, 27 May 2024 16:46:15 +0200 Subject: [PATCH 5/7] :bug: remove missing values from list of p-values before FDR calc The calculation with any missing values leads to NAN q-values for all entries. This happens mostly due to multicollinearity, which could be check for in the ANCOVA. --- src/njab/stats/ancova.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/njab/stats/ancova.py b/src/njab/stats/ancova.py index 7f4c024..da43a18 100644 --- a/src/njab/stats/ancova.py +++ b/src/njab/stats/ancova.py @@ -1,8 +1,8 @@ """Analysis of covariance using pingouin and statsmodels.""" from __future__ import annotations + import numpy as np import pandas as pd - import pingouin as pg import statsmodels @@ -52,6 +52,7 @@ def ancova_pg(df_long: pd.DataFrame, # num_covar = len(covar) for feat_name, data_feat in df_long.groupby(feat_col): + # ? drop duplicated colummns in long data format? ancova = pg.ancova(data=data_feat, dv=dv, between=between, covar=covar) ancova[feat_col] = feat_name scores.append(ancova) @@ -137,6 +138,8 @@ def ancova(self, random_seed=123): scores = self.get_scores() scores = filter_residuals_from_scores(scores) + # drop nan values (due to multicollinearity of features - i.e. duplicated features) + scores = scores.dropna() scores = add_fdr_scores(scores, random_seed=random_seed) self.scores = scores return scores.set_index('Source', append=True) From b863180c1b8fa21b556c3d653f6daee536003bd8 Mon Sep 17 00:00:00 2001 From: Henry Date: Mon, 27 May 2024 17:28:47 +0200 Subject: [PATCH 6/7] :sparkles: switch to dynamic versioning based on latest tag - keep track of intermediate versions --- pyproject.toml | 46 +++++++++++++++++++++++++++++++++++++++++++++- setup.cfg | 49 ------------------------------------------------- 2 files changed, 45 insertions(+), 50 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 008c8e4..1b2e9f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,48 @@ +[project] +name = "njab" +authors = [{ name = "Henry Webel", email = "henry.webel@cpr.ku.dk" }] +description = "not Just Another Biomarker" +readme = "README.md" +requires-python = ">=3.8" +keywords = ["bioinformatics", "biomarker"] +license = { file = "LICENSE" } +classifiers = [ + "License :: OSI Approved :: MIT License", + "Intended Audience :: Healthcare Industry", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "omegaconf", + "lifelines", + "numpy", + "pandas", + "scikit-learn", + "statsmodels", + "umap-learn", + "matplotlib", + "mrmr_selection", + "pingouin", + "seaborn", +] +dynamic = ["version"] + +[project.optional-dependencies] +docs = [ + "sphinx", + "sphinx-book-theme", + "myst-nb", + "ipywidgets", + "sphinx-new-tab-link!=0.2.2", +] + +[project.urls] +"Bug Tracker" = "https://github.com/RasmussenLab/njab/issues" +"Homepage" = "https://github.com/RasmussenLab/njab" + [build-system] -requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" +requires = ["setuptools>=64", "setuptools_scm>=8"] +[tool.setuptools_scm] diff --git a/setup.cfg b/setup.cfg index 1e8bb9a..412b0d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,52 +1,3 @@ -[metadata] -name = njab -version = 0.0.6 -description = not Just Another Biomarker -long_description = file: README.md -long_description_content_type = text/markdown -author = Henry Webel -author_email = "Henry Webel" -license = MIT -license_files = LICENSE -classifiers = - License :: OSI Approved :: MIT License - Intended Audience :: Healthcare Industry - Intended Audience :: Science/Research - Programming Language :: Python :: 3 - Topic :: Scientific/Engineering :: Bio-Informatics - -[options] -requires_python = ">=3.8" -package_dir = - =src -packages = find: -include_package_data = False -install_requires = - omegaconf - lifelines - numpy - pandas - scikit-learn - statsmodels - umap-learn - matplotlib - mrmr_selection - pingouin - seaborn - -[options.extras_require] -docs = - sphinx - sphinx-book-theme - myst-nb - ipywidgets - sphinx-new-tab-link!=0.2.2 - -[options.packages.find] -where = src -exclude = - test* - [yapf] based_on_style = pep8 From 4c8e16d7e6453d73c76824f6753bf2359f22a4bb Mon Sep 17 00:00:00 2001 From: Henry Date: Fri, 31 May 2024 17:03:28 +0200 Subject: [PATCH 7/7] :wrench: update VSCode config --- .vscode/settings.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 2b7e46d..aeee31c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,9 @@ { - "python.formatting.provider": "yapf" + "editor.defaultFormatter": "eeyore.yapf", + "python.testing.pytestArgs": [ + "test" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + } \ No newline at end of file