From ea902a6b9e521bde0bda536df164c2cc41b7256d Mon Sep 17 00:00:00 2001
From: Henry Webel <henry.webel@cpr.ku.dk>
Date: Mon, 10 Jun 2024 10:58:40 +0200
Subject: [PATCH] :bug: make all imports explicit w.r.t. to pkg (#68)

- make all import explicit
- let workflow run as a last resort with one job at a time (although it wasn't necessary for the merge run)

* :bug: make all imports explicit w.r.t. to pkg
* :memo: update M1 installation instructions

- still does not seem to work

* :bug: deactivating njab makes it run!

- committed from marc's M1 laptop

* :rewind: use njab after fix in downstream pkg

underlying case is polars import in mrmr-selection, see: https://github.com/RasmussenLab/njab/pull/13

* :bug: try to allow parallel installations for R

- ubuntu runs are regularly failing as lock can cause issues
- alternatively one job at a time could be run in the retry

* :bug: try to manuelly add missing dependency: gmm

* :zap::memo: make ci more robust to installation issues, update README for Mac M1 chicps

---------

Co-authored-by: mpielies <mpielies@gmail.com>
---
 .github/workflows/ci.yaml                 | 12 ++++++--
 .github/workflows/workflow_website.yaml   |  6 ++++
 README.md                                 | 17 ++++++++----
 project/01_1_train_NAGuideR_methods.R     |  6 ++++
 project/01_1_train_NAGuideR_methods.ipynb |  6 ++++
 vaep/__init__.py                          |  3 --
 vaep/analyzers/__init__.py                |  2 +-
 vaep/data_handling.py                     |  2 --
 vaep/filter.py                            |  2 --
 vaep/models/__init__.py                   |  3 +-
 vaep/models/ae.py                         |  2 +-
 vaep/models/collab.py                     |  3 +-
 vaep/pandas/__init__.py                   | 34 +++++++++++++++++++----
 vaep/plotting/__init__.py                 |  7 ++---
 14 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f46fec693..60149e4b1 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -69,17 +69,25 @@ jobs:
         mkdir runs
         papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models.ipynb
         papermill 04_1_train_pimms_models.ipynb runs/04_1_train_pimms_models_no_val.ipynb -p sample_splits False
-    - name: Run demo workflow (integration test)
+    - name: Dry-Run demo workflow (integration test)
       continue-on-error: true
       run: | 
        cd project
        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n
+    - name: Run demo workflow (integration test)
+      continue-on-error: true
+      run: | 
+       cd project
        snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml
     - name: Run demo workflow again (in case of installation issues)
+      continue-on-error: true
       run: | 
         cd project
-        snakemake -p -c1 -n --configfile config/single_dev_dataset/example/config.yaml
         snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml
+    - name: Run demo workflow again (in case of installation issues) - one thread
+      run: | 
+        cd project
+        snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml
     - name: Archive results
     # https://github.com/actions/upload-artifact
       uses: actions/upload-artifact@v4
diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml
index be97836db..d4f4e845a 100644
--- a/.github/workflows/workflow_website.yaml
+++ b/.github/workflows/workflow_website.yaml
@@ -39,9 +39,15 @@ jobs:
         cd project
         snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k
     - name: Run demo workflow again (in case of installation issues)
+      continue-on-error: true
       run: | 
         cd project
         snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k
+    - name: Run demo workflow again (in case of installation issues) with one thread
+      continue-on-error: true
+      run: | 
+        cd project
+        snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -k
     - name: Run differential analysis workflow
       run: | 
         cd project
diff --git a/README.md b/README.md
index aedf54e82..b11a02b64 100644
--- a/README.md
+++ b/README.md
@@ -127,17 +127,24 @@ mamba env create -n pimms -f environment.yml # faster, less then 5mins
 
 If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment:
 
-### Install pytorch first (M-chips)
+### Install pytorch first
+
+> :warning: We currently see issues with some installations on M1 chips. A dependency
+> for one workflow is polars, which causes the issue. This should be [fixed now](https://github.com/RasmussenLab/njab/pull/13) 
+> for general use by delayed import 
+> of `mrmr-selection` in `njab`. If you encounter issues, please open an issue.
 
 Check how to install pytorch for your system [here](https://pytorch.org/get-started).
 
 - select the version compatible with your cuda version if you have an nvidia gpu or a Mac M-chip.
 
 ```bash
-conda create -n vaep python=3.9 pip
-conda activate vaep
-# Follow instructions on https://pytorch.org/get-started 
-# conda env update -f environment.yml -n vaep # should not install the rest.
+conda create -n pimms python=3.9 pip
+conda activate pimms
+# Follow instructions on https://pytorch.org/get-started: 
+# CUDA is not available on MacOS, please use default package
+# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+conda install pytorch::pytorch torchvision torchaudio fastai -c pytorch -c fastai -y
 pip install pimms-learn
 pip install jupyterlab papermill # use run notebook interactively or as a script
 
diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R
index 13997aa17..3918a663c 100644
--- a/project/01_1_train_NAGuideR_methods.R
+++ b/project/01_1_train_NAGuideR_methods.R
@@ -20,6 +20,8 @@
 # - BiocManager could be moved to methods who are installed from BioConductor
 
 # + tags=["hide-input"] vscode={"languageId": "r"}
+# options("install.lock"=FALSE)
+
 packages_base_R <-
   c("BiocManager", "reshape2", "data.table", "readr", "tibble")
 
@@ -130,6 +132,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "qrilc") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- t(df1)
     data_zero1 <-
@@ -139,6 +142,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "mindet") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- as.matrix(df1)
     df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)
@@ -146,6 +150,7 @@ nafunctions <- function(x, method = "zero") {
   else if (method == "minprob") {
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     xxm <- as.matrix(df1)
     df <-
@@ -278,6 +283,7 @@ nafunctions <- function(x, method = "zero") {
     
     install_bioconductor("impute")
     install_bioconductor("pcaMethods")
+    install_rpackage('gmm')
     install_rpackage('imputeLCMD')
     install_rpackage("magrittr")
     install_rpackage("glmnet")
diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb
index 23fae4bd3..f6a7a6a1b 100644
--- a/project/01_1_train_NAGuideR_methods.ipynb
+++ b/project/01_1_train_NAGuideR_methods.ipynb
@@ -26,6 +26,8 @@
    },
    "outputs": [],
    "source": [
+    "# options(\"install.lock\"=FALSE)\n",
+    "\n",
     "packages_base_R <-\n",
     "  c(\"BiocManager\", \"reshape2\", \"data.table\", \"readr\", \"tibble\")\n",
     "\n",
@@ -160,6 +162,7 @@
     "  else if (method == \"qrilc\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- t(df1)\n",
     "    data_zero1 <-\n",
@@ -169,6 +172,7 @@
     "  else if (method == \"mindet\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- as.matrix(df1)\n",
     "    df <- imputeLCMD::impute.MinDet(xxm, q = 0.01)\n",
@@ -176,6 +180,7 @@
     "  else if (method == \"minprob\") {\n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    xxm <- as.matrix(df1)\n",
     "    df <-\n",
@@ -308,6 +313,7 @@
     "    \n",
     "    install_bioconductor(\"impute\")\n",
     "    install_bioconductor(\"pcaMethods\")\n",
+    "    install_rpackage('gmm')\n",
     "    install_rpackage('imputeLCMD')\n",
     "    install_rpackage(\"magrittr\")\n",
     "    install_rpackage(\"glmnet\")\n",
diff --git a/vaep/__init__.py b/vaep/__init__.py
index 98460455a..059ccf970 100644
--- a/vaep/__init__.py
+++ b/vaep/__init__.py
@@ -9,10 +9,7 @@
 from importlib import metadata
 
 import njab
-import pandas as pd
-import pandas.io.formats.format as pf
 
-# from . import logging, nb, pandas, plotting
 import vaep.logging
 import vaep.nb
 import vaep.pandas
diff --git a/vaep/analyzers/__init__.py b/vaep/analyzers/__init__.py
index 9856dafb9..6d16805a6 100644
--- a/vaep/analyzers/__init__.py
+++ b/vaep/analyzers/__init__.py
@@ -2,7 +2,7 @@
 """
 from types import SimpleNamespace
 
-from . import compare_predictions, diff_analysis
+from vaep.analyzers import compare_predictions, diff_analysis
 
 __all__ = ['diff_analysis', 'compare_predictions', 'Analysis']
 
diff --git a/vaep/data_handling.py b/vaep/data_handling.py
index 1f0bc5404..41be078ac 100644
--- a/vaep/data_handling.py
+++ b/vaep/data_handling.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pandas as pd
 
-# coverage
-
 
 def coverage(X: pd.DataFrame, coverage_col: float, coverage_row: float):
     """Select proteins by column depending on their coverage.
diff --git a/vaep/filter.py b/vaep/filter.py
index 2c85999a9..2d26c4806 100644
--- a/vaep/filter.py
+++ b/vaep/filter.py
@@ -4,8 +4,6 @@
 
 logger = logging.getLogger(__name__)
 
-# ! use in data selection and tutorial
-
 
 def select_features(df: pd.DataFrame,
                     feat_prevalence: float = .2,
diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py
index 3be35408b..2ae111d44 100644
--- a/vaep/models/__init__.py
+++ b/vaep/models/__init__.py
@@ -16,8 +16,7 @@
 from fastcore.foundation import L
 
 import vaep
-
-from . import ae, analysis, collab, vae
+from vaep.models import ae, analysis, collab, vae
 
 logger = logging.getLogger(__name__)
 
diff --git a/vaep/models/ae.py b/vaep/models/ae.py
index fd5d081a1..8295c8560 100644
--- a/vaep/models/ae.py
+++ b/vaep/models/ae.py
@@ -21,7 +21,7 @@
 import vaep.models
 import vaep.transform
 
-from . import analysis
+from vaep.models import analysis
 
 logger = logging.getLogger(__name__)
 
diff --git a/vaep/models/collab.py b/vaep/models/collab.py
index 6e1403213..f54ab6df2 100644
--- a/vaep/models/collab.py
+++ b/vaep/models/collab.py
@@ -11,8 +11,7 @@
 
 import vaep.io.dataloaders
 import vaep.io.datasplits
-
-from . import analysis
+from vaep.models import analysis
 
 logger = logging.getLogger(__name__)
 
diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py
index 97520bb02..5f82204b1 100644
--- a/vaep/pandas/__init__.py
+++ b/vaep/pandas/__init__.py
@@ -7,7 +7,30 @@
 import omegaconf
 import pandas as pd
 
-from .calc_errors import calc_errors_per_feat, get_absolute_error
+from vaep.pandas.calc_errors import calc_errors_per_feat, get_absolute_error
+
+__all__ = [
+    'calc_errors_per_feat',
+    'get_absolute_error',
+    'unique_cols',
+    'get_unique_non_unique_columns',
+    'prop_unique_index',
+    'replace_with',
+    'index_to_dict',
+    'get_columns_accessor',
+    'get_columns_accessor_from_iterable',
+    'select_max_by',
+    'get_columns_namedtuple',
+    'highlight_min',
+    '_add_indices',
+    'interpolate',
+    'flatten_dict_of_dicts',
+    'key_map',
+    'parse_query_expression',
+    'length',
+    'get_last_index_matching_proportion',
+    'get_lower_whiskers',
+    'get_counts_per_bin']
 
 
 def unique_cols(s: pd.Series) -> bool:
@@ -285,16 +308,15 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series:
     return ret
 
 
-def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None) -> pd.DataFrame:
+def get_counts_per_bin(df: pd.DataFrame,
+                       bins: range,
+                       columns: Optional[List[str]] = None) -> pd.DataFrame:
     """Return counts per bin for selected columns in DataFrame."""
     counts_per_bin = dict()
     if columns is None:
         columns = df.columns.to_list()
     for col in columns:
-        _series = (pd.cut(df[col], bins=bins)
-                   .to_frame()
-                   .groupby(col)
-                   .size())
+        _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
         _series.index.name = 'bin'
         counts_per_bin[col] = _series
     counts_per_bin = pd.DataFrame(counts_per_bin)
diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py
index 17cc86ced..105c183f8 100644
--- a/vaep/plotting/__init__.py
+++ b/vaep/plotting/__init__.py
@@ -11,11 +11,8 @@
 import seaborn
 
 import vaep.pandas
-
-from . import data, defaults, errors, plotly
-from .errors import plot_rolling_error
-
-# from . defaults import order_categories, labels_dict, IDX_ORDER
+from vaep.plotting import data, defaults, errors, plotly
+from vaep.plotting.errors import plot_rolling_error
 
 seaborn.set_style("whitegrid")
 # seaborn.set_theme()