From 737744174117f027f9c083784167ccf8bebe0f90 Mon Sep 17 00:00:00 2001 From: Henry Webel Date: Fri, 31 May 2024 11:37:20 +0200 Subject: [PATCH 1/3] :sparkles: test Snakemake workflow with more recent Python versions (#66) * :sparkles: test Snakemake workflow with more recent Python versions * :sparkles: remove snakemake upper limit * :bug: bump plac version related to : https://github.com/snakemake/snakemake/issues/2276 * :pushpin: update actions, make artefacts unique - check if local windows error with pandas can be reproduced in action (corr) * :art: dump counts for histograms - add for simulated missing values - remove duplication - :bug: do not omit last bin * :bug: assure that some values are set to NA if all values are higher than the default threshold, the assertion on L17 is not met. Make sure some NAs (missing values) are set. * :art: write out corr and prepare for pandas 2.0 - see if this works also with pandas 1.5.3 * :wrench: test relaxing pandas restriction * :bug: drop batches with one sample for training DAE and VAE - for creating the latent representation, now a new DataLoader is needed. * :sparkles: splitup large global environement - separate environment for PIMMS models and R based models - global environment should still work * :bug: test if adding jupyter is sufficient to install further packages in R sesssion - only execute one job at a time in retry to see errors better * :bug: fix sampling to make it compatible with python >=3.11 * :arrow_up: remove pytorch upper dependency * :white_check_mark: Test workflow v2 on Alzheimer dataset - once this passes, add ald analysis to website (for a reasonable subset of models) - maybe only showcase PIMMS models with a handful of other models * :bug: update path to execute run, speed-up - also remove two slowest models * :art: hide code in rendered notebooks of workflow, sort imports - hide code cells for generated report - isort imports * :sparkles: Functionality for plot source data (ALD study) - add some functionality required to collect source data for reporting on saved figures. * :sparkles: Run differential analysis workflow in CI on Alzheimer data - several adaptions to slightly different design between ALD and Alzheimer data * :bug: specify folder_experiment from global space - ... and not as wildcard * :art: rename Snakefile_v2 to Snakefile_v2.smk - uncommon names should have a file ending specifying Snakefiles. * :sparkles: skript to build website (execution) - execution should work, but subfolders need their own index.rst - need to adapt script for updating main index.rst See if everything runs for now. * :bug: do not exclude diff analysis folder in conf.py * :art::bug: make a strict hierarchy of headings per document - mapping titles in sphinx (cross-referencing) otherwise does not work * :art: collapse code in published notebooks - for better inspection of generated report for example * :art: annotate notebooks add some comments and streamline cells. * :sparkles: Test tutorial on colab * :bug::art: format and check briefly colab workflow on dev branch * :art: hide more inputs, downscale tutorial runner * :memo: Update README - :bug: use larger image to test tutorial on colab * :memo: update READMEs and add some hints to config files * :memo::art: save some adhoc script used during revisons, add and cleanup nb list * :bug: go back old config indentation (and model configuration) - rerun in codespace for inspection * :bug: fix issue having same model with 2 configurations - had to set model id ("model_key") as index * :memo::sparkles: Allow users to download large HeLa protein groups dataset easily --- .github/workflows/ci.yaml | 9 +- .github/workflows/ci_workflow.yaml | 55 ++ .github/workflows/test_pkg_on_colab.yaml | 26 + .github/workflows/workflow_website.yaml | 27 +- README.md | 121 +++-- environment.yml | 9 +- project/01_0_split_data.ipynb | 478 +++++++++++++---- project/01_0_split_data.py | 173 +++---- .../01_0_transform_data_to_wide_format.ipynb | 56 +- project/01_0_transform_data_to_wide_format.py | 21 +- project/01_1_train_CF.ipynb | 152 ++++-- project/01_1_train_CF.py | 63 ++- project/01_1_train_DAE.ipynb | 239 +++++++-- project/01_1_train_DAE.py | 94 ++-- project/01_1_train_KNN.ipynb | 153 ++++-- project/01_1_train_KNN.py | 50 +- project/01_1_train_KNN_unique_samples.py | 301 +++++++++++ project/01_1_train_Median.ipynb | 195 +++++-- project/01_1_train_Median.py | 73 ++- project/01_1_train_NAGuideR_methods.R | 24 +- project/01_1_train_NAGuideR_methods.ipynb | 55 +- project/01_1_train_RSN.ipynb | 157 +++++- project/01_1_train_RSN.py | 57 +- project/01_1_train_VAE.ipynb | 317 +++++++++--- project/01_1_train_VAE.py | 135 ++--- project/01_1_transfer_NAGuideR_pred.ipynb | 121 ++++- project/01_1_transfer_NAGuideR_pred.py | 46 +- project/01_2_performance_plots.ipynb | 290 ++++++++--- project/01_2_performance_plots.py | 146 +++--- project/01_3_revision3.py | 170 ++++++ project/04_1_train_pimms_models.ipynb | 20 +- project/04_1_train_pimms_models.py | 8 +- project/10_1_ald_diff_analysis.ipynb | 323 ++++++++---- project/10_1_ald_diff_analysis.py | 143 ++--- project/10_2_ald_compare_methods.ipynb | 488 ++++++++++++------ project/10_2_ald_compare_methods.py | 209 ++++---- project/10_3_ald_ml_new_feat.ipynb | 452 ++++++++++------ project/10_3_ald_ml_new_feat.py | 204 +++++--- project/10_4_ald_compare_single_pg.ipynb | 343 +++++++++--- project/10_4_ald_compare_single_pg.py | 137 +++-- project/README.md | 142 ++--- project/bin/create_qsub_commands.py | 78 +-- project/bin/run_snakemake_cluster.sh | 2 +- project/config/alzheimer_study/README.md | 10 + .../config/alzheimer_study/comparison.yaml | 22 + project/config/alzheimer_study/config.yaml | 79 +++ .../knn_comparison/ald_pgs_all/README.md | 2 +- .../knn_comparison/ald_pgs_all/config.yaml | 2 +- .../knn_comparison/hela_pgs_large/README.md | 2 +- .../knn_comparison/hela_pgs_large/config.yaml | 2 +- .../single_dev_dataset/mnar_mcar/evi_l.yaml | 2 +- .../single_dev_dataset/mnar_mcar/evi_m.yaml | 2 +- .../single_dev_dataset/mnar_mcar/pep_l.yaml | 132 ++--- .../single_dev_dataset/mnar_mcar/pep_m.yaml | 136 ++--- .../single_dev_dataset/mnar_mcar/pg_l.yaml | 124 ++--- .../single_dev_dataset/mnar_mcar/pg_m.yaml | 2 +- .../proteinGroups_N50/README.md | 23 + .../proteinGroups_N50/config_v2.yaml | 2 +- project/data/README.md | 70 ++- project/data/download_dev_dataset.py | 42 ++ project/workflow/Snakefile | 12 +- project/workflow/Snakefile_ald_comparison.smk | 39 +- .../{Snakefile_v2 => Snakefile_v2.smk} | 7 +- project/workflow/envs/pimms.yaml | 52 ++ project/workflow/envs/trainRmodels.yaml | 36 ++ pyproject.toml | 1 + snakemake_env.yml | 8 + tests/io/test_dataset.py | 3 +- .../setup_diff_analysis_website.py | 110 ++++ vaep/cmd_interface/setup_imp_cp_website.py | 2 +- vaep/io/dataloaders.py | 74 +-- 71 files changed, 5178 insertions(+), 2182 deletions(-) create mode 100644 .github/workflows/ci_workflow.yaml create mode 100644 .github/workflows/test_pkg_on_colab.yaml create mode 100644 project/01_1_train_KNN_unique_samples.py create mode 100644 project/01_3_revision3.py create mode 100644 project/config/alzheimer_study/README.md create mode 100644 project/config/alzheimer_study/comparison.yaml create mode 100644 project/config/alzheimer_study/config.yaml create mode 100644 project/config/single_dev_dataset/proteinGroups_N50/README.md create mode 100644 project/data/download_dev_dataset.py rename project/workflow/{Snakefile_v2 => Snakefile_v2.smk} (98%) create mode 100644 project/workflow/envs/pimms.yaml create mode 100644 project/workflow/envs/trainRmodels.yaml create mode 100644 snakemake_env.yml create mode 100644 vaep/cmd_interface/setup_diff_analysis_website.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 269d6f090..349e8173c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,13 +20,13 @@ jobs: "macos-13", # "windows-latest" # rrcovNA cannot be build from source on windows-server ] - python-version: ["3.8"] + python-version: ["3.8", "3.9", "3.10"] steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Miniconda # ! change action https://github.com/mamba-org/setup-micromamba - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Mambaforge # miniforge-version: latest @@ -82,9 +82,9 @@ jobs: snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml - name: Archive results # https://github.com/actions/upload-artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: example-workflow-results-${{ matrix.os }} + name: ${{ matrix.os }}-${{ matrix.python-version }}-example-workflow-results path: | project/runs/example/ environment.yml @@ -114,7 +114,6 @@ jobs: - name: Run pytest run: pytest . - publish: name: Publish package if: startsWith(github.ref, 'refs/tags') diff --git a/.github/workflows/ci_workflow.yaml b/.github/workflows/ci_workflow.yaml new file mode 100644 index 000000000..87df12633 --- /dev/null +++ b/.github/workflows/ci_workflow.yaml @@ -0,0 +1,55 @@ +name: run workflow with conda envs +on: + push: + branches: [main, dev] + pull_request: + branches: [main, dev] + release: + # schedule: + # - cron: '0 2 * * 3,6' +jobs: + run-integration-tests-with-conda-install: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -el {0} + strategy: + fail-fast: false + matrix: + os: [ + "ubuntu-latest", + "macos-13", + # "windows-latest" # rrcovNA cannot be build from source on windows-server + ] + python-version: ["3.10"] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Miniconda + # ! change action https://github.com/mamba-org/setup-micromamba + uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-variant: Mambaforge + use-mamba: true + channel-priority: disabled + python-version: ${{ matrix.python-version }} + environment-file: snakemake_env.yml + activate-environment: snakemake + auto-activate-base: true + - name: inspect-conda-environment + run: | + conda info + conda list + - name: Dry-run workflow + run: | + cd project + snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n --use-conda + - name: Run demo workflow (integration test) + continue-on-error: true + run: | + cd project + snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml --use-conda + - name: Run demo workflow again (in case of installation issues) + run: | + cd project + snakemake -p -c1 -k --configfile config/single_dev_dataset/example/config.yaml --use-conda diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml new file mode 100644 index 000000000..546434bb3 --- /dev/null +++ b/.github/workflows/test_pkg_on_colab.yaml @@ -0,0 +1,26 @@ +name: Test that tutorial runs on latest colab image + +on: + push: + branches: [dev] + pull_request: + branches: [main, dev] + schedule: + - cron: '0 2 3 * *' + +jobs: + test-tutorial-on-colab: + name: Test tutorial on latest colab image + runs-on: ubuntu-latest-4core # increase disk space + # https://console.cloud.google.com/artifacts/docker/colab-images/europe/public/runtime + container: + image: europe-docker.pkg.dev/colab-images/public/runtime:latest + steps: + - uses: actions/checkout@v4 + - name: Install pimms-learn and papermill + run: | + python3 -m pip install pimms-learn papermill + - name: Run tutorial + run: | + cd project + papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_output.ipynb diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml index ab775f193..be97836db 100644 --- a/.github/workflows/workflow_website.yaml +++ b/.github/workflows/workflow_website.yaml @@ -1,4 +1,4 @@ -name: Build workflow website on smaller development dataset (for protein groups) +name: Build workflow website on public Alzheimer dataset (for protein groups) on: pull_request: branches: [main, dev] @@ -29,32 +29,39 @@ jobs: activate-environment: vaep auto-activate-base: true # auto-update-conda: true + - name: Dry-run workflow + run: | + cd project + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -n - name: Run demo workflow (integration test) continue-on-error: true run: | cd project - snakemake -p -c1 -n - snakemake -p -c4 -k + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k - name: Run demo workflow again (in case of installation issues) run: | cd project - snakemake -p -c1 -n - snakemake -p -c4 -k + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k + - name: Run differential analysis workflow + run: | + cd project + snakemake -s workflow/Snakefile_ald_comparison.smk --configfile config/alzheimer_study/comparison.yaml -p -c4 - name: Install website dependencies run: | pip install .[docs] - name: Build imputation comparison website run: | - pimms-setup-imputation-comparison -f project/runs/dev_dataset_small/proteinGroups_N50/ - cd project/runs/dev_dataset_small/proteinGroups_N50/ + pimms-setup-imputation-comparison -f project/runs/alzheimer_study/ + pimms-add-diff-comp -f project/runs/alzheimer_study/ -sf_cp project/runs/alzheimer_study/diff_analysis/AD + cd project/runs/alzheimer_study/ sphinx-build -n --keep-going -b html ./ ./_build/ - name: Archive results uses: actions/upload-artifact@v3 with: - name: example-workflow-results-${{ matrix.os }} - path: project/runs/dev_dataset_small/proteinGroups_N50/_build/ + name: alzheimer-study + path: project/runs/alzheimer_study/ - name: Publish workflow as website uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: project/runs/dev_dataset_small/proteinGroups_N50/_build/ \ No newline at end of file + publish_dir: project/runs/alzheimer_study/_build/ \ No newline at end of file diff --git a/README.md b/README.md index 94c8128c7..4af30064e 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,33 @@ # PIMMS -[![Read the Docs](https://img.shields.io/readthedocs/pimms)](https://readthedocs.org/projects/pimms/) [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/RasmussenLab/pimms/ci.yaml)](https://github.com/RasmussenLab/pimms/actions) +[![Read the Docs](https://img.shields.io/readthedocs/pimms)](https://readthedocs.org/projects/pimms/) [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/RasmussenLab/pimms/ci.yaml)](https://github.com/RasmussenLab/pimms/actions) [![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest) PIMMS stands for Proteomics Imputation Modeling Mass Spectrometry and is a hommage to our dear British friends who are missing as part of the EU for far too long already -(Pimms is also a British summer drink). +(Pimms is a British summer drink). -The pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792). +The publication is accepted in Nature Communications +and the pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792). > `PIMMS` was called `vaep` during development. -> Before entire refactoring has to been completed the imported package will be -`vaep`. +> Before entire refactoring has to been completed the imported package will be `vaep`. -We provide functionality as a python package, an excutable workflow and notebooks. +We provide functionality as a python package, an excutable workflow or simply in notebooks. -The models can be used with the scikit-learn interface in the spirit of other scikit-learn imputers. You can try this in colab. [![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/HEAD/project/04_1_train_pimms_models.ipynb) +For any questions, please [open an issue](https://github.com/RasmussenLab/pimms/issues) or contact me directly. +## Getting started -## Python package +The models can be used with the scikit-learn interface in the spirit of other scikit-learn imputers. You can try this using our tutorial in colab: + +[![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/HEAD/project/04_1_train_pimms_models.ipynb) + +It uses the scikit-learn interface. The PIMMS models in the scikit-learn interface +can be executed on the entire data or by specifying a valdiation split for checking training process. +In our experiments overfitting wasn't a big issue, but it's easy to check. + +## Install Python package For interactive use of the models provided in PIMMS, you can use our [python package `pimms-learn`](https://pypi.org/project/pimms-learn/). @@ -28,7 +37,7 @@ The interface is similar to scikit-learn. pip install pimms-learn ``` -Then you can use the models on a pandas DataFrame with missing values. Try this in the tutorial on Colab: +Then you can use the models on a pandas DataFrame with missing values. You can try this in the tutorial on Colab by uploading your data: [![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/HEAD/project/04_1_train_pimms_models.ipynb) ## Notebooks as scripts using papermill @@ -37,27 +46,71 @@ If you want to run a model on your prepared data, you can run notebooks prefixed `01_`, i.e. [`project/01_*.ipynb`](https://github.com/RasmussenLab/pimms/tree/HEAD/project) after cloning the repository. Using jupytext also python percentage script versions are saved. -``` +```bash +# navigat to your desired folder +git clone https://github.com/RasmussenLab/pimms.git # get all notebooks cd project # project folder as pwd +# pip install pimms-learn papermill # if not already installed papermill 01_0_split_data.ipynb --help-notebook papermill 01_1_train_vae.ipynb --help-notebook ``` +> :warning: Mistyped argument names won't throw an error when using papermill, but a warning is printed on the console thanks to my contributions:) -> Mistyped argument names won't throw an error when using papermill - -## PIMMS comparison workflow +## PIMMS comparison workflow and differential analysis workflow The PIMMS comparison workflow is a snakemake workflow that runs the all selected PIMMS models and R-models on -a user-provided dataset and compares the results. An example for the smaller HeLa development dataset on the +a user-provided dataset and compares the results. An example for a publickly available Alzheimer dataset on the protein groups level is re-built regularly and available at: [rasmussenlab.org/pimms](https://www.rasmussenlab.org/pimms/) +It is built on top of + - the [Snakefile_v2.smk](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_v2.smk) (v2 of imputation workflow), specified in on configuration + - the [Snakefile_ald_comparision](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_ald_comparison.smk) workflow for differential analysis + +The associated notebooks are index with `01_*` for the comparsion workflow and `10_*` for the differential analysis workflow. The `project` folder can be copied separately to any location if the package is installed. It's standalone folder. It's main folders are: + +```bash +# project folder: +project +│ README.md # see description of notebooks and hints on execution in project folder +|---config # configuration files for experiments ("workflows") +|---data # data for experiments +|---runs # results of experiments +|---src # source code or binaries for some R packges +|---tutorials # some tutorials for libraries used in the project +|---workflow # snakemake workflows +``` + +To re-execute the entire workflow locally, have a look at the [configuration files](https://github.com/RasmussenLab/pimms/tree/HEAD/project/config/alzheimer_study) for the published Alzheimer workflow: + +- [`config/alzheimer_study/config.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/config/alzheimer_study/comparison.yaml) +- [`config/alzheimer_study/comparsion.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/config/alzheimer_study/config.yaml) + +To execute that workflow, follow the Setup instructions below and run the following command in the project folder: + +```bash +# being in the project folder +snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -n # one core/process, dry-run +snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c2 # two cores/process, execute +# after imputation workflow, execute the comparison workflow +snakemake -s workflow/Snakefile_ald_comparison.smk --configfile config/alzheimer_study/comparison.yaml -p -c1 +# If you want to build the website locally: https://www.rasmussenlab.org/pimms/ +pip install .[docs] +pimms-setup-imputation-comparison -f project/runs/alzheimer_study/ +pimms-add-diff-comp -f project/runs/alzheimer_study/ -sf_cp project/runs/alzheimer_study/diff_analysis/AD +cd project/runs/alzheimer_study/ +sphinx-build -n --keep-going -b html ./ ./_build/ +# open ./_build/index.html +``` + +## Setup workflow and development environment + ### Setup comparison workflow The core funtionality is available as a standalone software on PyPI under the name `pimms-learn`. However, running the entire snakemake workflow in enabled using conda (or mamba) and pip to setup an analysis environment. For a detailed description of setting up conda (or mamba), see [instructions on setting up a virtual environment](https://github.com/RasmussenLab/pimms/blob/HEAD/docs/venv_setup.md). -Download the repository +Download the repository: ``` git clone https://github.com/RasmussenLab/pimms.git @@ -74,14 +127,14 @@ mamba env create -n pimms -f environment.yml # faster, less then 5mins If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment: -### Install development dependencies +### Install pytorch first (M-chips) Check how to install pytorch for your system [here](https://pytorch.org/get-started). - select the version compatible with your cuda version if you have an nvidia gpu or a Mac M-chip. ```bash -conda create -n vaep python=3.8 pip +conda create -n vaep python=3.9 pip conda activate vaep # Follow instructions on https://pytorch.org/get-started # conda env update -f environment.yml -n vaep # should not install the rest. @@ -95,29 +148,17 @@ papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_test.ipynb # sec python 04_1_train_pimms_models.py # just execute the code ``` -### Entire development installation - - -```bash -conda create -n pimms_dev -c pytorch -c nvidia -c fastai -c bioconda -c plotly -c conda-forge --file requirements.txt --file requirements_R.txt --file requirements_dev.txt -pip install -e . # other pip dependencies missing -snakemake --configfile config/single_dev_dataset/example/config.yaml -F -n -``` - -or if you want to update an existing environment +### Let Snakemake handle installation +If you only want to execute the workflow, you can use snakemake to build the environments for you: -``` -conda update -c defaults -c conda-forge -c fastai -c bioconda -c plotly --file requirements.txt --file requirements_R.txt --file requirements_dev.txt -``` +> Snakefile workflow for imputation v1 only support that atm. -or using the environment.yml file (can fail on certain systems) - -``` -conda env create -f environment.yml +```bash +snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml --use-conda -n # dry-run +snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml --use-conda # execute with one core ``` - ### Troubleshooting Trouble shoot your R installation by opening jupyter lab @@ -127,16 +168,16 @@ Trouble shoot your R installation by opening jupyter lab jupyter lab # open 01_1_train_NAGuideR.ipynb ``` -## Run an analysis +## Run example Change to the [`project` folder](./project) and see it's [README](project/README.md) -You can subselect models by editing the config file: [`config.yaml`](project/config/single_dev_dataset/proteinGroups_N50/config.yaml) file. +You can subselect models by editing the config file: [`config.yaml`](https://github.com/RasmussenLab/pimms/tree/HEAD/project/config/single_dev_dataset/proteinGroups_N50) file. ``` conda activate pimms # activate virtual environment cd project # go to project folder pwd # so be in ./pimms/project -snakemake -c1 -p -n # dryrun demo workflow +snakemake -c1 -p -n # dryrun demo workflow, potentiall add --use-conda snakemake -c1 -p ``` @@ -228,7 +269,3 @@ From the brief description in the table the exact procedure is not always clear. | MSIMPUTE_MNAR | msImpute | BIOCONDUCTOR | | Missing not at random algorithm using low rank approximation | ~~grr~~ | DreamAI | - | Fails to install | Rigde regression | ~~GMS~~ | GMSimpute | tar file | Fails on Windows | Lasso model - - -## Build status -[![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest) \ No newline at end of file diff --git a/environment.yml b/environment.yml index a335db7ba..8415e1b58 100644 --- a/environment.yml +++ b/environment.yml @@ -9,9 +9,9 @@ channels: - plotly # - defaults dependencies: - - python=3.8 + - python>=3.8,<=3.12 - numpy - - pandas=1 + - pandas>=1 - scipy>=1.6 # plotting - matplotlib @@ -20,7 +20,7 @@ dependencies: - seaborn<0.13 - pip # ML - - pytorch=1 #=1.13.1=py3.8_cuda11.7_cudnn8_0 + - pytorch #=1.13.1=py3.8_cuda11.7_cudnn8_0 # - pytorch-cuda - scikit-learn - fastai @@ -36,8 +36,9 @@ dependencies: - xmltodict # configs - openpyxl # xml - omegaconf + - plac>=1.0 # snakemake - - snakemake-minimal<7.26 + - snakemake-minimal #<7.26 # jupyter - ipykernel - ipython diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index 78daf9994..85be6eaed 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -12,7 +12,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", @@ -69,7 +73,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -115,7 +123,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -125,7 +137,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.args_from_dict(args)\n", @@ -135,7 +151,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if not 0.0 <= args.frac_mnar <= 1.0:\n", @@ -164,7 +184,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "logger.info(f\"{args.FN_INTENSITIES = }\")\n", @@ -183,7 +207,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! factor out file reading to a separate module, not class\n", @@ -214,7 +242,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -231,7 +262,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.out_folder / '01_0_data_stats.xlsx'\n", @@ -262,7 +297,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "def join_as_str(seq):\n", @@ -291,7 +330,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -315,7 +358,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -331,7 +377,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10))" @@ -340,7 +390,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_meta = df_meta.sort_values(args.meta_date_col)" @@ -349,7 +403,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "meta_stats = df_meta.describe(include='all')\n", @@ -367,7 +425,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -384,7 +445,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_meta = align_meta_data(df, df_meta=df_meta)" @@ -400,7 +465,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "assert df.index.is_unique, \"Duplicates in index.\"" @@ -420,7 +489,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.select_N is not None:\n", @@ -449,7 +522,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "! add function\n", @@ -476,7 +553,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "notna = df.notna()\n", @@ -507,7 +588,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if isinstance(args.sample_completeness, float):\n", @@ -525,7 +610,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = sample_counts > args.sample_completeness\n", @@ -539,7 +628,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.N, args.M = df.shape # save data dimensions\n", @@ -556,7 +649,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "group = 1\n", @@ -572,7 +669,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -597,7 +697,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "min_max = vaep.plotting.data.min_max(df.stack())\n", @@ -613,7 +717,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -629,7 +737,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -655,7 +767,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "_feature_display_name = f'identified {args.feat_name_display}'\n", @@ -665,7 +781,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "K = 2\n", @@ -682,7 +802,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pcs.describe(include='all').T" @@ -691,7 +815,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_cat_col:\n", @@ -707,7 +835,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_date_col != 'PlaceholderTime':\n", @@ -729,7 +861,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots()\n", @@ -750,7 +886,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! write principal components to excel (if needed)\n", @@ -760,7 +900,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig = px.scatter(\n", @@ -791,7 +935,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df.head()" @@ -800,7 +948,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_w_date = df.join(df_meta[args.meta_date_col])\n", @@ -814,7 +966,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = df_w_date.plot.box(rot=80,\n", @@ -844,7 +1000,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))" @@ -862,7 +1022,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if not args.meta_date_col == 'PlaceholderTime':\n", @@ -904,7 +1068,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -922,7 +1089,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_per_feature = feature_frequency(df)\n", @@ -933,7 +1104,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -968,7 +1142,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -988,7 +1165,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_long = vaep.io.datasplits.long_format(df)\n", @@ -998,7 +1179,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "group = 2\n", @@ -1016,7 +1201,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1059,6 +1246,30 @@ "vaep.savefig(fig, fname)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ee6256", + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "counts_per_bin = vaep.pandas.get_counts_per_bin(\n", + " df=pd.concat(\n", + " [df_long.squeeze().to_frame('observed'),\n", + " thresholds.to_frame('threshold'),\n", + " fake_na_mnar.squeeze().to_frame(f'MNAR ({N_MNAR:,d})'),\n", + " fake_na_mcar.squeeze().to_frame(f'MCAR ({N_MCAR:,d})')],\n", + " axis=1),\n", + " bins=range(min_max[0], min_max[1] + 1, 1))\n", + "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + "counts_per_bin" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1074,7 +1285,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if 0.0 < args.prop_sample_w_sim < 1.0:\n", @@ -1104,7 +1319,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.test_y.groupby(level=-1).count().describe()" @@ -1113,7 +1332,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.val_y" @@ -1122,7 +1345,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.train_X.groupby(level=-1).count().describe()" @@ -1131,7 +1358,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# Check that feature indices and sample indicies overlap between splits\n", @@ -1160,7 +1391,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "diff = (splits\n", @@ -1195,7 +1430,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n", @@ -1228,7 +1467,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# dumps data in long-format\n", @@ -1247,7 +1490,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits = DataSplits.from_folder(args.data, file_format=args.file_format)" @@ -1263,7 +1510,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits_df = pd.DataFrame(index=df_long.index)\n", @@ -1278,7 +1529,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# whitespaces in legends are not displayed correctly...\n", @@ -1296,7 +1551,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "group = 3\n", @@ -1329,11 +1588,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "min_bin, max_bin = vaep.plotting.data.min_max(splits.val_y)\n", - "bins = range(int(min_bin), int(max_bin), 1)\n", + "bins = range(int(min_bin), int(max_bin) + 1, 1)\n", "ax = splits_df.plot.hist(bins=bins,\n", " xticks=list(bins),\n", " legend=False,\n", @@ -1353,7 +1616,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)\n", @@ -1364,7 +1631,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = splits_df.drop('train', axis=1).plot.hist(bins=bins,\n", @@ -1383,34 +1655,6 @@ "vaep.savefig(ax.get_figure(), fname)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Save binned counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counts_per_bin = dict()\n", - "for col in splits_df.columns:\n", - " _series = (pd.cut(splits_df[col], bins=bins)\n", - " .to_frame()\n", - " .groupby(col)\n", - " .size())\n", - " _series.index.name = 'bin'\n", - " counts_per_bin[col] = _series\n", - "counts_per_bin = pd.DataFrame(counts_per_bin)\n", - "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", - "counts_per_bin" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1421,7 +1665,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.to_wide_format()" @@ -1430,7 +1678,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -1443,7 +1695,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -1456,7 +1712,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "medians = (splits\n", @@ -1500,7 +1760,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.folder_experiment / 'data_config.yaml'\n", @@ -1518,7 +1782,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# saved figures\n", @@ -1535,7 +1803,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "writer.close()\n", @@ -1545,7 +1817,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] } diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 9b4061096..9be123d10 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -18,7 +18,7 @@ # # Create data splits -# %% +# %% tags=["hide-input"] import logging from functools import partial from pathlib import Path @@ -65,7 +65,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # %% [markdown] # ## Arguments -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -95,15 +95,15 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): feat_name_display: str = None # display name for feature name (e.g. 'protein group') -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args -# %% +# %% tags=["hide-input"] if not 0.0 <= args.frac_mnar <= 1.0: raise ValueError("Invalid MNAR float value (should be betw. 0 and 1):" f" {args.frac_mnar}") @@ -118,7 +118,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # %% [markdown] # process arguments -# %% +# %% tags=["hide-input"] logger.info(f"{args.FN_INTENSITIES = }") @@ -131,7 +131,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): logger.info( f"File format (extension): {FILE_EXT} (!specifies data loading function!)") -# %% +# %% tags=["hide-input"] # # ! factor out file reading to a separate module, not class # AnalyzePeptides.from_csv constructor = getattr(vaep.io.load, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT]) @@ -155,7 +155,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): df = log_fct(df) # ! potentially add check to increase value by 1 if 0 is present (should be part of preprocessing) df -# %% +# %% tags=["hide-input"] ax = (df .notna() .sum(axis=0) @@ -166,7 +166,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): ax.set_ylabel('Frequency') -# %% +# %% tags=["hide-input"] fname = args.out_folder / '01_0_data_stats.xlsx' dumps[fname.name] = fname.as_posix() writer = pd.ExcelWriter(fname) @@ -188,7 +188,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # # > The Collaborative Modeling approach will need a single feature column. -# %% +# %% tags=["hide-input"] def join_as_str(seq): ret = "_".join(str(x) for x in seq) return ret @@ -207,7 +207,7 @@ def join_as_str(seq): # # - read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser) -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) else: @@ -224,7 +224,7 @@ def join_as_str(seq): df_meta.index.name = args.index_col[0] df_meta -# %% +# %% tags=["hide-input"] if args.meta_date_col: df_meta[args.meta_date_col] = pd.to_datetime( df_meta[args.meta_date_col]) @@ -234,20 +234,20 @@ def join_as_str(seq): df_meta -# %% +# %% tags=["hide-input"] df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10)) -# %% +# %% tags=["hide-input"] df_meta = df_meta.sort_values(args.meta_date_col) -# %% +# %% tags=["hide-input"] meta_stats = df_meta.describe(include='all') meta_stats # %% [markdown] # subset with variation -# %% +# %% tags=["hide-input"] try: display(meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)]) @@ -258,13 +258,13 @@ def join_as_str(seq): display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)]) -# %% +# %% tags=["hide-input"] df_meta = align_meta_data(df, df_meta=df_meta) # %% [markdown] # Ensure unique indices -# %% +# %% tags=["hide-input"] assert df.index.is_unique, "Duplicates in index." # %% [markdown] @@ -274,7 +274,7 @@ def join_as_str(seq): # - for interpolation to make sense, it is best to select a consecutive number of samples: # - take N most recent samples (-> check that this makes sense for your case) -# %% +# %% tags=["hide-input"] if args.select_N is not None: args.select_N = min(args.select_N, len(df_meta)) if args.sample_N: @@ -292,7 +292,7 @@ def join_as_str(seq): # - `feat_prevalence` across samples -# %% +# %% tags=["hide-input"] # ! add function freq_per_feature = df.notna().sum() # on wide format if isinstance(args.feat_prevalence, float): @@ -313,7 +313,7 @@ def join_as_str(seq): df = df.loc[:, mask] df -# %% +# %% tags=["hide-input"] notna = df.notna() data_stats_filtered = pd.concat( [ @@ -330,7 +330,7 @@ def join_as_str(seq): # %% [markdown] # Select samples based on completeness -# %% +# %% tags=["hide-input"] if isinstance(args.sample_completeness, float): msg = f'Fraction of minimum sample completeness over all features specified with: {args.sample_completeness}\n' # assumes df in wide format @@ -342,7 +342,7 @@ def join_as_str(seq): sample_counts = df.notna().sum(axis=1) # if DataFrame sample_counts.describe() -# %% +# %% tags=["hide-input"] mask = sample_counts > args.sample_completeness msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.' logger.info(msg) @@ -350,14 +350,14 @@ def join_as_str(seq): df = df.dropna( axis=1, how='all') # drop now missing features -# %% +# %% tags=["hide-input"] args.N, args.M = df.shape # save data dimensions args.used_samples = df.index.to_list() # %% [markdown] # ### Histogram of features per sample -# %% +# %% tags=["hide-input"] group = 1 ax = df.notna().sum(axis=1).hist() ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample') @@ -366,7 +366,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax = df.notna().sum(axis=0).sort_values().plot() _new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels, rotation=45, @@ -381,7 +381,7 @@ def join_as_str(seq): # %% [markdown] # ### Number off observations accross feature value -# %% +# %% tags=["hide-input"] min_max = vaep.plotting.data.min_max(df.stack()) ax, bins = vaep.plotting.data.plot_histogram_intensities( df.stack(), min_max=min_max) @@ -391,7 +391,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=df, type='scatter') fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter' @@ -401,7 +401,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing( data=df, type='boxplot', return_plot_data=True) fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot' @@ -417,11 +417,11 @@ def join_as_str(seq): # %% [markdown] # ### Interactive and Single plots -# %% +# %% tags=["hide-input"] _feature_display_name = f'identified {args.feat_name_display}' sample_counts.name = _feature_display_name -# %% +# %% tags=["hide-input"] K = 2 df = df.astype(float) pcs = get_PCA(df, n_components=K) # should be renamed to get_PCs @@ -432,10 +432,10 @@ def join_as_str(seq): pcs = pcs.reset_index() pcs -# %% +# %% tags=["hide-input"] pcs.describe(include='all').T -# %% +# %% tags=["hide-input"] if args.meta_cat_col: fig, ax = plt.subplots(figsize=(3, 3)) analyzers.seaborn_scatter( @@ -445,7 +445,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(fig, fname) -# %% +# %% tags=["hide-input"] if args.meta_date_col != 'PlaceholderTime': fig, ax = plt.subplots() analyzers.plot_date_map( @@ -457,7 +457,7 @@ def join_as_str(seq): # %% [markdown] # - size: number of features in a single sample -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots() col_identified_feat = _feature_display_name analyzers.plot_scatter( @@ -472,11 +472,11 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(fig, fname) -# %% +# %% tags=["hide-input"] # # ! write principal components to excel (if needed) # pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx')) -# %% +# %% tags=["hide-input"] fig = px.scatter( pcs, x=pcs_name[0], y=pcs_name[1], hover_name=pcs_index_name, @@ -497,10 +497,10 @@ def join_as_str(seq): # %% [markdown] # ## Sample Medians and percentiles -# %% +# %% tags=["hide-input"] df.head() -# %% +# %% tags=["hide-input"] df_w_date = df.join(df_meta[args.meta_date_col]) df_w_date = df_w_date.set_index(args.meta_date_col).sort_index() if not args.meta_date_col == 'PlaceholderTime': @@ -508,7 +508,7 @@ def join_as_str(seq): df_w_date = df_w_date.T df_w_date -# %% +# %% tags=["hide-input"] ax = df_w_date.plot.box(rot=80, figsize=(7, 3), fontsize=7, @@ -528,7 +528,7 @@ def join_as_str(seq): # %% [markdown] # Percentiles of intensities in dataset -# %% +# %% tags=["hide-input"] df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2)) # %% [markdown] @@ -536,7 +536,7 @@ def join_as_str(seq): # - check if points are equally spaced (probably QC samples are run in close proximity) # - the machine will be not use for intermediate periods -# %% +# %% tags=["hide-input"] if not args.meta_date_col == 'PlaceholderTime': dates = df_meta[args.meta_date_col].sort_values() median_sample_intensity = (df @@ -563,7 +563,7 @@ def join_as_str(seq): # %% [markdown] # ## Feature frequency in data -# %% +# %% tags=["hide-input"] msg = "Total number of samples in data: {}" logger.info(msg.format(len(df))) @@ -571,11 +571,11 @@ def join_as_str(seq): # %% [markdown] # Recalculate feature frequency after selecting samples -# %% +# %% tags=["hide-input"] freq_per_feature = feature_frequency(df) freq_per_feature -# %% +# %% tags=["hide-input"] # freq_per_feature.name = 'Gene names freq' # name it differently? # index.name is lost when data is stored fname = args.data / 'freq_features.json' @@ -599,7 +599,7 @@ def join_as_str(seq): # for validation and test data split, e.g. 0.1 = quantile(0.1) # - select frac_mnar from intensities selected using threshold matrix -# %% +# %% tags=["hide-input"] splits = DataSplits(is_wide_format=False) logger.info(f"{splits = }") splits.__annotations__ @@ -609,11 +609,11 @@ def join_as_str(seq): # Create some target values by sampling X% of the validation and test data. # Simulated missing values are not used for validation and testing. -# %% +# %% tags=["hide-input"] df_long = vaep.io.datasplits.long_format(df) df_long.head() -# %% +# %% tags=["hide-input"] group = 2 splits, thresholds, fake_na_mcar, fake_na_mnar = vaep.sampling.sample_mnar_mcar( @@ -624,7 +624,7 @@ def join_as_str(seq): ) logger.info(f"{splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }") -# %% +# %% tags=["hide-input"] N = len(df_long) N_MCAR = len(fake_na_mcar) N_MNAR = len(fake_na_mnar) @@ -663,6 +663,18 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(fig, fname) +# %% tags=["hide-input"] +counts_per_bin = vaep.pandas.get_counts_per_bin( + df=pd.concat( + [df_long.squeeze().to_frame('observed'), + thresholds.to_frame('threshold'), + fake_na_mnar.squeeze().to_frame(f'MNAR ({N_MNAR:,d})'), + fake_na_mcar.squeeze().to_frame(f'MCAR ({N_MCAR:,d})')], + axis=1), + bins=range(min_max[0], min_max[1] + 1, 1)) +counts_per_bin.to_excel(fname.with_suffix('.xlsx')) +counts_per_bin + # %% [markdown] # ### Keep simulated samples only in a subset of the samples @@ -672,7 +684,7 @@ def join_as_str(seq): # # The procedure is experimental and turned off by default. -# %% +# %% tags=["hide-input"] if 0.0 < args.prop_sample_w_sim < 1.0: to_stratify = None if args.meta_cat_col and df_meta is not None: @@ -696,16 +708,16 @@ def join_as_str(seq): splits.test_y = splits.test_y.loc[test_idx] logger.info(f"New shapes: {splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }") -# %% +# %% tags=["hide-input"] splits.test_y.groupby(level=-1).count().describe() -# %% +# %% tags=["hide-input"] splits.val_y -# %% +# %% tags=["hide-input"] splits.train_X.groupby(level=-1).count().describe() -# %% +# %% tags=["hide-input"] # Check that feature indices and sample indicies overlap between splits # -> a single feature cannot be only in the validation or test split # -> single features should be put into the training data @@ -728,7 +740,7 @@ def join_as_str(seq): splits.val_y = splits.val_y.drop(to_remove.index) diff -# %% +# %% tags=["hide-input"] diff = (splits .test_y .index @@ -753,7 +765,7 @@ def join_as_str(seq): # In that case: Move the validation measurments back to the training data. # If after this procedure the condition is still not met, a value error is raised. -# %% +# %% tags=["hide-input"] mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4 if mask_min_4_measurments.any(): idx = mask_min_4_measurments.loc[mask_min_4_measurments].index @@ -776,7 +788,7 @@ def join_as_str(seq): # - Data in long format: (peptide, sample_id, intensity) # - no missing values kept -# %% +# %% tags=["hide-input"] # dumps data in long-format splits_dumped = splits.dump(folder=args.data, file_format=args.file_format) dumps.update(splits_dumped) @@ -785,13 +797,13 @@ def join_as_str(seq): # %% [markdown] # ### Reload from disk -# %% +# %% tags=["hide-input"] splits = DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # ## plot distribution of splits -# %% +# %% tags=["hide-input"] splits_df = pd.DataFrame(index=df_long.index) splits_df['train'] = splits.train_X splits_df['val'] = splits.val_y @@ -800,7 +812,7 @@ def join_as_str(seq): stats_splits.to_excel(writer, 'stats_splits', float_format='%.3f') stats_splits -# %% +# %% tags=["hide-input"] # whitespaces in legends are not displayed correctly... # max_int_len = len(str(int(stats_splits.loc['count'].max()))) +1 # _legend = [ @@ -812,7 +824,7 @@ def join_as_str(seq): for s in ('train', 'val', 'test')] print(_legend) -# %% +# %% tags=["hide-input"] group = 3 ax = (splits .train_X @@ -839,9 +851,9 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] min_bin, max_bin = vaep.plotting.data.min_max(splits.val_y) -bins = range(int(min_bin), int(max_bin), 1) +bins = range(int(min_bin), int(max_bin) + 1, 1) ax = splits_df.plot.hist(bins=bins, xticks=list(bins), legend=False, @@ -857,12 +869,12 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins) counts_per_bin.to_excel(fname.with_suffix('.xlsx')) counts_per_bin -# %% +# %% tags=["hide-input"] ax = splits_df.drop('train', axis=1).plot.hist(bins=bins, xticks=list(bins), color=['C1', 'C2'], @@ -878,43 +890,28 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) -# %% -# Save binned counts - -# %% -counts_per_bin = dict() -for col in splits_df.columns: - _series = (pd.cut(splits_df[col], bins=bins) - .to_frame() - .groupby(col) - .size()) - _series.index.name = 'bin' - counts_per_bin[col] = _series -counts_per_bin = pd.DataFrame(counts_per_bin) -counts_per_bin.to_excel(fname.with_suffix('.xlsx')) -counts_per_bin # %% [markdown] # plot training data missing plots -# %% +# %% tags=["hide-input"] splits.to_wide_format() -# %% +# %% tags=["hide-input"] ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=splits.train_X, type='scatter') fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter_train' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=splits.train_X, type='boxplot') fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_train' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] medians = (splits .train_X .median() @@ -948,7 +945,7 @@ def join_as_str(seq): # %% [markdown] # ## Save parameters -# %% +# %% tags=["hide-input"] fname = args.folder_experiment / 'data_config.yaml' args.dump(fname) args @@ -956,15 +953,15 @@ def join_as_str(seq): # %% [markdown] # ## Saved Figures -# %% +# %% tags=["hide-input"] # saved figures figures # %% [markdown] # Saved dumps -# %% +# %% tags=["hide-input"] writer.close() dumps -# %% +# %% tags=["hide-input"] diff --git a/project/01_0_transform_data_to_wide_format.ipynb b/project/01_0_transform_data_to_wide_format.ipynb index df4d3fd13..bcd8a0bf1 100644 --- a/project/01_0_transform_data_to_wide_format.ipynb +++ b/project/01_0_transform_data_to_wide_format.ipynb @@ -13,7 +13,9 @@ "execution_count": null, "id": "9aacaba7", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -28,7 +30,11 @@ "cell_type": "code", "execution_count": null, "id": "d01a155d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -67,7 +73,11 @@ "cell_type": "code", "execution_count": null, "id": "43ff9ae3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -78,7 +88,11 @@ "cell_type": "code", "execution_count": null, "id": "11e46901", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "params = vaep.nb.args_from_dict(args)\n", @@ -90,7 +104,11 @@ "cell_type": "code", "execution_count": null, "id": "1194de4e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits = datasplits.DataSplits.from_folder(params.data, file_format=params.file_format_in)" @@ -101,7 +119,10 @@ "execution_count": null, "id": "197708a1", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -123,7 +144,11 @@ "cell_type": "code", "execution_count": null, "id": "feeae52b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "annotation = pd.Series('test', train_data.index).to_frame('group')\n", @@ -135,7 +160,11 @@ "cell_type": "code", "execution_count": null, "id": "57546236", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = params.data / 'sample_annotation_placeholder.csv'\n", @@ -160,7 +189,10 @@ "execution_count": null, "id": "ce749fdb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -174,7 +206,11 @@ "cell_type": "code", "execution_count": null, "id": "8ce12421", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# 'data_wide_sample_cols.csv'" diff --git a/project/01_0_transform_data_to_wide_format.py b/project/01_0_transform_data_to_wide_format.py index e123defa5..5e1cee1cd 100644 --- a/project/01_0_transform_data_to_wide_format.py +++ b/project/01_0_transform_data_to_wide_format.py @@ -17,15 +17,14 @@ # # Transfer data for NAGuideR format # -# %% +# %% tags=["hide-input"] import pandas as pd import vaep import vaep.models from vaep.io import datasplits - -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -41,19 +40,19 @@ file_format_in: str = 'csv' # file format of original splits, default pickle (pkl) file_format_out: str = 'csv' # file format of transformed splits, default csv -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] params = vaep.nb.args_from_dict(args) # params = OmegaConf.create(args) params -# %% +# %% tags=["hide-input"] splits = datasplits.DataSplits.from_folder(params.data, file_format=params.file_format_in) -# %% +# %% tags=["hide-input"] train_data = splits.train_X.unstack() train_data @@ -62,12 +61,12 @@ # Save placeholder sample annotation for use in NAGuideR app which requires such a file -# %% +# %% tags=["hide-input"] annotation = pd.Series('test', train_data.index).to_frame('group') annotation.index.name = 'Samples' annotation -# %% +# %% tags=["hide-input"] fname = params.data / 'sample_annotation_placeholder.csv' annotation.to_csv(fname) fname @@ -75,12 +74,12 @@ # %% [markdo] # Save with samples in columns -# %% +# %% tags=["hide-input"] fname = params.data / 'data_wide_sample_cols.csv' # fillna('Filtered') train_data.T.to_csv(fname) fname -# %% +# %% tags=["hide-input"] # 'data_wide_sample_cols.csv' diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb index 6ce02e159..b508cd19d 100644 --- a/project/01_1_train_CF.ipynb +++ b/project/01_1_train_CF.ipynb @@ -12,38 +12,32 @@ "cell_type": "code", "execution_count": null, "id": "18b5d571-2956-4112-b22c-43d6c2146b06", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", - "\n", "from pprint import pprint\n", "\n", "import matplotlib.pyplot as plt\n", - "\n", - "# from fastai.basics import *\n", - "# from fastai.callback.all import *\n", - "# from fastai.torch_basics import *\n", - "# from fastai.data.all import *\n", - "\n", - "from fastai.tabular.all import *\n", + "# overwriting Recorder callback with custom plot_loss\n", + "from fastai import learner\n", "from fastai.collab import *\n", - "\n", - "from fastai.collab import (EmbeddingDotBias, Learner, MSELossFlat, EarlyStoppingCallback, default_device)\n", + "from fastai.collab import (EarlyStoppingCallback, EmbeddingDotBias, Learner,\n", + " MSELossFlat, default_device)\n", + "from fastai.tabular.all import *\n", "\n", "import vaep\n", "import vaep.model\n", "import vaep.models as models\n", - "from vaep.models import plot_loss, RecorderDump\n", - "\n", "import vaep.nb\n", - "from vaep import sampling\n", "from vaep.io import datasplits\n", - "\n", "from vaep.logging import setup_logger\n", + "from vaep.models import RecorderDump, plot_loss\n", "\n", - "# overwriting Recorder callback with custom plot_loss\n", - "from fastai import learner\n", "learner.Recorder.plot_loss = plot_loss\n", "# import fastai.callback.hook # Learner.summary\n", "\n", @@ -67,7 +61,11 @@ "cell_type": "code", "execution_count": null, "id": "85c7d6f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -119,7 +117,11 @@ "cell_type": "code", "execution_count": null, "id": "0746e70f-0259-48d5-90ef-25fe4b59f9ac", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -131,7 +133,10 @@ "execution_count": null, "id": "100bbf80", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -158,7 +163,10 @@ "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -180,7 +188,11 @@ "cell_type": "code", "execution_count": null, "id": "6d9cc7bd-6b6f-40b9-8db7-c8228e4b03e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -199,7 +211,11 @@ "cell_type": "code", "execution_count": null, "id": "02bb6bf5-0eb1-4c73-9723-414b14eaf7c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X" @@ -209,7 +225,11 @@ "cell_type": "code", "execution_count": null, "id": "f3311709", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! add check that specified data is available\n", @@ -229,7 +249,10 @@ "execution_count": null, "id": "44958473", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -269,7 +292,11 @@ "cell_type": "code", "execution_count": null, "id": "b5b945aa-9b4e-4487-8b09-dca289e64d9d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_simulated_na = data.val_y.to_frame(name='observed')\n", @@ -281,7 +308,10 @@ "execution_count": null, "id": "98558b10", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -304,7 +334,11 @@ "cell_type": "code", "execution_count": null, "id": "3ee54305-266a-479a-b677-f151ddde250a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# larger mini-batches speed up training\n", @@ -325,7 +359,10 @@ "execution_count": null, "id": "12ffa243-151e-4220-a1d5-247f8aba3429", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -337,7 +374,11 @@ "cell_type": "code", "execution_count": null, "id": "4a02e061-6789-4f3d-8031-a40879c496c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ana_collab.model = EmbeddingDotBias.from_classes(\n", @@ -371,7 +412,11 @@ "cell_type": "code", "execution_count": null, "id": "8317c9e1-d128-4ab4-8d60-775cb85ef535", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=train_collab\n", @@ -419,7 +464,10 @@ "execution_count": null, "id": "bb76e6c5-e135-41c4-95e8-a56c3764c731", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -443,7 +491,11 @@ "cell_type": "code", "execution_count": null, "id": "c7f0c597-d3c7-42d0-a6ef-3bc4c13121b8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ana_collab.test_dl = ana_collab.dls.test_dl(data.test_y.reset_index())\n", @@ -456,7 +508,10 @@ "execution_count": null, "id": "1cd76df6", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -484,7 +539,10 @@ "execution_count": null, "id": "cff8caf4-ccc9-4a36-a992-2cc596abe51a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -510,7 +568,11 @@ "cell_type": "code", "execution_count": null, "id": "d825e38e-f3d6-4bca-b621-150267e7b7bc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -613,7 +675,11 @@ "cell_type": "code", "execution_count": null, "id": "782636ac-c979-4f8b-9fc0-66fd0c7a3a8b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# save simulated missing values for both splits\n", @@ -635,7 +701,11 @@ "cell_type": "code", "execution_count": null, "id": "0f13cb38-abf0-4b56-9399-3d11d32f7fbc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", @@ -646,7 +716,11 @@ "cell_type": "code", "execution_count": null, "id": "408b261a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] } diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py index 9fca75638..68acf2afa 100644 --- a/project/01_1_train_CF.py +++ b/project/01_1_train_CF.py @@ -16,31 +16,26 @@ # %% [markdown] # # Collaborative Filtering -# %% +# %% tags=["hide-input"] import logging - from pprint import pprint import matplotlib.pyplot as plt - -from fastai.tabular.all import * +# overwriting Recorder callback with custom plot_loss +from fastai import learner from fastai.collab import * - -from fastai.collab import (EmbeddingDotBias, Learner, MSELossFlat, EarlyStoppingCallback, default_device) +from fastai.collab import (EarlyStoppingCallback, EmbeddingDotBias, Learner, + MSELossFlat, default_device) +from fastai.tabular.all import * import vaep import vaep.model import vaep.models as models -from vaep.models import plot_loss, RecorderDump - import vaep.nb -from vaep import sampling from vaep.io import datasplits - from vaep.logging import setup_logger +from vaep.models import RecorderDump, plot_loss -# overwriting Recorder callback with custom plot_loss -from fastai import learner learner.Recorder.plot_loss = plot_loss # import fastai.callback.hook # Learner.summary @@ -54,7 +49,7 @@ # %% [markdown] # Papermill script parameters: -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -83,11 +78,11 @@ # %% [markdown] # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) # # Currently not needed -> DotProduct used, not a FNN @@ -101,7 +96,7 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' if not args.cuda: @@ -111,24 +106,24 @@ # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X -# %% +# %% tags=["hide-input"] # # ! add check that specified data is available # silent error in fastai if e.g. target column is not available # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -151,11 +146,11 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_simulated_na = data.val_y.to_frame(name='observed') val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na = data.test_y.to_frame(name='observed') test_pred_simulated_na.describe() @@ -166,7 +161,7 @@ # - save custom collab batch size (increase AE batch size by a factor), could be setup separately. # - the test data is used to evaluate the performance after training -# %% +# %% tags=["hide-input"] # larger mini-batches speed up training ana_collab = models.collab.CollabAnalysis( datasplits=data, @@ -179,12 +174,12 @@ ), batch_size=args.batch_size) -# %% +# %% tags=["hide-input"] print("Args:") pprint(ana_collab.model_kwargs) -# %% +# %% tags=["hide-input"] ana_collab.model = EmbeddingDotBias.from_classes( classes=ana_collab.dls.classes, **ana_collab.model_kwargs) @@ -206,7 +201,7 @@ # %% [markdown] # ### Training -# %% +# %% tags=["hide-input"] # papermill_description=train_collab suggested_lr = ana_collab.learn.lr_find() print(f"{suggested_lr.valley = :.5f}") @@ -234,7 +229,7 @@ # %% [markdown] # Compare simulated_na data predictions to original values -# %% +# %% tags=["hide-input"] # this could be done using the validation data laoder now ana_collab.test_dl = ana_collab.dls.test_dl( data.val_y.reset_index()) # test_dl is here validation data @@ -246,12 +241,12 @@ # %% [markdown] # select test data predictions -# %% +# %% tags=["hide-input"] ana_collab.test_dl = ana_collab.dls.test_dl(data.test_y.reset_index()) test_pred_simulated_na['CF'], _ = ana_collab.learn.get_preds(dl=ana_collab.test_dl) test_pred_simulated_na -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = models.collab.get_missing_values( df_train_long=data.train_X, @@ -266,7 +261,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -280,7 +275,7 @@ # > Does not make to much sense to compare collab and AEs, # > as the setup differs of training and validation data differs -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() @@ -318,7 +313,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_simulated_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_simulated_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") @@ -326,8 +321,8 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args -# %% +# %% tags=["hide-input"] diff --git a/project/01_1_train_DAE.ipynb b/project/01_1_train_DAE.ipynb index d33607a87..ac07f5a79 100644 --- a/project/01_1_train_DAE.ipynb +++ b/project/01_1_train_DAE.ipynb @@ -13,32 +13,31 @@ "execution_count": null, "id": "18b5d571-2956-4112-b22c-43d6c2146b06", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "import logging\n", "\n", + "import sklearn\n", + "from fastai import learner\n", "from fastai.basics import *\n", "from fastai.callback.all import *\n", "from fastai.torch_basics import *\n", - "\n", "from IPython.display import display\n", - "\n", - "import sklearn\n", - "from sklearn.preprocessing import StandardScaler\n", "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import StandardScaler\n", "\n", "import vaep\n", - "from vaep.io import datasplits\n", - "from vaep.models import ae\n", - "import vaep.models as models\n", "import vaep.model\n", + "import vaep.models as models\n", "from vaep.analyzers import analyzers\n", - "\n", + "from vaep.io import datasplits\n", "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss\n", - "from fastai import learner\n", + "from vaep.models import ae, plot_loss\n", "\n", "learner.Recorder.plot_loss = plot_loss\n", "\n", @@ -54,7 +53,11 @@ "cell_type": "code", "execution_count": null, "id": "297f14bc-3c37-43fa-8217-f790f0593d78", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -124,7 +127,11 @@ "cell_type": "code", "execution_count": null, "id": "0746e70f-0259-48d5-90ef-25fe4b59f9ac", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -136,7 +143,10 @@ "execution_count": null, "id": "e20093e1", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -163,7 +173,11 @@ "cell_type": "code", "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -181,7 +195,11 @@ "cell_type": "code", "execution_count": null, "id": "6d9cc7bd-6b6f-40b9-8db7-c8228e4b03e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -200,7 +218,11 @@ "cell_type": "code", "execution_count": null, "id": "02bb6bf5-0eb1-4c73-9723-414b14eaf7c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -218,7 +240,11 @@ "cell_type": "code", "execution_count": null, "id": "44958473", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -250,7 +276,10 @@ "execution_count": null, "id": "b5b945aa-9b4e-4487-8b09-dca289e64d9d", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -281,7 +310,11 @@ "cell_type": "code", "execution_count": null, "id": "98f675b6-e619-45b6-8f04-b75237d212a7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_simulated_na = data.val_y.to_frame(name='observed')\n", @@ -293,7 +326,10 @@ "execution_count": null, "id": "9686a2eb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -316,7 +352,10 @@ "execution_count": null, "id": "cff8caf4-ccc9-4a36-a992-2cc596abe51a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -337,7 +376,11 @@ "cell_type": "code", "execution_count": null, "id": "7952fe13", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X" @@ -347,7 +390,11 @@ "cell_type": "code", "execution_count": null, "id": "a0a0bcd9-22af-4dd9-af56-b041931ee918", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y # potentially has less features" @@ -357,7 +404,11 @@ "cell_type": "code", "execution_count": null, "id": "9f0826f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index,\n", @@ -385,7 +436,11 @@ "cell_type": "code", "execution_count": null, "id": "7bbed0af-64bd-45d8-9be2-5b856cb25cce", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "default_pipeline = sklearn.pipeline.Pipeline(\n", @@ -424,7 +479,11 @@ "cell_type": "code", "execution_count": null, "id": "4c568fe5-adfb-401c-afed-fabce46be0fe", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.learn = Learner(dls=analysis.dls,\n", @@ -451,7 +510,11 @@ "cell_type": "code", "execution_count": null, "id": "d625fb88", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# learn.summary()" @@ -461,7 +524,11 @@ "cell_type": "code", "execution_count": null, "id": "2fad0a84-3d3a-4e77-9f80-58b7f45f5352", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "suggested_lr = analysis.learn.lr_find()\n", @@ -482,7 +549,10 @@ "execution_count": null, "id": "99a5f505-7785-4152-8bed-73bd965f3ea8", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -494,7 +564,11 @@ "cell_type": "code", "execution_count": null, "id": "a83ba8fb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=train\n", @@ -513,7 +587,11 @@ "cell_type": "code", "execution_count": null, "id": "d7ae4840", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.epoch_trained = analysis.learn.epoch + 1\n", @@ -533,7 +611,10 @@ "execution_count": null, "id": "c0d278d3-6b12-420e-92f9-f8c2dc06ec02", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -574,7 +655,10 @@ "execution_count": null, "id": "ff3aad0d", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -589,7 +673,10 @@ "execution_count": null, "id": "c965ca0f-5936-460d-b696-015d7db01d75", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -601,7 +688,11 @@ "cell_type": "code", "execution_count": null, "id": "dc1ff5c3-f01b-4997-845a-ea72f041c96d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_simulated_na['DAE'] = pred # model_key?\n", @@ -621,7 +712,10 @@ "execution_count": null, "id": "0f907181", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -648,7 +742,11 @@ "cell_type": "code", "execution_count": null, "id": "cc065c5f-7bba-48d5-bc87-e4cf90462a6f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.model.cpu()\n", @@ -662,7 +760,11 @@ "cell_type": "code", "execution_count": null, "id": "7b915728-5e84-45b7-bbc0-da32bc657091", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! calculate embeddings only if meta data is available? Optional argument to save embeddings?\n", @@ -679,7 +781,11 @@ "cell_type": "code", "execution_count": null, "id": "33404d1b-f553-4e05-be7e-821511883507", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_cat_col and df_meta is not None:\n", @@ -713,7 +819,11 @@ "cell_type": "code", "execution_count": null, "id": "d825e38e-f3d6-4bca-b621-150267e7b7bc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -732,7 +842,11 @@ "cell_type": "code", "execution_count": null, "id": "855a7a6f-93fd-4612-9d8d-96541a2441be", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na')\n", @@ -751,7 +865,11 @@ "cell_type": "code", "execution_count": null, "id": "571ac8d4-bb5d-45db-bba8-59817e476304", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na')\n", @@ -770,7 +888,11 @@ "cell_type": "code", "execution_count": null, "id": "87910434-7d07-4e8e-8380-c92fc515bd16", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n", @@ -782,7 +904,11 @@ "cell_type": "code", "execution_count": null, "id": "7d99deb9-9aad-4ba9-b79d-e4b3c6c7f023", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics,\n", @@ -803,7 +929,10 @@ "execution_count": null, "id": "782636ac-c979-4f8b-9fc0-66fd0c7a3a8b", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -824,7 +953,11 @@ "cell_type": "code", "execution_count": null, "id": "883de917", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -834,7 +967,11 @@ "cell_type": "code", "execution_count": null, "id": "0f13cb38-abf0-4b56-9399-3d11d32f7fbc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", @@ -845,7 +982,11 @@ "cell_type": "code", "execution_count": null, "id": "43e4a4ad", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] } diff --git a/project/01_1_train_DAE.py b/project/01_1_train_DAE.py index e06baeb15..069b02a0e 100644 --- a/project/01_1_train_DAE.py +++ b/project/01_1_train_DAE.py @@ -16,29 +16,25 @@ # %% [markdown] # # Denoising Autoencoder -# %% +# %% tags=["hide-input"] import logging +import sklearn +from fastai import learner from fastai.basics import * from fastai.callback.all import * from fastai.torch_basics import * - from IPython.display import display - -import sklearn -from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler import vaep -from vaep.io import datasplits -from vaep.models import ae -import vaep.models as models import vaep.model +import vaep.models as models from vaep.analyzers import analyzers - +from vaep.io import datasplits # overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss -from fastai import learner +from vaep.models import ae, plot_loss learner.Recorder.plot_loss = plot_loss @@ -50,7 +46,7 @@ figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -91,11 +87,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) if isinstance(args.hidden_layers, str): @@ -110,26 +106,26 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -148,7 +144,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -162,11 +158,11 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_simulated_na = data.val_y.to_frame(name='observed') val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na = data.test_y.to_frame(name='observed') test_pred_simulated_na.describe() @@ -176,7 +172,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -185,13 +181,13 @@ # %% [markdown] # ### Fill Validation data with potentially missing features -# %% +# %% tags=["hide-input"] data.train_X -# %% +# %% tags=["hide-input"] data.val_y # potentially has less features -# %% +# %% tags=["hide-input"] data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index, columns=data.train_X.columns).fillna(data.val_y) data.val_y @@ -202,7 +198,7 @@ # %% [markdown] # ### Analysis: DataLoaders, Model, transform -# %% +# %% tags=["hide-input"] default_pipeline = sklearn.pipeline.Pipeline( [ ('normalize', StandardScaler()), @@ -229,7 +225,7 @@ # %% [markdown] # ### Training -# %% +# %% tags=["hide-input"] analysis.learn = Learner(dls=analysis.dls, model=analysis.model, loss_func=MSELossFlat(reduction='sum'), @@ -244,10 +240,10 @@ # [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in # current version. Try again later -# %% +# %% tags=["hide-input"] # learn.summary() -# %% +# %% tags=["hide-input"] suggested_lr = analysis.learn.lr_find() analysis.params['suggested_inital_lr'] = suggested_lr.valley suggested_lr @@ -255,26 +251,26 @@ # %% [markdown] # dump model config -# %% +# %% tags=["hide-input"] vaep.io.dump_json(analysis.params, args.out_models / TEMPLATE_MODEL_PARAMS.format(args.model_key)) -# %% +# %% tags=["hide-input"] # papermill_description=train analysis.learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley) # %% [markdown] # Save number of actually trained epochs -# %% +# %% tags=["hide-input"] args.epoch_trained = analysis.learn.epoch + 1 args.epoch_trained # %% [markdown] # #### Loss normalized by total number of measurements -# %% +# %% tags=["hide-input"] N_train_notna = data.train_X.notna().sum().sum() N_val_notna = data.val_y.notna().sum().sum() fig = models.plot_training_losses(analysis.learn, args.model_key, @@ -297,24 +293,24 @@ # # create predictiona and select for validation data -# %% +# %% tags=["hide-input"] analysis.model.eval() pred, target = analysis.get_preds_from_df(df_wide=data.train_X) # train_X pred = pred.stack() pred -# %% +# %% tags=["hide-input"] val_pred_simulated_na['DAE'] = pred # model_key ? val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na['DAE'] = pred # model_key? test_pred_simulated_na # %% [markdown] # save missing values predictions -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = ae.get_missing_values(df_train_wide=data.train_X, val_idx=val_pred_simulated_na.index, @@ -329,14 +325,14 @@ # # - validation data -# %% +# %% tags=["hide-input"] analysis.model.cpu() df_latent = vaep.model.get_latent_space(analysis.model.encoder, dl=analysis.dls.valid, dl_index=analysis.dls.valid.data.index) df_latent -# %% +# %% tags=["hide-input"] # # ! calculate embeddings only if meta data is available? Optional argument to save embeddings? ana_latent = analyzers.LatentAnalysis(df_latent, df_meta, @@ -346,7 +342,7 @@ figures[f'latent_{args.model_key}_by_date'], ax = ana_latent.plot_by_date( args.meta_date_col) -# %% +# %% tags=["hide-input"] if args.meta_cat_col and df_meta is not None: figures[f'latent_{args.model_key}_by_{"_".join(args.meta_cat_col.split())}'], ax = ana_latent.plot_by_category( args.meta_cat_col) @@ -363,33 +359,33 @@ # # - all measured (identified, observed) peptides in validation data -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The simulated NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na') added_metrics # %% [markdown] # ### Test Datasplit -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na') added_metrics # %% [markdown] # Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -397,18 +393,18 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_simulated_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_simulated_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args -# %% +# %% tags=["hide-input"] diff --git a/project/01_1_train_KNN.ipynb b/project/01_1_train_KNN.ipynb index 1edfeccf0..ebd24a6d0 100644 --- a/project/01_1_train_KNN.ipynb +++ b/project/01_1_train_KNN.ipynb @@ -13,7 +13,10 @@ "execution_count": null, "id": "76e01f3e", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -22,6 +25,7 @@ "import pandas as pd\n", "import sklearn\n", "import sklearn.impute\n", + "from IPython.display import display\n", "\n", "import vaep\n", "import vaep.model\n", @@ -41,7 +45,11 @@ "cell_type": "code", "execution_count": null, "id": "3fb29f3e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -104,7 +112,10 @@ "execution_count": null, "id": "10dfbb95", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -125,7 +136,11 @@ "cell_type": "code", "execution_count": null, "id": "b0f78a72", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -143,7 +158,11 @@ "cell_type": "code", "execution_count": null, "id": "a7a2208e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" @@ -161,7 +180,11 @@ "cell_type": "code", "execution_count": null, "id": "34b79387", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -179,7 +202,11 @@ "cell_type": "code", "execution_count": null, "id": "a61f7f70", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -201,7 +228,11 @@ "cell_type": "code", "execution_count": null, "id": "5e1a277f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = sampling.frequency_by_index(data.train_X, 0)\n", @@ -228,7 +259,11 @@ "cell_type": "code", "execution_count": null, "id": "d056fb82", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -240,7 +275,10 @@ "execution_count": null, "id": "ade74bab", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -260,7 +298,11 @@ "cell_type": "code", "execution_count": null, "id": "4a75a078", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.to_wide_format()\n", @@ -282,7 +324,11 @@ "cell_type": "code", "execution_count": null, "id": "58d3383c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X)" @@ -305,7 +351,11 @@ "cell_type": "code", "execution_count": null, "id": "98a1cba0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pred = knn_imputer.transform(data.train_X)\n", @@ -317,7 +367,11 @@ "cell_type": "code", "execution_count": null, "id": "5b24132c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na[args.model_key] = pred\n", @@ -328,7 +382,11 @@ "cell_type": "code", "execution_count": null, "id": "f8937bcc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_fake_na[args.model_key] = pred\n", @@ -348,7 +406,10 @@ "execution_count": null, "id": "8a853343", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -375,7 +436,11 @@ "cell_type": "code", "execution_count": null, "id": "e7c0d93c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] }, @@ -384,11 +449,7 @@ "id": "8881f26e", "metadata": {}, "source": [ - "## Comparisons\n", - "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", - "> Could be changed." + "## Comparisons\n" ] }, { @@ -408,7 +469,11 @@ "cell_type": "code", "execution_count": null, "id": "1d3a789e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -427,7 +492,11 @@ "cell_type": "code", "execution_count": null, "id": "d56eb144", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", @@ -450,7 +519,11 @@ "cell_type": "code", "execution_count": null, "id": "3d0be628", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", @@ -469,7 +542,11 @@ "cell_type": "code", "execution_count": null, "id": "9f03ba1f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')\n", @@ -480,7 +557,11 @@ "cell_type": "code", "execution_count": null, "id": "b962d322", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics,\n", @@ -500,7 +581,11 @@ "cell_type": "code", "execution_count": null, "id": "ce0fb347", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# save simulated missing values for both splits\n", @@ -520,7 +605,11 @@ "cell_type": "code", "execution_count": null, "id": "fc43a4e8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -530,7 +619,11 @@ "cell_type": "code", "execution_count": null, "id": "d3bbd037", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.n_params = 1 # the number of neighbors to consider\n", diff --git a/project/01_1_train_KNN.py b/project/01_1_train_KNN.py index 9989d0f25..ddd21c2aa 100644 --- a/project/01_1_train_KNN.py +++ b/project/01_1_train_KNN.py @@ -16,7 +16,7 @@ # %% [markdown] # # K- Nearest Neighbors (KNN) -# %% +# %% tags=["hide-input"] import logging import pandas as pd @@ -38,7 +38,7 @@ figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -72,7 +72,7 @@ # %% [markdown] # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args = vaep.nb.args_from_dict(args) args @@ -81,25 +81,25 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -109,7 +109,7 @@ # %% [markdown] # ## Initialize Comparison -# %% +# %% tags=["hide-input"] freq_feat = sampling.frequency_by_index(data.train_X, 0) freq_feat.head() # training data @@ -119,11 +119,11 @@ # %% [markdown] # The validation fake NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() @@ -131,7 +131,7 @@ # %% [markdown] # ## Data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -140,7 +140,7 @@ # ## Train # model = 'sklearn_knn' -# %% +# %% tags=["hide-input"] knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X) # %% [markdown] @@ -151,23 +151,23 @@ # # create predictions and select for split entries -# %% +# %% tags=["hide-input"] pred = knn_imputer.transform(data.train_X) pred = pd.DataFrame(pred, index=data.train_X.index, columns=data.train_X.columns).stack() pred -# %% +# %% tags=["hide-input"] val_pred_fake_na[args.model_key] = pred val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na[args.model_key] = pred test_pred_fake_na # %% [markdown] # save missing values predictions -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = ae.get_missing_values(df_train_wide=data.train_X, val_idx=val_pred_fake_na.index, @@ -182,7 +182,7 @@ # # - validation data -# %% +# %% tags=["hide-input"] # %% [markdown] # ## Comparisons @@ -196,14 +196,14 @@ # > Does not make to much sense to compare collab and AEs, # > as the setup differs of training and validation data differs -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') added_metrics @@ -214,18 +214,18 @@ # explicitly to misssing before it was fed to the model for # reconstruction. -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') added_metrics # %% [markdown] # Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -233,7 +233,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_fake_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_fake_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") @@ -241,10 +241,10 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.n_params = 1 # the number of neighbors to consider args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_KNN_unique_samples.py b/project/01_1_train_KNN_unique_samples.py new file mode 100644 index 000000000..1cd24fe26 --- /dev/null +++ b/project/01_1_train_KNN_unique_samples.py @@ -0,0 +1,301 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.0 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # K- Nearest Neighbors (KNN) + +# %% +import logging + +import pandas as pd +import sklearn +from sklearn.model_selection import train_test_split + +import vaep +import vaep.model +import vaep.models as models +import vaep.nb +from vaep import sampling +from vaep.io import datasplits +from vaep.models import ae + +logger = vaep.logging.setup_logger(logging.getLogger('vaep')) +logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions") + +figures = {} # collection of ax or figures + + +# %% +# catch passed parameters +args = None +args = dict(globals()).keys() + +# %% [markdown] +# Papermill script parameters: + +# %% tags=["parameters"] +# files and folders +folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment +folder_data: str = '' # specify data directory if needed +file_format: str = 'csv' # file format of create splits, default pickle (pkl) +# Machine parsed metadata from rawfile workflow +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' +# training +epochs_max: int = 50 # Maximum number of epochs +# early_stopping:bool = True # Wheather to use early stopping or not +batch_size: int = 64 # Batch size for training (and evaluation) +cuda: bool = True # Whether to use a GPU for training +# model +neighbors: int = 3 # number of neigherst neighbors to use +force_train: bool = True # Force training when saved model could be used. Per default re-train model +sample_idx_position: int = 0 # position of index which is sample ID +model: str = 'KNN' # model name +model_key: str = 'KNN_UNIQUE' # potentially alternative key for model (grid search) +save_pred_real_na: bool = True # Save all predictions for missing values +# metadata -> defaults for metadata extracted from machine data +meta_date_col: str = None # date column in meta data +meta_cat_col: str = None # category column in meta data + + +# Parameters +neighbors = 3 +folder_experiment = "runs/rev3" +folder_data = "runs/appl_ald_data_2023_11/plasma/proteinGroups/data" +fn_rawfile_metadata = "data/ALD_study/processed/ald_metadata_cli.csv" +meta_cat_col = 'kleiner' + +# %% [markdown] +# Some argument transformations + +# %% +args = vaep.nb.get_params(args, globals=globals()) +args = vaep.nb.args_from_dict(args) +args + + +# %% [markdown] +# Some naming conventions + +# %% +TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' + + +# %% [markdown] +# load meta data for splits + + +# %% [markdown] +# ## Load data in long format + +# %% +data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) + +# %% [markdown] +# data is loaded in long format + +# %% +data.train_X.sample(5) + +# %% +if args.fn_rawfile_metadata: + df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) + df_meta = df_meta.loc[data.train_X.index.levels[0]] +else: + df_meta = None +df_meta + + +# %% +df_meta['to_stratify'] = df_meta[args.meta_cat_col].fillna(-1) +data.to_wide_format() +train_idx, val_test_idx = train_test_split(data.train_X.index, + test_size=.2, + stratify=df_meta['to_stratify'], + random_state=42) +val_idx, test_idx = train_test_split(val_test_idx, + test_size=.5, + stratify=df_meta.loc[val_test_idx, 'to_stratify'], + random_state=42) +print("Train:", train_idx.shape, "Val:", val_idx.shape, "Test:", test_idx.shape) + +# %% +data.train_X.update(data.val_y.loc[train_idx]) +data.train_X.update(data.test_y.loc[train_idx]) +data.val_X = data.train_X.loc[val_idx] +data.test_X = data.train_X.loc[test_idx] +data.train_X = data.train_X.loc[train_idx] + +data.val_y = data.val_y.loc[val_idx] +data.test_y = data.test_y.loc[test_idx] + +# %% +data.to_long_format() + +# %% [markdown] +# ## Initialize Comparison + +# %% +freq_feat = sampling.frequency_by_index(data.train_X, 0) +freq_feat.head() # training data + +# %% [markdown] +# ### Simulated missing values + +# %% [markdown] +# The validation fake NA is used to by all models to evaluate training performance. + +# %% +val_pred_fake_na = data.val_y.to_frame(name='observed') +val_pred_fake_na + +# %% +test_pred_fake_na = data.test_y.to_frame(name='observed') +test_pred_fake_na.describe() + + +# %% [markdown] +# ## Data in wide format + +# %% +data.to_wide_format() +args.M = data.train_X.shape[-1] +data.train_X + +# %% [markdown] +# ## Train +# model = 'sklearn_knn' + +# %% +knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X) + +# %% [markdown] +# ### Predictions +# +# - data of training data set and validation dataset to create predictions is the same as training data. +# - predictions include missing values (which are not further compared) +# +# create predictions and select for split entries + +# %% +pred = knn_imputer.transform(data.val_X) +pred = pd.DataFrame(pred, index=data.val_X.index, columns=data.val_X.columns).stack() +pred + +# %% +val_pred_fake_na[args.model_key] = pred +val_pred_fake_na + +# %% +pred = knn_imputer.transform(data.test_X) +pred = pd.DataFrame(pred, index=data.test_X.index, columns=data.test_X.columns).stack() + +test_pred_fake_na[args.model_key] = pred +test_pred_fake_na + +# %% [markdown] +# save missing values predictions + +# %% +df_complete = pd.concat([data.train_X, data.val_X, data.test_X]) +pred = knn_imputer.transform(df_complete) +pred = pd.DataFrame(pred, index=df_complete.index, columns=df_complete.columns).stack() +pred + +# %% +if args.save_pred_real_na: + pred_real_na = ae.get_missing_values(df_train_wide=df_complete, + val_idx=val_pred_fake_na.index, + test_idx=test_pred_fake_na.index, + pred=pred) + display(pred_real_na) + pred_real_na.to_csv(args.out_preds / f"pred_real_na_{args.model_key}.csv") + + +# %% [markdown] +# ### Plots +# +# - validation data + +# %% + +# %% [markdown] +# ## Comparisons +# +# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) +# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) +# > Could be changed. + +# %% [markdown] +# ### Validation data +# +# - all measured (identified, observed) peptides in validation data +# +# > Does not make to much sense to compare collab and AEs, +# > as the setup differs of training and validation data differs + +# %% +# papermill_description=metrics +d_metrics = models.Metrics() + +# %% [markdown] +# The fake NA for the validation step are real test data (not used for training nor early stopping) + +# %% +added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') +added_metrics + +# %% [markdown] +# ### Test Datasplit +# +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. + +# %% +added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') +added_metrics + +# %% [markdown] +# Save all metrics as json + +# %% +vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') +d_metrics + +# %% +metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, + column_levels=['model', 'metric_name']).T +metrics_df + +# %% [markdown] +# ## Save predictions + +# %% +# save simulated missing values for both splits +val_pred_fake_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") +test_pred_fake_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") + +# %% [markdown] +# ## Config + +# %% +figures # switch to fnames? + +# %% +args.n_params = 1 # the number of neighbors to consider +args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") +args + +# %% diff --git a/project/01_1_train_Median.ipynb b/project/01_1_train_Median.ipynb index 3c2933e5d..406691aab 100644 --- a/project/01_1_train_Median.ipynb +++ b/project/01_1_train_Median.ipynb @@ -13,20 +13,24 @@ "execution_count": null, "id": "bdefddcb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "import logging\n", "\n", "import pandas as pd\n", + "from IPython.display import display\n", "\n", "import vaep\n", "import vaep.model\n", "import vaep.models as models\n", + "import vaep.nb\n", "from vaep.io import datasplits\n", "\n", - "import vaep.nb\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\"Median Imputation\")\n", "\n", @@ -37,7 +41,11 @@ "cell_type": "code", "execution_count": null, "id": "82a53c81", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -93,7 +101,11 @@ "cell_type": "code", "execution_count": null, "id": "17c49967", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -105,7 +117,10 @@ "execution_count": null, "id": "071eb3aa", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -125,7 +140,11 @@ "cell_type": "code", "execution_count": null, "id": "79bd3e3d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -143,7 +162,11 @@ "cell_type": "code", "execution_count": null, "id": "fb8a27fa", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" @@ -161,7 +184,11 @@ "cell_type": "code", "execution_count": null, "id": "0df21944", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -179,7 +206,11 @@ "cell_type": "code", "execution_count": null, "id": "86b35447", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -209,7 +240,11 @@ "cell_type": "code", "execution_count": null, "id": "efe75402", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -237,7 +272,11 @@ "cell_type": "code", "execution_count": null, "id": "75aa1ac5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", @@ -264,7 +303,11 @@ "cell_type": "code", "execution_count": null, "id": "16f53ce5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -276,7 +319,10 @@ "execution_count": null, "id": "68ea1649", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -299,7 +345,10 @@ "execution_count": null, "id": "c679a1f9", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -320,7 +369,11 @@ "cell_type": "code", "execution_count": null, "id": "71d667e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# interpolated = vaep.pandas.interpolate(wide_df = data.train_X)\n", @@ -335,7 +388,10 @@ "execution_count": null, "id": "9fb6cea0", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -355,7 +411,10 @@ "execution_count": null, "id": "c05ecd3a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -389,7 +448,11 @@ "cell_type": "code", "execution_count": null, "id": "73648586", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val = val_pred_fake_na['observed'].groupby(level=-1).count()\n", @@ -401,7 +464,11 @@ "cell_type": "code", "execution_count": null, "id": "c3662d07", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# # scatter plot between overall feature freq and split freq\n", @@ -412,7 +479,11 @@ "cell_type": "code", "execution_count": null, "id": "3c72fe9c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" @@ -422,7 +493,11 @@ "cell_type": "code", "execution_count": null, "id": "786f8804", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0)\n", @@ -442,7 +517,11 @@ "cell_type": "code", "execution_count": null, "id": "6da29e33", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0)\n", @@ -453,7 +532,11 @@ "cell_type": "code", "execution_count": null, "id": "3565522f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val" @@ -464,11 +547,7 @@ "id": "ae345647", "metadata": {}, "source": [ - "## Comparisons\n", - "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", - "> Could be changed." + "## Comparisons" ] }, { @@ -476,19 +555,18 @@ "id": "b43adc40", "metadata": {}, "source": [ - "### Validation data\n", - "\n", - "- all measured (identified, observed) peptides in validation data\n", - "\n", - "> Does not make too much sense to compare collab and AEs,\n", - "> as the setup differs of training and validation data differs" + "### Validation data\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b017353a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -507,7 +585,11 @@ "cell_type": "code", "execution_count": null, "id": "47caaf3b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", @@ -530,7 +612,11 @@ "cell_type": "code", "execution_count": null, "id": "4b45f076", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", @@ -549,7 +635,11 @@ "cell_type": "code", "execution_count": null, "id": "d6a5da36", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] }, @@ -566,7 +656,10 @@ "execution_count": null, "id": "26be5fa4", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -578,7 +671,11 @@ "cell_type": "code", "execution_count": null, "id": "7fe80e9a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T\n", @@ -597,7 +694,11 @@ "cell_type": "code", "execution_count": null, "id": "225dc1f0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# val\n", @@ -622,7 +723,11 @@ "cell_type": "code", "execution_count": null, "id": "64a39dc2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -632,7 +737,11 @@ "cell_type": "code", "execution_count": null, "id": "14983bf9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", diff --git a/project/01_1_train_Median.py b/project/01_1_train_Median.py index 72a7cf562..cf43e2e1b 100644 --- a/project/01_1_train_Median.py +++ b/project/01_1_train_Median.py @@ -16,24 +16,25 @@ # %% [markdown] # # Variational Autoencoder -# %% +# %% tags=["hide-input"] import logging import pandas as pd +from IPython.display import display import vaep import vaep.model import vaep.models as models +import vaep.nb from vaep.io import datasplits -import vaep.nb logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info("Median Imputation") figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -60,11 +61,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args @@ -72,25 +73,25 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -108,7 +109,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -124,7 +125,7 @@ # - Not used for predictions or early stopping. # - [x] add some additional NAs based on distribution of data -# %% +# %% tags=["hide-input"] freq_feat = vaep.io.datasplits.load_freq(args.data) freq_feat.head() # training data @@ -134,11 +135,11 @@ # %% [markdown] # The validation fake NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() @@ -148,7 +149,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -157,14 +158,14 @@ # %% [markdown] # ### Add interpolation performance -# %% +# %% tags=["hide-input"] # interpolated = vaep.pandas.interpolate(wide_df = data.train_X) # val_pred_fake_na['interpolated'] = interpolated # test_pred_fake_na['interpolated'] = interpolated # del interpolated # test_pred_fake_na -# %% +# %% tags=["hide-input"] # Add median pred performance args.n_params = data.train_X.shape[-1] medians_train = data.train_X.median() @@ -176,7 +177,7 @@ val_pred_fake_na -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: mask = data.train_X.isna().stack() idx_real_na = mask.index[mask] @@ -196,19 +197,19 @@ # %% [markdown] # ### Plots # -# %% +# %% tags=["hide-input"] feat_freq_val = val_pred_fake_na['observed'].groupby(level=-1).count() feat_freq_val.name = 'freq_val' ax = feat_freq_val.plot.box() -# %% +# %% tags=["hide-input"] # # scatter plot between overall feature freq and split freq # freq_feat.to_frame('overall').join(feat_freq_val).plot.scatter(x='overall', y='freq_val') -# %% +# %% tags=["hide-input"] feat_freq_val.value_counts().sort_index().head() # require more than one feat? -# %% +# %% tags=["hide-input"] errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0) errors_val = errors_val.abs().groupby(level=-1).mean() errors_val = errors_val.join(freq_feat).sort_values(by='freq', ascending=True) @@ -221,36 +222,28 @@ ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10)) # errors_val_smoothed -# %% +# %% tags=["hide-input"] errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0) errors_val.abs().groupby(level=-1).agg(['mean', 'count']) -# %% +# %% tags=["hide-input"] errors_val # %% [markdown] # ## Comparisons -# -# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) -# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) -# > Could be changed. # %% [markdown] # ### Validation data # -# - all measured (identified, observed) peptides in validation data -# -# > Does not make too much sense to compare collab and AEs, -# > as the setup differs of training and validation data differs -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') added_metrics @@ -261,31 +254,31 @@ # explicitly to misssing before it was fed to the model for # reconstruction. -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') added_metrics # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] # %% [markdown] # ### Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # val fname = args.out_preds / f"pred_val_{args.model_key}.csv" setattr(args, fname.stem, fname.as_posix()) # add [] assignment? @@ -298,9 +291,9 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R index ce2f91af9..13997aa17 100644 --- a/project/01_1_train_NAGuideR_methods.R +++ b/project/01_1_train_NAGuideR_methods.R @@ -19,7 +19,7 @@ # # - BiocManager could be moved to methods who are installed from BioConductor -# + vscode={"languageId": "r"} +# + tags=["hide-input"] vscode={"languageId": "r"} packages_base_R <- c("BiocManager", "reshape2", "data.table", "readr", "tibble") @@ -58,7 +58,7 @@ for (package in packages_base_R) { # - seems quite hacky # - code is only slightly adapted from repo to run here, mainly to install packages on the fly -# + vscode={"languageId": "r"} +# + tags=["hide-input"] vscode={"languageId": "r"} nafunctions <- function(x, method = "zero") { df <- df1 <- as.data.frame(x) method <- tolower(method) @@ -407,7 +407,7 @@ original_header[1:5] # Uncomment to test certain methods (only for debugging, as at least one method per package is tested using Github Actions) -# + vscode={"languageId": "r"} +# + tags=["hide-input"] vscode={"languageId": "r"} # to_test <- c( # 'ZERO', # 'MINIMUM', @@ -450,20 +450,28 @@ pred <- nafunctions(df, method) pred <- tibble::as_tibble(cbind(rownames(pred), pred)) names(pred) <- original_header pred +# - + +# Transform predictions to long format + # + vscode={"languageId": "r"} pred <- reshape2::melt(pred, id.vars = feat_name) names(pred) <- c(feat_name, 'Sample ID', method) pred <- pred[reshape2::melt(is.na(df))['value'] == TRUE, ] pred +# - -# + vscode={"languageId": "r"} +# Check dimension of long format dataframe + +# + tags=["hide-input"] vscode={"languageId": "r"} dim(pred) +# - -# + vscode={"languageId": "r"} +# Save predictions to disk + +# + tags=["hide-input"] vscode={"languageId": "r"} fname = file.path(folder_experiment, 'preds', paste0('pred_all_', toupper(method), '.csv')) -fname - -# + vscode={"languageId": "r"} write_csv(pred, path = fname) +fname diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb index 072e207c2..23fae4bd3 100644 --- a/project/01_1_train_NAGuideR_methods.ipynb +++ b/project/01_1_train_NAGuideR_methods.ipynb @@ -17,6 +17,9 @@ "execution_count": null, "id": "2e50ecba-a6ca-4a3a-bd45-e58752c168eb", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -77,6 +80,9 @@ "execution_count": null, "id": "f9c48bf7-d31c-4073-895b-e9cf920ff1d3", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -497,6 +503,9 @@ "execution_count": null, "id": "162c5f7f-08f0-44ef-abf5-f0805ab58bb4", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -551,7 +560,6 @@ "execution_count": null, "id": "690d47c2-5666-41f2-b13f-9215334f197c", "metadata": { - "lines_to_next_cell": 0, "tags": [], "vscode": { "languageId": "r" @@ -565,6 +573,14 @@ "pred" ] }, + { + "cell_type": "markdown", + "id": "9738530f", + "metadata": {}, + "source": [ + "Transform predictions to long format" + ] + }, { "cell_type": "code", "execution_count": null, @@ -583,11 +599,22 @@ "pred" ] }, + { + "cell_type": "markdown", + "id": "b745b3ea", + "metadata": {}, + "source": [ + "Check dimension of long format dataframe" + ] + }, { "cell_type": "code", "execution_count": null, "id": "ff5196d2-0ecf-49da-b7eb-4075b8a73707", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -597,11 +624,22 @@ "dim(pred)" ] }, + { + "cell_type": "markdown", + "id": "faeb1eb0", + "metadata": {}, + "source": [ + "Save predictions to disk" + ] + }, { "cell_type": "code", "execution_count": null, "id": "ffebedb9-02db-4a7f-a5f6-a54a2aa057fc", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -611,22 +649,9 @@ "fname = file.path(folder_experiment,\n", " 'preds',\n", " paste0('pred_all_', toupper(method), '.csv'))\n", + "write_csv(pred, path = fname)\n", "fname" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df1a114a-166d-4bcc-8c10-3ac69570f96c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "write_csv(pred, path = fname)" - ] } ], "metadata": { diff --git a/project/01_1_train_RSN.ipynb b/project/01_1_train_RSN.ipynb index 95b208ebf..fb5fc7b67 100644 --- a/project/01_1_train_RSN.ipynb +++ b/project/01_1_train_RSN.ipynb @@ -13,21 +13,25 @@ "execution_count": null, "id": "e38874f3", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "import logging\n", "\n", "import pandas as pd\n", + "from IPython.display import display\n", "\n", "import vaep\n", + "import vaep.imputation\n", "import vaep.model\n", "import vaep.models as models\n", - "import vaep.imputation\n", + "import vaep.nb\n", "from vaep.io import datasplits\n", "\n", - "import vaep.nb\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\"Median Imputation\")\n", "\n", @@ -38,7 +42,11 @@ "cell_type": "code", "execution_count": null, "id": "ca2c3fb3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -99,7 +107,11 @@ "cell_type": "code", "execution_count": null, "id": "36f708a2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -111,7 +123,10 @@ "execution_count": null, "id": "a4bb6bf2", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -131,7 +146,11 @@ "cell_type": "code", "execution_count": null, "id": "d3ded735", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -149,7 +168,11 @@ "cell_type": "code", "execution_count": null, "id": "92d787e1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -168,7 +191,11 @@ "cell_type": "code", "execution_count": null, "id": "dbd2df5b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -186,7 +213,11 @@ "cell_type": "code", "execution_count": null, "id": "b8e2b780", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -217,7 +248,11 @@ "cell_type": "code", "execution_count": null, "id": "3cd8cc67", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -239,7 +274,11 @@ "cell_type": "code", "execution_count": null, "id": "63a9a8c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", @@ -266,7 +305,11 @@ "cell_type": "code", "execution_count": null, "id": "5855a725", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -277,7 +320,11 @@ "cell_type": "code", "execution_count": null, "id": "9e0ae839", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_fake_na = data.test_y.to_frame(name='observed')\n", @@ -297,7 +344,10 @@ "execution_count": null, "id": "e8b41aae", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -318,7 +368,11 @@ "cell_type": "code", "execution_count": null, "id": "8f5349d6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "imputed_shifted_normal = vaep.imputation.impute_shifted_normal(\n", @@ -335,7 +389,11 @@ "cell_type": "code", "execution_count": null, "id": "d32d445e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na[args.model] = imputed_shifted_normal\n", @@ -356,7 +414,10 @@ "execution_count": null, "id": "3198a37c", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -385,7 +446,11 @@ "cell_type": "code", "execution_count": null, "id": "df99da67", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax, _ = vaep.plotting.errors.plot_errors_binned(val_pred_fake_na)" @@ -395,7 +460,11 @@ "cell_type": "code", "execution_count": null, "id": "16637d79", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax, _ = vaep.plotting.errors.plot_errors_binned(test_pred_fake_na)" @@ -425,7 +494,11 @@ "cell_type": "code", "execution_count": null, "id": "43d42650", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -444,7 +517,11 @@ "cell_type": "code", "execution_count": null, "id": "ed0498d0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", @@ -467,7 +544,11 @@ "cell_type": "code", "execution_count": null, "id": "0ee61d53", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", @@ -494,7 +575,11 @@ "cell_type": "code", "execution_count": null, "id": "9973b3ee", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n", @@ -506,7 +591,11 @@ "cell_type": "code", "execution_count": null, "id": "6b2421c3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(\n", @@ -526,7 +615,11 @@ "cell_type": "code", "execution_count": null, "id": "39c41bcd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# val\n", @@ -551,7 +644,11 @@ "cell_type": "code", "execution_count": null, "id": "7f2f7404", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -561,7 +658,11 @@ "cell_type": "code", "execution_count": null, "id": "8ad37d15", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", diff --git a/project/01_1_train_RSN.py b/project/01_1_train_RSN.py index 73643f02a..c21769ac2 100644 --- a/project/01_1_train_RSN.py +++ b/project/01_1_train_RSN.py @@ -16,25 +16,26 @@ # %% [markdown] # # Imputation using random draws from shifted normal distribution -# %% +# %% tags=["hide-input"] import logging import pandas as pd +from IPython.display import display import vaep +import vaep.imputation import vaep.model import vaep.models as models -import vaep.imputation +import vaep.nb from vaep.io import datasplits -import vaep.nb logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info("Median Imputation") figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -66,11 +67,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args @@ -78,26 +79,26 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -116,7 +117,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -127,7 +128,7 @@ # ## Initialize Comparison # -# %% +# %% tags=["hide-input"] freq_feat = vaep.io.datasplits.load_freq(args.data) freq_feat.head() # training data @@ -137,18 +138,18 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() # %% [markdown] # ## Data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -157,7 +158,7 @@ # %% [markdown] # ### Impute using shifted normal distribution -# %% +# %% tags=["hide-input"] imputed_shifted_normal = vaep.imputation.impute_shifted_normal( data.train_X, mean_shift=1.8, @@ -167,7 +168,7 @@ imputed_shifted_normal = imputed_shifted_normal.to_frame('intensity') imputed_shifted_normal -# %% +# %% tags=["hide-input"] val_pred_fake_na[args.model] = imputed_shifted_normal test_pred_fake_na[args.model] = imputed_shifted_normal val_pred_fake_na @@ -175,7 +176,7 @@ # %% [markdown] # Save predictions for NA -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: mask = data.train_X.isna().stack() idx_real_na = mask.index[mask] @@ -195,10 +196,10 @@ # # %% [markdown] # ### Plots # -# %% +# %% tags=["hide-input"] ax, _ = vaep.plotting.errors.plot_errors_binned(val_pred_fake_na) -# %% +# %% tags=["hide-input"] ax, _ = vaep.plotting.errors.plot_errors_binned(test_pred_fake_na) # %% [markdown] @@ -210,14 +211,14 @@ # # - all measured (identified, observed) peptides in validation data -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') added_metrics @@ -228,7 +229,7 @@ # explicitly to misssing before it was fed to the model for # reconstruction. -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') added_metrics @@ -238,12 +239,12 @@ # %% [markdown] # ### Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict( d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -251,7 +252,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # val fname = args.out_preds / f"pred_val_{args.model_key}.csv" setattr(args, fname.stem, fname.as_posix()) # add [] assignment? @@ -264,9 +265,9 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_VAE.ipynb b/project/01_1_train_VAE.ipynb index 8a8ae0c7a..be38aa642 100644 --- a/project/01_1_train_VAE.ipynb +++ b/project/01_1_train_VAE.ipynb @@ -13,7 +13,10 @@ "execution_count": null, "id": "18b5d571-2956-4112-b22c-43d6c2146b06", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -21,36 +24,29 @@ "import logging\n", "from functools import partial\n", "\n", + "import pandas as pd\n", + "import sklearn\n", + "import torch\n", + "from fastai import learner\n", "from fastai.basics import *\n", - "from fastai.learner import Learner\n", "from fastai.callback.all import *\n", "from fastai.callback.all import EarlyStoppingCallback\n", + "from fastai.learner import Learner\n", "from fastai.torch_basics import *\n", - "\n", - "import torch\n", - "\n", "from IPython.display import display\n", - "\n", - "from torch.nn import Sigmoid\n", - "\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.preprocessing import StandardScaler\n", "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from torch.nn import Sigmoid\n", "\n", "import vaep\n", - "import vaep.nb\n", - "from vaep.io import datasplits\n", - "from vaep.models import ae\n", - "import vaep.models as models\n", "import vaep.model\n", + "import vaep.models as models\n", + "import vaep.nb\n", "from vaep.analyzers import analyzers\n", - "\n", - "\n", + "from vaep.io import datasplits\n", "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss\n", - "from fastai import learner\n", + "from vaep.models import ae, plot_loss\n", + "\n", "learner.Recorder.plot_loss = plot_loss\n", "\n", "\n", @@ -65,7 +61,11 @@ "cell_type": "code", "execution_count": null, "id": "5dbc8d89", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -133,7 +133,11 @@ "cell_type": "code", "execution_count": null, "id": "0746e70f-0259-48d5-90ef-25fe4b59f9ac", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -145,7 +149,10 @@ "execution_count": null, "id": "8083658b", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -172,7 +179,11 @@ "cell_type": "code", "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -190,7 +201,11 @@ "cell_type": "code", "execution_count": null, "id": "6d9cc7bd-6b6f-40b9-8db7-c8228e4b03e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -209,7 +224,11 @@ "cell_type": "code", "execution_count": null, "id": "02bb6bf5-0eb1-4c73-9723-414b14eaf7c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -227,7 +246,11 @@ "cell_type": "code", "execution_count": null, "id": "44958473", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -258,7 +281,11 @@ "cell_type": "code", "execution_count": null, "id": "b5b945aa-9b4e-4487-8b09-dca289e64d9d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -286,7 +313,11 @@ "cell_type": "code", "execution_count": null, "id": "98f675b6-e619-45b6-8f04-b75237d212a7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", @@ -313,7 +344,11 @@ "cell_type": "code", "execution_count": null, "id": "19eebaff-0e1e-4e44-ae40-12d2f0e75c74", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_simulated_na = data.val_y.to_frame(name='observed')\n", @@ -325,7 +360,10 @@ "execution_count": null, "id": "3797a539-84d9-430a-8d16-7cc0eebfe9f5", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -347,7 +385,11 @@ "cell_type": "code", "execution_count": null, "id": "cff8caf4-ccc9-4a36-a992-2cc596abe51a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.to_wide_format()\n", @@ -375,7 +417,11 @@ "cell_type": "code", "execution_count": null, "id": "7952fe13", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X" @@ -385,7 +431,11 @@ "cell_type": "code", "execution_count": null, "id": "a0a0bcd9-22af-4dd9-af56-b041931ee918", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y # potentially has less features" @@ -395,7 +445,11 @@ "cell_type": "code", "execution_count": null, "id": "9f0826f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index,\n", @@ -423,7 +477,11 @@ "cell_type": "code", "execution_count": null, "id": "e0d0d02f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "default_pipeline = sklearn.pipeline.Pipeline(\n", @@ -445,7 +503,11 @@ "cell_type": "code", "execution_count": null, "id": "43d49b4a-00ec-4874-8839-28a3cbc0e3b3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "\n", @@ -480,7 +542,11 @@ "cell_type": "code", "execution_count": null, "id": "cca0e4a4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "results = []\n", @@ -492,7 +558,10 @@ "execution_count": null, "id": "9366b06a", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -520,7 +589,11 @@ "cell_type": "code", "execution_count": null, "id": "f4b0aec2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# learn.summary()" @@ -530,7 +603,11 @@ "cell_type": "code", "execution_count": null, "id": "563a1e0a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "suggested_lr = analysis.learn.lr_find()\n", @@ -542,7 +619,11 @@ "cell_type": "code", "execution_count": null, "id": "468565f5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "results.clear() # reset results" @@ -560,7 +641,11 @@ "cell_type": "code", "execution_count": null, "id": "ec77d9e5-f619-4355-ab37-2bd44029236d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# needs class as argument, not instance, but serialization needs instance\n", @@ -581,7 +666,11 @@ "cell_type": "code", "execution_count": null, "id": "43d18ab6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=train\n", @@ -600,7 +689,11 @@ "cell_type": "code", "execution_count": null, "id": "079f9743-213c-422c-ba61-919c276fd710", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.epoch_trained = analysis.learn.epoch + 1\n", @@ -619,7 +712,11 @@ "cell_type": "code", "execution_count": null, "id": "3a4f91f9-c1a2-40c5-99d4-c289fb89cff8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "N_train_notna = data.train_X.notna().sum().sum()\n", @@ -642,7 +739,11 @@ "cell_type": "code", "execution_count": null, "id": "e73fb4dd-e73a-48df-82b7-b378fd3ee266", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.model.eval()\n", @@ -658,7 +759,10 @@ "execution_count": null, "id": "a9fc0e36", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -670,7 +774,11 @@ "cell_type": "code", "execution_count": null, "id": "b4d1fd73-9eb7-4f25-ad81-c42c6a840e77", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_simulated_na['VAE'] = pred # model_key?\n", @@ -690,7 +798,10 @@ "execution_count": null, "id": "c43e401f", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -717,12 +828,24 @@ "cell_type": "code", "execution_count": null, "id": "7999c89e-65fe-4c00-8e20-cb8ab88d1603", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.model = analysis.model.cpu()\n", + "# underlying data is train_X for both\n", + "# assert analysis.dls.valid.data.equals(analysis.dls.train.data)\n", + "# Reconstruct DataLoader for case that during training singleton batches were dropped\n", + "_dl = torch.utils.data.DataLoader(\n", + " vaep.io.datasets.DatasetWithTarget(\n", + " analysis.dls.valid.data),\n", + " batch_size=args.batch_size,\n", + " shuffle=False)\n", "df_latent = vaep.model.get_latent_space(analysis.model.get_mu_and_logvar,\n", - " dl=analysis.dls.valid,\n", + " dl=_dl,\n", " dl_index=analysis.dls.valid.data.index)\n", "df_latent" ] @@ -731,7 +854,11 @@ "cell_type": "code", "execution_count": null, "id": "1fdd8f86-639e-4e0e-bb89-466f3ba0ef7b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ana_latent = analyzers.LatentAnalysis(df_latent,\n", @@ -747,7 +874,11 @@ "cell_type": "code", "execution_count": null, "id": "763a5633-a9dd-4785-a0c0-91f588346c22", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_cat_col and df_meta is not None:\n", @@ -759,7 +890,11 @@ "cell_type": "code", "execution_count": null, "id": "d3fdd5cb-4038-489f-b4d8-54ec6ea913b5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val = val_pred_simulated_na['observed'].groupby(level=-1).count()\n", @@ -771,7 +906,11 @@ "cell_type": "code", "execution_count": null, "id": "446adbaf-81db-4ac5-b041-064744143602", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" @@ -781,7 +920,11 @@ "cell_type": "code", "execution_count": null, "id": "d408dbfa", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_simulated_na.drop('observed', axis=1).sub(\n", @@ -801,7 +944,11 @@ "cell_type": "code", "execution_count": null, "id": "7e505353-f19e-4961-9279-f1f0f1e4be09", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_simulated_na.drop('observed', axis=1).sub(\n", @@ -813,7 +960,11 @@ "cell_type": "code", "execution_count": null, "id": "95022f04-0e0d-47bf-8267-6135a936328f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val" @@ -845,7 +996,11 @@ "cell_type": "code", "execution_count": null, "id": "d825e38e-f3d6-4bca-b621-150267e7b7bc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -865,7 +1020,11 @@ "cell_type": "code", "execution_count": null, "id": "855a7a6f-93fd-4612-9d8d-96541a2441be", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na')\n", @@ -886,7 +1045,11 @@ "cell_type": "code", "execution_count": null, "id": "571ac8d4-bb5d-45db-bba8-59817e476304", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na')\n", @@ -905,7 +1068,11 @@ "cell_type": "code", "execution_count": null, "id": "87910434-7d07-4e8e-8380-c92fc515bd16", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n", @@ -917,7 +1084,11 @@ "cell_type": "code", "execution_count": null, "id": "7d99deb9-9aad-4ba9-b79d-e4b3c6c7f023", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(\n", @@ -937,7 +1108,11 @@ "cell_type": "code", "execution_count": null, "id": "782636ac-c979-4f8b-9fc0-66fd0c7a3a8b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# save simulated missing values for both splits\n", @@ -957,7 +1132,11 @@ "cell_type": "code", "execution_count": null, "id": "06e433ec", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -967,7 +1146,11 @@ "cell_type": "code", "execution_count": null, "id": "0f13cb38-abf0-4b56-9399-3d11d32f7fbc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", diff --git a/project/01_1_train_VAE.py b/project/01_1_train_VAE.py index ac154dcbd..d68428410 100644 --- a/project/01_1_train_VAE.py +++ b/project/01_1_train_VAE.py @@ -16,41 +16,34 @@ # %% [markdown] # # Variational Autoencoder -# %% +# %% tags=["hide-input"] import logging from functools import partial +import pandas as pd +import sklearn +import torch +from fastai import learner from fastai.basics import * -from fastai.learner import Learner from fastai.callback.all import * from fastai.callback.all import EarlyStoppingCallback +from fastai.learner import Learner from fastai.torch_basics import * - -import torch - from IPython.display import display - -from torch.nn import Sigmoid - -import pandas as pd - -import sklearn -from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from torch.nn import Sigmoid import vaep -import vaep.nb -from vaep.io import datasplits -from vaep.models import ae -import vaep.models as models import vaep.model +import vaep.models as models +import vaep.nb from vaep.analyzers import analyzers - - +from vaep.io import datasplits # overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss -from fastai import learner +from vaep.models import ae, plot_loss + learner.Recorder.plot_loss = plot_loss @@ -61,7 +54,7 @@ figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -100,11 +93,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) if isinstance(args.hidden_layers, str): @@ -119,26 +112,26 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -157,7 +150,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -173,7 +166,7 @@ # - Not used for predictions or early stopping. # - [x] add some additional NAs based on distribution of data -# %% +# %% tags=["hide-input"] freq_feat = vaep.io.datasplits.load_freq(args.data) freq_feat.head() # training data @@ -183,11 +176,11 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_simulated_na = data.val_y.to_frame(name='observed') val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na = data.test_y.to_frame(name='observed') test_pred_simulated_na.describe() @@ -197,7 +190,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -208,13 +201,13 @@ # %% [markdown] # ### Fill Validation data with potentially missing features -# %% +# %% tags=["hide-input"] data.train_X -# %% +# %% tags=["hide-input"] data.val_y # potentially has less features -# %% +# %% tags=["hide-input"] data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index, columns=data.train_X.columns).fillna(data.val_y) data.val_y @@ -225,7 +218,7 @@ # %% [markdown] # ### Analysis: DataLoaders, Model, transform -# %% +# %% tags=["hide-input"] default_pipeline = sklearn.pipeline.Pipeline( [ ('normalize', StandardScaler()), @@ -235,7 +228,7 @@ # %% [markdown] # ### Analysis: DataLoaders, Model -# %% +# %% tags=["hide-input"] analysis = ae.AutoEncoderAnalysis( # datasplits=data, train_df=data.train_X, @@ -259,11 +252,11 @@ # # -# %% +# %% tags=["hide-input"] results = [] loss_fct = partial(models.vae.loss_fct, results=results) -# %% +# %% tags=["hide-input"] analysis.learn = Learner(dls=analysis.dls, model=analysis.model, loss_func=loss_fct, @@ -277,21 +270,21 @@ # [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in # current version. Try again later -# %% +# %% tags=["hide-input"] # learn.summary() -# %% +# %% tags=["hide-input"] suggested_lr = analysis.learn.lr_find() analysis.params['suggested_inital_lr'] = suggested_lr.valley suggested_lr -# %% +# %% tags=["hide-input"] results.clear() # reset results # %% [markdown] # dump model config -# %% +# %% tags=["hide-input"] # needs class as argument, not instance, but serialization needs instance analysis.params['last_decoder_activation'] = Sigmoid() @@ -305,21 +298,21 @@ # restore original value analysis.params['last_decoder_activation'] = Sigmoid -# %% +# %% tags=["hide-input"] # papermill_description=train analysis.learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley) # %% [markdown] # Save number of actually trained epochs -# %% +# %% tags=["hide-input"] args.epoch_trained = analysis.learn.epoch + 1 args.epoch_trained # %% [markdown] # #### Loss normalized by total number of measurements -# %% +# %% tags=["hide-input"] N_train_notna = data.train_X.notna().sum().sum() N_val_notna = data.val_y.notna().sum().sum() fig = models.plot_training_losses(analysis.learn, args.model_key, @@ -330,7 +323,7 @@ # ### Predictions # create predictions and select validation data predictions -# %% +# %% tags=["hide-input"] analysis.model.eval() pred, target = res = ae.get_preds_from_df(df=data.train_X, learn=analysis.learn, position_pred_tuple=0, @@ -338,19 +331,19 @@ pred = pred.stack() pred -# %% +# %% tags=["hide-input"] val_pred_simulated_na['VAE'] = pred # 'model_key' ? val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na['VAE'] = pred # model_key? test_pred_simulated_na # %% [markdown] # save missing values predictions -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = ae.get_missing_values(df_train_wide=data.train_X, val_idx=val_pred_simulated_na.index, @@ -365,14 +358,22 @@ # # - validation data -# %% +# %% tags=["hide-input"] analysis.model = analysis.model.cpu() +# underlying data is train_X for both +# assert analysis.dls.valid.data.equals(analysis.dls.train.data) +# Reconstruct DataLoader for case that during training singleton batches were dropped +_dl = torch.utils.data.DataLoader( + vaep.io.datasets.DatasetWithTarget( + analysis.dls.valid.data), + batch_size=args.batch_size, + shuffle=False) df_latent = vaep.model.get_latent_space(analysis.model.get_mu_and_logvar, - dl=analysis.dls.valid, + dl=_dl, dl_index=analysis.dls.valid.data.index) df_latent -# %% +# %% tags=["hide-input"] ana_latent = analyzers.LatentAnalysis(df_latent, df_meta, args.model_key, @@ -381,20 +382,20 @@ figures[f'latent_{args.model_key}_by_date'], ax = ana_latent.plot_by_date( args.meta_date_col) -# %% +# %% tags=["hide-input"] if args.meta_cat_col and df_meta is not None: figures[f'latent_{args.model_key}_by_{"_".join(args.meta_cat_col.split())}'], ax = ana_latent.plot_by_category( args.meta_cat_col) -# %% +# %% tags=["hide-input"] feat_freq_val = val_pred_simulated_na['observed'].groupby(level=-1).count() feat_freq_val.name = 'freq_val' ax = feat_freq_val.plot.box() -# %% +# %% tags=["hide-input"] feat_freq_val.value_counts().sort_index().head() # require more than one feat? -# %% +# %% tags=["hide-input"] errors_val = val_pred_simulated_na.drop('observed', axis=1).sub( val_pred_simulated_na['observed'], axis=0) errors_val = errors_val.abs().groupby(level=-1).mean() @@ -407,12 +408,12 @@ ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10)) # errors_val_smoothed -# %% +# %% tags=["hide-input"] errors_val = val_pred_simulated_na.drop('observed', axis=1).sub( val_pred_simulated_na['observed'], axis=0) errors_val.abs().groupby(level=-1).agg(['mean', 'count']) -# %% +# %% tags=["hide-input"] errors_val # %% [markdown] @@ -427,7 +428,7 @@ # # - all measured (identified, observed) peptides in validation data -# %% +# %% tags=["hide-input"] # papermill_description=metrics # d_metrics = models.Metrics(no_na_key='NA interpolated', with_na_key='NA not interpolated') d_metrics = models.Metrics() @@ -435,7 +436,7 @@ # %% [markdown] # The simulated NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na') added_metrics @@ -444,19 +445,19 @@ # -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na') added_metrics # %% [markdown] # Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict( d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -464,7 +465,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_simulated_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_simulated_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") @@ -472,9 +473,9 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_transfer_NAGuideR_pred.ipynb b/project/01_1_transfer_NAGuideR_pred.ipynb index a985a61c1..a4fbd5e14 100644 --- a/project/01_1_transfer_NAGuideR_pred.ipynb +++ b/project/01_1_transfer_NAGuideR_pred.ipynb @@ -5,25 +5,30 @@ "id": "a75efcbe-2ae0-4609-872a-759fb5c80af1", "metadata": {}, "source": [ - "# Transfer predictions from NAGuideR" + "# Transfer predictions from NAGuideR\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f29b93d1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "from pathlib import Path\n", "import logging\n", - "import pandas as pd\n", + "from pathlib import Path\n", + "\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", "\n", "import vaep\n", "import vaep.models\n", - "from vaep.io import datasplits\n", "import vaep.pandas\n", + "from vaep.io import datasplits\n", "\n", "vaep.plotting.make_large_descriptors(5)\n", "\n", @@ -34,7 +39,11 @@ "cell_type": "code", "execution_count": null, "id": "cbf23f02", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -84,7 +93,11 @@ "cell_type": "code", "execution_count": null, "id": "3d5c476b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -96,7 +109,11 @@ "cell_type": "code", "execution_count": null, "id": "ba3513a7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out = {}" @@ -115,7 +132,10 @@ "execution_count": null, "id": "75341d2b", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -135,7 +155,11 @@ "cell_type": "code", "execution_count": null, "id": "723eacd2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -146,7 +170,11 @@ "cell_type": "code", "execution_count": null, "id": "514d193f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_fake_na = data.test_y.to_frame(name='observed')\n", @@ -157,7 +185,11 @@ "cell_type": "code", "execution_count": null, "id": "204838b7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# Find and load prediction files, filter for validation and test data" @@ -167,7 +199,11 @@ "cell_type": "code", "execution_count": null, "id": "c06dcd14", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.dumps is not None:\n", @@ -183,7 +219,10 @@ "execution_count": null, "id": "4a9e66bc", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -226,7 +265,11 @@ "cell_type": "code", "execution_count": null, "id": "72adc8ec", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na" @@ -244,7 +287,11 @@ "cell_type": "code", "execution_count": null, "id": "d7c5dab0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -255,7 +302,11 @@ "cell_type": "code", "execution_count": null, "id": "773dcbbe", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na.dropna(how='all', axis=1), 'valid_fake_na')\n", @@ -274,7 +325,11 @@ "cell_type": "code", "execution_count": null, "id": "065e1e62", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na.dropna(how='all', axis=1), 'test_fake_na')\n", @@ -285,7 +340,11 @@ "cell_type": "code", "execution_count": null, "id": "37e8e515", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = vaep.models.get_df_from_nested_dict(\n", @@ -297,7 +356,11 @@ "cell_type": "code", "execution_count": null, "id": "f1f0d12c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "order_methods = metrics_df.loc[pd.IndexSlice[:,\n", @@ -309,7 +372,11 @@ "cell_type": "code", "execution_count": null, "id": "a0f8ce7c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "top_5 = ['observed', *order_methods.droplevel(-1).index[:6]]\n", @@ -320,7 +387,11 @@ "cell_type": "code", "execution_count": null, "id": "7e041594", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(8, 2))\n", @@ -337,7 +408,11 @@ "cell_type": "code", "execution_count": null, "id": "77b1b792", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out" diff --git a/project/01_1_transfer_NAGuideR_pred.py b/project/01_1_transfer_NAGuideR_pred.py index ed152c987..bddf8f604 100644 --- a/project/01_1_transfer_NAGuideR_pred.py +++ b/project/01_1_transfer_NAGuideR_pred.py @@ -15,23 +15,25 @@ # %% [markdown] # # Transfer predictions from NAGuideR +# -# %% -from pathlib import Path +# %% tags=["hide-input"] import logging -import pandas as pd +from pathlib import Path + import matplotlib.pyplot as plt +import pandas as pd import vaep import vaep.models -from vaep.io import datasplits import vaep.pandas +from vaep.io import datasplits vaep.plotting.make_large_descriptors(5) logger = vaep.logging.setup_logger(logging.getLogger('vaep')) -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -52,18 +54,18 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args = vaep.nb.args_from_dict(args) args -# %% +# %% tags=["hide-input"] files_out = {} # %% [markdown] # load data splits -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) @@ -71,18 +73,18 @@ # %% [markdown] # Validation and test data split of simulated missing values -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() -# %% +# %% tags=["hide-input"] # Find and load prediction files, filter for validation and test data -# %% +# %% tags=["hide-input"] if args.dumps is not None: entire_pred = [Path(s) for s in args.dumps.split(',')] else: @@ -90,7 +92,7 @@ if '_all_' in str(file)) entire_pred -# %% +# %% tags=["hide-input"] mask = data.train_X.unstack().isna().stack() idx_real_na = mask.index[mask] idx_real_na = (idx_real_na @@ -124,42 +126,42 @@ logger.info(f"Save {fname = }") # del pred -# %% +# %% tags=["hide-input"] val_pred_fake_na # %% [markdown] # Metrics for simulated missing values (NA) -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = vaep.models.Metrics() -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na.dropna(how='all', axis=1), 'valid_fake_na') pd.DataFrame(added_metrics) # %% [markdown] # ## Test Datasplit -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na.dropna(how='all', axis=1), 'test_fake_na') pd.DataFrame(added_metrics) -# %% +# %% tags=["hide-input"] metrics_df = vaep.models.get_df_from_nested_dict( d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df -# %% +# %% tags=["hide-input"] order_methods = metrics_df.loc[pd.IndexSlice[:, 'MAE'], 'valid_fake_na'].sort_values() order_methods -# %% +# %% tags=["hide-input"] top_5 = ['observed', *order_methods.droplevel(-1).index[:6]] top_5 -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(8, 2)) ax, errors_bind = vaep.plotting.errors.plot_errors_binned( val_pred_fake_na[top_5], @@ -169,5 +171,5 @@ files_out[fname.name] = fname.as_posix() vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] files_out diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index 263026456..3d48f53a9 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -23,7 +23,9 @@ "execution_count": null, "id": "a1e5f978-a0cb-4bb6-98d1-467eda257165", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -82,7 +84,9 @@ "execution_count": null, "id": "67f5161a", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -139,7 +143,9 @@ "execution_count": null, "id": "ec1509e8-6908-43c3-8909-efbb0229c324", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -152,7 +158,9 @@ "execution_count": null, "id": "19b33594", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -165,7 +173,9 @@ "execution_count": null, "id": "59081f60", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -179,7 +189,9 @@ "id": "c3e124fb", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -199,7 +211,9 @@ "execution_count": null, "id": "747d5e4a", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -214,7 +228,9 @@ "execution_count": null, "id": "a4ba2a48-dedc-47a9-b2ea-79936dfc48ef", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -227,7 +243,9 @@ "execution_count": null, "id": "611a8edf", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -261,7 +279,9 @@ "execution_count": null, "id": "2d043b40-5c74-40cc-a5cf-8d22ac5538a8", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -276,7 +296,9 @@ "execution_count": null, "id": "d8f8c3f4-9896-4f0e-8f93-780f90b22573", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -299,7 +321,9 @@ "execution_count": null, "id": "9a94ad00-78fd-4541-be5d-68391af99bd5", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -320,7 +344,9 @@ "execution_count": null, "id": "526626c0-98c7-4741-abae-b6fc8c218f23", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -341,7 +367,9 @@ "execution_count": null, "id": "f3e738bd-79e9-4714-af4d-f3d0d2893353", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -365,20 +393,22 @@ "execution_count": null, "id": "91bc1e12-8477-4eda-a4c2-1f132e468616", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "# model_key could be used as key from config file\n", - "? load only specified configs?\n", - "? case: no config file available?\n", + "# ? load only specified configs?\n", + "# ? case: no config file available?\n", "all_configs = collect(\n", " paths=(fname for fname in args.out_models.iterdir()\n", " if fname.suffix == '.yaml'\n", " and 'model_config' in fname.name),\n", " load_fn=load_config_file\n", ")\n", - "model_configs = pd.DataFrame(all_configs).set_index('model')\n", + "model_configs = pd.DataFrame(all_configs).set_index('id')\n", "model_configs.T.to_excel(writer, sheet_name='model_params')\n", "model_configs.T" ] @@ -396,7 +426,9 @@ "execution_count": null, "id": "af8c112f-fb4f-4dcd-b729-9c9558715d88", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -428,7 +460,9 @@ "execution_count": null, "id": "4efc3fe6", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -456,7 +490,9 @@ "execution_count": null, "id": "d5196bcc", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -482,7 +518,9 @@ "execution_count": null, "id": "e94d9dd6-d97d-4e1c-b877-48dc1ae9c7c7", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -499,7 +537,11 @@ "cell_type": "code", "execution_count": null, "id": "f5865679", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]\n", @@ -516,7 +558,9 @@ "execution_count": null, "id": "4d6417fc", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -543,7 +587,9 @@ "execution_count": null, "id": "36e078fb-2268-41dd-a069-4ca3dc5ca6cf", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -557,7 +603,9 @@ "id": "a2440887-b5f2-45a1-90cd-d15ef9bfa0a7", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -580,13 +628,15 @@ "execution_count": null, "id": "cea24eb1", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "corr_per_sample_val = (pred_val\n", " .groupby(sample_index_name)\n", - " .aggregate(\n", + " .apply(\n", " lambda df: df.corr().loc[TARGET_COL]\n", " )[ORDER_MODELS])\n", "\n", @@ -607,7 +657,8 @@ "dumps[fname.stem] = fname\n", "with pd.ExcelWriter(fname) as w:\n", " corr_per_sample_val.describe().to_excel(w, sheet_name='summary')\n", - " corr_per_sample_val.to_excel(w, sheet_name='correlations')" + " corr_per_sample_val.to_excel(w, sheet_name='correlations')\n", + " corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')" ] }, { @@ -623,7 +674,9 @@ "execution_count": null, "id": "4068d91f-856e-4aa6-9c62-5f1f77a77c4c", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -647,7 +700,9 @@ "execution_count": null, "id": "52298acd-73c5-4574-b7fe-8fb6544708cf", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -662,7 +717,9 @@ "id": "570fc505-ab27-4710-b4c2-adbe72b33898", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -678,7 +735,9 @@ "execution_count": null, "id": "ddc98a9f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -691,7 +750,9 @@ "id": "af4f0e81-e9af-4763-908d-f7bdf4a4fed7", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -714,7 +775,9 @@ "execution_count": null, "id": "df6923c5-e6f7-4a14-aa8e-d55bf66cf817", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -740,7 +803,10 @@ "execution_count": null, "id": "6f6ffdd5", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -759,7 +825,9 @@ "execution_count": null, "id": "6122a309-5435-44d2-a6f8-8e9d46b5afae", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -782,7 +850,9 @@ "execution_count": null, "id": "1dc848c6-d39e-4092-9b72-3f6a0e1949e2", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -814,7 +884,9 @@ "execution_count": null, "id": "8bce941c", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -830,7 +902,9 @@ "execution_count": null, "id": "ff722dae", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -842,7 +916,9 @@ "execution_count": null, "id": "629eddae", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -862,7 +938,9 @@ "execution_count": null, "id": "f639cd92", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -885,7 +963,9 @@ "execution_count": null, "id": "99f7951f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -926,7 +1006,11 @@ "cell_type": "code", "execution_count": null, "id": "843a917f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test,\n", @@ -950,13 +1034,15 @@ "execution_count": null, "id": "ee088a12-ee60-45d1-bf5a-e07b76413c56", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "corr_per_sample_test = (pred_test\n", " .groupby(sample_index_name)\n", - " .aggregate(lambda df: df.corr().loc[TARGET_COL])\n", + " .apply(lambda df: df.corr().loc[TARGET_COL])\n", " [ORDER_MODELS])\n", "corr_per_sample_test = corr_per_sample_test.join(\n", " pred_test\n", @@ -973,7 +1059,9 @@ "execution_count": null, "id": "825efac2", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -995,7 +1083,8 @@ "dumps[fname.stem] = fname.with_suffix('.xlsx')\n", "with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:\n", " corr_per_sample_test.describe().to_excel(w, sheet_name='summary')\n", - " corr_per_sample_test.to_excel(w, sheet_name='correlations')" + " corr_per_sample_test.to_excel(w, sheet_name='correlations')\n", + " corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')" ] }, { @@ -1011,7 +1100,9 @@ "execution_count": null, "id": "77b846e1-00b8-4f61-b5cd-cdc1692787de", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1027,7 +1118,9 @@ "execution_count": null, "id": "7bff3764-5063-4399-a182-3ba795fbe99d", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1042,11 +1135,13 @@ "execution_count": null, "id": "c6145bd0-9b59-490e-9a0e-89475c18663b", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "options = random.sample(set(feature_names), 1)\n", + "options = random.sample(sorted(set(feature_names)), 1)\n", "pred_test.loc[pd.IndexSlice[:, options[0]], :]" ] }, @@ -1063,11 +1158,13 @@ "execution_count": null, "id": "6ee92128-4f78-45e9-a607-8e6c4163181a", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "corr_per_feat_test = pred_test.groupby(FEAT_NAME).aggregate(\n", + "corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(\n", " lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]\n", "corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[\n", " TARGET_COL].count().rename('n_obs'))\n", @@ -1081,7 +1178,9 @@ "execution_count": null, "id": "8e45b324-eaa0-43e4-b28b-b0f839f91955", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1093,7 +1192,9 @@ "execution_count": null, "id": "4c9a9ecc-526a-41ac-8a4d-d3a389ea6c07", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1114,7 +1215,8 @@ "with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:\n", " corr_per_feat_test.loc[~too_few_obs].describe().to_excel(\n", " w, sheet_name='summary')\n", - " corr_per_feat_test.to_excel(w, sheet_name='correlations')" + " corr_per_feat_test.to_excel(w, sheet_name='correlations')\n", + " corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')" ] }, { @@ -1122,7 +1224,9 @@ "execution_count": null, "id": "b38ffdfc-b1b0-4ae0-a47d-5881c534881f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1136,7 +1240,9 @@ "execution_count": null, "id": "9993d145-8b78-4769-838a-01721900a3c7", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1177,7 +1283,9 @@ "execution_count": null, "id": "829ebc82-587d-47c6-8422-03c610855211", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1193,7 +1301,9 @@ "execution_count": null, "id": "f8269d00-9048-4e70-9f39-dab95e103c32", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1206,7 +1316,9 @@ "execution_count": null, "id": "096083d1-bcd2-44a2-94fe-a89b7d204b66", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1221,7 +1333,9 @@ "id": "05a259ef-48bd-4dd0-8dfe-9e2750579383", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1243,7 +1357,9 @@ "execution_count": null, "id": "d3dd53c0-4068-4eac-a5c3-7aaa608e5f8f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1269,7 +1385,9 @@ "id": "ef92551d", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1295,7 +1413,9 @@ "execution_count": null, "id": "588f7bf3", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1328,7 +1448,10 @@ "execution_count": null, "id": "e1455bcc", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1347,7 +1470,9 @@ "execution_count": null, "id": "b13ecd37", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1372,7 +1497,10 @@ "execution_count": null, "id": "712faf9a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1434,7 +1562,10 @@ "execution_count": null, "id": "2a578570", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1493,7 +1624,9 @@ "execution_count": null, "id": "3339df97-230f-4cbd-b61d-7aef9a7495e8", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1517,7 +1650,9 @@ "execution_count": null, "id": "095f64eb-1c4f-47ae-9a01-d5b05a795779", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1541,7 +1676,9 @@ "execution_count": null, "id": "c8f67ae1-40e9-4c2a-af0a-41e627703518", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1553,11 +1690,14 @@ "execution_count": null, "id": "b08b442f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "dumps" + "dumps\n", + "print(\"done\")" ] } ], diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index 80f3f7c02..fd175a7f1 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -26,7 +26,7 @@ # - as for validation data # - top N based on validation data -# %% +# %% tags=["hide-input"] import logging import random from pathlib import Path @@ -77,7 +77,7 @@ def build_text(s): return ret -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -104,19 +104,19 @@ def build_text(s): # %% [markdown] # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args -# %% +# %% tags=["hide-input"] figures = {} dumps = {} -# %% +# %% tags=["hide-input"] TARGET_COL = 'observed' METRIC = 'MAE' MIN_FREQ = None @@ -128,17 +128,17 @@ def build_text(s): SEL_MODELS = args.sel_models.split(',') -# %% +# %% tags=["hide-input"] # list(sns.color_palette().as_hex()) # string representation of colors if args.plot_to_n > 10: logger.warning("Set maximum of models to 10 (maximum)") args.overwrite_entry('plot_to_n', 10) -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) -# %% +# %% tags=["hide-input"] fig, axes = plt.subplots(1, 2, sharey=True, sharex=True) vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0], @@ -158,13 +158,13 @@ def build_text(s): # %% [markdown] # ## data completeness across entire data -# %% +# %% tags=["hide-input"] # load frequency of training features... # needs to be pickle -> index.name needed freq_feat = vaep.io.datasplits.load_freq(args.data, file='freq_features.json') freq_feat.head() # training data -# %% +# %% tags=["hide-input"] prop = freq_feat / len(data.train_X.index.levels[0]) prop.sort_values().to_frame().plot( xlabel=f'{data.val_y.index.names[-1]}', @@ -173,21 +173,21 @@ def build_text(s): # %% [markdown] # View training data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() data.train_X # %% [markdown] # Number of samples and features: -# %% +# %% tags=["hide-input"] N_SAMPLES, M_FEAT = data.train_X.shape print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}") # %% [markdown] # Collect outputs in excel file: -# %% +# %% tags=["hide-input"] fname = args.folder_experiment / '01_2_performance_summary.xlsx' dumps[fname.stem] = fname writer = pd.ExcelWriter(fname) @@ -197,24 +197,24 @@ def build_text(s): # ## Model specifications # - used for bar plot annotations -# %% +# %% tags=["hide-input"] # model_key could be used as key from config file -# ? load only specified configs? -# ? case: no config file available? +# # ? load only specified configs? +# # ? case: no config file available? all_configs = collect( paths=(fname for fname in args.out_models.iterdir() if fname.suffix == '.yaml' and 'model_config' in fname.name), load_fn=load_config_file ) -model_configs = pd.DataFrame(all_configs).set_index('model') +model_configs = pd.DataFrame(all_configs).set_index('id') model_configs.T.to_excel(writer, sheet_name='model_params') model_configs.T # %% [markdown] # Set Feature name (columns are features, rows are samples) -# %% +# %% tags=["hide-input"] # index name freq_feat.index.name = data.train_X.columns.name # sample index name @@ -228,7 +228,7 @@ def build_text(s): # ## Validation data # - set top N models to plot based on validation data split -# %% +# %% tags=["hide-input"] pred_val = compare_predictions.load_split_prediction_by_modelkey( experiment_folder=args.folder_experiment, split='val', @@ -242,7 +242,7 @@ def build_text(s): # %% [markdown] # Describe absolute error -# %% +# %% tags=["hide-input"] errors_val = (pred_val .drop(TARGET_COL, axis=1) .sub(pred_val[TARGET_COL], axis=0) @@ -251,7 +251,7 @@ def build_text(s): # %% [markdown] # ### Select top N for plotting and set colors -# %% +# %% tags=["hide-input"] ORDER_MODELS = (errors_val .abs() .mean() @@ -260,7 +260,7 @@ def build_text(s): .to_list()) ORDER_MODELS -# %% +# %% tags=["hide-input"] pred_val = pred_val[[TARGET_COL] + ORDER_MODELS] if args.save_agg_pred: fname = args.folder_experiment / '01_2_agg_pred_val.csv' @@ -269,7 +269,7 @@ def build_text(s): logger.info(f"Saved aggregated predictions to: {fname}") pred_val -# %% +# %% tags=["hide-input"] mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS] mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f') mae_stats_ordered_val.T @@ -282,11 +282,11 @@ def build_text(s): # > 1. The order of "new" models is important for the color assignment. # > 2. User defined model keys for the same model with two configuration will yield different colors. -# %% +# %% tags=["hide-input"] COLORS_TO_USE = vaep.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS)) vaep.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE) -# %% +# %% tags=["hide-input"] TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n] TOP_N_COLOR_PALETTE = {model: color for model, color in zip(TOP_N_ORDER, COLORS_TO_USE)} @@ -296,10 +296,10 @@ def build_text(s): # %% [markdown] # ### Correlation per sample -# %% +# %% tags=["hide-input"] corr_per_sample_val = (pred_val .groupby(sample_index_name) - .aggregate( + .apply( lambda df: df.corr().loc[TARGET_COL] )[ORDER_MODELS]) @@ -321,11 +321,12 @@ def build_text(s): with pd.ExcelWriter(fname) as w: corr_per_sample_val.describe().to_excel(w, sheet_name='summary') corr_per_sample_val.to_excel(w, sheet_name='correlations') + corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted') # %% [markdown] # identify samples which are below lower whisker for models -# %% +# %% tags=["hide-input"] treshold = vaep.pandas.get_lower_whiskers( corr_per_sample_val[TOP_N_ORDER]).min() mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1) @@ -335,12 +336,12 @@ def build_text(s): # %% [markdown] # ### Error plot -# %% +# %% tags=["hide-input"] c_error_min = 4.5 mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1) errors_val.loc[mask].sort_index(level=1).head() -# %% +# %% tags=["hide-input"] errors_val = errors_val.abs().groupby( freq_feat.index.name).mean() # absolute error errors_val = errors_val.join(freq_feat) @@ -348,10 +349,10 @@ def build_text(s): errors_val.head() -# %% +# %% tags=["hide-input"] errors_val.describe()[ORDER_MODELS].T # mean of means -# %% +# %% tags=["hide-input"] c_avg_error = 2 mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1) errors_val.loc[mask] @@ -361,7 +362,7 @@ def build_text(s): # ### Error by non-decimal number of intensity # - number of observations in parentheses. -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(8, 3)) ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( pred_val[ @@ -378,7 +379,7 @@ def build_text(s): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) -# %% +# %% tags=["hide-input"] # # ! only used for reporting plotted = vaep.plotting.errors.get_data_for_errors_by_median( errors=errors_binned, @@ -389,7 +390,7 @@ def build_text(s): plotted -# %% +# %% tags=["hide-input"] errors_binned.head() dumps[fname.stem] = fname.with_suffix('.csv') errors_binned.to_csv(fname.with_suffix('.csv')) @@ -398,7 +399,7 @@ def build_text(s): # %% [markdown] # ## test data -# %% +# %% tags=["hide-input"] pred_test = compare_predictions.load_split_prediction_by_modelkey( experiment_folder=args.folder_experiment, split='test', @@ -416,17 +417,17 @@ def build_text(s): # %% [markdown] # Write averages for all models to excel (from before?) -# %% +# %% tags=["hide-input"] errors_test_mae = vaep.pandas.calc_errors.get_absolute_error( pred_test ) mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS] mae_stats_ordered_test -# %% +# %% tags=["hide-input"] mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f') -# %% +# %% tags=["hide-input"] cp_mean_perf = pd.concat([ mae_stats_ordered_val.loc['mean'], mae_stats_ordered_test.loc['mean'], @@ -437,13 +438,13 @@ def build_text(s): cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f') cp_mean_perf -# %% +# %% tags=["hide-input"] writer.close() # %% [markdown] # ### Intensity distribution as histogram # Plot top 4 models predictions for intensities in test data -# %% +# %% tags=["hide-input"] min_max = vaep.plotting.data.min_max(pred_test[TARGET_COL]) top_n = 4 fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True) @@ -476,7 +477,7 @@ def build_text(s): figures[fname.stem] = fname vaep.savefig(fig, name=fname) -# %% +# %% tags=["hide-input"] counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test, bins=bins, columns=[TARGET_COL, *ORDER_MODELS[:top_n]]) @@ -487,10 +488,10 @@ def build_text(s): # %% [markdown] # ### Correlation per sample -# %% +# %% tags=["hide-input"] corr_per_sample_test = (pred_test .groupby(sample_index_name) - .aggregate(lambda df: df.corr().loc[TARGET_COL]) + .apply(lambda df: df.corr().loc[TARGET_COL]) [ORDER_MODELS]) corr_per_sample_test = corr_per_sample_test.join( pred_test @@ -501,7 +502,7 @@ def build_text(s): too_few_obs = corr_per_sample_test['n_obs'] < 3 corr_per_sample_test.loc[~too_few_obs].describe() -# %% +# %% tags=["hide-input"] # # ! add minimum kwargs = dict(ylim=(0.7, 1), rot=90, flierprops=dict(markersize=3), @@ -521,32 +522,33 @@ def build_text(s): with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w: corr_per_sample_test.describe().to_excel(w, sheet_name='summary') corr_per_sample_test.to_excel(w, sheet_name='correlations') + corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted') # %% [markdown] # identify samples which are below lower whisker for models -# %% +# %% tags=["hide-input"] treshold = vaep.pandas.get_lower_whiskers( corr_per_sample_test[TOP_N_ORDER]).min() mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1) corr_per_sample_test.loc[mask].style.highlight_min( axis=1) if mask.sum() else 'Nothing to display' -# %% +# %% tags=["hide-input"] feature_names = pred_test.index.levels[-1] N_SAMPLES = pred_test.index M = len(feature_names) pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :] -# %% -options = random.sample(set(feature_names), 1) +# %% tags=["hide-input"] +options = random.sample(sorted(set(feature_names)), 1) pred_test.loc[pd.IndexSlice[:, options[0]], :] # %% [markdown] # ### Correlation per feature -# %% -corr_per_feat_test = pred_test.groupby(FEAT_NAME).aggregate( +# %% tags=["hide-input"] +corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply( lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS] corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[ TARGET_COL].count().rename('n_obs')) @@ -554,10 +556,10 @@ def build_text(s): too_few_obs = corr_per_feat_test['n_obs'] < 3 corr_per_feat_test.loc[~too_few_obs].describe() -# %% +# %% tags=["hide-input"] corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0) -# %% +# %% tags=["hide-input"] kwargs = dict(rot=90, flierprops=dict(markersize=1), ylabel=f'correlation per {FEAT_NAME_DISPLAY}') @@ -576,13 +578,14 @@ def build_text(s): corr_per_feat_test.loc[~too_few_obs].describe().to_excel( w, sheet_name='summary') corr_per_feat_test.to_excel(w, sheet_name='correlations') + corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted') -# %% +# %% tags=["hide-input"] feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count() feat_count_test.name = 'count' feat_count_test.head() -# %% +# %% tags=["hide-input"] treshold = vaep.pandas.get_lower_whiskers( corr_per_feat_test[TOP_N_ORDER]).min() mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1) @@ -609,23 +612,23 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] # ### Error plot -# %% +# %% tags=["hide-input"] metrics = vaep.models.Metrics() test_metrics = metrics.add_metrics( pred_test[['observed', *TOP_N_ORDER]], key='test data') test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER] test_metrics -# %% +# %% tags=["hide-input"] n_in_comparison = int(test_metrics.loc['N'].unique()[0]) n_in_comparison -# %% +# %% tags=["hide-input"] _to_plot = test_metrics.loc[METRIC].to_frame().T _to_plot.index = [feature_names.name] _to_plot -# %% +# %% tags=["hide-input"] try: text = model_configs[["latent_dim", "hidden_layers"]].apply( build_text, @@ -639,7 +642,7 @@ def highlight_min(s, color, tolerence=0.00001): _to_plot -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(4, 2)) # size of the plot can be adjusted ax = _to_plot.loc[[feature_names.name]].plot.bar( rot=0, @@ -655,7 +658,7 @@ def highlight_min(s, color, tolerence=0.00001): figures[fname.stem] = fname vaep.savefig(fig, name=fname) -# %% +# %% tags=["hide-input"] dumps[fname.stem] = fname.with_suffix('.csv') _to_plot_long = _to_plot.T _to_plot_long = _to_plot_long.rename( @@ -668,7 +671,7 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] # ### Plot error by median feature intensity -# %% +# %% tags=["hide-input"] vaep.plotting.make_large_descriptors(7) fig, ax = plt.subplots(figsize=(8, 2)) @@ -692,7 +695,7 @@ def highlight_min(s, color, tolerence=0.00001): errors_binned.to_csv(fname.with_suffix('.csv')) errors_binned -# %% +# %% tags=["hide-input"] # # ! only used for reporting plotted = vaep.plotting.errors.get_data_for_errors_by_median( errors=errors_binned, @@ -703,7 +706,7 @@ def highlight_min(s, color, tolerence=0.00001): plotted -# %% +# %% tags=["hide-input"] (errors_binned .set_index( ['model', errors_binned.columns[-1]] @@ -714,7 +717,7 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] # ### Custom model selection -# %% +# %% tags=["hide-input"] if SEL_MODELS: metrics = vaep.models.Metrics() test_metrics = metrics.add_metrics( @@ -768,7 +771,7 @@ def highlight_min(s, color, tolerence=0.00001): _to_plot_long.to_csv(fname.with_suffix('.csv')) -# %% +# %% tags=["hide-input"] # custom selection if SEL_MODELS: vaep.plotting.make_large_descriptors(7) @@ -814,7 +817,7 @@ def highlight_min(s, color, tolerence=0.00001): # # - number of observations in parentheses. -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(8, 2)) ax, errors_binned = vaep.plotting.errors.plot_errors_binned( pred_test[ @@ -829,15 +832,16 @@ def highlight_min(s, color, tolerence=0.00001): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) -# %% +# %% tags=["hide-input"] dumps[fname.stem] = fname.with_suffix('.csv') errors_binned.to_csv(fname.with_suffix('.csv')) errors_binned.head() # %% [markdown] # ## Figures dumped to disk -# %% +# %% tags=["hide-input"] figures -# %% +# %% tags=["hide-input"] dumps +print("done") diff --git a/project/01_3_revision3.py b/project/01_3_revision3.py new file mode 100644 index 000000000..9362de276 --- /dev/null +++ b/project/01_3_revision3.py @@ -0,0 +1,170 @@ +# %% [markdown] +# # Compare models +# +# 1. Load available configurations +# 2. Load validation predictions +# - calculate absolute error on common subset of data +# - select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5) +# - correlation per sample, correlation per feat, correlation overall +# - MAE plots +# 3. Load test data predictions +# - as for validation data +# - top N based on validation data +# +# Model with `UNIQUE` key refer to samples uniquly split into training, validation and test data. +# These models could not use all sample for training. The predictions on simulated values +# are therefore restricted to the validation and test data from the set of unique samples. +# The models trained on all sample have additionally missing values in their training data, +# which were not missing in the unique samples. The comparison is therefore between models +# which had different data available for training. + +# %% +import logging +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import yaml + +import vaep +import vaep.imputation +import vaep.models +import vaep.nb +from vaep.analyzers import compare_predictions +from vaep.models.collect_dumps import select_content + +pd.options.display.max_rows = 30 +pd.options.display.min_rows = 10 +pd.options.display.max_colwidth = 100 + +plt.rcParams.update({'figure.figsize': (3, 2)}) +vaep.plotting.make_large_descriptors(7) + +logger = vaep.logging.setup_nb_logger() +logging.getLogger('fontTools').setLevel(logging.WARNING) + + +def load_config_file(fname: Path, first_split='config_') -> dict: + with open(fname) as f: + loaded = yaml.safe_load(f) + key = f"{select_content(fname.stem, first_split=first_split)}" + return key, loaded + + +def build_text(s): + ret = '' + if not np.isnan(s["latent_dim"]): + ret += f'LD: {int(s["latent_dim"])} ' + try: + if len(s["hidden_layers"]): + t = ",".join(str(x) for x in s["hidden_layers"]) + ret += f"HL: {t}" + except TypeError: + # nan + pass + return ret + + +# %% +# catch passed parameters +args = None +args = dict(globals()).keys() + +# %% [markdown] +# Papermill script parameters: + +# %% tags=["parameters"] +# files and folders +# Datasplit folder with data for experiment +folder_experiment: str = 'runs/example' +folder_data: str = '' # specify data directory if needed +file_format: str = 'csv' # change default to pickled files +# Machine parsed metadata from rawfile workflow +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' +models: str = 'Median,CF,DAE,VAE,KNN' # picked models to compare (comma separated) +sel_models: str = '' # user defined comparison (comma separated) +# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10 +plot_to_n: int = 5 +feat_name_display: str = None # display name for feature name (e.g. 'protein group') + + +# %% +models = 'KNN,KNN_UNIQUE' +folder_experiment = 'runs/rev3' + +# %% [markdown] +# Some argument transformations + +# %% +args = vaep.nb.get_params(args, globals=globals()) +args + +# %% +args = vaep.nb.args_from_dict(args) +args + +# %% +figures = {} +dumps = {} + +# %% +TARGET_COL = 'observed' +METRIC = 'MAE' +MIN_FREQ = None +MODELS_PASSED = args.models.split(',') +MODELS = MODELS_PASSED.copy() +FEAT_NAME_DISPLAY = args.feat_name_display +SEL_MODELS = None +if args.sel_models: + SEL_MODELS = args.sel_models.split(',') + +# %% + + +# %% [markdown] +# # Load predictions on validation and test data split +# + +# %% [markdown] +# ## Validation data +# - set top N models to plot based on validation data split + +# %% +pred_val = compare_predictions.load_split_prediction_by_modelkey( + experiment_folder=args.folder_experiment, + split='val', + model_keys=MODELS_PASSED, + shared_columns=[TARGET_COL]) +SAMPLE_ID, FEAT_NAME = pred_val.index.names +if not FEAT_NAME_DISPLAY: + FEAT_NAME_DISPLAY = FEAT_NAME +pred_val[MODELS] + +# %% +pred_test = compare_predictions.load_split_prediction_by_modelkey( + experiment_folder=args.folder_experiment, + split='test', + model_keys=MODELS_PASSED, + shared_columns=[TARGET_COL]) +pred_test + +# %% +pred_val = pred_val.dropna() +pred_test = pred_test.dropna() + +# %% +metrics = vaep.models.Metrics() +test_metrics = metrics.add_metrics( + pred_test, key='test data') +test_metrics = pd.DataFrame(test_metrics) +test_metrics + +# %% +metrics = vaep.models.Metrics() +val_metrics = metrics.add_metrics( + pred_val, key='validation data') +val_metrics = pd.DataFrame(val_metrics) +val_metrics + +# %% diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb index 14cf6618c..5169e2d07 100644 --- a/project/04_1_train_pimms_models.ipynb +++ b/project/04_1_train_pimms_models.ipynb @@ -5,10 +5,10 @@ "id": "eae0a078", "metadata": {}, "source": [ - "# Scikit-learn styple transformers of the data\n", + "# PIMMS Tutorial: Scikit-learn style transformers\n", "\n", "1. Load data into pandas dataframe\n", - "2. Fit transformer on training data\n", + "2. Fit model on training data, potentially specify validation data\n", "3. Impute only missing values with predictions from model\n", "\n", "Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas\n", @@ -34,7 +34,7 @@ " except metadata.PackageNotFoundError:\n", " print(\"Install PIMMS...\")\n", " # !pip install git+https://github.com/RasmussenLab/pimms.git@dev\n", - " !pip install pimms-learn " + " !pip install pimms-learn" ] }, { @@ -74,7 +74,7 @@ "\n", "\n", "from vaep.plotting.defaults import color_model_mapping\n", - "import vaep.plotting.data \n", + "import vaep.plotting.data\n", "import vaep.sampling\n", "\n", "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n", @@ -208,7 +208,7 @@ "\n", "\n", "def select_features(df, feat_prevalence=.2, axis=0):\n", - " # ! vaep.filter.select_features\n", + " # # ! vaep.filter.select_features\n", " N = df.shape[axis]\n", " minimum_freq = N * feat_prevalence\n", " freq = df.notna().sum(axis=axis)\n", @@ -273,7 +273,7 @@ "metadata": {}, "outputs": [], "source": [ - "# # # CollaborativeFilteringTransformer?" + "# # # # CollaborativeFilteringTransformer?" ] }, { @@ -680,10 +680,6 @@ } ], "metadata": { - "mystnb": { - "execution_raise_on_error": true, - "execution_timeout": 120 - }, "jupytext": { "cell_metadata_filter": "-all", "main_language": "python", @@ -705,6 +701,10 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.17" + }, + "mystnb": { + "execution_raise_on_error": true, + "execution_timeout": 120 } }, "nbformat": 4, diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py index f4891d239..0a11b509c 100644 --- a/project/04_1_train_pimms_models.py +++ b/project/04_1_train_pimms_models.py @@ -1,8 +1,8 @@ # %% [markdown] -# # Scikit-learn styple transformers of the data +# # PIMMS Tutorial: Scikit-learn style transformers # # 1. Load data into pandas dataframe -# 2. Fit transformer on training data +# 2. Fit model on training data, potentially specify validation data # 3. Impute only missing values with predictions from model # # Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas @@ -21,7 +21,7 @@ except metadata.PackageNotFoundError: print("Install PIMMS...") # # !pip install git+https://github.com/RasmussenLab/pimms.git@dev - # !pip install pimms-learn + # !pip install pimms-learn # %% [markdown] # If on colab, please restart the environment and run everything from here on. @@ -42,7 +42,7 @@ from vaep.plotting.defaults import color_model_mapping -import vaep.plotting.data +import vaep.plotting.data import vaep.sampling from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer diff --git a/project/10_1_ald_diff_analysis.ipynb b/project/10_1_ald_diff_analysis.ipynb index d00af1fff..24dd7ce90 100644 --- a/project/10_1_ald_diff_analysis.ipynb +++ b/project/10_1_ald_diff_analysis.ipynb @@ -6,7 +6,7 @@ "source": [ "# Differential Analysis - Compare model imputation with standard imputation\n", "\n", - "- load missing values predictions\n", + "- load missing values predictions (if specified)\n", "- leave all other values as they were\n", "- compare missing values predicition by model with baseline method\n", " (default: draw from shifted normal distribution. short RSN)" @@ -15,7 +15,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", @@ -39,7 +43,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -51,14 +59,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Parameters" + "## Parameters\n", + "Default and set parameters for the notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2, "tags": [ "parameters" ] @@ -83,10 +91,22 @@ "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" ] }, + { + "cell_type": "markdown", + "id": "01617e36", + "metadata": {}, + "source": [ + "Add set parameters to configuration" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if not model:\n", @@ -98,7 +118,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.Config()\n", @@ -118,13 +142,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Outputs of this notebook will be stored here" + "Outputs of this notebook will be stored here:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out = {}\n", @@ -135,20 +163,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Data" + "## Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## MS proteomics" + "### MS proteomics or specified omics data\n", + "Aggregated from data splits of the imputation workflow run before." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = vaep.io.datasplits.DataSplits.from_folder(\n", @@ -158,7 +191,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "observed = pd.concat([data.train_X, data.val_y, data.test_y])\n", @@ -169,14 +206,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Clinical data" + "### Clinical data\n", + "Describe numerical data specified for use:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -189,8 +230,11 @@ { "cell_type": "code", "execution_count": null, - "id": "8dc0e77c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ## Additional annotations\n", @@ -221,7 +265,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_clinic[[args.target, *args.covar]].isna().any(axis=1).sum()" @@ -237,7 +285,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_sample_with_complete_clinical_data = df_clinic[[args.target, *args.covar]].notna().all(axis=1)\n", @@ -254,8 +306,11 @@ { "cell_type": "code", "execution_count": null, - "id": "7d96716e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_clinic.loc[idx_complete_data, args.target].value_counts()" @@ -263,17 +318,19 @@ }, { "cell_type": "markdown", - "id": "5c0917f5", "metadata": {}, "source": [ - "check which patients with kleiner score have misssing covariates" + "Check which patients with kleiner score have misssing covariates:" ] }, { "cell_type": "code", "execution_count": null, - "id": "4ec28834", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_clinic.loc[(~mask_sample_with_complete_clinical_data\n", @@ -283,7 +340,6 @@ }, { "cell_type": "markdown", - "id": "0be92801", "metadata": {}, "source": [ "Save feature frequency of observed data based on complete clinical data" @@ -292,8 +348,11 @@ { "cell_type": "code", "execution_count": null, - "id": "220ee009", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_observed = observed.unstack().loc[idx_complete_data].notna().sum()\n", @@ -311,13 +370,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## ALD study approach using all measurments" + "## ALD study approach using all measurements\n", + "Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study)." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "DATA_COMPLETENESS = 0.6\n", @@ -334,7 +398,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_qc_samples:\n", @@ -357,7 +425,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -371,63 +442,51 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## load model predictions for (real) missing data" + "## Load model predictions for (real) missing data\n", + "Load from:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list(args.out_preds.iterdir())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0904ba3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "# available_files = list(args.out_preds.iterdir())\n", "template_pred = str(args.out_preds / args.template_pred)\n", - "template_pred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "fname = args.out_preds / args.template_pred.format(args.model)\n", "fname" ] }, { "cell_type": "markdown", - "id": "6e514a17", + "id": "e7a55383", "metadata": { "lines_to_next_cell": 0 }, "source": [ - "Baseline comparison\n", - "In case of RSN -> use filtering as done in original paper (Niu et al. 2022)\n", - "otherwise -> use all data\n", + "Baseline comparison:\n", + "- in case of RSN -> use filtering as done in original [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study) (Niu et al. 2022)\n", + "- otherwise -> use all data\n", "\n", - "- use columns which are provided by model" + "Use columns which are provided by model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "# ALD study approach -> has access to simulated missing data!\n", - "# (VAE model did not see this data)\n", "pred_real_na = None\n", "if args.model_key and str(args.model_key) != 'None':\n", " pred_real_na = (vaep\n", @@ -452,13 +511,17 @@ "lines_to_next_cell": 2 }, "source": [ - "plot subsets to highlight differences" + "Plot unchanged observed intensities to imputed intensity distribution (if available):" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "def plot_distributions(observed: pd.Series,\n", @@ -496,13 +559,13 @@ " ax.set_ylabel('observations')\n", " ax.locator_params(axis='y', integer=True)\n", " ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - " return fig\n", + " return fig, bins\n", "\n", "\n", "vaep.plotting.make_large_descriptors(6)\n", - "fig = plot_distributions(observed,\n", - " imputation=pred_real_na,\n", - " model_key=args.model_key, figsize=(2.5, 2))\n", + "fig, bins = plot_distributions(observed,\n", + " imputation=pred_real_na,\n", + " model_key=args.model_key, figsize=(2.5, 2))\n", "fname = args.out_folder / 'dist_plots' / f'real_na_obs_vs_{args.model_key}.pdf'\n", "files_out[fname.name] = fname.as_posix()\n", "vaep.savefig(fig, name=fname)" @@ -510,15 +573,50 @@ }, { "cell_type": "markdown", + "id": "a8394517", "metadata": {}, "source": [ - "## Mean shift by model" + "Dump frequency of histograms to file for reporting (if imputed values are used)" ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "if pred_real_na is not None:\n", + " counts_per_bin = pd.concat([\n", + " vaep.pandas.get_counts_per_bin(observed.to_frame('observed'), bins=bins),\n", + " vaep.pandas.get_counts_per_bin(pred_real_na.to_frame(args.model_key), bins=bins)\n", + " ], axis=1)\n", + " counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + " logger.info(\"Counts per bin saved to %s\", fname.with_suffix('.xlsx'))\n", + " display(counts_per_bin)" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "## Mean shift by model\n", + "Compare how imputed values are shifted in comparsion to overall distribution.\n", + "\n", + "First by using all intensities without any grouping:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if pred_real_na is not None:\n", @@ -531,13 +629,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Or by averaging over the calculation by sample" + "Then by averaging over the calculation by sample:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if pred_real_na is not None:\n", @@ -557,14 +659,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Differential analysis\n", - "Impute missing values (or not)" + "## Differential analysis\n", + "Combine observed and imputed data (if available) for differential analysis:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df = pd.concat([observed, pred_real_na]).unstack()\n", @@ -574,11 +680,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# * if some features were not imputed -> drop them\n", - "? could be changed: let a model decide if a feature should be imputed, otherwise don't.\n", + "# ? could be changed: let a model decide if a feature should be imputed, otherwise don't.\n", "if pred_real_na is not None:\n", " if df.isna().sum().sum():\n", " logger.warning(\"DataFrame has missing entries after imputation.\")\n", @@ -590,14 +700,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Targets - Clinical variables" + "Results for target and clinical variables:" ] }, { "cell_type": "code", "execution_count": null, - "id": "fe32bcf6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = njab.stats.ancova.AncovaAll(df_proteomics=df,\n", @@ -606,16 +719,6 @@ " covar=args.covar,\n", " value_name=args.value_name\n", " ).ancova()\n", - "scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9632495", - "metadata": {}, - "outputs": [], - "source": [ "# features are in first index position\n", "feat_idx = scores.index.get_level_values(0)\n", "if gene_to_PG is not None:\n", @@ -626,11 +729,21 @@ "scores" ] }, + { + "cell_type": "markdown", + "id": "bd02d010", + "metadata": {}, + "source": [ + "Only for target:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -639,11 +752,21 @@ "scores.loc[pd.IndexSlice[:, args.target], :]" ] }, + { + "cell_type": "markdown", + "id": "bc8f0344", + "metadata": {}, + "source": [ + "Save all results to file:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -654,24 +777,26 @@ "fname" ] }, + { + "cell_type": "markdown", + "id": "3c3db9ea", + "metadata": {}, + "source": [ + "Saved files:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 0 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "files_out" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3734882", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/project/10_1_ald_diff_analysis.py b/project/10_1_ald_diff_analysis.py index 712334711..9bb4de7f1 100644 --- a/project/10_1_ald_diff_analysis.py +++ b/project/10_1_ald_diff_analysis.py @@ -15,12 +15,12 @@ # %% [markdown] # # Differential Analysis - Compare model imputation with standard imputation # -# - load missing values predictions +# - load missing values predictions (if specified) # - leave all other values as they were # - compare missing values predicition by model with baseline method # (default: draw from shifted normal distribution. short RSN) -# %% +# %% tags=["hide-input"] import logging from pathlib import Path @@ -38,13 +38,14 @@ logger = vaep.logging.setup_nb_logger() logging.getLogger('fontTools').setLevel(logging.WARNING) -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() # %% [markdown] # ## Parameters +# Default and set parameters for the notebook. # %% tags=["parameters"] folder_experiment = "runs/appl_ald_data/plasma/proteinGroups" @@ -64,14 +65,16 @@ out_folder = 'diff_analysis' template_pred = 'pred_real_na_{}.csv' # fixed, do not change +# %% [markdown] +# Add set parameters to configuration -# %% +# %% tags=["hide-input"] if not model: model = model_key params = vaep.nb.get_params(args, globals=globals(), remove=True) params -# %% +# %% tags=["hide-input"] args = vaep.nb.Config() args.fn_clinical_data = Path(params["fn_clinical_data"]) args.folder_experiment = Path(params["folder_experiment"]) @@ -85,37 +88,39 @@ args # %% [markdown] -# Outputs of this notebook will be stored here +# Outputs of this notebook will be stored here: -# %% +# %% tags=["hide-input"] files_out = {} args.out_folder # %% [markdown] -# # Data +# ## Data # %% [markdown] -# ## MS proteomics +# ### MS proteomics or specified omics data +# Aggregated from data splits of the imputation workflow run before. -# %% +# %% tags=["hide-input"] data = vaep.io.datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) -# %% +# %% tags=["hide-input"] observed = pd.concat([data.train_X, data.val_y, data.test_y]) observed # %% [markdown] -# ## Clinical data +# ### Clinical data +# Describe numerical data specified for use: -# %% +# %% tags=["hide-input"] df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0) df_clinic = df_clinic.loc[observed.index.levels[0]] cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) df_clinic[[args.target, *args.covar]].describe() -# %% +# %% tags=["hide-input"] # ## Additional annotations # - additional annotations of features (e.g. gene names for protein groups) @@ -136,13 +141,13 @@ # - only complete data is used for Differential Analysis # - covariates are not imputed -# %% +# %% tags=["hide-input"] df_clinic[[args.target, *args.covar]].isna().any(axis=1).sum() # %% [markdown] # Data description of data used: -# %% +# %% tags=["hide-input"] mask_sample_with_complete_clinical_data = df_clinic[[args.target, *args.covar]].notna().all(axis=1) fname = args.out_folder / 'mask_sample_with_complete_clinical_data.csv' files_out[fname.name] = fname.as_posix() @@ -153,13 +158,13 @@ .index) df_clinic.loc[idx_complete_data, [args.target, *args.covar]].describe() -# %% +# %% tags=["hide-input"] df_clinic.loc[idx_complete_data, args.target].value_counts() # %% [markdown] -# check which patients with kleiner score have misssing covariates +# Check which patients with kleiner score have misssing covariates: -# %% +# %% tags=["hide-input"] df_clinic.loc[(~mask_sample_with_complete_clinical_data & df_clinic[args.target].notna()), [args.target, *args.covar]] @@ -167,7 +172,7 @@ # %% [markdown] # Save feature frequency of observed data based on complete clinical data -# %% +# %% tags=["hide-input"] feat_freq_observed = observed.unstack().loc[idx_complete_data].notna().sum() feat_freq_observed.name = 'frequency' @@ -179,9 +184,10 @@ _ = ax.set_xticklabels([l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]) # %% [markdown] -# ## ALD study approach using all measurments +# ## ALD study approach using all measurements +# Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study). -# %% +# %% tags=["hide-input"] DATA_COMPLETENESS = 0.6 # MIN_N_PROTEIN_GROUPS: int = 200 FRAC_PROTEIN_GROUPS: int = 0.622 @@ -192,7 +198,7 @@ ald_study -# %% +# %% tags=["hide-input"] if args.fn_qc_samples: # Move this to data-preprocessing qc_samples = pd.read_pickle(args.fn_qc_samples) @@ -208,7 +214,7 @@ ald_study -# %% +# %% tags=["hide-input"] fig, axes = vaep.plotting.plot_cutoffs(observed.unstack(), feat_completness_over_samples=cutoffs.feat_completness_over_samples, min_feat_in_sample=cutoffs.min_feat_in_sample) @@ -216,28 +222,22 @@ # %% [markdown] -# ## load model predictions for (real) missing data - -# %% -list(args.out_preds.iterdir()) +# ## Load model predictions for (real) missing data +# Load from: -# %% +# %% tags=["hide-input"] +# available_files = list(args.out_preds.iterdir()) template_pred = str(args.out_preds / args.template_pred) -template_pred - -# %% fname = args.out_preds / args.template_pred.format(args.model) fname # %% [markdown] -# Baseline comparison -# In case of RSN -> use filtering as done in original paper (Niu et al. 2022) -# otherwise -> use all data +# Baseline comparison: +# - in case of RSN -> use filtering as done in original [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study) (Niu et al. 2022) +# - otherwise -> use all data # -# - use columns which are provided by model -# %% -# ALD study approach -> has access to simulated missing data! -# (VAE model did not see this data) +# Use columns which are provided by model +# %% tags=["hide-input"] pred_real_na = None if args.model_key and str(args.model_key) != 'None': pred_real_na = (vaep @@ -257,10 +257,10 @@ # %% [markdown] -# plot subsets to highlight differences +# Plot unchanged observed intensities to imputed intensity distribution (if available): -# %% +# %% tags=["hide-input"] def plot_distributions(observed: pd.Series, imputation: pd.Series = None, model_key: str = 'MODEL', @@ -296,30 +296,46 @@ def plot_distributions(observed: pd.Series, ax.set_ylabel('observations') ax.locator_params(axis='y', integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - return fig + return fig, bins vaep.plotting.make_large_descriptors(6) -fig = plot_distributions(observed, - imputation=pred_real_na, - model_key=args.model_key, figsize=(2.5, 2)) +fig, bins = plot_distributions(observed, + imputation=pred_real_na, + model_key=args.model_key, figsize=(2.5, 2)) fname = args.out_folder / 'dist_plots' / f'real_na_obs_vs_{args.model_key}.pdf' files_out[fname.name] = fname.as_posix() vaep.savefig(fig, name=fname) +# %% [markdown] +# Dump frequency of histograms to file for reporting (if imputed values are used) + +# %% tags=["hide-input"] +if pred_real_na is not None: + counts_per_bin = pd.concat([ + vaep.pandas.get_counts_per_bin(observed.to_frame('observed'), bins=bins), + vaep.pandas.get_counts_per_bin(pred_real_na.to_frame(args.model_key), bins=bins) + ], axis=1) + counts_per_bin.to_excel(fname.with_suffix('.xlsx')) + logger.info("Counts per bin saved to %s", fname.with_suffix('.xlsx')) + display(counts_per_bin) + # %% [markdown] # ## Mean shift by model +# Compare how imputed values are shifted in comparsion to overall distribution. +# +# First by using all intensities without any grouping: -# %% +# %% tags=["hide-input"] if pred_real_na is not None: shifts = (vaep.imputation.compute_moments_shift(observed, pred_real_na, names=('observed', args.model_key))) display(pd.DataFrame(shifts).T) # %% [markdown] -# Or by averaging over the calculation by sample +# Then by averaging over the calculation by sample: -# %% +# %% tags=["hide-input"] if pred_real_na is not None: index_level = 0 # per sample mean_by_sample = pd.DataFrame( @@ -333,16 +349,16 @@ def plot_distributions(observed: pd.Series, display(mean_by_sample) # %% [markdown] -# # Differential analysis -# Impute missing values (or not) +# ## Differential analysis +# Combine observed and imputed data (if available) for differential analysis: -# %% +# %% tags=["hide-input"] df = pd.concat([observed, pred_real_na]).unstack() df.loc[idx_complete_data] -# %% +# %% tags=["hide-input"] # * if some features were not imputed -> drop them -# ? could be changed: let a model decide if a feature should be imputed, otherwise don't. +# # ? could be changed: let a model decide if a feature should be imputed, otherwise don't. if pred_real_na is not None: if df.isna().sum().sum(): logger.warning("DataFrame has missing entries after imputation.") @@ -350,18 +366,15 @@ def plot_distributions(observed: pd.Series, df = df.dropna(axis=1) # %% [markdown] -# Targets - Clinical variables +# Results for target and clinical variables: -# %% +# %% tags=["hide-input"] scores = njab.stats.ancova.AncovaAll(df_proteomics=df, df_clinic=df_clinic, target=args.target, covar=args.covar, value_name=args.value_name ).ancova() -scores - -# %% # features are in first index position feat_idx = scores.index.get_level_values(0) if gene_to_PG is not None: @@ -371,20 +384,26 @@ def plot_distributions(observed: pd.Series, ) scores -# %% +# %% [markdown] +# Only for target: + +# %% tags=["hide-input"] scores.columns = pd.MultiIndex.from_product([[str(args.model_key)], scores.columns], names=('model', 'var')) scores.loc[pd.IndexSlice[:, args.target], :] +# %% [markdown] +# Save all results to file: -# %% +# %% tags=["hide-input"] fname = args.out_folder / 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl' files_out[fname.name] = fname.as_posix() fname.parent.mkdir(exist_ok=True, parents=True) scores.to_pickle(fname) fname +# %% [markdown] +# Saved files: -# %% +# %% tags=["hide-input"] files_out -# %% diff --git a/project/10_2_ald_compare_methods.ipynb b/project/10_2_ald_compare_methods.ipynb index 68ad9f949..02ff40448 100644 --- a/project/10_2_ald_compare_methods.ipynb +++ b/project/10_2_ald_compare_methods.ipynb @@ -7,38 +7,38 @@ "source": [ "# Compare outcomes from differential analysis based on different imputation methods\n", "\n", - "- load scores based on `16_ald_diff_analysis`" + "- load scores based on `10_1_ald_diff_analysis`" ] }, { "cell_type": "code", "execution_count": null, "id": "eec6f931-c04d-428c-b2b1-0424c50e6cd2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "import logging\n", "from pathlib import Path\n", "\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import seaborn as sns\n", + "from IPython.display import display\n", "\n", "import vaep\n", "import vaep.databases.diseases\n", + "\n", "logger = vaep.logging.setup_nb_logger()\n", "\n", "plt.rcParams['figure.figsize'] = (2, 2)\n", "fontsize = 5\n", - "vaep.plotting.make_large_descriptors(fontsize)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4bf65da-0569-4a21-ba20-9cae7d3679e7", - "metadata": {}, - "outputs": [], - "source": [ + "vaep.plotting.make_large_descriptors(fontsize)\n", + "logging.getLogger('fontTools').setLevel(logging.ERROR)\n", + "\n", "# catch passed parameters\n", "args = None\n", "args = dict(globals()).keys()" @@ -46,18 +46,18 @@ }, { "cell_type": "markdown", - "id": "22c645fc-c7eb-4c69-a7aa-084fc733258f", + "id": "85f5f5d5", "metadata": {}, "source": [ - "## Parameters" + "## Parameters\n", + "Default and set parameters for the notebook." ] }, { "cell_type": "code", "execution_count": null, - "id": "978876d0-b3cc-4847-8eab-dc0b89ddbbcd", + "id": "f4bf65da-0569-4a21-ba20-9cae7d3679e7", "metadata": { - "lines_to_next_cell": 2, "tags": [ "parameters" ] @@ -70,6 +70,7 @@ "model_key = 'VAE'\n", "baseline = 'RSN'\n", "out_folder = 'diff_analysis'\n", + "selected_statistics = ['p-unc', '-Log10 pvalue', 'qvalue', 'rejected']\n", "\n", "disease_ontology = 5082 # code from https://disease-ontology.org/\n", "# split diseases notebook? Query gene names for proteins in file from uniprot?\n", @@ -77,25 +78,25 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a8016d79-e41a-40a2-bcbf-e11711c33b7d", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "22c645fc-c7eb-4c69-a7aa-084fc733258f", + "metadata": {}, "source": [ - "params = vaep.nb.get_params(args, globals=globals())\n", - "params" + "Add set parameters to configuration" ] }, { "cell_type": "code", "execution_count": null, - "id": "5ded6640-99aa-4759-a8ef-b67029f22766", - "metadata": {}, + "id": "978876d0-b3cc-4847-8eab-dc0b89ddbbcd", + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "params = vaep.nb.get_params(args, globals=globals())\n", "args = vaep.nb.Config()\n", "args.folder_experiment = Path(params[\"folder_experiment\"])\n", "args = vaep.nb.add_default_paths(args,\n", @@ -109,53 +110,34 @@ " / params[\"out_folder\"]\n", " / params[\"target\"]\n", " / 'scores')\n", + "args.freq_features_observed = args.folder_experiment / 'freq_features_observed.csv'\n", "args" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "112dd9f2-6219-452a-9c6a-b1712dabb164", - "metadata": {}, - "outputs": [], - "source": [ - "files_in = {\n", - " 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv',\n", - "}\n", - "files_in" - ] - }, { "cell_type": "markdown", - "id": "c74bcc21-3fb2-4b8d-823a-72a3b6b6e847", - "metadata": {}, - "source": [ - "## Excel file for exports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12009c54-c45f-4ee0-a9b3-b0e8e5f3cff2", + "id": "fc184dea", "metadata": {}, - "outputs": [], "source": [ - "files_out = dict()" + "### Excel file for exports" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ef71b04-d4a5-4def-ad63-866d8bba4a1e", - "metadata": {}, + "id": "a8016d79-e41a-40a2-bcbf-e11711c33b7d", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ + "files_out = dict()\n", "writer_args = dict(float_format='%.3f')\n", "\n", "fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'\n", "files_out[fname.name] = fname\n", "writer = pd.ExcelWriter(fname)\n", - "fname" + "logger.info(\"Writing to excel file: %s\", fname)" ] }, { @@ -163,24 +145,27 @@ "id": "770d1f76-e86f-4ae3-9d7b-ceef9b9e9a22", "metadata": {}, "source": [ - "# Load scores" + "## Load scores" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e8bbada7-8b8e-4399-b0d3-b66c40905839", + "cell_type": "markdown", + "id": "6b108869", "metadata": {}, - "outputs": [], "source": [ - "[x for x in args.scores_folder.iterdir() if 'scores' in str(x)]" + "### Load baseline model scores\n", + "Show all statistics, later use selected statistics" ] }, { "cell_type": "code", "execution_count": null, "id": "97221134-5f61-4158-bfc5-ea30077140b8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'\n", @@ -188,11 +173,23 @@ "scores_baseline" ] }, + { + "cell_type": "markdown", + "id": "e49a8da2", + "metadata": {}, + "source": [ + "### Load selected comparison model scores" + ] + }, { "cell_type": "code", "execution_count": null, "id": "f0635e4d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.scores_folder / f'diff_analysis_scores_{args.model_key}.pkl'\n", @@ -200,22 +197,49 @@ "scores_model" ] }, + { + "cell_type": "markdown", + "id": "06b7e883", + "metadata": {}, + "source": [ + "### Combined scores\n", + "show only selected statistics for comparsion" + ] + }, { "cell_type": "code", "execution_count": null, "id": "373fdf65", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = scores_model.join(scores_baseline, how='outer')[[args.baseline, args.model_key]]\n", + "scores = scores.loc[:, pd.IndexSlice[scores.columns.levels[0].to_list(),\n", + " args.selected_statistics]]\n", "scores" ] }, + { + "cell_type": "markdown", + "id": "b84a6e5a", + "metadata": {}, + "source": [ + "Models in comparison (name mapping)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "34d243d1-3ab4-40e7-9eb8-f9efc828b82d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "models = vaep.nb.Config.from_dict(\n", @@ -223,21 +247,45 @@ "vars(models)" ] }, + { + "cell_type": "markdown", + "id": "dd7a560d", + "metadata": {}, + "source": [ + "## Describe scores" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0fee8f5d-fa52-4369-a1f9-fcfd518ab6bd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores.describe()" ] }, + { + "cell_type": "markdown", + "id": "52ecc596", + "metadata": {}, + "source": [ + "### One to one comparison of by feature:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "c6e5a0a6-343b-4f07-8d9d-2cd5cf95ae1f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = scores.loc[pd.IndexSlice[:, args.target], :]\n", @@ -245,22 +293,46 @@ "scores" ] }, + { + "cell_type": "markdown", + "id": "36e14580", + "metadata": {}, + "source": [ + "And the descriptive statistics\n", + "of the numeric values:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0e45e80a-32d8-4c6c-b0a4-5ce8b7f9e121", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores.describe()" ] }, + { + "cell_type": "markdown", + "id": "e520d6dc", + "metadata": {}, + "source": [ + "and the boolean decision values" + ] + }, { "cell_type": "code", "execution_count": null, "id": "53bd5597-221c-4d54-abf2-82956db42594", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -279,10 +351,14 @@ "cell_type": "code", "execution_count": null, "id": "2a926ba1-0f3b-4089-a349-b6d66128cf37", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "freq_feat = pd.read_csv(files_in['freq_features_observed.csv'], index_col=0)\n", + "freq_feat = pd.read_csv(args.freq_features_observed, index_col=0)\n", "freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),])\n", "freq_feat" ] @@ -292,14 +368,18 @@ "id": "408eacfe-770f-42ff-9057-2a98274e1ae3", "metadata": {}, "source": [ - "# Compare shared features" + "## Compare shared features" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2dfb0f-195b-4044-a228-2d784ea2a458", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores_common = (scores\n", @@ -311,11 +391,23 @@ "scores_common" ] }, + { + "cell_type": "markdown", + "id": "62a9eefd", + "metadata": {}, + "source": [ + "### Annotate decisions in Confusion Table style:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "80cf4145-070d-457a-bb74-ee64299809e7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "def annotate_decision(scores, model, model_column):\n", @@ -335,37 +427,54 @@ "annotations.value_counts()" ] }, + { + "cell_type": "markdown", + "id": "0942d395", + "metadata": {}, + "source": [ + "### List different decisions between models" + ] + }, { "cell_type": "code", "execution_count": null, "id": "cda5ffa4-9a97-4a49-aaba-34e83ef7940a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_different = (\n", " (scores_common.loc[:, pd.IndexSlice[:, 'rejected']].any(axis=1))\n", " & ~(scores_common.loc[:, pd.IndexSlice[:, 'rejected']].all(axis=1))\n", ")\n", - "\n", - "scores_common.loc[mask_different]" + "_to_write = scores_common.loc[mask_different]\n", + "_to_write.to_excel(writer, 'differences', **writer_args)\n", + "logger.info(\"Writen to Excel file under sheet 'differences'.\")\n", + "_to_write" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b9e2739b-a09a-4113-a3f8-f29d6ed398b7", + "cell_type": "markdown", + "id": "e8832084", "metadata": {}, - "outputs": [], "source": [ - "_to_write = scores_common.loc[mask_different]\n", - "_to_write.to_excel(writer, 'differences', **writer_args)" + "## Plot qvalues of both models with annotated decisions\n", + "\n", + "Prepare data for plotting (qvalues)" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2e5341-b054-40c3-b45a-44ae6ca46cfb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "var = 'qvalue'\n", @@ -385,14 +494,18 @@ "tags": [] }, "source": [ - "## Plot of intensities for most extreme example" + "List of features with the highest difference in qvalues" ] }, { "cell_type": "code", "execution_count": null, "id": "c7af4a70-aa43-4772-af00-d425f5ed249f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# should it be possible to run not only RSN?\n", @@ -405,19 +518,21 @@ "id": "e715954e-2d62-4cd9-b4b0-063524bca495", "metadata": {}, "source": [ - "## Differences plotted\n", - "\n", - "- first only using created annotations" + "### Differences plotted with created annotations" ] }, { "cell_type": "code", "execution_count": null, "id": "a20d356d-c397-4440-b70e-9d899aa200fd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "figsize = (2, 2)\n", + "figsize = (4, 4)\n", "size = 5\n", "fig, ax = plt.subplots(figsize=figsize)\n", "x_col = to_plot.columns[0]\n", @@ -450,17 +565,20 @@ "id": "d8849c76-c5f6-4618-87c0-f2635dc9ac66", "metadata": {}, "source": [ - "- showing how many features were measured (\"observed\")" + "- also showing how many features were measured (\"observed\") by size of circle" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf258f8-9dcb-40bb-af66-b600f8d413f6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "figsize = (2.5, 2.5)\n", "fig, ax = plt.subplots(figsize=figsize)\n", "ax = sns.scatterplot(data=to_plot,\n", " x=to_plot.columns[0],\n", @@ -490,75 +608,64 @@ "id": "1fee3a21-d8b3-40c6-aea2-4774dfe855ca", "metadata": {}, "source": [ - "# Only features contained in model" + "## Only features contained in model\n", + "- this block exist due to a specific part in the ALD analysis of the paper" ] }, { "cell_type": "code", "execution_count": null, "id": "c4e23a01-fd37-4496-a518-445a9ef38db1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores_model_only = scores.reset_index(level=-1, drop=True)\n", - "scores_model_only = (scores_model_only\n", - " .loc[\n", - " scores_model_only.index.difference(\n", - " scores_common.index),\n", - " args.model_key]\n", - " .sort_values(by='qvalue', ascending=True)\n", - " .join(freq_feat)\n", - " )\n", - "scores_model_only" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f668bef4-e2b9-46fb-828f-e7c6a0e23627", - "metadata": {}, - "outputs": [], - "source": [ - "scores_model_only.rejected.value_counts()" + "_diff = scores_model_only.index.difference(scores_common.index)\n", + "if not _diff.empty:\n", + " scores_model_only = (scores_model_only\n", + " .loc[\n", + " _diff,\n", + " args.model_key]\n", + " .sort_values(by='qvalue', ascending=True)\n", + " .join(freq_feat.squeeze().rename(freq_feat.columns.droplevel()[0])\n", + " )\n", + " )\n", + " display(scores_model_only)\n", + "else:\n", + " scores_model_only = None\n", + " logger.info(\"No features only in new comparision model.\")\n", + "\n", + "if not _diff.empty:\n", + " scores_model_only.to_excel(writer, 'only_model', **writer_args)\n", + " display(scores_model_only.rejected.value_counts())\n", + " scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected]\n", + " scores_model_only_rejected.to_excel(\n", + " writer, 'only_model_rejected', **writer_args)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "419ec0b5-24c4-4366-8a03-48fec3aeb29b", + "cell_type": "markdown", + "id": "78b2c336", "metadata": {}, - "outputs": [], "source": [ - "scores_model_only.to_excel(writer, 'only_model', **writer_args)" + "## DISEASES DB lookup\n", + "\n", + "Query diseases database for gene associations with specified disease ontology id." ] }, { "cell_type": "code", "execution_count": null, - "id": "814b8f38-81ef-4546-9182-f65b124e8858", - "metadata": {}, - "outputs": [], - "source": [ - "scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected]\n", - "scores_model_only_rejected.to_excel(\n", - " writer, 'only_model_rejected', **writer_args)" - ] - }, - { - "cell_type": "markdown", - "id": "6868984c-1ebf-4183-bebe-35a48b92e479", + "id": "d93a9242-0ef4-4fc7-bd98-226a93639f58", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, - "source": [ - "# DISEASES DB lookup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d93a9242-0ef4-4fc7-bd98-226a93639f58", - "metadata": {}, "outputs": [], "source": [ "data = vaep.databases.diseases.get_disease_association(\n", @@ -582,7 +689,11 @@ "cell_type": "code", "execution_count": null, "id": "5c26415e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] }, @@ -590,7 +701,11 @@ "cell_type": "code", "execution_count": null, "id": "b68b43df", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_name = scores.index.names[0] # first index level is feature name\n", @@ -607,7 +722,11 @@ "cell_type": "code", "execution_count": null, "id": "f8d4a74d-5a9b-4d9b-9345-4288bb23e19f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "gene_to_PG = (scores.droplevel(\n", @@ -625,7 +744,11 @@ "cell_type": "code", "execution_count": null, "id": "d9e76def-b48a-458d-a90b-765e6e70f7a4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "disease_associations_all = data.join(\n", @@ -645,7 +768,11 @@ "cell_type": "code", "execution_count": null, "id": "8e9d6944-87ba-4c41-af14-fb5ed93262f0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(scores_model_only.index)\n", @@ -658,7 +785,11 @@ "cell_type": "code", "execution_count": null, "id": "ceefc483-b889-4bab-b207-c8d5fd97fa4a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_new.loc[idx, 'score'] >= 2.0\n", @@ -677,7 +808,11 @@ "cell_type": "code", "execution_count": null, "id": "4644759b-8cc2-4f99-a16c-16419cfb915c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(\n", @@ -691,7 +826,11 @@ "cell_type": "code", "execution_count": null, "id": "365d8641-97a8-464f-b69b-270af9ae6e2d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_new_rejected.loc[idx, 'score'] >= 2.0\n", @@ -723,7 +862,11 @@ "cell_type": "code", "execution_count": null, "id": "6416d494-5f3e-4cf4-b766-b1f95e40ae1c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(mask.index[mask])\n", @@ -736,7 +879,11 @@ "cell_type": "code", "execution_count": null, "id": "10899e28-4aee-4d44-a542-e45be6699a1b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_shared_rejected_by_model.loc[idx, 'score'] >= 2.0\n", @@ -768,7 +915,11 @@ "cell_type": "code", "execution_count": null, "id": "7780de55-c63b-4028-a6d0-58bce7be81da", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(mask.index[mask])\n", @@ -783,7 +934,11 @@ "cell_type": "code", "execution_count": null, "id": "03115c8f-1f20-4b51-a78c-4d7c0317dc33", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_shared_rejected_by_RSN.loc[idx, 'score'] >= 2.0\n", @@ -802,7 +957,11 @@ "cell_type": "code", "execution_count": null, "id": "91e7fbb7-69fd-4b4c-9bc2-40e8dd1907b3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "disease_associations_all.to_excel(\n", @@ -818,26 +977,21 @@ "id": "52a42028-7e2d-47d5-be02-52f7ff1f3665", "metadata": {}, "source": [ - "# Outputs" + "## Outputs" ] }, { "cell_type": "code", "execution_count": null, "id": "7f3a7433-3bf1-4168-8f16-eb6d415ef17f", - "metadata": {}, - "outputs": [], - "source": [ - "writer.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e59ff592-a399-4490-bf3f-7618abf73feb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "writer.close()\n", "files_out" ] } diff --git a/project/10_2_ald_compare_methods.py b/project/10_2_ald_compare_methods.py index 55037a3de..9268c089c 100644 --- a/project/10_2_ald_compare_methods.py +++ b/project/10_2_ald_compare_methods.py @@ -15,30 +15,34 @@ # %% [markdown] # # Compare outcomes from differential analysis based on different imputation methods # -# - load scores based on `16_ald_diff_analysis` +# - load scores based on `10_1_ald_diff_analysis` -# %% +# %% tags=["hide-input"] +import logging from pathlib import Path import matplotlib.pyplot as plt import pandas as pd import seaborn as sns +from IPython.display import display import vaep import vaep.databases.diseases + logger = vaep.logging.setup_nb_logger() plt.rcParams['figure.figsize'] = (2, 2) fontsize = 5 vaep.plotting.make_large_descriptors(fontsize) +logging.getLogger('fontTools').setLevel(logging.ERROR) -# %% # catch passed parameters args = None args = dict(globals()).keys() # %% [markdown] # ## Parameters +# Default and set parameters for the notebook. # %% tags=["parameters"] folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups' @@ -47,17 +51,17 @@ model_key = 'VAE' baseline = 'RSN' out_folder = 'diff_analysis' +selected_statistics = ['p-unc', '-Log10 pvalue', 'qvalue', 'rejected'] disease_ontology = 5082 # code from https://disease-ontology.org/ # split diseases notebook? Query gene names for proteins in file from uniprot? annotaitons_gene_col = 'PG.Genes' +# %% [markdown] +# Add set parameters to configuration -# %% +# %% tags=["hide-input"] params = vaep.nb.get_params(args, globals=globals()) -params - -# %% args = vaep.nb.Config() args.folder_experiment = Path(params["folder_experiment"]) args = vaep.nb.add_default_paths(args, @@ -71,80 +75,99 @@ / params["out_folder"] / params["target"] / 'scores') +args.freq_features_observed = args.folder_experiment / 'freq_features_observed.csv' args -# %% -files_in = { - 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv', -} -files_in - # %% [markdown] -# ## Excel file for exports +# ### Excel file for exports # %% files_out = dict() - -# %% writer_args = dict(float_format='%.3f') fname = args.out_folder / 'diff_analysis_compare_methods.xlsx' files_out[fname.name] = fname writer = pd.ExcelWriter(fname) -fname +logger.info("Writing to excel file: %s", fname) # %% [markdown] -# # Load scores +# ## Load scores -# %% -[x for x in args.scores_folder.iterdir() if 'scores' in str(x)] +# %% [markdown] +# ### Load baseline model scores +# Show all statistics, later use selected statistics -# %% +# %% tags=["hide-input"] fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl' scores_baseline = pd.read_pickle(fname) scores_baseline -# %% +# %% [markdown] +# ### Load selected comparison model scores + +# %% tags=["hide-input"] fname = args.scores_folder / f'diff_analysis_scores_{args.model_key}.pkl' scores_model = pd.read_pickle(fname) scores_model -# %% +# %% [markdown] +# ### Combined scores +# show only selected statistics for comparsion + +# %% tags=["hide-input"] scores = scores_model.join(scores_baseline, how='outer')[[args.baseline, args.model_key]] +scores = scores.loc[:, pd.IndexSlice[scores.columns.levels[0].to_list(), + args.selected_statistics]] scores -# %% +# %% [markdown] +# Models in comparison (name mapping) + +# %% tags=["hide-input"] models = vaep.nb.Config.from_dict( vaep.pandas.index_to_dict(scores.columns.get_level_values(0))) vars(models) -# %% +# %% [markdown] +# ## Describe scores + +# %% tags=["hide-input"] scores.describe() -# %% +# %% [markdown] +# ### One to one comparison of by feature: + +# %% tags=["hide-input"] scores = scores.loc[pd.IndexSlice[:, args.target], :] scores.to_excel(writer, 'scores', **writer_args) scores -# %% +# %% [markdown] +# And the descriptive statistics +# of the numeric values: + +# %% tags=["hide-input"] scores.describe() -# %% +# %% [markdown] +# and the boolean decision values + +# %% tags=["hide-input"] scores.describe(include=['bool', 'O']) # %% [markdown] # ## Load frequencies of observed features -# %% -freq_feat = pd.read_csv(files_in['freq_features_observed.csv'], index_col=0) +# %% tags=["hide-input"] +freq_feat = pd.read_csv(args.freq_features_observed, index_col=0) freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),]) freq_feat # %% [markdown] -# # Compare shared features +# ## Compare shared features -# %% +# %% tags=["hide-input"] scores_common = (scores .dropna() .reset_index(-1, drop=True) @@ -154,7 +177,10 @@ scores_common -# %% +# %% [markdown] +# ### Annotate decisions in Confusion Table style: + +# %% tags=["hide-input"] def annotate_decision(scores, model, model_column): return scores[(model_column, 'rejected')].replace({False: f'{model} (no) ', True: f'{model} (yes)'}) @@ -171,19 +197,25 @@ def annotate_decision(scores, model, model_column): annotations.name = 'Differential Analysis Comparison' annotations.value_counts() -# %% +# %% [markdown] +# ### List different decisions between models + +# %% tags=["hide-input"] mask_different = ( (scores_common.loc[:, pd.IndexSlice[:, 'rejected']].any(axis=1)) & ~(scores_common.loc[:, pd.IndexSlice[:, 'rejected']].all(axis=1)) ) - -scores_common.loc[mask_different] - -# %% _to_write = scores_common.loc[mask_different] _to_write.to_excel(writer, 'differences', **writer_args) +logger.info("Writen to Excel file under sheet 'differences'.") +_to_write -# %% +# %% [markdown] +# ## Plot qvalues of both models with annotated decisions +# +# Prepare data for plotting (qvalues) + +# %% tags=["hide-input"] var = 'qvalue' to_plot = [scores_common[v][var] for v in models.values()] for s, k in zip(to_plot, models.keys()): @@ -194,20 +226,18 @@ def annotate_decision(scores, model, model_column): to_plot # %% [markdown] -# ## Plot of intensities for most extreme example +# List of features with the highest difference in qvalues -# %% +# %% tags=["hide-input"] # should it be possible to run not only RSN? to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs() to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False) # %% [markdown] -# ## Differences plotted -# -# - first only using created annotations +# ### Differences plotted with created annotations -# %% -figsize = (2, 2) +# %% tags=["hide-input"] +figsize = (4, 4) size = 5 fig, ax = plt.subplots(figsize=figsize) x_col = to_plot.columns[0] @@ -235,10 +265,9 @@ def annotate_decision(scores, model, model_column): vaep.savefig(fig, name=fname) # %% [markdown] -# - showing how many features were measured ("observed") +# - also showing how many features were measured ("observed") by size of circle -# %% -figsize = (2.5, 2.5) +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=figsize) ax = sns.scatterplot(data=to_plot, x=to_plot.columns[0], @@ -263,35 +292,39 @@ def annotate_decision(scores, model, model_column): fig, name=files_out[f'diff_analysis_comparision_2_{args.model_key}']) # %% [markdown] -# # Only features contained in model +# ## Only features contained in model +# - this block exist due to a specific part in the ALD analysis of the paper -# %% +# %% tags=["hide-input"] scores_model_only = scores.reset_index(level=-1, drop=True) -scores_model_only = (scores_model_only - .loc[ - scores_model_only.index.difference( - scores_common.index), - args.model_key] - .sort_values(by='qvalue', ascending=True) - .join(freq_feat) - ) -scores_model_only +_diff = scores_model_only.index.difference(scores_common.index) +if not _diff.empty: + scores_model_only = (scores_model_only + .loc[ + _diff, + args.model_key] + .sort_values(by='qvalue', ascending=True) + .join(freq_feat.squeeze().rename(freq_feat.columns.droplevel()[0]) + ) + ) + display(scores_model_only) +else: + scores_model_only = None + logger.info("No features only in new comparision model.") -# %% -scores_model_only.rejected.value_counts() - -# %% -scores_model_only.to_excel(writer, 'only_model', **writer_args) - -# %% -scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected] -scores_model_only_rejected.to_excel( - writer, 'only_model_rejected', **writer_args) +if not _diff.empty: + scores_model_only.to_excel(writer, 'only_model', **writer_args) + display(scores_model_only.rejected.value_counts()) + scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected] + scores_model_only_rejected.to_excel( + writer, 'only_model_rejected', **writer_args) # %% [markdown] -# # DISEASES DB lookup +# ## DISEASES DB lookup +# +# Query diseases database for gene associations with specified disease ontology id. -# %% +# %% tags=["hide-input"] data = vaep.databases.diseases.get_disease_association( doid=args.disease_ontology, limit=10000) data = pd.DataFrame.from_dict(data, orient='index').rename_axis('ENSP', axis=0) @@ -303,9 +336,9 @@ def annotate_decision(scores, model, model_column): # ## Shared features # ToDo: new script -> DISEASES DB lookup -# %% +# %% tags=["hide-input"] -# %% +# %% tags=["hide-input"] feat_name = scores.index.names[0] # first index level is feature name if args.annotaitons_gene_col in scores.index.names: logger.info(f"Found gene annotation in scores index: {scores.index.names}") @@ -315,7 +348,7 @@ def annotate_decision(scores, model, model_column): import sys sys.exit(0) -# %% +# %% tags=["hide-input"] gene_to_PG = (scores.droplevel( list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col}) ) @@ -326,7 +359,7 @@ def annotate_decision(scores, model, model_column): ) gene_to_PG.head() -# %% +# %% tags=["hide-input"] disease_associations_all = data.join( gene_to_PG).dropna().reset_index().set_index(feat_name).join(annotations) disease_associations_all @@ -334,27 +367,27 @@ def annotate_decision(scores, model, model_column): # %% [markdown] # ## only by model -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection(scores_model_only.index) disease_assocications_new = disease_associations_all.loc[idx].sort_values( 'score', ascending=False) disease_assocications_new.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_new.loc[idx, 'score'] >= 2.0 disease_assocications_new.loc[idx].loc[mask] # %% [markdown] # ## Only by model which were significant -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection( scores_model_only_rejected.index) disease_assocications_new_rejected = disease_associations_all.loc[idx].sort_values( 'score', ascending=False) disease_assocications_new_rejected.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_new_rejected.loc[idx, 'score'] >= 2.0 disease_assocications_new_rejected.loc[idx].loc[mask] @@ -365,13 +398,13 @@ def annotate_decision(scores, model, model_column): mask = (scores_common[(str(args.model_key), 'rejected')] & mask_different) mask.sum() -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection(mask.index[mask]) disease_assocications_shared_rejected_by_model = (disease_associations_all.loc[idx].sort_values( 'score', ascending=False)) disease_assocications_shared_rejected_by_model.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_shared_rejected_by_model.loc[idx, 'score'] >= 2.0 disease_assocications_shared_rejected_by_model.loc[idx].loc[mask] @@ -382,7 +415,7 @@ def annotate_decision(scores, model, model_column): mask = (scores_common[(str(args.baseline), 'rejected')] & mask_different) mask.sum() -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection(mask.index[mask]) disease_assocications_shared_rejected_by_RSN = ( disease_associations_all @@ -390,14 +423,14 @@ def annotate_decision(scores, model, model_column): .sort_values('score', ascending=False)) disease_assocications_shared_rejected_by_RSN.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_shared_rejected_by_RSN.loc[idx, 'score'] >= 2.0 disease_assocications_shared_rejected_by_RSN.loc[idx].loc[mask] # %% [markdown] # ## Write to excel -# %% +# %% tags=["hide-input"] disease_associations_all.to_excel( writer, sheet_name='disease_assoc_all', **writer_args) disease_assocications_new.to_excel( @@ -406,10 +439,8 @@ def annotate_decision(scores, model, model_column): writer, sheet_name='disease_assoc_new_rejected', **writer_args) # %% [markdown] -# # Outputs +# ## Outputs -# %% +# %% tags=["hide-input"] writer.close() - -# %% files_out diff --git a/project/10_3_ald_ml_new_feat.ipynb b/project/10_3_ald_ml_new_feat.ipynb index 08b5a5985..8d9ebd273 100644 --- a/project/10_3_ald_ml_new_feat.ipynb +++ b/project/10_3_ald_ml_new_feat.ipynb @@ -3,29 +3,31 @@ { "cell_type": "markdown", "id": "d5f8edbd", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ - "# Compare outcomes from differential analysis based on different imputation methods\n", - "\n", - "- load scores based on `10_1_ald_diff_analysis.ipynb`\n", - "- compare performance for set of features included in original Study\n", - " to the set of features included in Niu. et. al 2022\n", - " (by lowering the threshold for feature completeness))\n", - "- RSN should be set as baseline if Niu et. al 2022 data is used\n", + "# Fit logistic regression model\n", "\n", - "This notebook could be adapted to compare\n", - "1. different set of features which were classified \"significant\" (is there signal)?" + "- based on different imputation methods\n", + "- baseline: reference\n", + "- model: any other selected imputation method" ] }, { "cell_type": "code", "execution_count": null, "id": "8d8c6764", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", "from pathlib import Path\n", + "from typing import List\n", "\n", "import matplotlib.pyplot as plt\n", "import njab.sklearn\n", @@ -47,16 +49,35 @@ "\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", - "logging.getLogger('fontTools').setLevel(logging.ERROR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61a7353e", - "metadata": {}, - "outputs": [], - "source": [ + "logging.getLogger('fontTools').setLevel(logging.ERROR)\n", + "\n", + "\n", + "def parse_roc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame:\n", + " ret = list()\n", + " for _r in res:\n", + " _roc = (pd.DataFrame(_r.test.roc,\n", + " index='fpr tpr cutoffs'.split()\n", + " )).loc[['fpr', 'tpr']]\n", + " _roc = _roc.T\n", + " _roc.columns = pd.MultiIndex.from_product([[_r.name], _roc.columns])\n", + " ret.append(_roc)\n", + " ret = pd.concat(ret, axis=1)\n", + " return ret\n", + "\n", + "\n", + "def parse_prc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame:\n", + " ret = list()\n", + " for _r in res:\n", + " _prc = pd.DataFrame(_r.test.prc,\n", + " index='precision recall cutoffs'.split()\n", + " ).loc[['precision', 'recall']]\n", + " _prc = _prc.T.rename(columns={'recall': 'tpr'})\n", + " _prc.columns = pd.MultiIndex.from_product([[_r.name], _prc.columns])\n", + " ret.append(_prc)\n", + " ret = pd.concat(ret, axis=1)\n", + " return ret\n", + "\n", + "\n", "# catch passed parameters\n", "args = None\n", "args = dict(globals()).keys()" @@ -64,10 +85,11 @@ }, { "cell_type": "markdown", - "id": "139c9ae8", + "id": "e1e67f6d", "metadata": {}, "source": [ - "## Parameters" + "## Parameters\n", + "Default and set parameters for the notebook." ] }, { @@ -101,20 +123,14 @@ "cell_type": "code", "execution_count": null, "id": "13538b85", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "params = vaep.nb.get_params(args, globals=globals())\n", - "params" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92a4a7c0", - "metadata": {}, - "outputs": [], - "source": [ "args = vaep.nb.Config()\n", "args.folder_experiment = Path(params[\"folder_experiment\"])\n", "args = vaep.nb.add_default_paths(args,\n", @@ -123,25 +139,18 @@ " / params[\"target\"]\n", " / f\"{params['baseline']}_vs_{params['model_key']}\"))\n", "args.update_from_dict(params)\n", + "files_out = dict()\n", "args" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "04ac25ed", - "metadata": {}, - "outputs": [], - "source": [ - "files_out = dict()" - ] - }, { "cell_type": "markdown", - "id": "07fb7cc9", + "id": "1ca8264e", "metadata": {}, "source": [ - "## Load target" + "## Load data\n", + "\n", + "### Load target" ] }, { @@ -165,14 +174,19 @@ "id": "02bbf2a2", "metadata": {}, "source": [ - "### Measured data" + "### MS proteomics or specified omics data\n", + "Aggregated from data splits of the imputation workflow run before." ] }, { "cell_type": "code", "execution_count": null, "id": "f4cd6005", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = vaep.io.datasplits.DataSplits.from_folder(\n", @@ -194,14 +208,19 @@ "id": "c79ad218", "metadata": {}, "source": [ - "### Load ALD data or create" + "### Select by ALD criteria\n", + "Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study)." ] }, { "cell_type": "code", "execution_count": null, "id": "3038462c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "DATA_COMPLETENESS = 0.6\n", @@ -230,11 +249,23 @@ "ald_study" ] }, + { + "cell_type": "markdown", + "id": "e0e04598", + "metadata": {}, + "source": [ + "Number of complete cases which can be used:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "3a9e70e6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_has_target = data.index.levels[0].intersection(target.index)\n", @@ -249,14 +280,18 @@ "id": "fcc05bf5", "metadata": {}, "source": [ - "### Load semi-supervised model imputations" + "### Load imputations from specified model" ] }, { "cell_type": "code", "execution_count": null, "id": "5f072d5f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.model_key)\n", @@ -266,11 +301,23 @@ "pred_real_na.sample(3)" ] }, + { + "cell_type": "markdown", + "id": "eadd9ea6", + "metadata": {}, + "source": [ + "### Load imputations from baseline model" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0f2dd584", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.baseline)\n", @@ -283,7 +330,7 @@ "id": "8fa21c8b", "metadata": {}, "source": [ - "# Model predictions\n", + "## Modeling setup\n", "General approach:\n", " - use one train, test split of the data\n", " - select best 10 features from training data `X_train`, `y_train` before binarization of target\n", @@ -293,27 +340,45 @@ "Repeat general approach for\n", " 1. all original ald data: all features justed in original ALD study\n", " 2. all model data: all features available my using the self supervised deep learning model\n", - "3. newly available feat only: the subset of features available from the\n", - "self supervised deep learning model which were newly retained using the\n", - "new approach" + " 3. newly available feat only: the subset of features available from the\n", + " self supervised deep learning model which were newly retained using the\n", + " new approach\n", + "\n", + "All data:" ] }, { "cell_type": "code", "execution_count": null, "id": "f457863e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "X = pd.concat([data, pred_real_na]).unstack()\n", "X" ] }, + { + "cell_type": "markdown", + "id": "0c92c7bd", + "metadata": {}, + "source": [ + "### Subset of data by ALD criteria" + ] + }, { "cell_type": "code", "execution_count": null, "id": "a387dd6f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# could be just observed, drop columns with missing values\n", @@ -321,18 +386,30 @@ " [ald_study.stack(),\n", " pred_real_na_baseline.loc[\n", " # only select columns in selected in ald_study\n", - " pd.IndexSlice[:, ald_study.columns]\n", + " pd.IndexSlice[:, pred_real_na.index.levels[-1].intersection(ald_study.columns)]\n", " ]\n", " ]\n", ").unstack()\n", "ald_study" ] }, + { + "cell_type": "markdown", + "id": "f8c07f73", + "metadata": {}, + "source": [ + "Features which would not have been included using ALD criteria:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "12b9d002", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "new_features = X.columns.difference(ald_study.columns)\n", @@ -344,14 +421,18 @@ "id": "a8e67247-a2a1-4a2f-b838-0bdc9f40cfa9", "metadata": {}, "source": [ - "Binarize targets, but also keep groups for stratification\n" + "Binarize targets, but also keep groups for stratification" ] }, { "cell_type": "code", "execution_count": null, "id": "4aa1f404-427a-4e78-b98d-cb26bb1d1ec4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "target_to_group = target.copy()\n", @@ -364,54 +445,99 @@ "id": "bfab754f", "metadata": {}, "source": [ - "## Best number of parameters by CV" + "## Determine best number of parameters by cross validation procedure\n", + "\n", + "using subset of data by ALD criteria:" ] }, { "cell_type": "code", "execution_count": null, "id": "90e410d1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "cv_feat_ald = njab.sklearn.find_n_best_features(X=ald_study, y=target, name=args.target,\n", " groups=target_to_group)\n", - "cv_feat_ald = cv_feat_ald.groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat_ald = (cv_feat_ald\n", + " .drop('test_case', axis=1)\n", + " .groupby('n_features')\n", + " .agg(['mean', 'std']))\n", "cv_feat_ald" ] }, + { + "cell_type": "markdown", + "id": "b40fb391", + "metadata": {}, + "source": [ + "Using all data:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "988dea31", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "cv_feat_all = njab.sklearn.find_n_best_features(X=X, y=target, name=args.target,\n", " groups=target_to_group)\n", - "cv_feat_all = cv_feat_all.groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat_all = cv_feat_all.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std'])\n", "cv_feat_all" ] }, + { + "cell_type": "markdown", + "id": "0029a621", + "metadata": {}, + "source": [ + "Using only new features:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "811f75d0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "cv_feat_new = njab.sklearn.find_n_best_features(X=X.loc[:, new_features],\n", " y=target, name=args.target,\n", " groups=target_to_group)\n", - "cv_feat_new = cv_feat_new.groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat_new = cv_feat_new.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std'])\n", "cv_feat_new" ] }, + { + "cell_type": "markdown", + "id": "bd57bbac", + "metadata": {}, + "source": [ + "### Best number of features by subset of the data:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "72655713", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "n_feat_best = pd.DataFrame(\n", @@ -428,14 +554,20 @@ "id": "2efdc8bf", "metadata": {}, "source": [ - "## Train, test split" + "## Train, test split\n", + "Show number of cases in train and test data" ] }, { "cell_type": "code", "execution_count": null, "id": "dc3d3b21", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n", @@ -445,52 +577,37 @@ " stratify=target_to_group,\n", " random_state=42)\n", "idx_train = X_train.index\n", - "idx_test = X_test.index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3b4b394", - "metadata": {}, - "outputs": [], - "source": [ + "idx_test = X_test.index\n", + "\n", "njab.pandas.combine_value_counts(\n", " pd.concat([y_train, y_test],\n", " axis=1,\n", " ignore_index=True,\n", - " )\n", - " .rename(columns={0: 'train', 1: 'test'})\n", + " ).rename(columns={0: 'train', 1: 'test'})\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "71879005", - "metadata": {}, - "outputs": [], - "source": [ - "y_train.value_counts()" - ] - }, { "cell_type": "markdown", - "id": "8b528b8e", + "id": "d3a33fb1", "metadata": {}, "source": [ "## Results\n", "\n", "- `run_model` returns dataclasses with the further needed results\n", - "- add mrmr selection of data (select best number of features to use instead of fixing it)" + "- add mrmr selection of data (select best number of features to use instead of fixing it)\n", + "\n", + "Save results for final model on entire data, new features and ALD study criteria selected data." ] }, { "cell_type": "code", "execution_count": null, - "id": "baa9de8b", + "id": "d3b4b394", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -504,28 +621,8 @@ "results_model_full.name = f'{args.model_key} all'\n", "fname = args.out_folder / f'results_{results_model_full.name}.pkl'\n", "files_out[fname.name] = fname\n", - "vaep.io.to_pickle(results_model_full, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18688a0c", - "metadata": {}, - "outputs": [], - "source": [ - "# all(results_model_full.test.roc.tpr\n", - "# ==\n", - "# vaep.sklearn.Results.from_pickle(fname).test.roc.tpr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78e72950", - "metadata": {}, - "outputs": [], - "source": [ + "vaep.io.to_pickle(results_model_full, fname)\n", + "\n", "splits = Splits(X_train=X.loc[idx_train, new_features],\n", " X_test=X.loc[idx_test, new_features],\n", " y_train=y_train,\n", @@ -536,16 +633,8 @@ "results_model_new.name = f'{args.model_key} new'\n", "fname = args.out_folder / f'results_{results_model_new.name}.pkl'\n", "files_out[fname.name] = fname\n", - "vaep.io.to_pickle(results_model_new, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "764ec22c", - "metadata": {}, - "outputs": [], - "source": [ + "vaep.io.to_pickle(results_model_new, fname)\n", + "\n", "splits_ald = Splits(\n", " X_train=ald_study.loc[idx_train],\n", " X_test=ald_study.loc[idx_test],\n", @@ -562,17 +651,21 @@ }, { "cell_type": "markdown", - "id": "790b1db5", + "id": "0ad96ff4", "metadata": {}, "source": [ - "### ROC-AUC" + "### ROC-AUC on test split" ] }, { "cell_type": "code", "execution_count": null, "id": "04b82583", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -584,19 +677,49 @@ "vaep.savefig(fig, name=fname)" ] }, + { + "cell_type": "markdown", + "id": "9e35c686", + "metadata": {}, + "source": [ + "Data used to plot ROC:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322281db", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "res = [results_ald_full, results_model_full, results_model_new]\n", + "\n", + "auc_roc_curve = parse_roc(*res)\n", + "auc_roc_curve.to_excel(fname.with_suffix('.xlsx'))\n", + "auc_roc_curve" + ] + }, { "cell_type": "markdown", "id": "46e9a3f2-89aa-4bd5-a083-d8e16815020a", "metadata": {}, "source": [ - "### Features selected" + "### Features selected for final models" ] }, { "cell_type": "code", "execution_count": null, "id": "9e1bb173", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "selected_features = pd.DataFrame(\n", @@ -620,14 +743,18 @@ "id": "ce227174", "metadata": {}, "source": [ - "### Precision-Recall plot" + "### Precision-Recall plot on test data" ] }, { "cell_type": "code", "execution_count": null, "id": "56ea0d50", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -640,6 +767,30 @@ "vaep.savefig(fig, name=fname)" ] }, + { + "cell_type": "markdown", + "id": "bc9e560d", + "metadata": {}, + "source": [ + "Data used to plot PRC:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9f5e5ce", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "prec_recall_curve = parse_prc(*res)\n", + "prec_recall_curve.to_excel(fname.with_suffix('.xlsx'))\n", + "prec_recall_curve" + ] + }, { "cell_type": "markdown", "id": "0ddf0913", @@ -652,7 +803,11 @@ "cell_type": "code", "execution_count": null, "id": "6eb3ed77", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -669,7 +824,11 @@ "cell_type": "code", "execution_count": null, "id": "64fee389", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -686,17 +845,18 @@ "id": "545b7a34", "metadata": {}, "source": [ - "Options:\n", - "- F1 results for test data for best cutoff on training data?\n", - " (select best cutoff of training data, evaluate on test data)\n", - "- plot X_train PCA/UMAP, map X_test" + "Output files:" ] }, { "cell_type": "code", "execution_count": null, "id": "860e0d5e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out" diff --git a/project/10_3_ald_ml_new_feat.py b/project/10_3_ald_ml_new_feat.py index 886c3839c..40bef222b 100644 --- a/project/10_3_ald_ml_new_feat.py +++ b/project/10_3_ald_ml_new_feat.py @@ -13,20 +13,17 @@ # --- # %% [markdown] -# # Compare outcomes from differential analysis based on different imputation methods +# # Fit logistic regression model # -# - load scores based on `10_1_ald_diff_analysis.ipynb` -# - compare performance for set of features included in original Study -# to the set of features included in Niu. et. al 2022 -# (by lowering the threshold for feature completeness)) -# - RSN should be set as baseline if Niu et. al 2022 data is used -# -# This notebook could be adapted to compare -# 1. different set of features which were classified "significant" (is there signal)? +# - based on different imputation methods +# - baseline: reference +# - model: any other selected imputation method -# %% + +# %% tags=["hide-input"] import logging from pathlib import Path +from typing import List import matplotlib.pyplot as plt import njab.sklearn @@ -50,13 +47,40 @@ logger = vaep.logging.setup_nb_logger() logging.getLogger('fontTools').setLevel(logging.ERROR) -# %% + +def parse_roc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame: + ret = list() + for _r in res: + _roc = (pd.DataFrame(_r.test.roc, + index='fpr tpr cutoffs'.split() + )).loc[['fpr', 'tpr']] + _roc = _roc.T + _roc.columns = pd.MultiIndex.from_product([[_r.name], _roc.columns]) + ret.append(_roc) + ret = pd.concat(ret, axis=1) + return ret + + +def parse_prc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame: + ret = list() + for _r in res: + _prc = pd.DataFrame(_r.test.prc, + index='precision recall cutoffs'.split() + ).loc[['precision', 'recall']] + _prc = _prc.T.rename(columns={'recall': 'tpr'}) + _prc.columns = pd.MultiIndex.from_product([[_r.name], _prc.columns]) + ret.append(_prc) + ret = pd.concat(ret, axis=1) + return ret + + # catch passed parameters args = None args = dict(globals()).keys() # %% [markdown] # ## Parameters +# Default and set parameters for the notebook. # %% tags=["parameters"] folder_data: str = '' # specify data directory if needed @@ -74,11 +98,8 @@ template_pred = 'pred_real_na_{}.csv' # fixed, do not change -# %% +# %% tags=["hide-input"] params = vaep.nb.get_params(args, globals=globals()) -params - -# %% args = vaep.nb.Config() args.folder_experiment = Path(params["folder_experiment"]) args = vaep.nb.add_default_paths(args, @@ -87,13 +108,13 @@ / params["target"] / f"{params['baseline']}_vs_{params['model_key']}")) args.update_from_dict(params) -args - -# %% files_out = dict() +args # %% [markdown] -# ## Load target +# ## Load data +# +# ### Load target # %% target = pd.read_csv(args.fn_clinical_data, @@ -103,9 +124,10 @@ target # %% [markdown] -# ### Measured data +# ### MS proteomics or specified omics data +# Aggregated from data splits of the imputation workflow run before. -# %% +# %% tags=["hide-input"] data = vaep.io.datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) data = pd.concat([data.train_X, data.val_y, data.test_y]) @@ -115,9 +137,10 @@ # Get overlap between independent features and target # %% [markdown] -# ### Load ALD data or create +# ### Select by ALD criteria +# Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study). -# %% +# %% tags=["hide-input"] DATA_COMPLETENESS = 0.6 MIN_N_PROTEIN_GROUPS: int = 200 FRAC_PROTEIN_GROUPS: int = 0.622 @@ -143,7 +166,10 @@ ald_study = ald_study.rename(columns=column_name_first_prot_to_pg) ald_study -# %% +# %% [markdown] +# Number of complete cases which can be used: + +# %% tags=["hide-input"] mask_has_target = data.index.levels[0].intersection(target.index) assert not mask_has_target.empty, f"No data for target: {data.index.levels[0]} and {target.index}" print( @@ -151,22 +177,25 @@ target, data, ald_study = target.loc[mask_has_target], data.loc[mask_has_target], ald_study.loc[mask_has_target] # %% [markdown] -# ### Load semi-supervised model imputations +# ### Load imputations from specified model -# %% +# %% tags=["hide-input"] fname = args.out_preds / args.template_pred.format(args.model_key) print(f"missing values pred. by {args.model_key}: {fname}") load_single_csv_pred_file = vaep.analyzers.compare_predictions.load_single_csv_pred_file pred_real_na = load_single_csv_pred_file(fname).loc[mask_has_target] pred_real_na.sample(3) -# %% +# %% [markdown] +# ### Load imputations from baseline model + +# %% tags=["hide-input"] fname = args.out_preds / args.template_pred.format(args.baseline) pred_real_na_baseline = load_single_csv_pred_file(fname) # .loc[mask_has_target] pred_real_na_baseline # %% [markdown] -# # Model predictions +# ## Modeling setup # General approach: # - use one train, test split of the data # - select best 10 features from training data `X_train`, `y_train` before binarization of target @@ -176,62 +205,83 @@ # Repeat general approach for # 1. all original ald data: all features justed in original ALD study # 2. all model data: all features available my using the self supervised deep learning model -# 3. newly available feat only: the subset of features available from the -# self supervised deep learning model which were newly retained using the -# new approach +# 3. newly available feat only: the subset of features available from the +# self supervised deep learning model which were newly retained using the +# new approach +# +# All data: -# %% +# %% tags=["hide-input"] X = pd.concat([data, pred_real_na]).unstack() X -# %% +# %% [markdown] +# ### Subset of data by ALD criteria + +# %% tags=["hide-input"] # could be just observed, drop columns with missing values ald_study = pd.concat( [ald_study.stack(), pred_real_na_baseline.loc[ # only select columns in selected in ald_study - pd.IndexSlice[:, ald_study.columns] + pd.IndexSlice[:, pred_real_na.index.levels[-1].intersection(ald_study.columns)] ] ] ).unstack() ald_study -# %% +# %% [markdown] +# Features which would not have been included using ALD criteria: + +# %% tags=["hide-input"] new_features = X.columns.difference(ald_study.columns) new_features # %% [markdown] # Binarize targets, but also keep groups for stratification -# -# %% +# %% tags=["hide-input"] target_to_group = target.copy() target = target >= args.cutoff_target pd.crosstab(target.squeeze(), target_to_group.squeeze()) # %% [markdown] -# ## Best number of parameters by CV +# ## Determine best number of parameters by cross validation procedure +# +# using subset of data by ALD criteria: -# %% +# %% tags=["hide-input"] cv_feat_ald = njab.sklearn.find_n_best_features(X=ald_study, y=target, name=args.target, groups=target_to_group) -cv_feat_ald = cv_feat_ald.groupby('n_features').agg(['mean', 'std']) +cv_feat_ald = (cv_feat_ald + .drop('test_case', axis=1) + .groupby('n_features') + .agg(['mean', 'std'])) cv_feat_ald -# %% +# %% [markdown] +# Using all data: + +# %% tags=["hide-input"] cv_feat_all = njab.sklearn.find_n_best_features(X=X, y=target, name=args.target, groups=target_to_group) -cv_feat_all = cv_feat_all.groupby('n_features').agg(['mean', 'std']) +cv_feat_all = cv_feat_all.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std']) cv_feat_all -# %% +# %% [markdown] +# Using only new features: + +# %% tags=["hide-input"] cv_feat_new = njab.sklearn.find_n_best_features(X=X.loc[:, new_features], y=target, name=args.target, groups=target_to_group) -cv_feat_new = cv_feat_new.groupby('n_features').agg(['mean', 'std']) +cv_feat_new = cv_feat_new.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std']) cv_feat_new -# %% +# %% [markdown] +# ### Best number of features by subset of the data: + +# %% tags=["hide-input"] n_feat_best = pd.DataFrame( {'ald': cv_feat_ald.loc[:, pd.IndexSlice[:, 'mean']].idxmax(), 'all': cv_feat_all.loc[:, pd.IndexSlice[:, 'mean']].idxmax(), @@ -242,8 +292,9 @@ # %% [markdown] # ## Train, test split +# Show number of cases in train and test data -# %% +# %% tags=["hide-input"] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, target, @@ -253,25 +304,21 @@ idx_train = X_train.index idx_test = X_test.index -# %% njab.pandas.combine_value_counts( pd.concat([y_train, y_test], axis=1, ignore_index=True, - ) - .rename(columns={0: 'train', 1: 'test'}) + ).rename(columns={0: 'train', 1: 'test'}) ) - -# %% -y_train.value_counts() - # %% [markdown] # ## Results # # - `run_model` returns dataclasses with the further needed results # - add mrmr selection of data (select best number of features to use instead of fixing it) +# +# Save results for final model on entire data, new features and ALD study criteria selected data. -# %% +# %% tags=["hide-input"] splits = Splits(X_train=X.loc[idx_train], X_test=X.loc[idx_test], y_train=y_train, @@ -284,13 +331,6 @@ files_out[fname.name] = fname vaep.io.to_pickle(results_model_full, fname) - -# %% -# all(results_model_full.test.roc.tpr -# == -# vaep.sklearn.Results.from_pickle(fname).test.roc.tpr) - -# %% splits = Splits(X_train=X.loc[idx_train, new_features], X_test=X.loc[idx_test, new_features], y_train=y_train, @@ -303,7 +343,6 @@ files_out[fname.name] = fname vaep.io.to_pickle(results_model_new, fname) -# %% splits_ald = Splits( X_train=ald_study.loc[idx_train], X_test=ald_study.loc[idx_test], @@ -318,9 +357,9 @@ vaep.io.to_pickle(results_ald_full, fname) # %% [markdown] -# ### ROC-AUC +# ### ROC-AUC on test split -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) plot_split_auc(results_ald_full.test, results_ald_full.name, ax) plot_split_auc(results_model_full.test, results_model_full.name, ax) @@ -330,9 +369,19 @@ vaep.savefig(fig, name=fname) # %% [markdown] -# ### Features selected +# Data used to plot ROC: -# %% +# %% tags=["hide-input"] +res = [results_ald_full, results_model_full, results_model_new] + +auc_roc_curve = parse_roc(*res) +auc_roc_curve.to_excel(fname.with_suffix('.xlsx')) +auc_roc_curve + +# %% [markdown] +# ### Features selected for final models + +# %% tags=["hide-input"] selected_features = pd.DataFrame( [results_ald_full.selected_features, results_model_full.selected_features, @@ -349,9 +398,9 @@ selected_features # %% [markdown] -# ### Precision-Recall plot +# ### Precision-Recall plot on test data -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) ax = plot_split_prc(results_ald_full.test, results_ald_full.name, ax) @@ -361,10 +410,18 @@ files_out[fname.name] = fname vaep.savefig(fig, name=fname) +# %% [markdown] +# Data used to plot PRC: + +# %% tags=["hide-input"] +prec_recall_curve = parse_prc(*res) +prec_recall_curve.to_excel(fname.with_suffix('.xlsx')) +prec_recall_curve + # %% [markdown] # ## Train data plots -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) ax = plot_split_prc(results_ald_full.train, results_ald_full.name, ax) @@ -374,7 +431,7 @@ files_out[fname.name] = fname vaep.savefig(fig, name=fname) -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) plot_split_auc(results_ald_full.train, results_ald_full.name, ax) plot_split_auc(results_model_full.train, results_model_full.name, ax) @@ -384,10 +441,7 @@ vaep.savefig(fig, name=fname) # %% [markdown] -# Options: -# - F1 results for test data for best cutoff on training data? -# (select best cutoff of training data, evaluate on test data) -# - plot X_train PCA/UMAP, map X_test +# Output files: -# %% +# %% tags=["hide-input"] files_out diff --git a/project/10_4_ald_compare_single_pg.ipynb b/project/10_4_ald_compare_single_pg.ipynb index 10f97305a..6a1644d68 100644 --- a/project/10_4_ald_compare_single_pg.ipynb +++ b/project/10_4_ald_compare_single_pg.ipynb @@ -15,7 +15,11 @@ "cell_type": "code", "execution_count": null, "id": "4ffa9d4c-622f-46c3-847a-7f7474082ee4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", @@ -36,7 +40,11 @@ "logging.getLogger('fontTools').setLevel(logging.WARNING)\n", "\n", "plt.rcParams['figure.figsize'] = [4, 2.5] # [16.0, 7.0] , [4, 3]\n", - "vaep.plotting.make_large_descriptors(7)" + "vaep.plotting.make_large_descriptors(7)\n", + "\n", + "# catch passed parameters\n", + "args = None\n", + "args = dict(globals()).keys()" ] }, { @@ -47,18 +55,6 @@ "## Parameters" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6ab3869-cc83-47ed-8ce2-0c8a470b96a6", - "metadata": {}, - "outputs": [], - "source": [ - "# catch passed parameters\n", - "args = None\n", - "args = dict(globals()).keys()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -89,20 +85,14 @@ "cell_type": "code", "execution_count": null, "id": "b85c6c2a-146c-48bd-9d7b-1fe4eec8a6ae", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "params = vaep.nb.get_params(args, globals=globals())\n", - "params" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c833157-e36e-476b-a3bd-d604b962ef04", - "metadata": {}, - "outputs": [], - "source": [ "args = vaep.nb.Config()\n", "args.folder_experiment = Path(params[\"folder_experiment\"])\n", "args = vaep.nb.add_default_paths(args,\n", @@ -118,12 +108,22 @@ "args" ] }, + { + "cell_type": "markdown", + "id": "4036fc07", + "metadata": {}, + "source": [ + "Write outputs to excel" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "bc408ac3-85cc-4b39-be77-5202b23bbef7", + "id": "8c833157-e36e-476b-a3bd-d604b962ef04", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -131,14 +131,28 @@ "\n", "fname = args.out_folder / 'diff_analysis_compare_DA.xlsx'\n", "writer = pd.ExcelWriter(fname)\n", - "files_out[fname.name] = fname.as_posix()" + "files_out[fname.name] = fname.as_posix()\n", + "logger.info(\"Writing to excel file: %s\", fname)" + ] + }, + { + "cell_type": "markdown", + "id": "62d61673", + "metadata": {}, + "source": [ + "## Load scores\n", + "List dump of scores:" ] }, { "cell_type": "code", "execution_count": null, "id": "bcbd112b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "score_dumps = [fname for fname in Path(\n", @@ -146,25 +160,48 @@ "score_dumps" ] }, + { + "cell_type": "markdown", + "id": "18113565", + "metadata": {}, + "source": [ + "Load scores from dumps:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "d240a9b0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = pd.concat([pd.read_pickle(fname) for fname in score_dumps], axis=1)\n", "scores" ] }, + { + "cell_type": "markdown", + "id": "1abb0f0b", + "metadata": {}, + "source": [ + "If reference dump is provided, add it to the scores" + ] + }, { "cell_type": "code", "execution_count": null, "id": "c92dae12", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "# Reference dump\n", "if args.ref_method_score:\n", " scores_reference = (pd\n", " .read_pickle(args.ref_method_score)\n", @@ -179,14 +216,18 @@ "id": "79746f59", "metadata": {}, "source": [ - "## Load frequencies of observed features" + "### Load frequencies of observed features" ] }, { "cell_type": "code", "execution_count": null, "id": "86ecc391", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.folder_experiment / 'freq_features_observed.csv'\n", @@ -195,11 +236,23 @@ "freq_feat" ] }, + { + "cell_type": "markdown", + "id": "641099cd", + "metadata": {}, + "source": [ + "### Assemble qvalues" + ] + }, { "cell_type": "code", "execution_count": null, "id": "54a41e86", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "qvalues = scores.loc[pd.IndexSlice[:, args.target],\n", @@ -215,11 +268,23 @@ "qvalues" ] }, + { + "cell_type": "markdown", + "id": "d024e94d", + "metadata": {}, + "source": [ + "### Assemble pvalues" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0b2488e4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pvalues = scores.loc[pd.IndexSlice[:, args.target],\n", @@ -235,11 +300,23 @@ "pvalues" ] }, + { + "cell_type": "markdown", + "id": "b3b02e5a", + "metadata": {}, + "source": [ + "### Assemble rejected features" + ] + }, { "cell_type": "code", "execution_count": null, "id": "ec12c234", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "da_target = scores.loc[pd.IndexSlice[:, args.target],\n", @@ -256,11 +333,23 @@ "count_rejected" ] }, + { + "cell_type": "markdown", + "id": "85e2a9c8", + "metadata": {}, + "source": [ + "### Tabulate rejected decisions by method:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "c9e0a7b4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! This uses implicitly that RSN is not available for some protein groups\n", @@ -271,11 +360,23 @@ "count_rejected_common" ] }, + { + "cell_type": "markdown", + "id": "4bf8e3e5", + "metadata": {}, + "source": [ + "### Tabulate rejected decisions by method for newly included features (if available)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "af1a13cb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))\n", @@ -283,37 +384,75 @@ "count_rejected_new" ] }, + { + "cell_type": "markdown", + "id": "db57d9f0", + "metadata": {}, + "source": [ + "### Tabulate rejected decisions by method for all features" + ] + }, { "cell_type": "code", "execution_count": null, "id": "f76e8772", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "da_target.to_excel(writer, sheet_name='equality_rejected_all')\n", + "logger.info(\"Written to sheet 'equality_rejected_all' in excel file.\")\n", "da_target" ] }, + { + "cell_type": "markdown", + "id": "e1ed7b09", + "metadata": {}, + "source": [ + "Tabulate number of equal decison by method (`True`) to the ones with varying \n", + "decision depending on the method (`False`)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "9fa40ea2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "da_target_same = (da_target.sum(axis=1) == 0) | da_target.all(axis=1)\n", "da_target_same.value_counts()" ] }, + { + "cell_type": "markdown", + "id": "01bc6744", + "metadata": {}, + "source": [ + "List frequency of features with varying decisions" + ] + }, { "cell_type": "code", "execution_count": null, "id": "22c37698", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_idx_w_diff = da_target_same[~da_target_same].index\n", - "feat_idx_w_diff" + "feat_idx_w_diff.to_frame()[['frequency']].reset_index(-1, drop=True)" ] }, { @@ -328,7 +467,11 @@ "cell_type": "code", "execution_count": null, "id": "ee57dfa9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "(qvalues\n", @@ -344,12 +487,15 @@ " .to_excel(writer, sheet_name='qvalues_diff_common')\n", " )\n", "\n", - "(qvalues\n", - " .loc[feat_idx_w_diff]\n", - " .loc[~mask_common] # mask automatically aligned\n", - " .sort_values(('None', 'qvalue'))\n", - " .to_excel(writer, sheet_name='qvalues_diff_new')\n", - " )\n", + "try:\n", + " (qvalues\n", + " .loc[feat_idx_w_diff]\n", + " .loc[~mask_common]\n", + " .sort_values(('None', 'qvalue'))\n", + " .to_excel(writer, sheet_name='qvalues_diff_new')\n", + " )\n", + "except IndexError:\n", + " print(\"No new features or no new ones (with diverging decisions.)\")\n", "writer.close()" ] }, @@ -366,7 +512,10 @@ "execution_count": null, "id": "10092826", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -388,7 +537,11 @@ "cell_type": "code", "execution_count": null, "id": "624d3301", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "target = pd.read_csv(args.fn_clinical_data,\n", @@ -402,7 +555,11 @@ "cell_type": "code", "execution_count": null, "id": "a160ab0c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "target_to_group = target.copy()\n", @@ -425,7 +582,11 @@ "cell_type": "code", "execution_count": null, "id": "7083535b-9a06-479e-9909-935d49311b00", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = vaep.io.datasplits.DataSplits.from_folder(\n", @@ -447,7 +608,11 @@ "cell_type": "code", "execution_count": null, "id": "b8d183d5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_new_abundant = da_target.loc[~mask_common].any(axis=1)\n", @@ -459,7 +624,11 @@ "cell_type": "code", "execution_count": null, "id": "112a677c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_sel = feat_idx_w_diff.get_level_values(0)\n", @@ -471,7 +640,11 @@ "cell_type": "code", "execution_count": null, "id": "110c0f53", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = data.loc[:, feat_sel]\n", @@ -497,7 +670,11 @@ "cell_type": "code", "execution_count": null, "id": "cfd936e9-eb56-4fb7-8010-d68092b925ad", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# exclude 'None' as this is without imputation (-> data)\n", @@ -513,7 +690,10 @@ "execution_count": null, "id": "26ecc0ed-c550-4a40-802b-25962d7edf7e", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -540,7 +720,10 @@ "execution_count": null, "id": "e422a7a8", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -555,7 +738,11 @@ "cell_type": "code", "execution_count": null, "id": "a3294f6a-65f3-4793-ad0c-4dd8ff11be47", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = feat_sel[0]" @@ -565,7 +752,11 @@ "cell_type": "code", "execution_count": null, "id": "ee17d5eb-a132-4616-b505-4a68efa0e9e5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_observed = data[idx].dropna()\n", @@ -577,7 +768,10 @@ "execution_count": null, "id": "043395a7-fa33-490e-9d9c-f8071274f0b5", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -594,7 +788,10 @@ "execution_count": null, "id": "f813f693", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -620,7 +817,9 @@ "execution_count": null, "id": "d819b0e0", "metadata": { - "lines_to_next_cell": 0 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -728,11 +927,23 @@ " plt.close()" ] }, + { + "cell_type": "markdown", + "id": "f899bcf9", + "metadata": {}, + "source": [ + "Saved files:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "a4b042a1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out" diff --git a/project/10_4_ald_compare_single_pg.py b/project/10_4_ald_compare_single_pg.py index d07f19827..3bb21e39d 100644 --- a/project/10_4_ald_compare_single_pg.py +++ b/project/10_4_ald_compare_single_pg.py @@ -18,7 +18,7 @@ # - see differences in imputation for diverging cases # - dumps top5 -# %% +# %% tags=["hide-input"] import logging from pathlib import Path @@ -39,14 +39,13 @@ plt.rcParams['figure.figsize'] = [4, 2.5] # [16.0, 7.0] , [4, 3] vaep.plotting.make_large_descriptors(7) -# %% [markdown] -# ## Parameters - -# %% # catch passed parameters args = None args = dict(globals()).keys() +# %% [markdown] +# ## Parameters + # %% tags=["parameters"] folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups' fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" @@ -62,11 +61,8 @@ ref_method_score = None # filepath to reference method score -# %% +# %% tags=["hide-input"] params = vaep.nb.get_params(args, globals=globals()) -params - -# %% args = vaep.nb.Config() args.folder_experiment = Path(params["folder_experiment"]) args = vaep.nb.add_default_paths(args, @@ -81,25 +77,37 @@ args.update_from_dict(params) args -# %% +# %% [markdown] +# Write outputs to excel + +# %% tags=["hide-input"] files_out = dict() fname = args.out_folder / 'diff_analysis_compare_DA.xlsx' writer = pd.ExcelWriter(fname) files_out[fname.name] = fname.as_posix() +logger.info("Writing to excel file: %s", fname) +# %% [markdown] +# ## Load scores +# List dump of scores: -# %% +# %% tags=["hide-input"] score_dumps = [fname for fname in Path( args.folder_scores).iterdir() if fname.suffix == '.pkl'] score_dumps -# %% +# %% [markdown] +# Load scores from dumps: + +# %% tags=["hide-input"] scores = pd.concat([pd.read_pickle(fname) for fname in score_dumps], axis=1) scores -# %% -# Reference dump +# %% [markdown] +# If reference dump is provided, add it to the scores + +# %% tags=["hide-input"] if args.ref_method_score: scores_reference = (pd .read_pickle(args.ref_method_score) @@ -109,15 +117,18 @@ logger.info(f'Added reference method scores from {args.ref_method_score}') # %% [markdown] -# ## Load frequencies of observed features +# ### Load frequencies of observed features -# %% +# %% tags=["hide-input"] fname = args.folder_experiment / 'freq_features_observed.csv' freq_feat = pd.read_csv(fname, index_col=0) freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),]) freq_feat -# %% +# %% [markdown] +# ### Assemble qvalues + +# %% tags=["hide-input"] qvalues = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'qvalue'] ].join(freq_feat @@ -130,7 +141,10 @@ qvalues.to_excel(writer, sheet_name='qvalues_all') qvalues -# %% +# %% [markdown] +# ### Assemble pvalues + +# %% tags=["hide-input"] pvalues = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'p-unc'] ].join(freq_feat @@ -143,7 +157,10 @@ pvalues.to_excel(writer, sheet_name='pvalues_all') pvalues -# %% +# %% [markdown] +# ### Assemble rejected features + +# %% tags=["hide-input"] da_target = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'rejected'] ].join(freq_feat @@ -157,7 +174,10 @@ count_rejected.to_excel(writer, sheet_name='count_rejected') count_rejected -# %% +# %% [markdown] +# ### Tabulate rejected decisions by method: + +# %% tags=["hide-input"] # # ! This uses implicitly that RSN is not available for some protein groups # # ! Make an explicit list of the 313 protein groups available in original data mask_common = da_target.notna().all(axis=1) @@ -165,27 +185,41 @@ count_rejected_common.to_excel(writer, sheet_name='count_rejected_common') count_rejected_common -# %% +# %% [markdown] +# ### Tabulate rejected decisions by method for newly included features (if available) + +# %% tags=["hide-input"] count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1)) count_rejected_new.to_excel(writer, sheet_name='count_rejected_new') count_rejected_new -# %% +# %% [markdown] +# ### Tabulate rejected decisions by method for all features + +# %% tags=["hide-input"] da_target.to_excel(writer, sheet_name='equality_rejected_all') +logger.info("Written to sheet 'equality_rejected_all' in excel file.") da_target -# %% +# %% [markdown] +# Tabulate number of equal decison by method (`True`) to the ones with varying +# decision depending on the method (`False`) + +# %% tags=["hide-input"] da_target_same = (da_target.sum(axis=1) == 0) | da_target.all(axis=1) da_target_same.value_counts() -# %% +# %% [markdown] +# List frequency of features with varying decisions + +# %% tags=["hide-input"] feat_idx_w_diff = da_target_same[~da_target_same].index -feat_idx_w_diff +feat_idx_w_diff.to_frame()[['frequency']].reset_index(-1, drop=True) # %% [markdown] # take only those with different decisions -# %% +# %% tags=["hide-input"] (qvalues .loc[feat_idx_w_diff] .sort_values(('None', 'qvalue')) @@ -199,18 +233,21 @@ .to_excel(writer, sheet_name='qvalues_diff_common') ) -(qvalues - .loc[feat_idx_w_diff] - .loc[~mask_common] # mask automatically aligned - .sort_values(('None', 'qvalue')) - .to_excel(writer, sheet_name='qvalues_diff_new') - ) +try: + (qvalues + .loc[feat_idx_w_diff] + .loc[~mask_common] + .sort_values(('None', 'qvalue')) + .to_excel(writer, sheet_name='qvalues_diff_new') + ) +except IndexError: + print("No new features or no new ones (with diverging decisions.)") writer.close() # %% [markdown] # ## Plots for inspecting imputations (for diverging decisions) -# %% +# %% tags=["hide-input"] if not args.make_plots: logger.warning("Not plots requested.") import sys @@ -220,14 +257,14 @@ # %% [markdown] # ## Load target -# %% +# %% tags=["hide-input"] target = pd.read_csv(args.fn_clinical_data, index_col=0, usecols=[args.sample_id_col, args.target]) target = target.dropna() target -# %% +# %% tags=["hide-input"] target_to_group = target.copy() target = target >= args.cutoff_target target = target.replace({False: f'{args.target} < {args.cutoff_target}', @@ -238,7 +275,7 @@ # %% [markdown] # ## Measurments -# %% +# %% tags=["hide-input"] data = vaep.io.datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) @@ -248,17 +285,17 @@ # %% [markdown] # plot all of the new pgs which are at least once significant which are not already dumped. -# %% +# %% tags=["hide-input"] feat_new_abundant = da_target.loc[~mask_common].any(axis=1) feat_new_abundant = feat_new_abundant.loc[feat_new_abundant].index.get_level_values(0) feat_new_abundant -# %% +# %% tags=["hide-input"] feat_sel = feat_idx_w_diff.get_level_values(0) feat_sel = feat_sel.union(feat_new_abundant) len(feat_sel) -# %% +# %% tags=["hide-input"] data = data.loc[:, feat_sel] data @@ -272,7 +309,7 @@ # # Load all prediction files and reshape -# %% +# %% tags=["hide-input"] # exclude 'None' as this is without imputation (-> data) model_keys = [k for k in qvalues.columns.get_level_values(0) if k != 'None'] pred_paths = [ @@ -280,7 +317,7 @@ for method in model_keys] pred_paths -# %% +# %% tags=["hide-input"] load_single_csv_pred_file = vaep.analyzers.compare_predictions.load_single_csv_pred_file pred_real_na = dict() for method in model_keys: @@ -294,7 +331,7 @@ # %% [markdown] # Once imputation, reduce to target samples only (samples with target score) -# %% +# %% tags=["hide-input"] # select samples with target information data = data.loc[target.index] pred_real_na = pred_real_na.loc[target.index] @@ -302,14 +339,14 @@ # assert len(data) == len(pred_real_na) -# %% +# %% tags=["hide-input"] idx = feat_sel[0] -# %% +# %% tags=["hide-input"] feat_observed = data[idx].dropna() feat_observed -# %% +# %% tags=["hide-input"] # axes = axes.ravel() # args.out_folder.parent / 'intensity_plots' # each feature -> one plot? @@ -318,7 +355,7 @@ folder.mkdir(parents=True, exist_ok=True) -# %% +# %% tags=["hide-input"] min_y_int, max_y_int = vaep.plotting.data.get_min_max_iterable( [data.stack(), pred_real_na.stack()]) min_max = min_y_int, max_y_int @@ -331,7 +368,7 @@ # %% [markdown] # ## Compare with target annotation -# %% +# %% tags=["hide-input"] # labels somehow? # target.replace({True: f' >={args.cutoff_target}', False: f'<{args.cutoff_target}'}) @@ -434,5 +471,9 @@ def get_centered_label(method, n, q): fig, name=fname) plt.close() -# %% + +# %% [markdown] +# Saved files: + +# %% tags=["hide-input"] files_out diff --git a/project/README.md b/project/README.md index 7c72014d2..eddd93109 100644 --- a/project/README.md +++ b/project/README.md @@ -27,7 +27,8 @@ or as long formated data. | sample_03 | Protein B | 0.2 | | sample_03 | Protein C | 0.1 | -Currently `pickle`d and `csv` files are supported. +Currently `pickle`d and `csv` files are supported. If you use csv files, make sure +to set an index name for the columns (default: `Sample ID`). It's done mostly automatically. Optionally, ThermoRawFileParser output cab be used as metadata. along further as e.g. clinical metadata for each sample. @@ -82,125 +83,58 @@ papermill 01_0_split_data.ipynb runs/experiment_03/%DATASET%/experiment_03_data tag | notebook | Description --- | --- | --- +Tutorials | +tut | 04_1_train_pimms_models.ipynb | main tutorial showing scikit-learn interface partly with validatio data +tut | 04_1_train_DAE_VAE_wo_val_data.ipynb | Single experiment | run | 01_0_split_data.ipynb | Create train, validation and test data splits +run | 01_0_transform_data_to_wide_format.ipynb | Transform train split to wide format for R models run | 01_1_train_.ipynb | Train a single model e.g. (VAE, DAE, CF) +run | 01_1_train_NAGuideR_methods.ipynb | Train supported R models +run | 01_1_transfer_NAGuideR_pred.ipynb | Transfer R model predictions to correct format in Python run | 01_2_performance_plots.ipynb | Performance of single model run Grid search and best model analysis | -grid | 02_1_aggregate_metrics.py.ipynb | Aggregate metrics -grid | 02_2_aggregate_configs.py.ipynb | Aggregate model configurations +grid | 02_1_{aggregate|join}_metrics.py.ipynb | Aggregate or join metrics +grid | 02_2_{aggregate|join}_configs.py.ipynb | Aggregate or join model configurations grid | 02_3_grid_search_analysis.ipynb | Analyze different runs with varying hyperparameters on a dataset grid | 02_4_best_models_over_all_data | Show best models and best models across data types best | 03_1_best_models_comparison.ipynb | best model trained repeatedly or across datasets -Applications | -ald | 16_ald_data.ipynb | preprocess data -> could be move to data folder -ald | 16_ald_diff_analysis.ipynb | differential analysis (DA), dump scores -ald | 16_ald_compare_methods.ipynb | DA comparison between methods -ald | 16_ald_ml_new_feat.ipynb | ML model comparison -ald | 16_ald_compare_single_pg.ipynb | [DEV] Compare imputation for feat between methods (dist plots) +Differential analysis workflow | +ald | 10_0_ald_data.ipynb | preprocess data -> could be move to data folder +ald | 10_1_ald_diff_analysis.ipynb | differential analysis (DA), dump scores +ald | 10_2_ald_compare_methods.ipynb | DA comparison between methods +ald | 10_3_ald_ml_new_feat.ipynb | ML model comparison +ald | 10_4_ald_compare_single_pg.ipynb | Compare imputation for feat between methods (dist plots) +ald | 10_5_comp_diff_analysis_repetitions.ipynb | [Not in workflow] Compare 10x repeated differential analysis workflow +ald | 10_6_interpret_repeated_ald_da.py | [Not in workflow] Interpret 10x repeated differential analysis +ald | 10_7_ald_reduced_dataset_plots.ipynb | [Not in workflow] Plots releated reduced dataset (80% dataset) +Data inspection and manipulations for experiments | +data | 00_5_training_data_exploration.py | Inspect dataset +data | 00_6_0_permute_data.ipynb | Permute data per column to check overfitting of models (mean unchanged per column) +data | 00_8_add_random_missing_values.py | Script to add random missing values to ALD data +Publication specific notebooks | +pub | 03_2_best_models_comparison_fig2.ipynb | Best models comparison in Fig. 2 +pub | 03_3_combine_experiment_result_tables.ipynb | Combine HeLa experiment results for reporting +pub | 03_4_join_tables.py | Combine ALD experiment results for reporting +pub | 03_6_setup_comparison_rev3.py | Analyze setup of KNN comparison for rev 3 Miscancellous notebooks on different topics (partly exploration) | misc | misc_embeddings.ipynb | FastAI Embeddings misc | misc_illustrations.ipynb | Illustrations of certain concepts (e.g. draw from shifted random distribution) misc | misc_json_formats.ipynb | Investigate storring training data as json with correct encoding -misc | misc_MaxQuantOutput.ipynb | \[documentation\] Analyze MQ output, show MaxQuantOutput class behaviour -misc | misc_protein_support.ipynb | peptide sequences mapped to protein sequences misc | misc_pytorch_fastai_dataset.ipynb | Dataset functionality misc | misc_pytorch_fastai_dataloaders.ipynb| Dataloading functionality misc | misc_sampling_in_pandas.ipynb | How to sample in pandas -# Notebook descriptions (To be completed) +## KNN adhoc analysis using jupytext and papermill -## Inspect dataset - -### `00_5_training_data_exploration.py` - -Can be execute manually +Compare performance splitting samples into train, validation and test set. +Use scikit-learn `KNN_IMPUTER` as it's easiest to tweak and understand. ```bash -jupytext 00_5_training_data_exploration.py --to ipynb -o - | papermill - runs/example/00_5_training_data_exploration.ipynb -f config/single_dev_dataset/example/inspect_data.yaml -``` - -## Single experiment run -### `01_0_split_data.ipynb` - -- select data according to procedure described in **Fig. S1** - -### `01_1_train_.ipynb` -- notebooks for training model `X` (e.g. `VAE`, `DAE` or `CF`) - -### `01_2_performance_plots.ipynb` - -## Grid search and best model analysis - -### `02_1_aggregate_metrics.py.ipynb` and `02_1_join_metrics.py.ipynb` -- helper script to collect `metrics`. -### `02_2_aggregate_configs.py.ipynb` and `02_2_join_configs.py.ipynb` - -- helper script to collect `config`urations. - -### `02_3_grid_search_analysis.ipynb` - -- analyze different runs with varying hyperparameters on a single data set -- run for each protein group, peptides and precursor data set - -### `02_4_best_models_over_all_data.ipynb` - -- show best models across data sets in grid search - -### `03_1_best_models_comparison.ipynb` - -## Misc - -### `misc_clustering_proteins.ipynb` - -- first PCA analysis of proteins from Annelaura - -### `misc_data_exploration_proteins.ipynb` - -### `misc_embeddings.ipynb` - -### `misc_illustrations.ipynb` -- illustrations for presentations -- e.g. shifted normal imputation - -### `misc_pytorch_fastai_dataloaders.ipynb` - -### `misc_pytorch_fastai_dataset.ipynb` -### `misc_id_mapper.ipynb` - -### `misc_json_formats.ipynb` - -### `run_ipynbs.py` - -### `misc_protein_support.ipynb` - -- map peptide sequences to protein sequences -- calculate some metrics - -### `misc_sampling_in_pandas.ipynb` - -### `misc_MaxQuantOutput.ipynb` -- misc - -### 01 Analysis Fasta - -#### `misc_FASTA_tryptic_digest.ipynb` - -- analysis FASTA file used for protein search - -#### `misc_FASTA_data_agg_by_gene.ipynb` - -- analysis of gene to protein mapping of fasta file - -### 02 Analysis dataset - -#### `erda_data_available.ipynb` -- analyze `count_all_peptides.json`: How many peptides are identified overall in all - processed files - -> erda notebook: `00_mq_count_peptides.ipynb` - -#### `misc_data_exploration_peptides.ipynb` -- finds files originationg from fractionation experiments -- plot mask indicating presence/abscence of peptide measurement in an experiment -- intensity log-transformation: +# classic: +jupytext --to ipynb -k - -o - 01_1_train_KNN.py | papermill - runs/rev3/01_1_train_KNN.ipynb +# train only on samples without simulated missing values, add simulated missing values to test and validation samples +jupytext --to ipynb -k - -o - 01_1_train_KNN_unique_samples.py | papermill - runs/rev3/01_1_train_KNN_unique_samples.ipynb +# new comparison (check if the old nb could be used for this purpose) +jupytext --to ipynb -k - -o - 01_3_revision3.py | papermill - runs/rev3/01_3_revision3.ipynb +``` \ No newline at end of file diff --git a/project/bin/create_qsub_commands.py b/project/bin/create_qsub_commands.py index d5b2445a9..0c79ce497 100755 --- a/project/bin/create_qsub_commands.py +++ b/project/bin/create_qsub_commands.py @@ -1,39 +1,39 @@ -# %% -from itertools import product - -# import subprocess -mnar_mcar = [25, 50, 75] -datasets = ["pg_m", "pg_l", "pep_m", "evi_m", "pep_l", "evi_l"] - -for dataset, perc in product(datasets, mnar_mcar): - print(f"# {dataset = } # {perc = }") - cmd = ( - "qsub bin/run_snakemake_cluster.sh" - f" -N sm_{dataset}_{perc}" - f" -v configfile=config/single_dev_dataset/mnar_mcar/{dataset}.yaml,prefix={dataset}_{perc}," - f"frac_mnar={perc/100:.2f}," - f"config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml," - f"config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml," - f"folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" - ) - print(cmd) - # subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) - -# %% [markdown] -# Create local command to run on interactive node -print() -print("#" * 80) -print() -# %% -for dataset, perc in product(datasets, mnar_mcar): - cmd = ( - "snakemake -s workflow/Snakefile_v2" - f" --configfile config/single_dev_dataset/mnar_mcar/{dataset}.yaml" - f" --config frac_mnar={perc/100:.2f}" - f" config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml" - f" config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml" - f" folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" - " -c1" - ) - print(cmd) -# %% +# %% +from itertools import product + +# import subprocess +mnar_mcar = [25, 50, 75] +datasets = ["pg_m", "pg_l", "pep_m", "evi_m", "pep_l", "evi_l"] + +for dataset, perc in product(datasets, mnar_mcar): + print(f"# {dataset = } # {perc = }") + cmd = ( + "qsub bin/run_snakemake_cluster.sh" + f" -N sm_{dataset}_{perc}" + f" -v configfile=config/single_dev_dataset/mnar_mcar/{dataset}.yaml,prefix={dataset}_{perc}," + f"frac_mnar={perc/100:.2f}," + f"config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml," + f"config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml," + f"folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" + ) + print(cmd) + # subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) + +# %% [markdown] +# Create local command to run on interactive node +print() +print("#" * 80) +print() +# %% +for dataset, perc in product(datasets, mnar_mcar): + cmd = ( + "snakemake -s workflow/Snakefile_v2.smk" + f" --configfile config/single_dev_dataset/mnar_mcar/{dataset}.yaml" + f" --config frac_mnar={perc/100:.2f}" + f" config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml" + f" config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml" + f" folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" + " -c1" + ) + print(cmd) +# %% diff --git a/project/bin/run_snakemake_cluster.sh b/project/bin/run_snakemake_cluster.sh index 20d24ff2c..bc96a46a6 100644 --- a/project/bin/run_snakemake_cluster.sh +++ b/project/bin/run_snakemake_cluster.sh @@ -48,7 +48,7 @@ echo config_train $config_train . ~/setup_conda.sh conda activate vaep -snakemake -s workflow/Snakefile_v2 --jobs 10 -k -p -c2 --latency-wait 60 --rerun-incomplete \ +snakemake -s workflow/Snakefile_v2.smk --jobs 10 -k -p -c2 --latency-wait 60 --rerun-incomplete \ --configfile $configfile \ --config frac_mnar=$frac_mnar folder_experiment=$folder_experiment config_split=$config_split config_train=$config_train \ --max-status-checks-per-second 0.1 \ diff --git a/project/config/alzheimer_study/README.md b/project/config/alzheimer_study/README.md new file mode 100644 index 000000000..b947673a0 --- /dev/null +++ b/project/config/alzheimer_study/README.md @@ -0,0 +1,10 @@ +# Alzheimer study configuration + +For [`workflow/Snakefile_v2.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_v2.smk): + +- [`config.yaml`](config.yaml) +- see comments in config for explanations. + +For [`workflow/Snakefile_ald_comparison](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_ald_comparison.smk): + +- [`comparison.yaml`](comparison.yaml) diff --git a/project/config/alzheimer_study/comparison.yaml b/project/config/alzheimer_study/comparison.yaml new file mode 100644 index 000000000..27030a7cf --- /dev/null +++ b/project/config/alzheimer_study/comparison.yaml @@ -0,0 +1,22 @@ +folder_experiment: runs/alzheimer_study +fn_clinical_data: https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv +target: AD +covar: + AD: age,Kiel,Magdeburg,Sweden +cutoffs: + AD: 0.5 +disease_ontology: # code from https://disease-ontology.org/ + AD: 10652 # Alzheimer disease +f_annotations: null +annotaitons_gene_col: null +baseline: PI +ref_method_score: +make_plots: false +methods: + - Median + - CF + - DAE + - VAE + - QRILC + - TRKNN + - RF diff --git a/project/config/alzheimer_study/config.yaml b/project/config/alzheimer_study/config.yaml new file mode 100644 index 000000000..66e00ee0c --- /dev/null +++ b/project/config/alzheimer_study/config.yaml @@ -0,0 +1,79 @@ +# config for Snakefile_v2.smk +config_split: runs/alzheimer_study/split.yaml # ! will be build by workflow +config_train: runs/alzheimer_study/train_{model}.yaml # ! will be build by workflow +folder_experiment: runs/alzheimer_study # folder to save the results +fn_rawfile_metadata: https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv # metadata file +cuda: False # use GPU? +file_format: csv # intermediate file formats +split_data: # for 01_01_split_data.ipynb -> check parameters + FN_INTENSITIES: https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv + sample_completeness: 0.5 + feat_prevalence: 0.25 + column_names: + - protein groups + index_col: 0 + meta_cat_col: _collection site + meta_date_col: null # null if no date column, translated to None in Python + frac_mnar: 0.25 + frac_non_train: 0.1 +models: + - Median: # name used for model with this configuration + model: Median # model used + - CF: + model: CF # notebook: 01_1_train_{model}.ipynb will be 01_1_train_CF.ipynb + latent_dim: 50 + batch_size: 1024 + epochs_max: 100 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + latent_dim: 10 + batch_size: 64 + epochs_max: 300 + hidden_layers: "64" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + latent_dim: 10 + batch_size: 64 + epochs_max: 300 + hidden_layers: "64" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv + - KNN5: + model: KNN + neighbors: 5 + file_format: csv +NAGuideR_methods: + - BPCA + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + # - MICE-CART > 1h20min on GitHub small runner + # - MICE-NORM ~ 1h on GitHub small runner + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/knn_comparison/ald_pgs_all/README.md b/project/config/knn_comparison/ald_pgs_all/README.md index 95ea87933..2392be95c 100644 --- a/project/config/knn_comparison/ald_pgs_all/README.md +++ b/project/config/knn_comparison/ald_pgs_all/README.md @@ -3,5 +3,5 @@ for ALD protein groups dataset. ```bash -snakemake -s workflow/Snakefile_v2 --configfile config/knn_comparison/ald_pgs_all/config.yaml -p -c1 -n +snakemake -s workflow/Snakefile_v2.smk --configfile config/knn_comparison/ald_pgs_all/config.yaml -p -c1 -n ``` \ No newline at end of file diff --git a/project/config/knn_comparison/ald_pgs_all/config.yaml b/project/config/knn_comparison/ald_pgs_all/config.yaml index 1a0f21c94..d320c996a 100644 --- a/project/config/knn_comparison/ald_pgs_all/config.yaml +++ b/project/config/knn_comparison/ald_pgs_all/config.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_train: runs/knn_comparison/ald_pgs_all/configs_train/train_{model}.yaml config_split: runs/knn_comparison/ald_pgs_all/config_split.yaml folder_experiment: runs/knn_comparison/ald_pgs_all diff --git a/project/config/knn_comparison/hela_pgs_large/README.md b/project/config/knn_comparison/hela_pgs_large/README.md index b22dd0b33..d865857fe 100644 --- a/project/config/knn_comparison/hela_pgs_large/README.md +++ b/project/config/knn_comparison/hela_pgs_large/README.md @@ -3,5 +3,5 @@ for large protein groups HeLa dataset. ```bash -snakemake -s workflow/Snakefile_v2 --configfile config/knn_comparison/hela_pgs_large/config.yaml -p -c1 -n +snakemake -s workflow/Snakefile_v2.smk --configfile config/knn_comparison/hela_pgs_large/config.yaml -p -c1 -n ``` \ No newline at end of file diff --git a/project/config/knn_comparison/hela_pgs_large/config.yaml b/project/config/knn_comparison/hela_pgs_large/config.yaml index 671a9c222..fbdd598b1 100644 --- a/project/config/knn_comparison/hela_pgs_large/config.yaml +++ b/project/config/knn_comparison/hela_pgs_large/config.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: config/knn_comparison/hela_pgs_large/split.yaml config_train: runs/knn_comparison/hela_pgs_large/configs_train/train_{model}.yaml folder_experiment: runs/knn_comparison/hela_pgs_large diff --git a/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml b/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml index 5712570f4..3530f1668 100644 --- a/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_l_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_l_50MNAR diff --git a/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml b/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml index 657d3adaa..1e1af8e38 100755 --- a/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_m_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_m_50MNAR diff --git a/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml b/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml index bf5623c7c..1f98afed4 100644 --- a/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pep_l_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pep_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pep_l_50MNAR @@ -6,70 +6,70 @@ frac_mnar: 0.5 fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv file_format: csv split_data: - FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl - sample_completeness: 0.4 - feat_prevalence: 0.25 - index_col: 0 - meta_date_col: Content Creation Date - column_names: null + FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null models: - - Median: - model: Median # needs to set at least one parameter - - CF: - model: CF - file_format: csv - latent_dim: 50 - batch_size: 4096 - epochs_max: 30 - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - DAE: - model: DAE - file_format: csv - latent_dim: 50 - batch_size: 10 - epochs_max: 200 - hidden_layers: "1024" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - VAE: - model: VAE - file_format: csv - latent_dim: 10 - batch_size: 10 - epochs_max: 200 - hidden_layers: "512" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - KNN: - model: KNN - neighbors: 3 - file_format: csv + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 50 + batch_size: 10 + epochs_max: 200 + hidden_layers: "1024" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 10 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv NAGuideR_methods: - # - BPCA # > 24h, killed - - COLMEDIAN - # - GSIMP # > 24h, killed - - IMPSEQ - - IMPSEQROB - # - IRM # > 24h, killed - - KNN_IMPUTE - # - LLS # error - # - MICE-CART # > 24h, killed - # - MICE-NORM # > 24h, killed - - MINDET - - MINIMUM - - MINPROB - - MLE - - MSIMPUTE - - MSIMPUTE_MNAR - - PI - - QRILC - # - RF # > 24h, killed - - ROWMEDIAN - # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds - - SVDMETHOD - # - TRKNN # > 24h, killed - - ZERO + # - BPCA # > 24h, killed + - COLMEDIAN + # - GSIMP # > 24h, killed + - IMPSEQ + - IMPSEQROB + # - IRM # > 24h, killed + - KNN_IMPUTE + # - LLS # error + # - MICE-CART # > 24h, killed + # - MICE-NORM # > 24h, killed + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + # - RF # > 24h, killed + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + # - TRKNN # > 24h, killed + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml b/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml index e633639b7..81a1f8c3c 100755 --- a/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pep_m_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pep_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pep_m_50MNAR @@ -6,72 +6,72 @@ frac_mnar: 0.5 fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv file_format: csv split_data: - FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl - sample_completeness: 0.4 - feat_prevalence: 0.25 - select_N: 50 - index_col: 0 - meta_date_col: Content Creation Date - column_names: null + FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + select_N: 50 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null models: - - Median: - model: Median # needs to set at least one parameter - - CF: - model: CF - file_format: csv - latent_dim: 50 - batch_size: 4096 - epochs_max: 30 - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - DAE: - model: DAE - file_format: csv - latent_dim: 75 - batch_size: 25 - patience: 50 - epochs_max: 200 - hidden_layers: "256_128" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - VAE: - model: VAE - file_format: csv - latent_dim: 50 - batch_size: 25 - epochs_max: 200 - hidden_layers: "256" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - KNN: - model: KNN - neighbors: 3 - file_format: csv + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 75 + batch_size: 25 + patience: 50 + epochs_max: 200 + hidden_layers: "256_128" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 50 + batch_size: 25 + epochs_max: 200 + hidden_layers: "256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv NAGuideR_methods: - - BPCA - - COLMEDIAN - # - GSIMP > 24h, killed - - IMPSEQ - - IMPSEQROB - - IRM - - KNN_IMPUTE - - LLS - - MICE-CART - - MICE-NORM - - MINDET - - MINIMUM - - MINPROB - - MLE - - MSIMPUTE - - MSIMPUTE_MNAR - - PI - - QRILC - - RF - - ROWMEDIAN - - SEQKNN - - SVDMETHOD - - TRKNN - - ZERO + - BPCA + - COLMEDIAN + # - GSIMP > 24h, killed + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + - MICE-CART + - MICE-NORM + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + - SEQKNN + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml b/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml index 2241305f9..5bdf72f2c 100755 --- a/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_l_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_l_50MNAR @@ -7,66 +7,66 @@ fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadat cuda: False file_format: csv split_data: - FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl - sample_completeness: 0.4 - feat_prevalence: 0.25 - index_col: 0 - meta_date_col: Content Creation Date + FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + index_col: 0 + meta_date_col: Content Creation Date models: - - Median: - model: Median - - CF: # 2min - model: CF - latent_dim: 50 - batch_size: 32768 - epochs_max: 100 - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - DAE: # 2min - model: DAE - latent_dim: 25 - batch_size: 64 - epochs_max: 100 - hidden_layers: "512" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - VAE: # 2min - model: VAE - latent_dim: 25 - batch_size: 64 - epochs_max: 50 - hidden_layers: "512" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - KNN: - model: KNN - neighbors: 3 - file_format: csv + - Median: + model: Median + - CF: # 2min + model: CF + latent_dim: 50 + batch_size: 32768 + epochs_max: 100 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: # 2min + model: DAE + latent_dim: 25 + batch_size: 64 + epochs_max: 100 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: # 2min + model: VAE + latent_dim: 25 + batch_size: 64 + epochs_max: 50 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv NAGuideR_methods: - - BPCA #6h41min - - COLMEDIAN - # - GSIMP # stopped after 24h - - IMPSEQ # 1min - - IMPSEQROB - - IRM # 7h52min - - KNN_IMPUTE - - LLS - # - MICE-CART # stopped after 24h - # - MICE-NORM # stopped after 24h - - MINDET - - MINIMUM - - MINPROB - - MLE - - MSIMPUTE - - MSIMPUTE_MNAR - - PI - - QRILC - - RF # 58min - - ROWMEDIAN - # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds - - SVDMETHOD # 16min - - TRKNN # 5h38min - - ZERO + - BPCA #6h41min + - COLMEDIAN + # - GSIMP # stopped after 24h + - IMPSEQ # 1min + - IMPSEQROB + - IRM # 7h52min + - KNN_IMPUTE + - LLS + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF # 58min + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD # 16min + - TRKNN # 5h38min + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml b/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml index 40aa30bc3..41410ea0a 100644 --- a/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_m_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_m_50MNAR diff --git a/project/config/single_dev_dataset/proteinGroups_N50/README.md b/project/config/single_dev_dataset/proteinGroups_N50/README.md new file mode 100644 index 000000000..4605516a0 --- /dev/null +++ b/project/config/single_dev_dataset/proteinGroups_N50/README.md @@ -0,0 +1,23 @@ +# Config files + +## Version 1 imputation workflow + +For [`worflow/Snakefile`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile) + +```bash +config.yaml # main config +split.yaml # split data config referenced in config.yaml +train_CF.yaml # CF train config referenced in config.yaml +train_DAE.yaml # DAE train config referenced in config.yaml +train_KNN.yaml # KNN train config referenced in config.yaml +train_Median.yaml # Median train config referenced in config.yaml +train_VAE.yaml # VAE train config referenced in config.yaml +``` + +## Version 2 impuation workflow + +For [`workflow/Snakefile_v2.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_v2.smk) only one config file is needed: + +```bash +config_v2.yaml +``` \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml b/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml index c7c084b1e..1c2c1a7e3 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: config/single_dev_dataset/proteinGroups_N50/split.yaml # ! will be build config_train: config/single_dev_dataset/proteinGroups_N50/train_{model}.yaml # ! will be build folder_experiment: runs/dev_dataset_small/proteinGroups_N50_Snakefile_v2 diff --git a/project/data/README.md b/project/data/README.md index b509a16c4..b518521a4 100644 --- a/project/data/README.md +++ b/project/data/README.md @@ -1,3 +1,71 @@ # Data Folder -> Put you files here. \ No newline at end of file +> Put you files here. + +## Download development dataset + +The large development data sets can be obtained from PRIDE. An example for the protein +groups level data is provided below and as an executable script. + +### Download large development dataset +Execute the script to download and save the large Hela protein group data for instrument 6070: + +```bash +python download_dev_dataset.py +``` + +This script contains the following code: + +```python +import io +import zipfile +from pathlib import Path + +import pandas as pd +import requests + +FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' +FILE = 'pride_metadata.csv' +print(f'Fetch metadata: {FTP_FOLDER}/{FILE}') +meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0) +meta.sample(5, random_state=42).sort_index() +idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index + +FILE = 'geneGroups_aggregated.zip' +print(f"Fetch archive: {FTP_FOLDER}/{FILE}") +r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900) +with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive: + print('available files in archive' '\n - '.join(zip_archive.namelist())) + FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv' + print('\nread file:', FNAME) + with zip_archive.open(FNAME) as f: + df = pd.read_csv(f, index_col=0) + +# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long') +FOLDER.mkdir(parents=True, exist_ok=True) +fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv' +df.loc[idx_6070].to_csv(fname) +print(f'saved data to: {fname}') +df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl')) +print(f'saved data to: {fname.with_suffix(".pkl")}') +# save metadata: +fname = FOLDER / 'metadata.csv' +meta.loc[idx_6070].to_csv(fname) +print(f'saved metadata to: {fname}') +``` +### Run snakemake workflow + +Then you will be able to run the snakemake workflow for the larger +development dataset: + +```bash +snakemake --configfile config/single_dev_dataset/proteinGroups/config.yaml -c1 -n +``` + +The smaller development data set on the protein groups level is also shipped with this +repository and can be found in the [`dev_datasets/HeLa_6070`](dev_datasets/HeLa_6070/) folder. + +```bash +snakemake -c1 -n +``` diff --git a/project/data/download_dev_dataset.py b/project/data/download_dev_dataset.py new file mode 100644 index 000000000..dec94bdde --- /dev/null +++ b/project/data/download_dev_dataset.py @@ -0,0 +1,42 @@ +"""Download the development dataset of HeLa cells from PRIDE. + +Instrument: Q_Exactive_HF_X_Orbitrap_6070 + +Can be adapted to save all instruments or other datasets. +""" +import io +import zipfile +from pathlib import Path + +import pandas as pd +import requests + +FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' +FILE = 'pride_metadata.csv' +print(f'Fetch metadata: {FTP_FOLDER}/{FILE}') +meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0) +meta.sample(5, random_state=42).sort_index() +idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index + +FILE = 'geneGroups_aggregated.zip' +print(f"Fetch archive: {FTP_FOLDER}/{FILE}") +r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900) +with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive: + print('available files in archive' '\n - '.join(zip_archive.namelist())) + FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv' + print('\nread file:', FNAME) + with zip_archive.open(FNAME) as f: + df = pd.read_csv(f, index_col=0) + +# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long') +FOLDER.mkdir(parents=True, exist_ok=True) +fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv' +df.loc[idx_6070].to_csv(fname) +print(f'saved data to: {fname}') +df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl')) +print(f'saved data to: {fname.with_suffix(".pkl")}') +# save metadata: +fname = FOLDER / 'metadata.csv' +meta.loc[idx_6070].to_csv(fname) +print(f'saved metadata to: {fname}') diff --git a/project/workflow/Snakefile b/project/workflow/Snakefile index 9aace9738..0667e4735 100644 --- a/project/workflow/Snakefile +++ b/project/workflow/Snakefile @@ -56,6 +56,8 @@ rule comparison: models=",".join(MODELS), err=f"{{folder_experiment}}/{nb_stem}.e", out=f"{{folder_experiment}}/{nb_stem}.o", + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb:q}" " -p fn_rawfile_metadata {params.meta_data:q}" @@ -94,6 +96,8 @@ rule transform_NAGuideR_predictions: folder_experiment="{folder_experiment}", # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules dumps_as_str=lambda wildcards, input: ",".join(input.dumps), + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb:q}" " -r folder_experiment {params.folder_experiment:q}" @@ -123,7 +127,7 @@ rule train_NAGuideR_model: # log: # err="{folder_experiment}/01_1_train_NAGuideR_{method}.log", conda: - "vaep" + "envs/trainRmodels.yaml" shell: "papermill {input.nb} {output.nb:q}" " -r train_split {input.train_split:q}" @@ -147,6 +151,8 @@ rule transform_data_to_wide_format: folder_experiment="{folder_experiment}", err=f"{{folder_experiment}}/{nb_stem}.e", out=f"{{folder_experiment}}/{nb_stem}.o", + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb:q}" " -r folder_experiment {params.folder_experiment:q}" @@ -174,7 +180,7 @@ rule train_models: # log: # err="{folder_experiment}/01_1_train_{model}.log", conda: - "vaep" + "envs/pimms.yaml" shell: "papermill {input.nb:q} {output.nb:q}" " -f {input.configfile:q}" @@ -203,6 +209,8 @@ rule create_splits: meta_data=config["fn_rawfile_metadata"], err=f"{{folder_experiment}}/{nb_stem}.e", out=f"{{folder_experiment}}/{nb_stem}.o", + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb}" " -f {input.configfile:q}" diff --git a/project/workflow/Snakefile_ald_comparison.smk b/project/workflow/Snakefile_ald_comparison.smk index 0b79e9c1b..63713da86 100644 --- a/project/workflow/Snakefile_ald_comparison.smk +++ b/project/workflow/Snakefile_ald_comparison.smk @@ -18,9 +18,7 @@ out_folder = folder_experiment + "/{out_folder}/{target}/" out_folder_two_methods_cp = out_folder + "{baseline}_vs_{model}/" -target_cutoff = dict(kleiner="2") - -target = "kleiner" +target = config["target"] all_methods = [config["baseline"], "None", *config["methods"]] @@ -73,7 +71,8 @@ rule plot_intensities_for_diverging_results: out_folder=config["out_folder"], ), nb=nb, - fn_clinical_data="data/ALD_study/processed/ald_metadata_cli.csv", + # replace with config + fn_clinical_data=f"{folder_experiment}/data/clinical_data.csv", output: diff_da=out_folder + "diff_analysis_compare_DA.xlsx", qvalues=out_folder + "qvalues_target.pkl", @@ -106,7 +105,7 @@ rule ml_comparison: nb=nb, pred_base=folder_experiment + "/preds/pred_real_na_{baseline}.csv", pred_model=folder_experiment + "/preds/pred_real_na_{model}.csv", - fn_clinical_data="data/ALD_study/processed/ald_metadata_cli.csv", + fn_clinical_data=f"{folder_experiment}/data/clinical_data.csv", output: sel_feat=out_folder_two_methods_cp + "mrmr_feat_by_model.xlsx", nb=out_folder_two_methods_cp + nb, @@ -157,24 +156,44 @@ rule compare_diff_analysis: ########################################################################################## # Scores for each model (method) -nb = "10_1_ald_diff_analysis.ipynb" +nb_stem = "10_1_ald_diff_analysis" rule differential_analysis: input: - nb=nb, - f_annotations=config["f_annotations"], + nb=f"{nb_stem}.ipynb", + fn_clinical_data=f"{folder_experiment}/data/clinical_data.csv", output: score=out_folder + "scores/diff_analysis_scores_{model}.pkl", - nb=out_folder + "scores/diff_analysis_{model}.ipynb", + nb=out_folder + f"scores/{nb_stem}_{{model}}.ipynb", params: covar=lambda wildcards: config["covar"][wildcards.target], + f_annotations=config["f_annotations"], shell: "papermill {input.nb} {output.nb}" f" -r folder_experiment {folder_experiment}" - " -r f_annotations {input.f_annotations}" + " -r fn_clinical_data {input.fn_clinical_data}" + " -p f_annotations {params.f_annotations}" " -r target {wildcards.target}" " -r covar {params.covar}" " -r model_key {wildcards.model}" " -r out_folder {wildcards.out_folder}" " && jupyter nbconvert --to html {output.nb}" + + +########################################################################################## +# Save clinical metadata to data folder of experimental folder +# Makes it possible to have remote clincial data + +rule copy_clinical_data: + output: + local_clincial_data = f"{folder_experiment}/data/clinical_data.csv", + params: + fn_clinical_data = config["fn_clinical_data"], + run: + import pandas as pd + # could be extended for several file-types + df = pd.read_csv(params.fn_clinical_data) + df.to_csv(output.local_clincial_data, index=False) + # , index_col=0) + # usecols=[args.sample_id_col, args.target]) diff --git a/project/workflow/Snakefile_v2 b/project/workflow/Snakefile_v2.smk similarity index 98% rename from project/workflow/Snakefile_v2 rename to project/workflow/Snakefile_v2.smk index bbd2a95b7..1db8774af 100644 --- a/project/workflow/Snakefile_v2 +++ b/project/workflow/Snakefile_v2.smk @@ -215,13 +215,10 @@ rule dump_train_config: f.write("# Build in Snakemake workflow\n") yaml.dump(model_configs[wildcards.model], f, sort_keys=False) - ########################################################################################## -# Create Data splits -# separate workflow by level -> provide custom configs +# Create data splits nb_stem = "01_0_split_data" - rule create_splits: input: nb=f"{nb_stem}.ipynb", @@ -243,7 +240,7 @@ rule create_splits: ########################################################################################## -# create config file dumps for each model +# create data splitting configuration file rule dump_split_config: diff --git a/project/workflow/envs/pimms.yaml b/project/workflow/envs/pimms.yaml new file mode 100644 index 000000000..da6fe3eee --- /dev/null +++ b/project/workflow/envs/pimms.yaml @@ -0,0 +1,52 @@ +# Dev Environment +name: pimms +channels: + - conda-forge + - pytorch + - nvidia + - fastai # fastchan + - bioconda + - plotly + # - defaults +dependencies: + - python>=3.8,<=3.12 + - numpy + - pandas>=1 + - scipy>=1.6 + # plotting + - matplotlib + - python-kaleido + - plotly + - seaborn<0.13 + - pip + # ML + - pytorch #=1.13.1=py3.8_cuda11.7_cudnn8_0 + # - pytorch-cuda + - scikit-learn + - fastai + - torchvision + # - cudatoolkit #=11.7 + # - tensorboard + - umap-learn + # stats + - pingouin + - statsmodels + # other + - tqdm # progress bars + - xmltodict # configs + - openpyxl # xml + - omegaconf + - plac>=1.0 + # snakemake + # jupyter + - ipykernel + - ipython + - ipywidgets + - jupyterlab # standalone jupyter installation + # - jupyter_contrib_nbextensions # delete configuration file if you see an error: https://github.com/jupyter/nbconvert/issues/526#issuecomment-277552771 + - jupyter-dash + - papermill # execute ipynb's + - pip: + - git+https://github.com/RasmussenLab/pimms.git@dev + - mrmr-selection + - njab diff --git a/project/workflow/envs/trainRmodels.yaml b/project/workflow/envs/trainRmodels.yaml new file mode 100644 index 000000000..f4f246bbc --- /dev/null +++ b/project/workflow/envs/trainRmodels.yaml @@ -0,0 +1,36 @@ +# Dev Environment +name: trainRmodels +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - papermill # execute ipynb's + - jupyter + # R packages (listed in NAGuideR) + - r-base + - r-devtools # is it needed for source installs on windows server? + - r-irkernel + - r-reshape2 + - r-stringi # + rmarkdown hack for reshape2 + - r-stringr # reshape2 + - r-tidyverse + - r-gdata + - r-glmnet + - r-e1071 + - r-norm + - r-missforest + - r-vim + - r-mice + - r-cluster + - r-mvtnorm + - r-rrcov + - r-gmm + - r-tmvtnorm + - r-igraph + # - bioconductor-biocinstaller + # - r-imputelcmd # bioconda + # - bioconductor-impute + # - bioconductor-pcamethods + # - rrcovNA, GMSimpute + # SeqKnn, pcaMethods, DreamAI # bioconductor diff --git a/pyproject.toml b/pyproject.toml index 74ee4adf0..25cd2ac9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ [project.scripts] pimms-setup-imputation-comparison = "vaep.cmd_interface.setup_imp_cp_website:main" +pimms-add-diff-comp = "vaep.cmd_interface.setup_diff_analysis_website:main" [project.urls] "Bug Tracker" = "https://github.com/RasmussenLab/pimms/issues" diff --git a/snakemake_env.yml b/snakemake_env.yml new file mode 100644 index 000000000..7713b7b18 --- /dev/null +++ b/snakemake_env.yml @@ -0,0 +1,8 @@ +name: snakemake +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - snakemake-minimal + - mamba diff --git a/tests/io/test_dataset.py b/tests/io/test_dataset.py index 5edf5aefa..cea05e853 100644 --- a/tests/io/test_dataset.py +++ b/tests/io/test_dataset.py @@ -10,7 +10,8 @@ from vaep.io.datasets import DatasetWithMaskAndNoTarget, DatasetWithTarget data = np.random.random(size=(10, 5)) -mask = ~(data < 0.1) +threshold = max(0.15, data.min() + 0.02) +mask = ~(data < threshold) data_w_na = np.where(mask, data, np.nan) assert (data != data_w_na).any() diff --git a/vaep/cmd_interface/setup_diff_analysis_website.py b/vaep/cmd_interface/setup_diff_analysis_website.py new file mode 100644 index 000000000..c2f983847 --- /dev/null +++ b/vaep/cmd_interface/setup_diff_analysis_website.py @@ -0,0 +1,110 @@ +"""Console script to create or append index.rst for static website of differential analysis workflow.""" +import argparse +import textwrap +from collections import defaultdict +from pathlib import Path + + +def split_nb_name(nb: str) -> list: + return nb.split('.')[0].split('_') + + +INDEX_RST = textwrap.dedent("""\ + Differential Analysis Notebooks + ------------------------------- + + Inspect the notebooks associated with the differential analysis workflow. + + .. toctree:: + :maxdepth: 2 + :caption: Differential analysis (ANCOVA) + + {nb_1} + + .. toctree:: + :maxdepth: 2 + :caption: Compare ANCOVAs + + {nb_2} + + .. toctree:: + :maxdepth: 2 + :caption: Compare single differential analysis + + {nb_4} + + .. toctree:: + :maxdepth: 2 + :caption: Logistic regression models + + {nb_3} + """) + + +def main(): + parser = argparse.ArgumentParser( + description='Create or append index.rst for static website ' + 'displaying differential analysis notebooks.') + parser.add_argument('--folder', '-f', + type=str, + help='Path to the folder', + required=True) + parser.add_argument('--subfolder_comparision', '-sf_cp', + type=str, + help='Subfolder for comparison', + required=True) + args = parser.parse_args() + + folder_experiment = args.folder + + folder_experiment = Path(folder_experiment) + subfolder_comparison = Path(args.subfolder_comparision) + nbs = [_f.relative_to(folder_experiment) for _f in subfolder_comparison.glob('**/*.ipynb') if _f.is_file()] + nbs + + groups = defaultdict(list) + for nb in nbs: + _group = nb.name.split('_')[1] + groups[_group].append(nb) + groups = dict(groups) + groups + + # Parse notebooks present in imputation workflow + + nb_1 = '' + for nb in groups['1']: + nb_1 += " " * 4 + split_nb_name(nb.name)[-1] + f" <{nb.as_posix()}>\n" + + nb_2 = '' + for nb in groups['2']: + nb_2 += " " * 4 + ' '.join(nb.parent.name.split('_')) + f" <{nb.as_posix()}>\n" + + nb_3 = '' + for nb in groups['3']: + nb_3 += " " * 4 + ' '.join(nb.parent.name.split('_')) + f" <{nb.as_posix()}>\n" + print(nb_3) + + nb_4 = groups['4'][0] + nb_4 = " " * 4 + "Compare single features" + f" <{nb_4.as_posix()}>\n" + + index_rst = INDEX_RST.format(nb_1=nb_1, + nb_2=nb_2, + nb_3=nb_3, + nb_4=nb_4) + # append to index.rst + with open(folder_experiment / 'index.rst', 'a') as f: + f.write(index_rst) + + msg = f"""\ + The index.rst file has been created or extended in {folder_experiment}: + ```bash + {folder_experiment / 'index.rst'} + ``` + """ + + msg = textwrap.dedent(msg) + print(msg) + + +if __name__ == '__main__': + main() diff --git a/vaep/cmd_interface/setup_imp_cp_website.py b/vaep/cmd_interface/setup_imp_cp_website.py index d8c758ddc..0f4c8ebd9 100644 --- a/vaep/cmd_interface/setup_imp_cp_website.py +++ b/vaep/cmd_interface/setup_imp_cp_website.py @@ -91,7 +91,7 @@ def split_nb_name(nb: str) -> list: # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. - exclude_patterns = ['_build', 'jupyter_execute', 'diff_analysis', 'figures', + exclude_patterns = ['_build', 'jupyter_execute', 'figures', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- diff --git a/vaep/io/dataloaders.py b/vaep/io/dataloaders.py index b649c15fe..57c373dba 100644 --- a/vaep/io/dataloaders.py +++ b/vaep/io/dataloaders.py @@ -1,73 +1,15 @@ -import pandas -import torch -from typing import Tuple -from torch.utils.data import Dataset -from fastai.data.load import DataLoader -from fastai.data.core import DataLoaders +import pandas +import pandas as pd from fastai.data.all import * +from fastai.data.core import DataLoaders +from fastai.data.load import DataLoader +from torch.utils.data import Dataset from vaep.io import datasets from vaep.io.datasets import DatasetWithTarget from vaep.transform import VaepPipeline -import pandas as pd - - -class DataLoadersCreator(): - """DataLoader creator. For training or evaluation.""" - - def __init__(self, - df_train: pandas.DataFrame, - df_valid: pandas.DataFrame, - scaler, - DataSetClass: torch.utils.data.Dataset, - batch_size: int - ): - """Helper function to create from pandas.DataFrame(s) in memory datasets. - - Parameters - ---------- - df_train : pandas.DataFrame - Training data samples in DataFrames. - df_valid : pandas.DataFrame - Validation data (for training) in DataFrames. - scaler : [type] - A pipeline of transform to apply to the dataset. - DataSetClass : torch.utils.data.Dataset - Type of dataset to use for generating single samples based on - DataFrames. - batch_size : int - Batch size to use. - - Returns - ------- - Tuple[torch.utils.data.Dataloader, torch.utils.data.Dataloader] - train and validation set dataloaders. - """ - self.data_train = DataSetClass( - data=scaler.transform(df_train)) - self.data_valid = DataSetClass(data=scaler.transform(df_valid)) - self.scaler = scaler - self.batch_size = batch_size - - def get_dls(self, - shuffle_train: bool = True, - **kwargs) -> Tuple[torch.utils.data.DataLoader, - torch.utils.data.DataLoader]: - self.shuffle_train = shuffle_train - dl_train = DataLoader( - dataset=self.data_train, - batch_size=self.batch_size, shuffle=shuffle_train, **kwargs) - - dl_valid = DataLoader( - dataset=self.data_valid, - batch_size=self.batch_size, shuffle=False, **kwargs) - return dl_train, dl_valid - - def __repr__(self): - return f"{self.__class__.__name__} for creating dataloaders with {self.batch_size}." - def get_dls(train_X: pandas.DataFrame, valid_X: pandas.DataFrame, @@ -124,7 +66,11 @@ def get_dls(train_X: pandas.DataFrame, valid_ds = datasets.DatasetWithTarget(df=pd.DataFrame()) # ! Need for script exection (as plain python file) # https://pytorch.org/docs/stable/notes/windows.html#multiprocessing-error-without-if-clause-protection - return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=False, + drop_last = False + if (len(train_X) % bs) == 1: + # Batch-Normalization does not work with batches of size one + drop_last = True + return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=drop_last, num_workers=num_workers) From 115e681cebe4f9293401ff98cb85b8728a0e9668 Mon Sep 17 00:00:00 2001 From: Henry Date: Fri, 31 May 2024 11:41:33 +0200 Subject: [PATCH 2/3] :art: move plotting functionality to package --- project/00_5_training_data_exploration.py | 76 +++++++---------------- vaep/plotting/__init__.py | 39 ++++++++++++ vaep/plotting/data.py | 4 +- 3 files changed, 66 insertions(+), 53 deletions(-) diff --git a/project/00_5_training_data_exploration.py b/project/00_5_training_data_exploration.py index 219777465..92735c858 100644 --- a/project/00_5_training_data_exploration.py +++ b/project/00_5_training_data_exploration.py @@ -26,22 +26,22 @@ # %% from __future__ import annotations + import json import logging from pathlib import Path +import matplotlib +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt import seaborn as sns -import matplotlib - import vaep -from vaep import plotting -from vaep.pandas import missing_data import vaep.data_handling +from vaep import plotting from vaep.analyzers import analyzers +from vaep.pandas import missing_data from vaep.utils import create_random_df logger = vaep.logging.setup_nb_logger() @@ -51,48 +51,13 @@ 'figure.figsize': [4.0, 2.0]}) -def only_every_x_ticks(ax, x=2, axis=None): - """Sparse out ticks on both axis by factor x""" - if axis is None: - ax.set_xticks(ax.get_xticks()[::x]) - ax.set_yticks(ax.get_yticks()[::x]) - else: - if axis == 0: - ax.set_xticks(ax.get_xticks()[::x]) - elif axis == 1: - ax.set_yticks(ax.get_yticks()[::x]) - else: - raise ValueError(f'axis must be 0 or 1, got {axis}') - return ax - - -def use_first_n_chars_in_labels(ax, x=2): - """Take first N characters of labels and use them as new labels""" - # xaxis - _new_labels = [_l.get_text()[:x] - for _l in ax.get_xticklabels()] - _ = ax.set_xticklabels(_new_labels) - # yaxis - _new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()] - _ = ax.set_yticklabels(_new_labels) - return ax - - -def split_xticklabels(ax, PG_SEPARATOR=';'): - """Split labels by PG_SEPARATOR and only use first part""" - if PG_SEPARATOR is not None: - _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] - for _l in ax.get_xticklabels()] - _ = ax.set_xticklabels(_new_labels) - return ax - - def get_clustermap(data, figsize=(8, 8), cbar_pos: tuple[float, float, float, float] = ( 0.02, 0.83, 0.03, 0.15), **kwargs): from sklearn.impute import SimpleImputer + from vaep.pandas import _add_indices X = SimpleImputer().fit_transform(data) X = _add_indices(X, data) @@ -172,6 +137,10 @@ def get_dynamic_range(min_max): data = pd.read_pickle(FN_INTENSITIES) elif FN_INTENSITIES.suffix == '.csv': data = pd.read_csv(FN_INTENSITIES, index_col=INDEX_COL, nrows=N_FIRST_ROWS) +elif FN_INTENSITIES.suffix == '.tsv': + data = pd.read_csv(FN_INTENSITIES, sep='\t', index_col=INDEX_COL, nrows=N_FIRST_ROWS) +else: + raise ValueError(f'File extension {FN_INTENSITIES.suffix} not supported') data # %% @@ -373,10 +342,10 @@ def get_dynamic_range(min_max): ax.set_yticks([]) # cg.fig.suptitle(f'Present-absent pattern of {FEATURES_CUTOFF_TEXT}') ax.set_title(f'Present-absent pattern of {FEATURES_CUTOFF_TEXT}') -cg.fig.tight_layout() +cg.figure.tight_layout() fname = FIGUREFOLDER / 'clustermap_present_absent_pattern.png' files_out[fname.name] = fname -vaep.savefig(cg.fig, +vaep.savefig(cg.figure, name=fname, pdf=False, dpi=600) @@ -390,16 +359,19 @@ def get_dynamic_range(min_max): # %% vaep.plotting.make_large_descriptors(5) -fig, ax = plt.subplots(figsize=(8, 8)) +fig, ax = plt.subplots(figsize=(7.5, 3.5)) ax = sns.heatmap( selected.iloc[cg.dendrogram_row.reordered_ind, cg.dendrogram_col.reordered_ind], + robust=True, + cbar=False, + annot=False, ax=ax, ) ax.set_title(f'Heatmap of intensities clustered by missing pattern of {FEATURES_CUTOFF_TEXT}', fontsize=8) -only_every_x_ticks(ax, x=2) -use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +vaep.plotting.only_every_x_ticks(ax, x=2) +vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) if PG_SEPARATOR is not None: _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels()] @@ -428,8 +400,8 @@ def get_dynamic_range(min_max): ) ax.set_title(f'Heatmap of feature correlation of {FEATURES_CUTOFF_TEXT}', fontsize=8) -_ = only_every_x_ticks(ax, x=2) -_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +_ = vaep.plotting.only_every_x_ticks(ax, x=2) +_ = vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) if PG_SEPARATOR is not None: _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels()] @@ -455,8 +427,8 @@ def get_dynamic_range(min_max): cbar_kws={'shrink': 0.75}, square=True, ) -_ = only_every_x_ticks(ax, x=2) -_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +_ = vaep.plotting.only_every_x_ticks(ax, x=2) +_ = vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) if NO_TICK_LABELS_ON_HEATMAP: ax.set_xticks([]) ax.set_yticks([]) @@ -477,8 +449,8 @@ def get_dynamic_range(min_max): _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels) -_ = only_every_x_ticks(ax, x=2, axis=0) -_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +_ = vaep.plotting.only_every_x_ticks(ax, x=2, axis=0) +_ = vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) # ax.set_title(f'Clustermap of intensities based on {FEATURES_CUTOFF_TEXT}', fontsize=7) # cg.fig.tight_layout() # tight_layout makes the cbar a bit ugly cg.fig.suptitle(f'Clustermap of intensities based on {FEATURES_CUTOFF_TEXT}', fontsize=7) diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index 6d4accf8e..17cc86ced 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -128,6 +128,9 @@ def make_large_descriptors(size='xx-large'): }) +set_font_sizes = make_large_descriptors + + def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, format_str: str = '{x:,.3f}') -> matplotlib.axes.Axes: """Add proportion as second axis. Try to align cleverly @@ -327,3 +330,39 @@ def plot_cutoffs(df: pd.DataFrame, if min_feat_in_sample is not None: ax.axhline(min_feat_in_sample) return fig, axes + + +def only_every_x_ticks(ax, x=2, axis=None): + """Sparse out ticks on both axis by factor x""" + if axis is None: + ax.set_xticks(ax.get_xticks()[::x]) + ax.set_yticks(ax.get_yticks()[::x]) + else: + if axis == 0: + ax.set_xticks(ax.get_xticks()[::x]) + elif axis == 1: + ax.set_yticks(ax.get_yticks()[::x]) + else: + raise ValueError(f'axis must be 0 or 1, got {axis}') + return ax + + +def use_first_n_chars_in_labels(ax, x=2): + """Take first N characters of labels and use them as new labels""" + # xaxis + _new_labels = [_l.get_text()[:x] + for _l in ax.get_xticklabels()] + _ = ax.set_xticklabels(_new_labels) + # yaxis + _new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()] + _ = ax.set_yticklabels(_new_labels) + return ax + + +def split_xticklabels(ax, PG_SEPARATOR=';'): + """Split labels by PG_SEPARATOR and only use first part""" + if PG_SEPARATOR is not None: + _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] + for _l in ax.get_xticklabels()] + _ = ax.set_xticklabels(_new_labels) + return ax diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py index 14ff90430..1051a1a43 100644 --- a/vaep/plotting/data.py +++ b/vaep/plotting/data.py @@ -43,10 +43,12 @@ def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]: def plot_histogram_intensities(s: pd.Series, interval_bins=1, - min_max=(15, 40), + min_max: Tuple[int] = None, ax=None, **kwargs) -> Tuple[Axes, range]: """Plot intensities in Series in a certain range and equally spaced intervals.""" + if min_max is None: + min_max = get_min_max_iterable([s]) min_bin, max_bin = min_max bins = range(min_bin, max_bin, interval_bins) ax = s.plot.hist(bins=bins, xticks=list(bins), From 7fc0193c794c22a858771748f673cc7b62ed9e4e Mon Sep 17 00:00:00 2001 From: Henry Date: Fri, 31 May 2024 11:44:00 +0200 Subject: [PATCH 3/3] :art: annotate some functions, remove tags --- project/01_0_split_data.ipynb | 1 - project/02_3_grid_search_analysis.ipynb | 37 ++++++++++++++++++------- project/02_3_grid_search_analysis.py | 27 ++++++++++++------ vaep/analyzers/analyzers.py | 7 +++-- vaep/models/__init__.py | 7 ++++- vaep/pandas/__init__.py | 2 +- 6 files changed, 56 insertions(+), 25 deletions(-) diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index 85be6eaed..3b757bb1c 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -1249,7 +1249,6 @@ { "cell_type": "code", "execution_count": null, - "id": "34ee6256", "metadata": { "lines_to_next_cell": 2, "tags": [ diff --git a/project/02_3_grid_search_analysis.ipynb b/project/02_3_grid_search_analysis.ipynb index 586417d74..4a3028aa6 100644 --- a/project/02_3_grid_search_analysis.ipynb +++ b/project/02_3_grid_search_analysis.ipynb @@ -16,22 +16,25 @@ "metadata": {}, "outputs": [], "source": [ + "import snakemake\n", "import logging\n", "import pathlib\n", - "import pandas as pd\n", - "import plotly.express as px\n", + "\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import plotly.express as px\n", "import seaborn as sns\n", "\n", + "import vaep.io\n", + "import vaep.nb\n", + "import vaep.pandas\n", "import vaep.plotting.plotly as px_vaep\n", - "from vaep.analyzers import compare_predictions\n", + "import vaep.utils\n", "from vaep import sampling\n", + "from vaep.analyzers import compare_predictions\n", "from vaep.io import datasplits\n", - "import vaep.utils\n", - "import vaep.pandas\n", - "import vaep.io\n", - "import vaep.nb\n", + "\n", "matplotlib.rcParams['figure.figsize'] = [12.0, 6.0]\n", "\n", "\n", @@ -96,15 +99,18 @@ "cell_type": "code", "execution_count": null, "id": "8f0497b1-5f91-45e9-a3e1-88de08b928a9", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "# not robust\n", "try:\n", " ORDER = {'model': snakemake.params.models}\n", + " FILE_FORMAT = snakemake.params.file_format\n", "except AttributeError:\n", " ORDER = {'model': ['CF', 'DAE', 'VAE']}\n", - "FILE_FORMAT = snakemake.params.file_format" + " FILE_FORMAT = 'csv'" ] }, { @@ -607,6 +613,16 @@ "id": "f8190d51-c4db-4aae-8b91-11641958a0f8", "metadata": {}, "outputs": [], + "source": [ + "view = metrics_long[[\"model\", \"n_params\", \"data_split\", \"metric_name\", \"metric_value\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f98b49d8", + "metadata": {}, + "outputs": [], "source": [ "plt.rcParams['figure.figsize'] = (7, 4)\n", "plt.rcParams['lines.linewidth'] = 2\n", @@ -616,7 +632,7 @@ "col_order = ('valid_fake_na', 'test_fake_na')\n", "row_order = ('MAE', 'MSE')\n", "fg = sns.relplot(\n", - " data=metrics_long,\n", + " data=view,\n", " x='n_params',\n", " y='metric_value',\n", " col=\"data_split\",\n", @@ -652,6 +668,7 @@ "fname\n", "fname = FOLDER / \"hyperpar_results_by_parameters_val+test.pdf\"\n", "files_out[fname.name] = fname.as_posix()\n", + "view.to_excel(fname.with_suffix('.xlsx'))\n", "fg.savefig(fname)\n", "fg.savefig(fname.with_suffix('.png'), dpi=300)" ] diff --git a/project/02_3_grid_search_analysis.py b/project/02_3_grid_search_analysis.py index b3984e069..540bebc05 100644 --- a/project/02_3_grid_search_analysis.py +++ b/project/02_3_grid_search_analysis.py @@ -17,22 +17,25 @@ # # Analyis of grid hyperparameter search # %% +import snakemake import logging import pathlib -import pandas as pd -import plotly.express as px + import matplotlib import matplotlib.pyplot as plt +import pandas as pd +import plotly.express as px import seaborn as sns +import vaep.io +import vaep.nb +import vaep.pandas import vaep.plotting.plotly as px_vaep -from vaep.analyzers import compare_predictions +import vaep.utils from vaep import sampling +from vaep.analyzers import compare_predictions from vaep.io import datasplits -import vaep.utils -import vaep.pandas -import vaep.io -import vaep.nb + matplotlib.rcParams['figure.figsize'] = [12.0, 6.0] @@ -66,9 +69,11 @@ # not robust try: ORDER = {'model': snakemake.params.models} + FILE_FORMAT = snakemake.params.file_format except AttributeError: ORDER = {'model': ['CF', 'DAE', 'VAE']} -FILE_FORMAT = snakemake.params.file_format + FILE_FORMAT = 'csv' + # %% path_metrics = pathlib.Path(metrics_csv) @@ -318,6 +323,9 @@ hover_data['data_split'] = True hover_data['metric_value'] = ':.4f' +# %% +view = metrics_long[["model", "n_params", "data_split", "metric_name", "metric_value"]] + # %% plt.rcParams['figure.figsize'] = (7, 4) plt.rcParams['lines.linewidth'] = 2 @@ -327,7 +335,7 @@ col_order = ('valid_fake_na', 'test_fake_na') row_order = ('MAE', 'MSE') fg = sns.relplot( - data=metrics_long, + data=view, x='n_params', y='metric_value', col="data_split", @@ -363,6 +371,7 @@ fname fname = FOLDER / "hyperpar_results_by_parameters_val+test.pdf" files_out[fname.name] = fname.as_posix() +view.to_excel(fname.with_suffix('.xlsx')) fg.savefig(fname) fg.savefig(fname.with_suffix('.png'), dpi=300) diff --git a/vaep/analyzers/analyzers.py b/vaep/analyzers/analyzers.py index 5b8d4398e..7bd8c1e3c 100644 --- a/vaep/analyzers/analyzers.py +++ b/vaep/analyzers/analyzers.py @@ -395,9 +395,10 @@ def get_consecutive_data_indices(df, n_samples): return df.loc[index[start_sample:start_sample + n_samples]] -def corr_lower_triangle(df): - """Compute the correlation matrix, returning only unique values.""" - corr_df = df.corr() +def corr_lower_triangle(df, **kwargs): + """Compute the correlation matrix, returning only unique values. + """ + corr_df = df.corr(**kwargs) lower_triangle = pd.DataFrame( np.tril(np.ones(corr_df.shape), -1)).astype(bool) lower_triangle.index, lower_triangle.columns = corr_df.index, corr_df.columns diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 68fc89fc6..3be35408b 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -23,6 +23,10 @@ NUMPY_ONE = np.int64(1) +__all__ = ['ae', 'analysis', 'collab', 'vae', 'plot_loss', 'plot_training_losses', + 'calc_net_weight_count', 'RecorderDump', 'split_prediction_by_mask', + 'compare_indices', 'collect_metrics', 'calculte_metrics', + 'Metrics', 'get_df_from_nested_dict'] def plot_loss(recorder: learner.Recorder, @@ -312,7 +316,8 @@ def __repr__(self): def get_df_from_nested_dict(nested_dict, - column_levels=('data_split', 'model', 'metric_name'), + column_levels=( + 'data_split', 'model', 'metric_name'), row_name='subset'): metrics = {} for k, run_metrics in nested_dict.items(): diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py index ffaa60b17..97520bb02 100644 --- a/vaep/pandas/__init__.py +++ b/vaep/pandas/__init__.py @@ -285,7 +285,7 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: return ret -def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None): +def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None) -> pd.DataFrame: """Return counts per bin for selected columns in DataFrame.""" counts_per_bin = dict() if columns is None: