diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 269d6f090..349e8173c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,13 +20,13 @@ jobs: "macos-13", # "windows-latest" # rrcovNA cannot be build from source on windows-server ] - python-version: ["3.8"] + python-version: ["3.8", "3.9", "3.10"] steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Miniconda # ! change action https://github.com/mamba-org/setup-micromamba - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: miniforge-variant: Mambaforge # miniforge-version: latest @@ -82,9 +82,9 @@ jobs: snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml - name: Archive results # https://github.com/actions/upload-artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: example-workflow-results-${{ matrix.os }} + name: ${{ matrix.os }}-${{ matrix.python-version }}-example-workflow-results path: | project/runs/example/ environment.yml @@ -114,7 +114,6 @@ jobs: - name: Run pytest run: pytest . - publish: name: Publish package if: startsWith(github.ref, 'refs/tags') diff --git a/.github/workflows/ci_workflow.yaml b/.github/workflows/ci_workflow.yaml new file mode 100644 index 000000000..87df12633 --- /dev/null +++ b/.github/workflows/ci_workflow.yaml @@ -0,0 +1,55 @@ +name: run workflow with conda envs +on: + push: + branches: [main, dev] + pull_request: + branches: [main, dev] + release: + # schedule: + # - cron: '0 2 * * 3,6' +jobs: + run-integration-tests-with-conda-install: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -el {0} + strategy: + fail-fast: false + matrix: + os: [ + "ubuntu-latest", + "macos-13", + # "windows-latest" # rrcovNA cannot be build from source on windows-server + ] + python-version: ["3.10"] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Miniconda + # ! change action https://github.com/mamba-org/setup-micromamba + uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-variant: Mambaforge + use-mamba: true + channel-priority: disabled + python-version: ${{ matrix.python-version }} + environment-file: snakemake_env.yml + activate-environment: snakemake + auto-activate-base: true + - name: inspect-conda-environment + run: | + conda info + conda list + - name: Dry-run workflow + run: | + cd project + snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml -n --use-conda + - name: Run demo workflow (integration test) + continue-on-error: true + run: | + cd project + snakemake -p -c4 -k --configfile config/single_dev_dataset/example/config.yaml --use-conda + - name: Run demo workflow again (in case of installation issues) + run: | + cd project + snakemake -p -c1 -k --configfile config/single_dev_dataset/example/config.yaml --use-conda diff --git a/.github/workflows/test_pkg_on_colab.yaml b/.github/workflows/test_pkg_on_colab.yaml new file mode 100644 index 000000000..546434bb3 --- /dev/null +++ b/.github/workflows/test_pkg_on_colab.yaml @@ -0,0 +1,26 @@ +name: Test that tutorial runs on latest colab image + +on: + push: + branches: [dev] + pull_request: + branches: [main, dev] + schedule: + - cron: '0 2 3 * *' + +jobs: + test-tutorial-on-colab: + name: Test tutorial on latest colab image + runs-on: ubuntu-latest-4core # increase disk space + # https://console.cloud.google.com/artifacts/docker/colab-images/europe/public/runtime + container: + image: europe-docker.pkg.dev/colab-images/public/runtime:latest + steps: + - uses: actions/checkout@v4 + - name: Install pimms-learn and papermill + run: | + python3 -m pip install pimms-learn papermill + - name: Run tutorial + run: | + cd project + papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_output.ipynb diff --git a/.github/workflows/workflow_website.yaml b/.github/workflows/workflow_website.yaml index ab775f193..be97836db 100644 --- a/.github/workflows/workflow_website.yaml +++ b/.github/workflows/workflow_website.yaml @@ -1,4 +1,4 @@ -name: Build workflow website on smaller development dataset (for protein groups) +name: Build workflow website on public Alzheimer dataset (for protein groups) on: pull_request: branches: [main, dev] @@ -29,32 +29,39 @@ jobs: activate-environment: vaep auto-activate-base: true # auto-update-conda: true + - name: Dry-run workflow + run: | + cd project + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -n - name: Run demo workflow (integration test) continue-on-error: true run: | cd project - snakemake -p -c1 -n - snakemake -p -c4 -k + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k - name: Run demo workflow again (in case of installation issues) run: | cd project - snakemake -p -c1 -n - snakemake -p -c4 -k + snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c4 -k + - name: Run differential analysis workflow + run: | + cd project + snakemake -s workflow/Snakefile_ald_comparison.smk --configfile config/alzheimer_study/comparison.yaml -p -c4 - name: Install website dependencies run: | pip install .[docs] - name: Build imputation comparison website run: | - pimms-setup-imputation-comparison -f project/runs/dev_dataset_small/proteinGroups_N50/ - cd project/runs/dev_dataset_small/proteinGroups_N50/ + pimms-setup-imputation-comparison -f project/runs/alzheimer_study/ + pimms-add-diff-comp -f project/runs/alzheimer_study/ -sf_cp project/runs/alzheimer_study/diff_analysis/AD + cd project/runs/alzheimer_study/ sphinx-build -n --keep-going -b html ./ ./_build/ - name: Archive results uses: actions/upload-artifact@v3 with: - name: example-workflow-results-${{ matrix.os }} - path: project/runs/dev_dataset_small/proteinGroups_N50/_build/ + name: alzheimer-study + path: project/runs/alzheimer_study/ - name: Publish workflow as website uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: project/runs/dev_dataset_small/proteinGroups_N50/_build/ \ No newline at end of file + publish_dir: project/runs/alzheimer_study/_build/ \ No newline at end of file diff --git a/README.md b/README.md index 94c8128c7..4af30064e 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,33 @@ # PIMMS -[![Read the Docs](https://img.shields.io/readthedocs/pimms)](https://readthedocs.org/projects/pimms/) [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/RasmussenLab/pimms/ci.yaml)](https://github.com/RasmussenLab/pimms/actions) +[![Read the Docs](https://img.shields.io/readthedocs/pimms)](https://readthedocs.org/projects/pimms/) [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/RasmussenLab/pimms/ci.yaml)](https://github.com/RasmussenLab/pimms/actions) [![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest) PIMMS stands for Proteomics Imputation Modeling Mass Spectrometry and is a hommage to our dear British friends who are missing as part of the EU for far too long already -(Pimms is also a British summer drink). +(Pimms is a British summer drink). -The pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792). +The publication is accepted in Nature Communications +and the pre-print is available [on biorxiv](https://doi.org/10.1101/2023.01.12.523792). > `PIMMS` was called `vaep` during development. -> Before entire refactoring has to been completed the imported package will be -`vaep`. +> Before entire refactoring has to been completed the imported package will be `vaep`. -We provide functionality as a python package, an excutable workflow and notebooks. +We provide functionality as a python package, an excutable workflow or simply in notebooks. -The models can be used with the scikit-learn interface in the spirit of other scikit-learn imputers. You can try this in colab. [![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/HEAD/project/04_1_train_pimms_models.ipynb) +For any questions, please [open an issue](https://github.com/RasmussenLab/pimms/issues) or contact me directly. +## Getting started -## Python package +The models can be used with the scikit-learn interface in the spirit of other scikit-learn imputers. You can try this using our tutorial in colab: + +[![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/HEAD/project/04_1_train_pimms_models.ipynb) + +It uses the scikit-learn interface. The PIMMS models in the scikit-learn interface +can be executed on the entire data or by specifying a valdiation split for checking training process. +In our experiments overfitting wasn't a big issue, but it's easy to check. + +## Install Python package For interactive use of the models provided in PIMMS, you can use our [python package `pimms-learn`](https://pypi.org/project/pimms-learn/). @@ -28,7 +37,7 @@ The interface is similar to scikit-learn. pip install pimms-learn ``` -Then you can use the models on a pandas DataFrame with missing values. Try this in the tutorial on Colab: +Then you can use the models on a pandas DataFrame with missing values. You can try this in the tutorial on Colab by uploading your data: [![open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RasmussenLab/pimms/blob/HEAD/project/04_1_train_pimms_models.ipynb) ## Notebooks as scripts using papermill @@ -37,27 +46,71 @@ If you want to run a model on your prepared data, you can run notebooks prefixed `01_`, i.e. [`project/01_*.ipynb`](https://github.com/RasmussenLab/pimms/tree/HEAD/project) after cloning the repository. Using jupytext also python percentage script versions are saved. -``` +```bash +# navigat to your desired folder +git clone https://github.com/RasmussenLab/pimms.git # get all notebooks cd project # project folder as pwd +# pip install pimms-learn papermill # if not already installed papermill 01_0_split_data.ipynb --help-notebook papermill 01_1_train_vae.ipynb --help-notebook ``` +> :warning: Mistyped argument names won't throw an error when using papermill, but a warning is printed on the console thanks to my contributions:) -> Mistyped argument names won't throw an error when using papermill - -## PIMMS comparison workflow +## PIMMS comparison workflow and differential analysis workflow The PIMMS comparison workflow is a snakemake workflow that runs the all selected PIMMS models and R-models on -a user-provided dataset and compares the results. An example for the smaller HeLa development dataset on the +a user-provided dataset and compares the results. An example for a publickly available Alzheimer dataset on the protein groups level is re-built regularly and available at: [rasmussenlab.org/pimms](https://www.rasmussenlab.org/pimms/) +It is built on top of + - the [Snakefile_v2.smk](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_v2.smk) (v2 of imputation workflow), specified in on configuration + - the [Snakefile_ald_comparision](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_ald_comparison.smk) workflow for differential analysis + +The associated notebooks are index with `01_*` for the comparsion workflow and `10_*` for the differential analysis workflow. The `project` folder can be copied separately to any location if the package is installed. It's standalone folder. It's main folders are: + +```bash +# project folder: +project +│ README.md # see description of notebooks and hints on execution in project folder +|---config # configuration files for experiments ("workflows") +|---data # data for experiments +|---runs # results of experiments +|---src # source code or binaries for some R packges +|---tutorials # some tutorials for libraries used in the project +|---workflow # snakemake workflows +``` + +To re-execute the entire workflow locally, have a look at the [configuration files](https://github.com/RasmussenLab/pimms/tree/HEAD/project/config/alzheimer_study) for the published Alzheimer workflow: + +- [`config/alzheimer_study/config.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/config/alzheimer_study/comparison.yaml) +- [`config/alzheimer_study/comparsion.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/config/alzheimer_study/config.yaml) + +To execute that workflow, follow the Setup instructions below and run the following command in the project folder: + +```bash +# being in the project folder +snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c1 -n # one core/process, dry-run +snakemake -s workflow/Snakefile_v2.smk --configfile config/alzheimer_study/config.yaml -p -c2 # two cores/process, execute +# after imputation workflow, execute the comparison workflow +snakemake -s workflow/Snakefile_ald_comparison.smk --configfile config/alzheimer_study/comparison.yaml -p -c1 +# If you want to build the website locally: https://www.rasmussenlab.org/pimms/ +pip install .[docs] +pimms-setup-imputation-comparison -f project/runs/alzheimer_study/ +pimms-add-diff-comp -f project/runs/alzheimer_study/ -sf_cp project/runs/alzheimer_study/diff_analysis/AD +cd project/runs/alzheimer_study/ +sphinx-build -n --keep-going -b html ./ ./_build/ +# open ./_build/index.html +``` + +## Setup workflow and development environment + ### Setup comparison workflow The core funtionality is available as a standalone software on PyPI under the name `pimms-learn`. However, running the entire snakemake workflow in enabled using conda (or mamba) and pip to setup an analysis environment. For a detailed description of setting up conda (or mamba), see [instructions on setting up a virtual environment](https://github.com/RasmussenLab/pimms/blob/HEAD/docs/venv_setup.md). -Download the repository +Download the repository: ``` git clone https://github.com/RasmussenLab/pimms.git @@ -74,14 +127,14 @@ mamba env create -n pimms -f environment.yml # faster, less then 5mins If on Mac M1, M2 or having otherwise issue using your accelerator (e.g. GPUs): Install the pytorch dependencies first, then the rest of the environment: -### Install development dependencies +### Install pytorch first (M-chips) Check how to install pytorch for your system [here](https://pytorch.org/get-started). - select the version compatible with your cuda version if you have an nvidia gpu or a Mac M-chip. ```bash -conda create -n vaep python=3.8 pip +conda create -n vaep python=3.9 pip conda activate vaep # Follow instructions on https://pytorch.org/get-started # conda env update -f environment.yml -n vaep # should not install the rest. @@ -95,29 +148,17 @@ papermill 04_1_train_pimms_models.ipynb 04_1_train_pimms_models_test.ipynb # sec python 04_1_train_pimms_models.py # just execute the code ``` -### Entire development installation - - -```bash -conda create -n pimms_dev -c pytorch -c nvidia -c fastai -c bioconda -c plotly -c conda-forge --file requirements.txt --file requirements_R.txt --file requirements_dev.txt -pip install -e . # other pip dependencies missing -snakemake --configfile config/single_dev_dataset/example/config.yaml -F -n -``` - -or if you want to update an existing environment +### Let Snakemake handle installation +If you only want to execute the workflow, you can use snakemake to build the environments for you: -``` -conda update -c defaults -c conda-forge -c fastai -c bioconda -c plotly --file requirements.txt --file requirements_R.txt --file requirements_dev.txt -``` +> Snakefile workflow for imputation v1 only support that atm. -or using the environment.yml file (can fail on certain systems) - -``` -conda env create -f environment.yml +```bash +snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml --use-conda -n # dry-run +snakemake -p -c1 --configfile config/single_dev_dataset/example/config.yaml --use-conda # execute with one core ``` - ### Troubleshooting Trouble shoot your R installation by opening jupyter lab @@ -127,16 +168,16 @@ Trouble shoot your R installation by opening jupyter lab jupyter lab # open 01_1_train_NAGuideR.ipynb ``` -## Run an analysis +## Run example Change to the [`project` folder](./project) and see it's [README](project/README.md) -You can subselect models by editing the config file: [`config.yaml`](project/config/single_dev_dataset/proteinGroups_N50/config.yaml) file. +You can subselect models by editing the config file: [`config.yaml`](https://github.com/RasmussenLab/pimms/tree/HEAD/project/config/single_dev_dataset/proteinGroups_N50) file. ``` conda activate pimms # activate virtual environment cd project # go to project folder pwd # so be in ./pimms/project -snakemake -c1 -p -n # dryrun demo workflow +snakemake -c1 -p -n # dryrun demo workflow, potentiall add --use-conda snakemake -c1 -p ``` @@ -228,7 +269,3 @@ From the brief description in the table the exact procedure is not always clear. | MSIMPUTE_MNAR | msImpute | BIOCONDUCTOR | | Missing not at random algorithm using low rank approximation | ~~grr~~ | DreamAI | - | Fails to install | Rigde regression | ~~GMS~~ | GMSimpute | tar file | Fails on Windows | Lasso model - - -## Build status -[![Documentation Status](https://readthedocs.org/projects/pimms/badge/?version=latest)](https://pimms.readthedocs.io/en/latest/?badge=latest) \ No newline at end of file diff --git a/environment.yml b/environment.yml index a335db7ba..8415e1b58 100644 --- a/environment.yml +++ b/environment.yml @@ -9,9 +9,9 @@ channels: - plotly # - defaults dependencies: - - python=3.8 + - python>=3.8,<=3.12 - numpy - - pandas=1 + - pandas>=1 - scipy>=1.6 # plotting - matplotlib @@ -20,7 +20,7 @@ dependencies: - seaborn<0.13 - pip # ML - - pytorch=1 #=1.13.1=py3.8_cuda11.7_cudnn8_0 + - pytorch #=1.13.1=py3.8_cuda11.7_cudnn8_0 # - pytorch-cuda - scikit-learn - fastai @@ -36,8 +36,9 @@ dependencies: - xmltodict # configs - openpyxl # xml - omegaconf + - plac>=1.0 # snakemake - - snakemake-minimal<7.26 + - snakemake-minimal #<7.26 # jupyter - ipykernel - ipython diff --git a/project/00_5_training_data_exploration.py b/project/00_5_training_data_exploration.py index 219777465..92735c858 100644 --- a/project/00_5_training_data_exploration.py +++ b/project/00_5_training_data_exploration.py @@ -26,22 +26,22 @@ # %% from __future__ import annotations + import json import logging from pathlib import Path +import matplotlib +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt import seaborn as sns -import matplotlib - import vaep -from vaep import plotting -from vaep.pandas import missing_data import vaep.data_handling +from vaep import plotting from vaep.analyzers import analyzers +from vaep.pandas import missing_data from vaep.utils import create_random_df logger = vaep.logging.setup_nb_logger() @@ -51,48 +51,13 @@ 'figure.figsize': [4.0, 2.0]}) -def only_every_x_ticks(ax, x=2, axis=None): - """Sparse out ticks on both axis by factor x""" - if axis is None: - ax.set_xticks(ax.get_xticks()[::x]) - ax.set_yticks(ax.get_yticks()[::x]) - else: - if axis == 0: - ax.set_xticks(ax.get_xticks()[::x]) - elif axis == 1: - ax.set_yticks(ax.get_yticks()[::x]) - else: - raise ValueError(f'axis must be 0 or 1, got {axis}') - return ax - - -def use_first_n_chars_in_labels(ax, x=2): - """Take first N characters of labels and use them as new labels""" - # xaxis - _new_labels = [_l.get_text()[:x] - for _l in ax.get_xticklabels()] - _ = ax.set_xticklabels(_new_labels) - # yaxis - _new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()] - _ = ax.set_yticklabels(_new_labels) - return ax - - -def split_xticklabels(ax, PG_SEPARATOR=';'): - """Split labels by PG_SEPARATOR and only use first part""" - if PG_SEPARATOR is not None: - _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] - for _l in ax.get_xticklabels()] - _ = ax.set_xticklabels(_new_labels) - return ax - - def get_clustermap(data, figsize=(8, 8), cbar_pos: tuple[float, float, float, float] = ( 0.02, 0.83, 0.03, 0.15), **kwargs): from sklearn.impute import SimpleImputer + from vaep.pandas import _add_indices X = SimpleImputer().fit_transform(data) X = _add_indices(X, data) @@ -172,6 +137,10 @@ def get_dynamic_range(min_max): data = pd.read_pickle(FN_INTENSITIES) elif FN_INTENSITIES.suffix == '.csv': data = pd.read_csv(FN_INTENSITIES, index_col=INDEX_COL, nrows=N_FIRST_ROWS) +elif FN_INTENSITIES.suffix == '.tsv': + data = pd.read_csv(FN_INTENSITIES, sep='\t', index_col=INDEX_COL, nrows=N_FIRST_ROWS) +else: + raise ValueError(f'File extension {FN_INTENSITIES.suffix} not supported') data # %% @@ -373,10 +342,10 @@ def get_dynamic_range(min_max): ax.set_yticks([]) # cg.fig.suptitle(f'Present-absent pattern of {FEATURES_CUTOFF_TEXT}') ax.set_title(f'Present-absent pattern of {FEATURES_CUTOFF_TEXT}') -cg.fig.tight_layout() +cg.figure.tight_layout() fname = FIGUREFOLDER / 'clustermap_present_absent_pattern.png' files_out[fname.name] = fname -vaep.savefig(cg.fig, +vaep.savefig(cg.figure, name=fname, pdf=False, dpi=600) @@ -390,16 +359,19 @@ def get_dynamic_range(min_max): # %% vaep.plotting.make_large_descriptors(5) -fig, ax = plt.subplots(figsize=(8, 8)) +fig, ax = plt.subplots(figsize=(7.5, 3.5)) ax = sns.heatmap( selected.iloc[cg.dendrogram_row.reordered_ind, cg.dendrogram_col.reordered_ind], + robust=True, + cbar=False, + annot=False, ax=ax, ) ax.set_title(f'Heatmap of intensities clustered by missing pattern of {FEATURES_CUTOFF_TEXT}', fontsize=8) -only_every_x_ticks(ax, x=2) -use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +vaep.plotting.only_every_x_ticks(ax, x=2) +vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) if PG_SEPARATOR is not None: _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels()] @@ -428,8 +400,8 @@ def get_dynamic_range(min_max): ) ax.set_title(f'Heatmap of feature correlation of {FEATURES_CUTOFF_TEXT}', fontsize=8) -_ = only_every_x_ticks(ax, x=2) -_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +_ = vaep.plotting.only_every_x_ticks(ax, x=2) +_ = vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) if PG_SEPARATOR is not None: _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels()] @@ -455,8 +427,8 @@ def get_dynamic_range(min_max): cbar_kws={'shrink': 0.75}, square=True, ) -_ = only_every_x_ticks(ax, x=2) -_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +_ = vaep.plotting.only_every_x_ticks(ax, x=2) +_ = vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) if NO_TICK_LABELS_ON_HEATMAP: ax.set_xticks([]) ax.set_yticks([]) @@ -477,8 +449,8 @@ def get_dynamic_range(min_max): _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] for _l in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels) -_ = only_every_x_ticks(ax, x=2, axis=0) -_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) +_ = vaep.plotting.only_every_x_ticks(ax, x=2, axis=0) +_ = vaep.plotting.use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS) # ax.set_title(f'Clustermap of intensities based on {FEATURES_CUTOFF_TEXT}', fontsize=7) # cg.fig.tight_layout() # tight_layout makes the cbar a bit ugly cg.fig.suptitle(f'Clustermap of intensities based on {FEATURES_CUTOFF_TEXT}', fontsize=7) diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index 78daf9994..3b757bb1c 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -12,7 +12,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", @@ -69,7 +73,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -115,7 +123,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -125,7 +137,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.args_from_dict(args)\n", @@ -135,7 +151,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if not 0.0 <= args.frac_mnar <= 1.0:\n", @@ -164,7 +184,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "logger.info(f\"{args.FN_INTENSITIES = }\")\n", @@ -183,7 +207,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! factor out file reading to a separate module, not class\n", @@ -214,7 +242,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -231,7 +262,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.out_folder / '01_0_data_stats.xlsx'\n", @@ -262,7 +297,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "def join_as_str(seq):\n", @@ -291,7 +330,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -315,7 +358,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -331,7 +377,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10))" @@ -340,7 +390,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_meta = df_meta.sort_values(args.meta_date_col)" @@ -349,7 +403,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "meta_stats = df_meta.describe(include='all')\n", @@ -367,7 +425,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -384,7 +445,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_meta = align_meta_data(df, df_meta=df_meta)" @@ -400,7 +465,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "assert df.index.is_unique, \"Duplicates in index.\"" @@ -420,7 +489,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.select_N is not None:\n", @@ -449,7 +522,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "! add function\n", @@ -476,7 +553,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "notna = df.notna()\n", @@ -507,7 +588,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if isinstance(args.sample_completeness, float):\n", @@ -525,7 +610,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = sample_counts > args.sample_completeness\n", @@ -539,7 +628,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.N, args.M = df.shape # save data dimensions\n", @@ -556,7 +649,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "group = 1\n", @@ -572,7 +669,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -597,7 +697,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "min_max = vaep.plotting.data.min_max(df.stack())\n", @@ -613,7 +717,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -629,7 +737,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -655,7 +767,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "_feature_display_name = f'identified {args.feat_name_display}'\n", @@ -665,7 +781,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "K = 2\n", @@ -682,7 +802,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pcs.describe(include='all').T" @@ -691,7 +815,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_cat_col:\n", @@ -707,7 +835,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_date_col != 'PlaceholderTime':\n", @@ -729,7 +861,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots()\n", @@ -750,7 +886,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! write principal components to excel (if needed)\n", @@ -760,7 +900,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig = px.scatter(\n", @@ -791,7 +935,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df.head()" @@ -800,7 +948,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_w_date = df.join(df_meta[args.meta_date_col])\n", @@ -814,7 +966,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = df_w_date.plot.box(rot=80,\n", @@ -844,7 +1000,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))" @@ -862,7 +1022,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if not args.meta_date_col == 'PlaceholderTime':\n", @@ -904,7 +1068,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -922,7 +1089,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_per_feature = feature_frequency(df)\n", @@ -933,7 +1104,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -968,7 +1142,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -988,7 +1165,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_long = vaep.io.datasplits.long_format(df)\n", @@ -998,7 +1179,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "group = 2\n", @@ -1016,7 +1201,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1059,6 +1246,29 @@ "vaep.savefig(fig, fname)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "counts_per_bin = vaep.pandas.get_counts_per_bin(\n", + " df=pd.concat(\n", + " [df_long.squeeze().to_frame('observed'),\n", + " thresholds.to_frame('threshold'),\n", + " fake_na_mnar.squeeze().to_frame(f'MNAR ({N_MNAR:,d})'),\n", + " fake_na_mcar.squeeze().to_frame(f'MCAR ({N_MCAR:,d})')],\n", + " axis=1),\n", + " bins=range(min_max[0], min_max[1] + 1, 1))\n", + "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + "counts_per_bin" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1074,7 +1284,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if 0.0 < args.prop_sample_w_sim < 1.0:\n", @@ -1104,7 +1318,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.test_y.groupby(level=-1).count().describe()" @@ -1113,7 +1331,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.val_y" @@ -1122,7 +1344,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.train_X.groupby(level=-1).count().describe()" @@ -1131,7 +1357,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# Check that feature indices and sample indicies overlap between splits\n", @@ -1160,7 +1390,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "diff = (splits\n", @@ -1195,7 +1429,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4\n", @@ -1228,7 +1466,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# dumps data in long-format\n", @@ -1247,7 +1489,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits = DataSplits.from_folder(args.data, file_format=args.file_format)" @@ -1263,7 +1509,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits_df = pd.DataFrame(index=df_long.index)\n", @@ -1278,7 +1528,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# whitespaces in legends are not displayed correctly...\n", @@ -1296,7 +1550,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "group = 3\n", @@ -1329,11 +1587,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "min_bin, max_bin = vaep.plotting.data.min_max(splits.val_y)\n", - "bins = range(int(min_bin), int(max_bin), 1)\n", + "bins = range(int(min_bin), int(max_bin) + 1, 1)\n", "ax = splits_df.plot.hist(bins=bins,\n", " xticks=list(bins),\n", " legend=False,\n", @@ -1353,7 +1615,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)\n", @@ -1364,7 +1630,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = splits_df.drop('train', axis=1).plot.hist(bins=bins,\n", @@ -1383,34 +1654,6 @@ "vaep.savefig(ax.get_figure(), fname)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Save binned counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "counts_per_bin = dict()\n", - "for col in splits_df.columns:\n", - " _series = (pd.cut(splits_df[col], bins=bins)\n", - " .to_frame()\n", - " .groupby(col)\n", - " .size())\n", - " _series.index.name = 'bin'\n", - " counts_per_bin[col] = _series\n", - "counts_per_bin = pd.DataFrame(counts_per_bin)\n", - "counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", - "counts_per_bin" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1421,7 +1664,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits.to_wide_format()" @@ -1430,7 +1677,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -1443,7 +1694,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", @@ -1456,7 +1711,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "medians = (splits\n", @@ -1500,7 +1759,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.folder_experiment / 'data_config.yaml'\n", @@ -1518,7 +1781,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# saved figures\n", @@ -1535,7 +1802,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "writer.close()\n", @@ -1545,7 +1816,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] } diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 9b4061096..9be123d10 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -18,7 +18,7 @@ # # Create data splits -# %% +# %% tags=["hide-input"] import logging from functools import partial from pathlib import Path @@ -65,7 +65,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # %% [markdown] # ## Arguments -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -95,15 +95,15 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): feat_name_display: str = None # display name for feature name (e.g. 'protein group') -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args -# %% +# %% tags=["hide-input"] if not 0.0 <= args.frac_mnar <= 1.0: raise ValueError("Invalid MNAR float value (should be betw. 0 and 1):" f" {args.frac_mnar}") @@ -118,7 +118,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # %% [markdown] # process arguments -# %% +# %% tags=["hide-input"] logger.info(f"{args.FN_INTENSITIES = }") @@ -131,7 +131,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): logger.info( f"File format (extension): {FILE_EXT} (!specifies data loading function!)") -# %% +# %% tags=["hide-input"] # # ! factor out file reading to a separate module, not class # AnalyzePeptides.from_csv constructor = getattr(vaep.io.load, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT]) @@ -155,7 +155,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): df = log_fct(df) # ! potentially add check to increase value by 1 if 0 is present (should be part of preprocessing) df -# %% +# %% tags=["hide-input"] ax = (df .notna() .sum(axis=0) @@ -166,7 +166,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): ax.set_ylabel('Frequency') -# %% +# %% tags=["hide-input"] fname = args.out_folder / '01_0_data_stats.xlsx' dumps[fname.name] = fname.as_posix() writer = pd.ExcelWriter(fname) @@ -188,7 +188,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): # # > The Collaborative Modeling approach will need a single feature column. -# %% +# %% tags=["hide-input"] def join_as_str(seq): ret = "_".join(str(x) for x in seq) return ret @@ -207,7 +207,7 @@ def join_as_str(seq): # # - read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser) -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) else: @@ -224,7 +224,7 @@ def join_as_str(seq): df_meta.index.name = args.index_col[0] df_meta -# %% +# %% tags=["hide-input"] if args.meta_date_col: df_meta[args.meta_date_col] = pd.to_datetime( df_meta[args.meta_date_col]) @@ -234,20 +234,20 @@ def join_as_str(seq): df_meta -# %% +# %% tags=["hide-input"] df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10)) -# %% +# %% tags=["hide-input"] df_meta = df_meta.sort_values(args.meta_date_col) -# %% +# %% tags=["hide-input"] meta_stats = df_meta.describe(include='all') meta_stats # %% [markdown] # subset with variation -# %% +# %% tags=["hide-input"] try: display(meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)]) @@ -258,13 +258,13 @@ def join_as_str(seq): display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)]) -# %% +# %% tags=["hide-input"] df_meta = align_meta_data(df, df_meta=df_meta) # %% [markdown] # Ensure unique indices -# %% +# %% tags=["hide-input"] assert df.index.is_unique, "Duplicates in index." # %% [markdown] @@ -274,7 +274,7 @@ def join_as_str(seq): # - for interpolation to make sense, it is best to select a consecutive number of samples: # - take N most recent samples (-> check that this makes sense for your case) -# %% +# %% tags=["hide-input"] if args.select_N is not None: args.select_N = min(args.select_N, len(df_meta)) if args.sample_N: @@ -292,7 +292,7 @@ def join_as_str(seq): # - `feat_prevalence` across samples -# %% +# %% tags=["hide-input"] # ! add function freq_per_feature = df.notna().sum() # on wide format if isinstance(args.feat_prevalence, float): @@ -313,7 +313,7 @@ def join_as_str(seq): df = df.loc[:, mask] df -# %% +# %% tags=["hide-input"] notna = df.notna() data_stats_filtered = pd.concat( [ @@ -330,7 +330,7 @@ def join_as_str(seq): # %% [markdown] # Select samples based on completeness -# %% +# %% tags=["hide-input"] if isinstance(args.sample_completeness, float): msg = f'Fraction of minimum sample completeness over all features specified with: {args.sample_completeness}\n' # assumes df in wide format @@ -342,7 +342,7 @@ def join_as_str(seq): sample_counts = df.notna().sum(axis=1) # if DataFrame sample_counts.describe() -# %% +# %% tags=["hide-input"] mask = sample_counts > args.sample_completeness msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.' logger.info(msg) @@ -350,14 +350,14 @@ def join_as_str(seq): df = df.dropna( axis=1, how='all') # drop now missing features -# %% +# %% tags=["hide-input"] args.N, args.M = df.shape # save data dimensions args.used_samples = df.index.to_list() # %% [markdown] # ### Histogram of features per sample -# %% +# %% tags=["hide-input"] group = 1 ax = df.notna().sum(axis=1).hist() ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample') @@ -366,7 +366,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax = df.notna().sum(axis=0).sort_values().plot() _new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels, rotation=45, @@ -381,7 +381,7 @@ def join_as_str(seq): # %% [markdown] # ### Number off observations accross feature value -# %% +# %% tags=["hide-input"] min_max = vaep.plotting.data.min_max(df.stack()) ax, bins = vaep.plotting.data.plot_histogram_intensities( df.stack(), min_max=min_max) @@ -391,7 +391,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=df, type='scatter') fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter' @@ -401,7 +401,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing( data=df, type='boxplot', return_plot_data=True) fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot' @@ -417,11 +417,11 @@ def join_as_str(seq): # %% [markdown] # ### Interactive and Single plots -# %% +# %% tags=["hide-input"] _feature_display_name = f'identified {args.feat_name_display}' sample_counts.name = _feature_display_name -# %% +# %% tags=["hide-input"] K = 2 df = df.astype(float) pcs = get_PCA(df, n_components=K) # should be renamed to get_PCs @@ -432,10 +432,10 @@ def join_as_str(seq): pcs = pcs.reset_index() pcs -# %% +# %% tags=["hide-input"] pcs.describe(include='all').T -# %% +# %% tags=["hide-input"] if args.meta_cat_col: fig, ax = plt.subplots(figsize=(3, 3)) analyzers.seaborn_scatter( @@ -445,7 +445,7 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(fig, fname) -# %% +# %% tags=["hide-input"] if args.meta_date_col != 'PlaceholderTime': fig, ax = plt.subplots() analyzers.plot_date_map( @@ -457,7 +457,7 @@ def join_as_str(seq): # %% [markdown] # - size: number of features in a single sample -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots() col_identified_feat = _feature_display_name analyzers.plot_scatter( @@ -472,11 +472,11 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(fig, fname) -# %% +# %% tags=["hide-input"] # # ! write principal components to excel (if needed) # pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx')) -# %% +# %% tags=["hide-input"] fig = px.scatter( pcs, x=pcs_name[0], y=pcs_name[1], hover_name=pcs_index_name, @@ -497,10 +497,10 @@ def join_as_str(seq): # %% [markdown] # ## Sample Medians and percentiles -# %% +# %% tags=["hide-input"] df.head() -# %% +# %% tags=["hide-input"] df_w_date = df.join(df_meta[args.meta_date_col]) df_w_date = df_w_date.set_index(args.meta_date_col).sort_index() if not args.meta_date_col == 'PlaceholderTime': @@ -508,7 +508,7 @@ def join_as_str(seq): df_w_date = df_w_date.T df_w_date -# %% +# %% tags=["hide-input"] ax = df_w_date.plot.box(rot=80, figsize=(7, 3), fontsize=7, @@ -528,7 +528,7 @@ def join_as_str(seq): # %% [markdown] # Percentiles of intensities in dataset -# %% +# %% tags=["hide-input"] df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2)) # %% [markdown] @@ -536,7 +536,7 @@ def join_as_str(seq): # - check if points are equally spaced (probably QC samples are run in close proximity) # - the machine will be not use for intermediate periods -# %% +# %% tags=["hide-input"] if not args.meta_date_col == 'PlaceholderTime': dates = df_meta[args.meta_date_col].sort_values() median_sample_intensity = (df @@ -563,7 +563,7 @@ def join_as_str(seq): # %% [markdown] # ## Feature frequency in data -# %% +# %% tags=["hide-input"] msg = "Total number of samples in data: {}" logger.info(msg.format(len(df))) @@ -571,11 +571,11 @@ def join_as_str(seq): # %% [markdown] # Recalculate feature frequency after selecting samples -# %% +# %% tags=["hide-input"] freq_per_feature = feature_frequency(df) freq_per_feature -# %% +# %% tags=["hide-input"] # freq_per_feature.name = 'Gene names freq' # name it differently? # index.name is lost when data is stored fname = args.data / 'freq_features.json' @@ -599,7 +599,7 @@ def join_as_str(seq): # for validation and test data split, e.g. 0.1 = quantile(0.1) # - select frac_mnar from intensities selected using threshold matrix -# %% +# %% tags=["hide-input"] splits = DataSplits(is_wide_format=False) logger.info(f"{splits = }") splits.__annotations__ @@ -609,11 +609,11 @@ def join_as_str(seq): # Create some target values by sampling X% of the validation and test data. # Simulated missing values are not used for validation and testing. -# %% +# %% tags=["hide-input"] df_long = vaep.io.datasplits.long_format(df) df_long.head() -# %% +# %% tags=["hide-input"] group = 2 splits, thresholds, fake_na_mcar, fake_na_mnar = vaep.sampling.sample_mnar_mcar( @@ -624,7 +624,7 @@ def join_as_str(seq): ) logger.info(f"{splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }") -# %% +# %% tags=["hide-input"] N = len(df_long) N_MCAR = len(fake_na_mcar) N_MNAR = len(fake_na_mnar) @@ -663,6 +663,18 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(fig, fname) +# %% tags=["hide-input"] +counts_per_bin = vaep.pandas.get_counts_per_bin( + df=pd.concat( + [df_long.squeeze().to_frame('observed'), + thresholds.to_frame('threshold'), + fake_na_mnar.squeeze().to_frame(f'MNAR ({N_MNAR:,d})'), + fake_na_mcar.squeeze().to_frame(f'MCAR ({N_MCAR:,d})')], + axis=1), + bins=range(min_max[0], min_max[1] + 1, 1)) +counts_per_bin.to_excel(fname.with_suffix('.xlsx')) +counts_per_bin + # %% [markdown] # ### Keep simulated samples only in a subset of the samples @@ -672,7 +684,7 @@ def join_as_str(seq): # # The procedure is experimental and turned off by default. -# %% +# %% tags=["hide-input"] if 0.0 < args.prop_sample_w_sim < 1.0: to_stratify = None if args.meta_cat_col and df_meta is not None: @@ -696,16 +708,16 @@ def join_as_str(seq): splits.test_y = splits.test_y.loc[test_idx] logger.info(f"New shapes: {splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }") -# %% +# %% tags=["hide-input"] splits.test_y.groupby(level=-1).count().describe() -# %% +# %% tags=["hide-input"] splits.val_y -# %% +# %% tags=["hide-input"] splits.train_X.groupby(level=-1).count().describe() -# %% +# %% tags=["hide-input"] # Check that feature indices and sample indicies overlap between splits # -> a single feature cannot be only in the validation or test split # -> single features should be put into the training data @@ -728,7 +740,7 @@ def join_as_str(seq): splits.val_y = splits.val_y.drop(to_remove.index) diff -# %% +# %% tags=["hide-input"] diff = (splits .test_y .index @@ -753,7 +765,7 @@ def join_as_str(seq): # In that case: Move the validation measurments back to the training data. # If after this procedure the condition is still not met, a value error is raised. -# %% +# %% tags=["hide-input"] mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4 if mask_min_4_measurments.any(): idx = mask_min_4_measurments.loc[mask_min_4_measurments].index @@ -776,7 +788,7 @@ def join_as_str(seq): # - Data in long format: (peptide, sample_id, intensity) # - no missing values kept -# %% +# %% tags=["hide-input"] # dumps data in long-format splits_dumped = splits.dump(folder=args.data, file_format=args.file_format) dumps.update(splits_dumped) @@ -785,13 +797,13 @@ def join_as_str(seq): # %% [markdown] # ### Reload from disk -# %% +# %% tags=["hide-input"] splits = DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # ## plot distribution of splits -# %% +# %% tags=["hide-input"] splits_df = pd.DataFrame(index=df_long.index) splits_df['train'] = splits.train_X splits_df['val'] = splits.val_y @@ -800,7 +812,7 @@ def join_as_str(seq): stats_splits.to_excel(writer, 'stats_splits', float_format='%.3f') stats_splits -# %% +# %% tags=["hide-input"] # whitespaces in legends are not displayed correctly... # max_int_len = len(str(int(stats_splits.loc['count'].max()))) +1 # _legend = [ @@ -812,7 +824,7 @@ def join_as_str(seq): for s in ('train', 'val', 'test')] print(_legend) -# %% +# %% tags=["hide-input"] group = 3 ax = (splits .train_X @@ -839,9 +851,9 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] min_bin, max_bin = vaep.plotting.data.min_max(splits.val_y) -bins = range(int(min_bin), int(max_bin), 1) +bins = range(int(min_bin), int(max_bin) + 1, 1) ax = splits_df.plot.hist(bins=bins, xticks=list(bins), legend=False, @@ -857,12 +869,12 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins) counts_per_bin.to_excel(fname.with_suffix('.xlsx')) counts_per_bin -# %% +# %% tags=["hide-input"] ax = splits_df.drop('train', axis=1).plot.hist(bins=bins, xticks=list(bins), color=['C1', 'C2'], @@ -878,43 +890,28 @@ def join_as_str(seq): figures[fname.name] = fname vaep.savefig(ax.get_figure(), fname) -# %% -# Save binned counts - -# %% -counts_per_bin = dict() -for col in splits_df.columns: - _series = (pd.cut(splits_df[col], bins=bins) - .to_frame() - .groupby(col) - .size()) - _series.index.name = 'bin' - counts_per_bin[col] = _series -counts_per_bin = pd.DataFrame(counts_per_bin) -counts_per_bin.to_excel(fname.with_suffix('.xlsx')) -counts_per_bin # %% [markdown] # plot training data missing plots -# %% +# %% tags=["hide-input"] splits.to_wide_format() -# %% +# %% tags=["hide-input"] ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=splits.train_X, type='scatter') fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter_train' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] ax = vaep.plotting.data.plot_feat_median_over_prop_missing( data=splits.train_X, type='boxplot') fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_train' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] medians = (splits .train_X .median() @@ -948,7 +945,7 @@ def join_as_str(seq): # %% [markdown] # ## Save parameters -# %% +# %% tags=["hide-input"] fname = args.folder_experiment / 'data_config.yaml' args.dump(fname) args @@ -956,15 +953,15 @@ def join_as_str(seq): # %% [markdown] # ## Saved Figures -# %% +# %% tags=["hide-input"] # saved figures figures # %% [markdown] # Saved dumps -# %% +# %% tags=["hide-input"] writer.close() dumps -# %% +# %% tags=["hide-input"] diff --git a/project/01_0_transform_data_to_wide_format.ipynb b/project/01_0_transform_data_to_wide_format.ipynb index df4d3fd13..bcd8a0bf1 100644 --- a/project/01_0_transform_data_to_wide_format.ipynb +++ b/project/01_0_transform_data_to_wide_format.ipynb @@ -13,7 +13,9 @@ "execution_count": null, "id": "9aacaba7", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -28,7 +30,11 @@ "cell_type": "code", "execution_count": null, "id": "d01a155d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -67,7 +73,11 @@ "cell_type": "code", "execution_count": null, "id": "43ff9ae3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -78,7 +88,11 @@ "cell_type": "code", "execution_count": null, "id": "11e46901", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "params = vaep.nb.args_from_dict(args)\n", @@ -90,7 +104,11 @@ "cell_type": "code", "execution_count": null, "id": "1194de4e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "splits = datasplits.DataSplits.from_folder(params.data, file_format=params.file_format_in)" @@ -101,7 +119,10 @@ "execution_count": null, "id": "197708a1", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -123,7 +144,11 @@ "cell_type": "code", "execution_count": null, "id": "feeae52b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "annotation = pd.Series('test', train_data.index).to_frame('group')\n", @@ -135,7 +160,11 @@ "cell_type": "code", "execution_count": null, "id": "57546236", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = params.data / 'sample_annotation_placeholder.csv'\n", @@ -160,7 +189,10 @@ "execution_count": null, "id": "ce749fdb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -174,7 +206,11 @@ "cell_type": "code", "execution_count": null, "id": "8ce12421", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# 'data_wide_sample_cols.csv'" diff --git a/project/01_0_transform_data_to_wide_format.py b/project/01_0_transform_data_to_wide_format.py index e123defa5..5e1cee1cd 100644 --- a/project/01_0_transform_data_to_wide_format.py +++ b/project/01_0_transform_data_to_wide_format.py @@ -17,15 +17,14 @@ # # Transfer data for NAGuideR format # -# %% +# %% tags=["hide-input"] import pandas as pd import vaep import vaep.models from vaep.io import datasplits - -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -41,19 +40,19 @@ file_format_in: str = 'csv' # file format of original splits, default pickle (pkl) file_format_out: str = 'csv' # file format of transformed splits, default csv -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] params = vaep.nb.args_from_dict(args) # params = OmegaConf.create(args) params -# %% +# %% tags=["hide-input"] splits = datasplits.DataSplits.from_folder(params.data, file_format=params.file_format_in) -# %% +# %% tags=["hide-input"] train_data = splits.train_X.unstack() train_data @@ -62,12 +61,12 @@ # Save placeholder sample annotation for use in NAGuideR app which requires such a file -# %% +# %% tags=["hide-input"] annotation = pd.Series('test', train_data.index).to_frame('group') annotation.index.name = 'Samples' annotation -# %% +# %% tags=["hide-input"] fname = params.data / 'sample_annotation_placeholder.csv' annotation.to_csv(fname) fname @@ -75,12 +74,12 @@ # %% [markdo] # Save with samples in columns -# %% +# %% tags=["hide-input"] fname = params.data / 'data_wide_sample_cols.csv' # fillna('Filtered') train_data.T.to_csv(fname) fname -# %% +# %% tags=["hide-input"] # 'data_wide_sample_cols.csv' diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb index 6ce02e159..b508cd19d 100644 --- a/project/01_1_train_CF.ipynb +++ b/project/01_1_train_CF.ipynb @@ -12,38 +12,32 @@ "cell_type": "code", "execution_count": null, "id": "18b5d571-2956-4112-b22c-43d6c2146b06", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", - "\n", "from pprint import pprint\n", "\n", "import matplotlib.pyplot as plt\n", - "\n", - "# from fastai.basics import *\n", - "# from fastai.callback.all import *\n", - "# from fastai.torch_basics import *\n", - "# from fastai.data.all import *\n", - "\n", - "from fastai.tabular.all import *\n", + "# overwriting Recorder callback with custom plot_loss\n", + "from fastai import learner\n", "from fastai.collab import *\n", - "\n", - "from fastai.collab import (EmbeddingDotBias, Learner, MSELossFlat, EarlyStoppingCallback, default_device)\n", + "from fastai.collab import (EarlyStoppingCallback, EmbeddingDotBias, Learner,\n", + " MSELossFlat, default_device)\n", + "from fastai.tabular.all import *\n", "\n", "import vaep\n", "import vaep.model\n", "import vaep.models as models\n", - "from vaep.models import plot_loss, RecorderDump\n", - "\n", "import vaep.nb\n", - "from vaep import sampling\n", "from vaep.io import datasplits\n", - "\n", "from vaep.logging import setup_logger\n", + "from vaep.models import RecorderDump, plot_loss\n", "\n", - "# overwriting Recorder callback with custom plot_loss\n", - "from fastai import learner\n", "learner.Recorder.plot_loss = plot_loss\n", "# import fastai.callback.hook # Learner.summary\n", "\n", @@ -67,7 +61,11 @@ "cell_type": "code", "execution_count": null, "id": "85c7d6f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -119,7 +117,11 @@ "cell_type": "code", "execution_count": null, "id": "0746e70f-0259-48d5-90ef-25fe4b59f9ac", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -131,7 +133,10 @@ "execution_count": null, "id": "100bbf80", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -158,7 +163,10 @@ "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -180,7 +188,11 @@ "cell_type": "code", "execution_count": null, "id": "6d9cc7bd-6b6f-40b9-8db7-c8228e4b03e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -199,7 +211,11 @@ "cell_type": "code", "execution_count": null, "id": "02bb6bf5-0eb1-4c73-9723-414b14eaf7c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X" @@ -209,7 +225,11 @@ "cell_type": "code", "execution_count": null, "id": "f3311709", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! add check that specified data is available\n", @@ -229,7 +249,10 @@ "execution_count": null, "id": "44958473", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -269,7 +292,11 @@ "cell_type": "code", "execution_count": null, "id": "b5b945aa-9b4e-4487-8b09-dca289e64d9d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_simulated_na = data.val_y.to_frame(name='observed')\n", @@ -281,7 +308,10 @@ "execution_count": null, "id": "98558b10", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -304,7 +334,11 @@ "cell_type": "code", "execution_count": null, "id": "3ee54305-266a-479a-b677-f151ddde250a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# larger mini-batches speed up training\n", @@ -325,7 +359,10 @@ "execution_count": null, "id": "12ffa243-151e-4220-a1d5-247f8aba3429", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -337,7 +374,11 @@ "cell_type": "code", "execution_count": null, "id": "4a02e061-6789-4f3d-8031-a40879c496c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ana_collab.model = EmbeddingDotBias.from_classes(\n", @@ -371,7 +412,11 @@ "cell_type": "code", "execution_count": null, "id": "8317c9e1-d128-4ab4-8d60-775cb85ef535", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=train_collab\n", @@ -419,7 +464,10 @@ "execution_count": null, "id": "bb76e6c5-e135-41c4-95e8-a56c3764c731", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -443,7 +491,11 @@ "cell_type": "code", "execution_count": null, "id": "c7f0c597-d3c7-42d0-a6ef-3bc4c13121b8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ana_collab.test_dl = ana_collab.dls.test_dl(data.test_y.reset_index())\n", @@ -456,7 +508,10 @@ "execution_count": null, "id": "1cd76df6", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -484,7 +539,10 @@ "execution_count": null, "id": "cff8caf4-ccc9-4a36-a992-2cc596abe51a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -510,7 +568,11 @@ "cell_type": "code", "execution_count": null, "id": "d825e38e-f3d6-4bca-b621-150267e7b7bc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -613,7 +675,11 @@ "cell_type": "code", "execution_count": null, "id": "782636ac-c979-4f8b-9fc0-66fd0c7a3a8b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# save simulated missing values for both splits\n", @@ -635,7 +701,11 @@ "cell_type": "code", "execution_count": null, "id": "0f13cb38-abf0-4b56-9399-3d11d32f7fbc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", @@ -646,7 +716,11 @@ "cell_type": "code", "execution_count": null, "id": "408b261a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] } diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py index 9fca75638..68acf2afa 100644 --- a/project/01_1_train_CF.py +++ b/project/01_1_train_CF.py @@ -16,31 +16,26 @@ # %% [markdown] # # Collaborative Filtering -# %% +# %% tags=["hide-input"] import logging - from pprint import pprint import matplotlib.pyplot as plt - -from fastai.tabular.all import * +# overwriting Recorder callback with custom plot_loss +from fastai import learner from fastai.collab import * - -from fastai.collab import (EmbeddingDotBias, Learner, MSELossFlat, EarlyStoppingCallback, default_device) +from fastai.collab import (EarlyStoppingCallback, EmbeddingDotBias, Learner, + MSELossFlat, default_device) +from fastai.tabular.all import * import vaep import vaep.model import vaep.models as models -from vaep.models import plot_loss, RecorderDump - import vaep.nb -from vaep import sampling from vaep.io import datasplits - from vaep.logging import setup_logger +from vaep.models import RecorderDump, plot_loss -# overwriting Recorder callback with custom plot_loss -from fastai import learner learner.Recorder.plot_loss = plot_loss # import fastai.callback.hook # Learner.summary @@ -54,7 +49,7 @@ # %% [markdown] # Papermill script parameters: -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -83,11 +78,11 @@ # %% [markdown] # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) # # Currently not needed -> DotProduct used, not a FNN @@ -101,7 +96,7 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' if not args.cuda: @@ -111,24 +106,24 @@ # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X -# %% +# %% tags=["hide-input"] # # ! add check that specified data is available # silent error in fastai if e.g. target column is not available # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -151,11 +146,11 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_simulated_na = data.val_y.to_frame(name='observed') val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na = data.test_y.to_frame(name='observed') test_pred_simulated_na.describe() @@ -166,7 +161,7 @@ # - save custom collab batch size (increase AE batch size by a factor), could be setup separately. # - the test data is used to evaluate the performance after training -# %% +# %% tags=["hide-input"] # larger mini-batches speed up training ana_collab = models.collab.CollabAnalysis( datasplits=data, @@ -179,12 +174,12 @@ ), batch_size=args.batch_size) -# %% +# %% tags=["hide-input"] print("Args:") pprint(ana_collab.model_kwargs) -# %% +# %% tags=["hide-input"] ana_collab.model = EmbeddingDotBias.from_classes( classes=ana_collab.dls.classes, **ana_collab.model_kwargs) @@ -206,7 +201,7 @@ # %% [markdown] # ### Training -# %% +# %% tags=["hide-input"] # papermill_description=train_collab suggested_lr = ana_collab.learn.lr_find() print(f"{suggested_lr.valley = :.5f}") @@ -234,7 +229,7 @@ # %% [markdown] # Compare simulated_na data predictions to original values -# %% +# %% tags=["hide-input"] # this could be done using the validation data laoder now ana_collab.test_dl = ana_collab.dls.test_dl( data.val_y.reset_index()) # test_dl is here validation data @@ -246,12 +241,12 @@ # %% [markdown] # select test data predictions -# %% +# %% tags=["hide-input"] ana_collab.test_dl = ana_collab.dls.test_dl(data.test_y.reset_index()) test_pred_simulated_na['CF'], _ = ana_collab.learn.get_preds(dl=ana_collab.test_dl) test_pred_simulated_na -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = models.collab.get_missing_values( df_train_long=data.train_X, @@ -266,7 +261,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -280,7 +275,7 @@ # > Does not make to much sense to compare collab and AEs, # > as the setup differs of training and validation data differs -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() @@ -318,7 +313,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_simulated_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_simulated_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") @@ -326,8 +321,8 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args -# %% +# %% tags=["hide-input"] diff --git a/project/01_1_train_DAE.ipynb b/project/01_1_train_DAE.ipynb index d33607a87..ac07f5a79 100644 --- a/project/01_1_train_DAE.ipynb +++ b/project/01_1_train_DAE.ipynb @@ -13,32 +13,31 @@ "execution_count": null, "id": "18b5d571-2956-4112-b22c-43d6c2146b06", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "import logging\n", "\n", + "import sklearn\n", + "from fastai import learner\n", "from fastai.basics import *\n", "from fastai.callback.all import *\n", "from fastai.torch_basics import *\n", - "\n", "from IPython.display import display\n", - "\n", - "import sklearn\n", - "from sklearn.preprocessing import StandardScaler\n", "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import StandardScaler\n", "\n", "import vaep\n", - "from vaep.io import datasplits\n", - "from vaep.models import ae\n", - "import vaep.models as models\n", "import vaep.model\n", + "import vaep.models as models\n", "from vaep.analyzers import analyzers\n", - "\n", + "from vaep.io import datasplits\n", "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss\n", - "from fastai import learner\n", + "from vaep.models import ae, plot_loss\n", "\n", "learner.Recorder.plot_loss = plot_loss\n", "\n", @@ -54,7 +53,11 @@ "cell_type": "code", "execution_count": null, "id": "297f14bc-3c37-43fa-8217-f790f0593d78", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -124,7 +127,11 @@ "cell_type": "code", "execution_count": null, "id": "0746e70f-0259-48d5-90ef-25fe4b59f9ac", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -136,7 +143,10 @@ "execution_count": null, "id": "e20093e1", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -163,7 +173,11 @@ "cell_type": "code", "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -181,7 +195,11 @@ "cell_type": "code", "execution_count": null, "id": "6d9cc7bd-6b6f-40b9-8db7-c8228e4b03e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -200,7 +218,11 @@ "cell_type": "code", "execution_count": null, "id": "02bb6bf5-0eb1-4c73-9723-414b14eaf7c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -218,7 +240,11 @@ "cell_type": "code", "execution_count": null, "id": "44958473", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -250,7 +276,10 @@ "execution_count": null, "id": "b5b945aa-9b4e-4487-8b09-dca289e64d9d", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -281,7 +310,11 @@ "cell_type": "code", "execution_count": null, "id": "98f675b6-e619-45b6-8f04-b75237d212a7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_simulated_na = data.val_y.to_frame(name='observed')\n", @@ -293,7 +326,10 @@ "execution_count": null, "id": "9686a2eb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -316,7 +352,10 @@ "execution_count": null, "id": "cff8caf4-ccc9-4a36-a992-2cc596abe51a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -337,7 +376,11 @@ "cell_type": "code", "execution_count": null, "id": "7952fe13", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X" @@ -347,7 +390,11 @@ "cell_type": "code", "execution_count": null, "id": "a0a0bcd9-22af-4dd9-af56-b041931ee918", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y # potentially has less features" @@ -357,7 +404,11 @@ "cell_type": "code", "execution_count": null, "id": "9f0826f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index,\n", @@ -385,7 +436,11 @@ "cell_type": "code", "execution_count": null, "id": "7bbed0af-64bd-45d8-9be2-5b856cb25cce", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "default_pipeline = sklearn.pipeline.Pipeline(\n", @@ -424,7 +479,11 @@ "cell_type": "code", "execution_count": null, "id": "4c568fe5-adfb-401c-afed-fabce46be0fe", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.learn = Learner(dls=analysis.dls,\n", @@ -451,7 +510,11 @@ "cell_type": "code", "execution_count": null, "id": "d625fb88", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# learn.summary()" @@ -461,7 +524,11 @@ "cell_type": "code", "execution_count": null, "id": "2fad0a84-3d3a-4e77-9f80-58b7f45f5352", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "suggested_lr = analysis.learn.lr_find()\n", @@ -482,7 +549,10 @@ "execution_count": null, "id": "99a5f505-7785-4152-8bed-73bd965f3ea8", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -494,7 +564,11 @@ "cell_type": "code", "execution_count": null, "id": "a83ba8fb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=train\n", @@ -513,7 +587,11 @@ "cell_type": "code", "execution_count": null, "id": "d7ae4840", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.epoch_trained = analysis.learn.epoch + 1\n", @@ -533,7 +611,10 @@ "execution_count": null, "id": "c0d278d3-6b12-420e-92f9-f8c2dc06ec02", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -574,7 +655,10 @@ "execution_count": null, "id": "ff3aad0d", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -589,7 +673,10 @@ "execution_count": null, "id": "c965ca0f-5936-460d-b696-015d7db01d75", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -601,7 +688,11 @@ "cell_type": "code", "execution_count": null, "id": "dc1ff5c3-f01b-4997-845a-ea72f041c96d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_simulated_na['DAE'] = pred # model_key?\n", @@ -621,7 +712,10 @@ "execution_count": null, "id": "0f907181", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -648,7 +742,11 @@ "cell_type": "code", "execution_count": null, "id": "cc065c5f-7bba-48d5-bc87-e4cf90462a6f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.model.cpu()\n", @@ -662,7 +760,11 @@ "cell_type": "code", "execution_count": null, "id": "7b915728-5e84-45b7-bbc0-da32bc657091", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! calculate embeddings only if meta data is available? Optional argument to save embeddings?\n", @@ -679,7 +781,11 @@ "cell_type": "code", "execution_count": null, "id": "33404d1b-f553-4e05-be7e-821511883507", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_cat_col and df_meta is not None:\n", @@ -713,7 +819,11 @@ "cell_type": "code", "execution_count": null, "id": "d825e38e-f3d6-4bca-b621-150267e7b7bc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -732,7 +842,11 @@ "cell_type": "code", "execution_count": null, "id": "855a7a6f-93fd-4612-9d8d-96541a2441be", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na')\n", @@ -751,7 +865,11 @@ "cell_type": "code", "execution_count": null, "id": "571ac8d4-bb5d-45db-bba8-59817e476304", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na')\n", @@ -770,7 +888,11 @@ "cell_type": "code", "execution_count": null, "id": "87910434-7d07-4e8e-8380-c92fc515bd16", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n", @@ -782,7 +904,11 @@ "cell_type": "code", "execution_count": null, "id": "7d99deb9-9aad-4ba9-b79d-e4b3c6c7f023", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics,\n", @@ -803,7 +929,10 @@ "execution_count": null, "id": "782636ac-c979-4f8b-9fc0-66fd0c7a3a8b", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -824,7 +953,11 @@ "cell_type": "code", "execution_count": null, "id": "883de917", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -834,7 +967,11 @@ "cell_type": "code", "execution_count": null, "id": "0f13cb38-abf0-4b56-9399-3d11d32f7fbc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", @@ -845,7 +982,11 @@ "cell_type": "code", "execution_count": null, "id": "43e4a4ad", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] } diff --git a/project/01_1_train_DAE.py b/project/01_1_train_DAE.py index e06baeb15..069b02a0e 100644 --- a/project/01_1_train_DAE.py +++ b/project/01_1_train_DAE.py @@ -16,29 +16,25 @@ # %% [markdown] # # Denoising Autoencoder -# %% +# %% tags=["hide-input"] import logging +import sklearn +from fastai import learner from fastai.basics import * from fastai.callback.all import * from fastai.torch_basics import * - from IPython.display import display - -import sklearn -from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler import vaep -from vaep.io import datasplits -from vaep.models import ae -import vaep.models as models import vaep.model +import vaep.models as models from vaep.analyzers import analyzers - +from vaep.io import datasplits # overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss -from fastai import learner +from vaep.models import ae, plot_loss learner.Recorder.plot_loss = plot_loss @@ -50,7 +46,7 @@ figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -91,11 +87,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) if isinstance(args.hidden_layers, str): @@ -110,26 +106,26 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -148,7 +144,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -162,11 +158,11 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_simulated_na = data.val_y.to_frame(name='observed') val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na = data.test_y.to_frame(name='observed') test_pred_simulated_na.describe() @@ -176,7 +172,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -185,13 +181,13 @@ # %% [markdown] # ### Fill Validation data with potentially missing features -# %% +# %% tags=["hide-input"] data.train_X -# %% +# %% tags=["hide-input"] data.val_y # potentially has less features -# %% +# %% tags=["hide-input"] data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index, columns=data.train_X.columns).fillna(data.val_y) data.val_y @@ -202,7 +198,7 @@ # %% [markdown] # ### Analysis: DataLoaders, Model, transform -# %% +# %% tags=["hide-input"] default_pipeline = sklearn.pipeline.Pipeline( [ ('normalize', StandardScaler()), @@ -229,7 +225,7 @@ # %% [markdown] # ### Training -# %% +# %% tags=["hide-input"] analysis.learn = Learner(dls=analysis.dls, model=analysis.model, loss_func=MSELossFlat(reduction='sum'), @@ -244,10 +240,10 @@ # [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in # current version. Try again later -# %% +# %% tags=["hide-input"] # learn.summary() -# %% +# %% tags=["hide-input"] suggested_lr = analysis.learn.lr_find() analysis.params['suggested_inital_lr'] = suggested_lr.valley suggested_lr @@ -255,26 +251,26 @@ # %% [markdown] # dump model config -# %% +# %% tags=["hide-input"] vaep.io.dump_json(analysis.params, args.out_models / TEMPLATE_MODEL_PARAMS.format(args.model_key)) -# %% +# %% tags=["hide-input"] # papermill_description=train analysis.learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley) # %% [markdown] # Save number of actually trained epochs -# %% +# %% tags=["hide-input"] args.epoch_trained = analysis.learn.epoch + 1 args.epoch_trained # %% [markdown] # #### Loss normalized by total number of measurements -# %% +# %% tags=["hide-input"] N_train_notna = data.train_X.notna().sum().sum() N_val_notna = data.val_y.notna().sum().sum() fig = models.plot_training_losses(analysis.learn, args.model_key, @@ -297,24 +293,24 @@ # # create predictiona and select for validation data -# %% +# %% tags=["hide-input"] analysis.model.eval() pred, target = analysis.get_preds_from_df(df_wide=data.train_X) # train_X pred = pred.stack() pred -# %% +# %% tags=["hide-input"] val_pred_simulated_na['DAE'] = pred # model_key ? val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na['DAE'] = pred # model_key? test_pred_simulated_na # %% [markdown] # save missing values predictions -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = ae.get_missing_values(df_train_wide=data.train_X, val_idx=val_pred_simulated_na.index, @@ -329,14 +325,14 @@ # # - validation data -# %% +# %% tags=["hide-input"] analysis.model.cpu() df_latent = vaep.model.get_latent_space(analysis.model.encoder, dl=analysis.dls.valid, dl_index=analysis.dls.valid.data.index) df_latent -# %% +# %% tags=["hide-input"] # # ! calculate embeddings only if meta data is available? Optional argument to save embeddings? ana_latent = analyzers.LatentAnalysis(df_latent, df_meta, @@ -346,7 +342,7 @@ figures[f'latent_{args.model_key}_by_date'], ax = ana_latent.plot_by_date( args.meta_date_col) -# %% +# %% tags=["hide-input"] if args.meta_cat_col and df_meta is not None: figures[f'latent_{args.model_key}_by_{"_".join(args.meta_cat_col.split())}'], ax = ana_latent.plot_by_category( args.meta_cat_col) @@ -363,33 +359,33 @@ # # - all measured (identified, observed) peptides in validation data -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The simulated NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na') added_metrics # %% [markdown] # ### Test Datasplit -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na') added_metrics # %% [markdown] # Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -397,18 +393,18 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_simulated_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_simulated_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args -# %% +# %% tags=["hide-input"] diff --git a/project/01_1_train_KNN.ipynb b/project/01_1_train_KNN.ipynb index 1edfeccf0..ebd24a6d0 100644 --- a/project/01_1_train_KNN.ipynb +++ b/project/01_1_train_KNN.ipynb @@ -13,7 +13,10 @@ "execution_count": null, "id": "76e01f3e", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -22,6 +25,7 @@ "import pandas as pd\n", "import sklearn\n", "import sklearn.impute\n", + "from IPython.display import display\n", "\n", "import vaep\n", "import vaep.model\n", @@ -41,7 +45,11 @@ "cell_type": "code", "execution_count": null, "id": "3fb29f3e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -104,7 +112,10 @@ "execution_count": null, "id": "10dfbb95", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -125,7 +136,11 @@ "cell_type": "code", "execution_count": null, "id": "b0f78a72", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -143,7 +158,11 @@ "cell_type": "code", "execution_count": null, "id": "a7a2208e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" @@ -161,7 +180,11 @@ "cell_type": "code", "execution_count": null, "id": "34b79387", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -179,7 +202,11 @@ "cell_type": "code", "execution_count": null, "id": "a61f7f70", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -201,7 +228,11 @@ "cell_type": "code", "execution_count": null, "id": "5e1a277f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = sampling.frequency_by_index(data.train_X, 0)\n", @@ -228,7 +259,11 @@ "cell_type": "code", "execution_count": null, "id": "d056fb82", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -240,7 +275,10 @@ "execution_count": null, "id": "ade74bab", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -260,7 +298,11 @@ "cell_type": "code", "execution_count": null, "id": "4a75a078", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.to_wide_format()\n", @@ -282,7 +324,11 @@ "cell_type": "code", "execution_count": null, "id": "58d3383c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X)" @@ -305,7 +351,11 @@ "cell_type": "code", "execution_count": null, "id": "98a1cba0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pred = knn_imputer.transform(data.train_X)\n", @@ -317,7 +367,11 @@ "cell_type": "code", "execution_count": null, "id": "5b24132c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na[args.model_key] = pred\n", @@ -328,7 +382,11 @@ "cell_type": "code", "execution_count": null, "id": "f8937bcc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_fake_na[args.model_key] = pred\n", @@ -348,7 +406,10 @@ "execution_count": null, "id": "8a853343", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -375,7 +436,11 @@ "cell_type": "code", "execution_count": null, "id": "e7c0d93c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] }, @@ -384,11 +449,7 @@ "id": "8881f26e", "metadata": {}, "source": [ - "## Comparisons\n", - "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", - "> Could be changed." + "## Comparisons\n" ] }, { @@ -408,7 +469,11 @@ "cell_type": "code", "execution_count": null, "id": "1d3a789e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -427,7 +492,11 @@ "cell_type": "code", "execution_count": null, "id": "d56eb144", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", @@ -450,7 +519,11 @@ "cell_type": "code", "execution_count": null, "id": "3d0be628", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", @@ -469,7 +542,11 @@ "cell_type": "code", "execution_count": null, "id": "9f03ba1f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')\n", @@ -480,7 +557,11 @@ "cell_type": "code", "execution_count": null, "id": "b962d322", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics,\n", @@ -500,7 +581,11 @@ "cell_type": "code", "execution_count": null, "id": "ce0fb347", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# save simulated missing values for both splits\n", @@ -520,7 +605,11 @@ "cell_type": "code", "execution_count": null, "id": "fc43a4e8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -530,7 +619,11 @@ "cell_type": "code", "execution_count": null, "id": "d3bbd037", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.n_params = 1 # the number of neighbors to consider\n", diff --git a/project/01_1_train_KNN.py b/project/01_1_train_KNN.py index 9989d0f25..ddd21c2aa 100644 --- a/project/01_1_train_KNN.py +++ b/project/01_1_train_KNN.py @@ -16,7 +16,7 @@ # %% [markdown] # # K- Nearest Neighbors (KNN) -# %% +# %% tags=["hide-input"] import logging import pandas as pd @@ -38,7 +38,7 @@ figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -72,7 +72,7 @@ # %% [markdown] # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args = vaep.nb.args_from_dict(args) args @@ -81,25 +81,25 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -109,7 +109,7 @@ # %% [markdown] # ## Initialize Comparison -# %% +# %% tags=["hide-input"] freq_feat = sampling.frequency_by_index(data.train_X, 0) freq_feat.head() # training data @@ -119,11 +119,11 @@ # %% [markdown] # The validation fake NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() @@ -131,7 +131,7 @@ # %% [markdown] # ## Data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -140,7 +140,7 @@ # ## Train # model = 'sklearn_knn' -# %% +# %% tags=["hide-input"] knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X) # %% [markdown] @@ -151,23 +151,23 @@ # # create predictions and select for split entries -# %% +# %% tags=["hide-input"] pred = knn_imputer.transform(data.train_X) pred = pd.DataFrame(pred, index=data.train_X.index, columns=data.train_X.columns).stack() pred -# %% +# %% tags=["hide-input"] val_pred_fake_na[args.model_key] = pred val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na[args.model_key] = pred test_pred_fake_na # %% [markdown] # save missing values predictions -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = ae.get_missing_values(df_train_wide=data.train_X, val_idx=val_pred_fake_na.index, @@ -182,7 +182,7 @@ # # - validation data -# %% +# %% tags=["hide-input"] # %% [markdown] # ## Comparisons @@ -196,14 +196,14 @@ # > Does not make to much sense to compare collab and AEs, # > as the setup differs of training and validation data differs -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') added_metrics @@ -214,18 +214,18 @@ # explicitly to misssing before it was fed to the model for # reconstruction. -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') added_metrics # %% [markdown] # Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -233,7 +233,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_fake_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_fake_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") @@ -241,10 +241,10 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.n_params = 1 # the number of neighbors to consider args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_KNN_unique_samples.py b/project/01_1_train_KNN_unique_samples.py new file mode 100644 index 000000000..1cd24fe26 --- /dev/null +++ b/project/01_1_train_KNN_unique_samples.py @@ -0,0 +1,301 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.0 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # K- Nearest Neighbors (KNN) + +# %% +import logging + +import pandas as pd +import sklearn +from sklearn.model_selection import train_test_split + +import vaep +import vaep.model +import vaep.models as models +import vaep.nb +from vaep import sampling +from vaep.io import datasplits +from vaep.models import ae + +logger = vaep.logging.setup_logger(logging.getLogger('vaep')) +logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions") + +figures = {} # collection of ax or figures + + +# %% +# catch passed parameters +args = None +args = dict(globals()).keys() + +# %% [markdown] +# Papermill script parameters: + +# %% tags=["parameters"] +# files and folders +folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment +folder_data: str = '' # specify data directory if needed +file_format: str = 'csv' # file format of create splits, default pickle (pkl) +# Machine parsed metadata from rawfile workflow +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' +# training +epochs_max: int = 50 # Maximum number of epochs +# early_stopping:bool = True # Wheather to use early stopping or not +batch_size: int = 64 # Batch size for training (and evaluation) +cuda: bool = True # Whether to use a GPU for training +# model +neighbors: int = 3 # number of neigherst neighbors to use +force_train: bool = True # Force training when saved model could be used. Per default re-train model +sample_idx_position: int = 0 # position of index which is sample ID +model: str = 'KNN' # model name +model_key: str = 'KNN_UNIQUE' # potentially alternative key for model (grid search) +save_pred_real_na: bool = True # Save all predictions for missing values +# metadata -> defaults for metadata extracted from machine data +meta_date_col: str = None # date column in meta data +meta_cat_col: str = None # category column in meta data + + +# Parameters +neighbors = 3 +folder_experiment = "runs/rev3" +folder_data = "runs/appl_ald_data_2023_11/plasma/proteinGroups/data" +fn_rawfile_metadata = "data/ALD_study/processed/ald_metadata_cli.csv" +meta_cat_col = 'kleiner' + +# %% [markdown] +# Some argument transformations + +# %% +args = vaep.nb.get_params(args, globals=globals()) +args = vaep.nb.args_from_dict(args) +args + + +# %% [markdown] +# Some naming conventions + +# %% +TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' + + +# %% [markdown] +# load meta data for splits + + +# %% [markdown] +# ## Load data in long format + +# %% +data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) + +# %% [markdown] +# data is loaded in long format + +# %% +data.train_X.sample(5) + +# %% +if args.fn_rawfile_metadata: + df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) + df_meta = df_meta.loc[data.train_X.index.levels[0]] +else: + df_meta = None +df_meta + + +# %% +df_meta['to_stratify'] = df_meta[args.meta_cat_col].fillna(-1) +data.to_wide_format() +train_idx, val_test_idx = train_test_split(data.train_X.index, + test_size=.2, + stratify=df_meta['to_stratify'], + random_state=42) +val_idx, test_idx = train_test_split(val_test_idx, + test_size=.5, + stratify=df_meta.loc[val_test_idx, 'to_stratify'], + random_state=42) +print("Train:", train_idx.shape, "Val:", val_idx.shape, "Test:", test_idx.shape) + +# %% +data.train_X.update(data.val_y.loc[train_idx]) +data.train_X.update(data.test_y.loc[train_idx]) +data.val_X = data.train_X.loc[val_idx] +data.test_X = data.train_X.loc[test_idx] +data.train_X = data.train_X.loc[train_idx] + +data.val_y = data.val_y.loc[val_idx] +data.test_y = data.test_y.loc[test_idx] + +# %% +data.to_long_format() + +# %% [markdown] +# ## Initialize Comparison + +# %% +freq_feat = sampling.frequency_by_index(data.train_X, 0) +freq_feat.head() # training data + +# %% [markdown] +# ### Simulated missing values + +# %% [markdown] +# The validation fake NA is used to by all models to evaluate training performance. + +# %% +val_pred_fake_na = data.val_y.to_frame(name='observed') +val_pred_fake_na + +# %% +test_pred_fake_na = data.test_y.to_frame(name='observed') +test_pred_fake_na.describe() + + +# %% [markdown] +# ## Data in wide format + +# %% +data.to_wide_format() +args.M = data.train_X.shape[-1] +data.train_X + +# %% [markdown] +# ## Train +# model = 'sklearn_knn' + +# %% +knn_imputer = sklearn.impute.KNNImputer(n_neighbors=args.neighbors).fit(data.train_X) + +# %% [markdown] +# ### Predictions +# +# - data of training data set and validation dataset to create predictions is the same as training data. +# - predictions include missing values (which are not further compared) +# +# create predictions and select for split entries + +# %% +pred = knn_imputer.transform(data.val_X) +pred = pd.DataFrame(pred, index=data.val_X.index, columns=data.val_X.columns).stack() +pred + +# %% +val_pred_fake_na[args.model_key] = pred +val_pred_fake_na + +# %% +pred = knn_imputer.transform(data.test_X) +pred = pd.DataFrame(pred, index=data.test_X.index, columns=data.test_X.columns).stack() + +test_pred_fake_na[args.model_key] = pred +test_pred_fake_na + +# %% [markdown] +# save missing values predictions + +# %% +df_complete = pd.concat([data.train_X, data.val_X, data.test_X]) +pred = knn_imputer.transform(df_complete) +pred = pd.DataFrame(pred, index=df_complete.index, columns=df_complete.columns).stack() +pred + +# %% +if args.save_pred_real_na: + pred_real_na = ae.get_missing_values(df_train_wide=df_complete, + val_idx=val_pred_fake_na.index, + test_idx=test_pred_fake_na.index, + pred=pred) + display(pred_real_na) + pred_real_na.to_csv(args.out_preds / f"pred_real_na_{args.model_key}.csv") + + +# %% [markdown] +# ### Plots +# +# - validation data + +# %% + +# %% [markdown] +# ## Comparisons +# +# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) +# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) +# > Could be changed. + +# %% [markdown] +# ### Validation data +# +# - all measured (identified, observed) peptides in validation data +# +# > Does not make to much sense to compare collab and AEs, +# > as the setup differs of training and validation data differs + +# %% +# papermill_description=metrics +d_metrics = models.Metrics() + +# %% [markdown] +# The fake NA for the validation step are real test data (not used for training nor early stopping) + +# %% +added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') +added_metrics + +# %% [markdown] +# ### Test Datasplit +# +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. + +# %% +added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') +added_metrics + +# %% [markdown] +# Save all metrics as json + +# %% +vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') +d_metrics + +# %% +metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, + column_levels=['model', 'metric_name']).T +metrics_df + +# %% [markdown] +# ## Save predictions + +# %% +# save simulated missing values for both splits +val_pred_fake_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") +test_pred_fake_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") + +# %% [markdown] +# ## Config + +# %% +figures # switch to fnames? + +# %% +args.n_params = 1 # the number of neighbors to consider +args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") +args + +# %% diff --git a/project/01_1_train_Median.ipynb b/project/01_1_train_Median.ipynb index 3c2933e5d..406691aab 100644 --- a/project/01_1_train_Median.ipynb +++ b/project/01_1_train_Median.ipynb @@ -13,20 +13,24 @@ "execution_count": null, "id": "bdefddcb", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "import logging\n", "\n", "import pandas as pd\n", + "from IPython.display import display\n", "\n", "import vaep\n", "import vaep.model\n", "import vaep.models as models\n", + "import vaep.nb\n", "from vaep.io import datasplits\n", "\n", - "import vaep.nb\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\"Median Imputation\")\n", "\n", @@ -37,7 +41,11 @@ "cell_type": "code", "execution_count": null, "id": "82a53c81", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -93,7 +101,11 @@ "cell_type": "code", "execution_count": null, "id": "17c49967", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -105,7 +117,10 @@ "execution_count": null, "id": "071eb3aa", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -125,7 +140,11 @@ "cell_type": "code", "execution_count": null, "id": "79bd3e3d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -143,7 +162,11 @@ "cell_type": "code", "execution_count": null, "id": "fb8a27fa", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" @@ -161,7 +184,11 @@ "cell_type": "code", "execution_count": null, "id": "0df21944", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -179,7 +206,11 @@ "cell_type": "code", "execution_count": null, "id": "86b35447", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -209,7 +240,11 @@ "cell_type": "code", "execution_count": null, "id": "efe75402", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -237,7 +272,11 @@ "cell_type": "code", "execution_count": null, "id": "75aa1ac5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", @@ -264,7 +303,11 @@ "cell_type": "code", "execution_count": null, "id": "16f53ce5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -276,7 +319,10 @@ "execution_count": null, "id": "68ea1649", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -299,7 +345,10 @@ "execution_count": null, "id": "c679a1f9", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -320,7 +369,11 @@ "cell_type": "code", "execution_count": null, "id": "71d667e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# interpolated = vaep.pandas.interpolate(wide_df = data.train_X)\n", @@ -335,7 +388,10 @@ "execution_count": null, "id": "9fb6cea0", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -355,7 +411,10 @@ "execution_count": null, "id": "c05ecd3a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -389,7 +448,11 @@ "cell_type": "code", "execution_count": null, "id": "73648586", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val = val_pred_fake_na['observed'].groupby(level=-1).count()\n", @@ -401,7 +464,11 @@ "cell_type": "code", "execution_count": null, "id": "c3662d07", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# # scatter plot between overall feature freq and split freq\n", @@ -412,7 +479,11 @@ "cell_type": "code", "execution_count": null, "id": "3c72fe9c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" @@ -422,7 +493,11 @@ "cell_type": "code", "execution_count": null, "id": "786f8804", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0)\n", @@ -442,7 +517,11 @@ "cell_type": "code", "execution_count": null, "id": "6da29e33", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0)\n", @@ -453,7 +532,11 @@ "cell_type": "code", "execution_count": null, "id": "3565522f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val" @@ -464,11 +547,7 @@ "id": "ae345647", "metadata": {}, "source": [ - "## Comparisons\n", - "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", - "> Could be changed." + "## Comparisons" ] }, { @@ -476,19 +555,18 @@ "id": "b43adc40", "metadata": {}, "source": [ - "### Validation data\n", - "\n", - "- all measured (identified, observed) peptides in validation data\n", - "\n", - "> Does not make too much sense to compare collab and AEs,\n", - "> as the setup differs of training and validation data differs" + "### Validation data\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b017353a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -507,7 +585,11 @@ "cell_type": "code", "execution_count": null, "id": "47caaf3b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", @@ -530,7 +612,11 @@ "cell_type": "code", "execution_count": null, "id": "4b45f076", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", @@ -549,7 +635,11 @@ "cell_type": "code", "execution_count": null, "id": "d6a5da36", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] }, @@ -566,7 +656,10 @@ "execution_count": null, "id": "26be5fa4", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -578,7 +671,11 @@ "cell_type": "code", "execution_count": null, "id": "7fe80e9a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T\n", @@ -597,7 +694,11 @@ "cell_type": "code", "execution_count": null, "id": "225dc1f0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# val\n", @@ -622,7 +723,11 @@ "cell_type": "code", "execution_count": null, "id": "64a39dc2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -632,7 +737,11 @@ "cell_type": "code", "execution_count": null, "id": "14983bf9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", diff --git a/project/01_1_train_Median.py b/project/01_1_train_Median.py index 72a7cf562..cf43e2e1b 100644 --- a/project/01_1_train_Median.py +++ b/project/01_1_train_Median.py @@ -16,24 +16,25 @@ # %% [markdown] # # Variational Autoencoder -# %% +# %% tags=["hide-input"] import logging import pandas as pd +from IPython.display import display import vaep import vaep.model import vaep.models as models +import vaep.nb from vaep.io import datasplits -import vaep.nb logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info("Median Imputation") figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -60,11 +61,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args @@ -72,25 +73,25 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -108,7 +109,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -124,7 +125,7 @@ # - Not used for predictions or early stopping. # - [x] add some additional NAs based on distribution of data -# %% +# %% tags=["hide-input"] freq_feat = vaep.io.datasplits.load_freq(args.data) freq_feat.head() # training data @@ -134,11 +135,11 @@ # %% [markdown] # The validation fake NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() @@ -148,7 +149,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -157,14 +158,14 @@ # %% [markdown] # ### Add interpolation performance -# %% +# %% tags=["hide-input"] # interpolated = vaep.pandas.interpolate(wide_df = data.train_X) # val_pred_fake_na['interpolated'] = interpolated # test_pred_fake_na['interpolated'] = interpolated # del interpolated # test_pred_fake_na -# %% +# %% tags=["hide-input"] # Add median pred performance args.n_params = data.train_X.shape[-1] medians_train = data.train_X.median() @@ -176,7 +177,7 @@ val_pred_fake_na -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: mask = data.train_X.isna().stack() idx_real_na = mask.index[mask] @@ -196,19 +197,19 @@ # %% [markdown] # ### Plots # -# %% +# %% tags=["hide-input"] feat_freq_val = val_pred_fake_na['observed'].groupby(level=-1).count() feat_freq_val.name = 'freq_val' ax = feat_freq_val.plot.box() -# %% +# %% tags=["hide-input"] # # scatter plot between overall feature freq and split freq # freq_feat.to_frame('overall').join(feat_freq_val).plot.scatter(x='overall', y='freq_val') -# %% +# %% tags=["hide-input"] feat_freq_val.value_counts().sort_index().head() # require more than one feat? -# %% +# %% tags=["hide-input"] errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0) errors_val = errors_val.abs().groupby(level=-1).mean() errors_val = errors_val.join(freq_feat).sort_values(by='freq', ascending=True) @@ -221,36 +222,28 @@ ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10)) # errors_val_smoothed -# %% +# %% tags=["hide-input"] errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0) errors_val.abs().groupby(level=-1).agg(['mean', 'count']) -# %% +# %% tags=["hide-input"] errors_val # %% [markdown] # ## Comparisons -# -# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) -# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) -# > Could be changed. # %% [markdown] # ### Validation data # -# - all measured (identified, observed) peptides in validation data -# -# > Does not make too much sense to compare collab and AEs, -# > as the setup differs of training and validation data differs -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') added_metrics @@ -261,31 +254,31 @@ # explicitly to misssing before it was fed to the model for # reconstruction. -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') added_metrics # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] # %% [markdown] # ### Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # val fname = args.out_preds / f"pred_val_{args.model_key}.csv" setattr(args, fname.stem, fname.as_posix()) # add [] assignment? @@ -298,9 +291,9 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_NAGuideR_methods.R b/project/01_1_train_NAGuideR_methods.R index ce2f91af9..13997aa17 100644 --- a/project/01_1_train_NAGuideR_methods.R +++ b/project/01_1_train_NAGuideR_methods.R @@ -19,7 +19,7 @@ # # - BiocManager could be moved to methods who are installed from BioConductor -# + vscode={"languageId": "r"} +# + tags=["hide-input"] vscode={"languageId": "r"} packages_base_R <- c("BiocManager", "reshape2", "data.table", "readr", "tibble") @@ -58,7 +58,7 @@ for (package in packages_base_R) { # - seems quite hacky # - code is only slightly adapted from repo to run here, mainly to install packages on the fly -# + vscode={"languageId": "r"} +# + tags=["hide-input"] vscode={"languageId": "r"} nafunctions <- function(x, method = "zero") { df <- df1 <- as.data.frame(x) method <- tolower(method) @@ -407,7 +407,7 @@ original_header[1:5] # Uncomment to test certain methods (only for debugging, as at least one method per package is tested using Github Actions) -# + vscode={"languageId": "r"} +# + tags=["hide-input"] vscode={"languageId": "r"} # to_test <- c( # 'ZERO', # 'MINIMUM', @@ -450,20 +450,28 @@ pred <- nafunctions(df, method) pred <- tibble::as_tibble(cbind(rownames(pred), pred)) names(pred) <- original_header pred +# - + +# Transform predictions to long format + # + vscode={"languageId": "r"} pred <- reshape2::melt(pred, id.vars = feat_name) names(pred) <- c(feat_name, 'Sample ID', method) pred <- pred[reshape2::melt(is.na(df))['value'] == TRUE, ] pred +# - -# + vscode={"languageId": "r"} +# Check dimension of long format dataframe + +# + tags=["hide-input"] vscode={"languageId": "r"} dim(pred) +# - -# + vscode={"languageId": "r"} +# Save predictions to disk + +# + tags=["hide-input"] vscode={"languageId": "r"} fname = file.path(folder_experiment, 'preds', paste0('pred_all_', toupper(method), '.csv')) -fname - -# + vscode={"languageId": "r"} write_csv(pred, path = fname) +fname diff --git a/project/01_1_train_NAGuideR_methods.ipynb b/project/01_1_train_NAGuideR_methods.ipynb index 072e207c2..23fae4bd3 100644 --- a/project/01_1_train_NAGuideR_methods.ipynb +++ b/project/01_1_train_NAGuideR_methods.ipynb @@ -17,6 +17,9 @@ "execution_count": null, "id": "2e50ecba-a6ca-4a3a-bd45-e58752c168eb", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -77,6 +80,9 @@ "execution_count": null, "id": "f9c48bf7-d31c-4073-895b-e9cf920ff1d3", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -497,6 +503,9 @@ "execution_count": null, "id": "162c5f7f-08f0-44ef-abf5-f0805ab58bb4", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -551,7 +560,6 @@ "execution_count": null, "id": "690d47c2-5666-41f2-b13f-9215334f197c", "metadata": { - "lines_to_next_cell": 0, "tags": [], "vscode": { "languageId": "r" @@ -565,6 +573,14 @@ "pred" ] }, + { + "cell_type": "markdown", + "id": "9738530f", + "metadata": {}, + "source": [ + "Transform predictions to long format" + ] + }, { "cell_type": "code", "execution_count": null, @@ -583,11 +599,22 @@ "pred" ] }, + { + "cell_type": "markdown", + "id": "b745b3ea", + "metadata": {}, + "source": [ + "Check dimension of long format dataframe" + ] + }, { "cell_type": "code", "execution_count": null, "id": "ff5196d2-0ecf-49da-b7eb-4075b8a73707", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -597,11 +624,22 @@ "dim(pred)" ] }, + { + "cell_type": "markdown", + "id": "faeb1eb0", + "metadata": {}, + "source": [ + "Save predictions to disk" + ] + }, { "cell_type": "code", "execution_count": null, "id": "ffebedb9-02db-4a7f-a5f6-a54a2aa057fc", "metadata": { + "tags": [ + "hide-input" + ], "vscode": { "languageId": "r" } @@ -611,22 +649,9 @@ "fname = file.path(folder_experiment,\n", " 'preds',\n", " paste0('pred_all_', toupper(method), '.csv'))\n", + "write_csv(pred, path = fname)\n", "fname" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df1a114a-166d-4bcc-8c10-3ac69570f96c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "write_csv(pred, path = fname)" - ] } ], "metadata": { diff --git a/project/01_1_train_RSN.ipynb b/project/01_1_train_RSN.ipynb index 95b208ebf..fb5fc7b67 100644 --- a/project/01_1_train_RSN.ipynb +++ b/project/01_1_train_RSN.ipynb @@ -13,21 +13,25 @@ "execution_count": null, "id": "e38874f3", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "import logging\n", "\n", "import pandas as pd\n", + "from IPython.display import display\n", "\n", "import vaep\n", + "import vaep.imputation\n", "import vaep.model\n", "import vaep.models as models\n", - "import vaep.imputation\n", + "import vaep.nb\n", "from vaep.io import datasplits\n", "\n", - "import vaep.nb\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\"Median Imputation\")\n", "\n", @@ -38,7 +42,11 @@ "cell_type": "code", "execution_count": null, "id": "ca2c3fb3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -99,7 +107,11 @@ "cell_type": "code", "execution_count": null, "id": "36f708a2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -111,7 +123,10 @@ "execution_count": null, "id": "a4bb6bf2", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -131,7 +146,11 @@ "cell_type": "code", "execution_count": null, "id": "d3ded735", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -149,7 +168,11 @@ "cell_type": "code", "execution_count": null, "id": "92d787e1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -168,7 +191,11 @@ "cell_type": "code", "execution_count": null, "id": "dbd2df5b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -186,7 +213,11 @@ "cell_type": "code", "execution_count": null, "id": "b8e2b780", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -217,7 +248,11 @@ "cell_type": "code", "execution_count": null, "id": "3cd8cc67", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -239,7 +274,11 @@ "cell_type": "code", "execution_count": null, "id": "63a9a8c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", @@ -266,7 +305,11 @@ "cell_type": "code", "execution_count": null, "id": "5855a725", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -277,7 +320,11 @@ "cell_type": "code", "execution_count": null, "id": "9e0ae839", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_fake_na = data.test_y.to_frame(name='observed')\n", @@ -297,7 +344,10 @@ "execution_count": null, "id": "e8b41aae", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -318,7 +368,11 @@ "cell_type": "code", "execution_count": null, "id": "8f5349d6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "imputed_shifted_normal = vaep.imputation.impute_shifted_normal(\n", @@ -335,7 +389,11 @@ "cell_type": "code", "execution_count": null, "id": "d32d445e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na[args.model] = imputed_shifted_normal\n", @@ -356,7 +414,10 @@ "execution_count": null, "id": "3198a37c", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -385,7 +446,11 @@ "cell_type": "code", "execution_count": null, "id": "df99da67", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax, _ = vaep.plotting.errors.plot_errors_binned(val_pred_fake_na)" @@ -395,7 +460,11 @@ "cell_type": "code", "execution_count": null, "id": "16637d79", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ax, _ = vaep.plotting.errors.plot_errors_binned(test_pred_fake_na)" @@ -425,7 +494,11 @@ "cell_type": "code", "execution_count": null, "id": "43d42650", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -444,7 +517,11 @@ "cell_type": "code", "execution_count": null, "id": "ed0498d0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')\n", @@ -467,7 +544,11 @@ "cell_type": "code", "execution_count": null, "id": "0ee61d53", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')\n", @@ -494,7 +575,11 @@ "cell_type": "code", "execution_count": null, "id": "9973b3ee", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n", @@ -506,7 +591,11 @@ "cell_type": "code", "execution_count": null, "id": "6b2421c3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(\n", @@ -526,7 +615,11 @@ "cell_type": "code", "execution_count": null, "id": "39c41bcd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# val\n", @@ -551,7 +644,11 @@ "cell_type": "code", "execution_count": null, "id": "7f2f7404", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -561,7 +658,11 @@ "cell_type": "code", "execution_count": null, "id": "8ad37d15", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", diff --git a/project/01_1_train_RSN.py b/project/01_1_train_RSN.py index 73643f02a..c21769ac2 100644 --- a/project/01_1_train_RSN.py +++ b/project/01_1_train_RSN.py @@ -16,25 +16,26 @@ # %% [markdown] # # Imputation using random draws from shifted normal distribution -# %% +# %% tags=["hide-input"] import logging import pandas as pd +from IPython.display import display import vaep +import vaep.imputation import vaep.model import vaep.models as models -import vaep.imputation +import vaep.nb from vaep.io import datasplits -import vaep.nb logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info("Median Imputation") figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -66,11 +67,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args @@ -78,26 +79,26 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -116,7 +117,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -127,7 +128,7 @@ # ## Initialize Comparison # -# %% +# %% tags=["hide-input"] freq_feat = vaep.io.datasplits.load_freq(args.data) freq_feat.head() # training data @@ -137,18 +138,18 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() # %% [markdown] # ## Data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -157,7 +158,7 @@ # %% [markdown] # ### Impute using shifted normal distribution -# %% +# %% tags=["hide-input"] imputed_shifted_normal = vaep.imputation.impute_shifted_normal( data.train_X, mean_shift=1.8, @@ -167,7 +168,7 @@ imputed_shifted_normal = imputed_shifted_normal.to_frame('intensity') imputed_shifted_normal -# %% +# %% tags=["hide-input"] val_pred_fake_na[args.model] = imputed_shifted_normal test_pred_fake_na[args.model] = imputed_shifted_normal val_pred_fake_na @@ -175,7 +176,7 @@ # %% [markdown] # Save predictions for NA -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: mask = data.train_X.isna().stack() idx_real_na = mask.index[mask] @@ -195,10 +196,10 @@ # # %% [markdown] # ### Plots # -# %% +# %% tags=["hide-input"] ax, _ = vaep.plotting.errors.plot_errors_binned(val_pred_fake_na) -# %% +# %% tags=["hide-input"] ax, _ = vaep.plotting.errors.plot_errors_binned(test_pred_fake_na) # %% [markdown] @@ -210,14 +211,14 @@ # # - all measured (identified, observed) peptides in validation data -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = models.Metrics() # %% [markdown] # The fake NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na') added_metrics @@ -228,7 +229,7 @@ # explicitly to misssing before it was fed to the model for # reconstruction. -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') added_metrics @@ -238,12 +239,12 @@ # %% [markdown] # ### Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict( d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -251,7 +252,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # val fname = args.out_preds / f"pred_val_{args.model_key}.csv" setattr(args, fname.stem, fname.as_posix()) # add [] assignment? @@ -264,9 +265,9 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_VAE.ipynb b/project/01_1_train_VAE.ipynb index 8a8ae0c7a..be38aa642 100644 --- a/project/01_1_train_VAE.ipynb +++ b/project/01_1_train_VAE.ipynb @@ -13,7 +13,10 @@ "execution_count": null, "id": "18b5d571-2956-4112-b22c-43d6c2146b06", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -21,36 +24,29 @@ "import logging\n", "from functools import partial\n", "\n", + "import pandas as pd\n", + "import sklearn\n", + "import torch\n", + "from fastai import learner\n", "from fastai.basics import *\n", - "from fastai.learner import Learner\n", "from fastai.callback.all import *\n", "from fastai.callback.all import EarlyStoppingCallback\n", + "from fastai.learner import Learner\n", "from fastai.torch_basics import *\n", - "\n", - "import torch\n", - "\n", "from IPython.display import display\n", - "\n", - "from torch.nn import Sigmoid\n", - "\n", - "import pandas as pd\n", - "\n", - "import sklearn\n", - "from sklearn.preprocessing import StandardScaler\n", "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from torch.nn import Sigmoid\n", "\n", "import vaep\n", - "import vaep.nb\n", - "from vaep.io import datasplits\n", - "from vaep.models import ae\n", - "import vaep.models as models\n", "import vaep.model\n", + "import vaep.models as models\n", + "import vaep.nb\n", "from vaep.analyzers import analyzers\n", - "\n", - "\n", + "from vaep.io import datasplits\n", "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss\n", - "from fastai import learner\n", + "from vaep.models import ae, plot_loss\n", + "\n", "learner.Recorder.plot_loss = plot_loss\n", "\n", "\n", @@ -65,7 +61,11 @@ "cell_type": "code", "execution_count": null, "id": "5dbc8d89", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -133,7 +133,11 @@ "cell_type": "code", "execution_count": null, "id": "0746e70f-0259-48d5-90ef-25fe4b59f9ac", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -145,7 +149,10 @@ "execution_count": null, "id": "8083658b", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -172,7 +179,11 @@ "cell_type": "code", "execution_count": null, "id": "a19fe098-a029-4f71-b7fb-e652a9c16ac7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "TEMPLATE_MODEL_PARAMS = 'model_params_{}.json'" @@ -190,7 +201,11 @@ "cell_type": "code", "execution_count": null, "id": "6d9cc7bd-6b6f-40b9-8db7-c8228e4b03e3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = datasplits.DataSplits.from_folder(\n", @@ -209,7 +224,11 @@ "cell_type": "code", "execution_count": null, "id": "02bb6bf5-0eb1-4c73-9723-414b14eaf7c8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X.sample(5)" @@ -227,7 +246,11 @@ "cell_type": "code", "execution_count": null, "id": "44958473", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "index_columns = list(data.train_X.index.names)\n", @@ -258,7 +281,11 @@ "cell_type": "code", "execution_count": null, "id": "b5b945aa-9b4e-4487-8b09-dca289e64d9d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_rawfile_metadata:\n", @@ -286,7 +313,11 @@ "cell_type": "code", "execution_count": null, "id": "98f675b6-e619-45b6-8f04-b75237d212a7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", @@ -313,7 +344,11 @@ "cell_type": "code", "execution_count": null, "id": "19eebaff-0e1e-4e44-ae40-12d2f0e75c74", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_simulated_na = data.val_y.to_frame(name='observed')\n", @@ -325,7 +360,10 @@ "execution_count": null, "id": "3797a539-84d9-430a-8d16-7cc0eebfe9f5", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -347,7 +385,11 @@ "cell_type": "code", "execution_count": null, "id": "cff8caf4-ccc9-4a36-a992-2cc596abe51a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.to_wide_format()\n", @@ -375,7 +417,11 @@ "cell_type": "code", "execution_count": null, "id": "7952fe13", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.train_X" @@ -385,7 +431,11 @@ "cell_type": "code", "execution_count": null, "id": "a0a0bcd9-22af-4dd9-af56-b041931ee918", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y # potentially has less features" @@ -395,7 +445,11 @@ "cell_type": "code", "execution_count": null, "id": "9f0826f9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index,\n", @@ -423,7 +477,11 @@ "cell_type": "code", "execution_count": null, "id": "e0d0d02f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "default_pipeline = sklearn.pipeline.Pipeline(\n", @@ -445,7 +503,11 @@ "cell_type": "code", "execution_count": null, "id": "43d49b4a-00ec-4874-8839-28a3cbc0e3b3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "\n", @@ -480,7 +542,11 @@ "cell_type": "code", "execution_count": null, "id": "cca0e4a4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "results = []\n", @@ -492,7 +558,10 @@ "execution_count": null, "id": "9366b06a", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -520,7 +589,11 @@ "cell_type": "code", "execution_count": null, "id": "f4b0aec2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# learn.summary()" @@ -530,7 +603,11 @@ "cell_type": "code", "execution_count": null, "id": "563a1e0a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "suggested_lr = analysis.learn.lr_find()\n", @@ -542,7 +619,11 @@ "cell_type": "code", "execution_count": null, "id": "468565f5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "results.clear() # reset results" @@ -560,7 +641,11 @@ "cell_type": "code", "execution_count": null, "id": "ec77d9e5-f619-4355-ab37-2bd44029236d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# needs class as argument, not instance, but serialization needs instance\n", @@ -581,7 +666,11 @@ "cell_type": "code", "execution_count": null, "id": "43d18ab6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=train\n", @@ -600,7 +689,11 @@ "cell_type": "code", "execution_count": null, "id": "079f9743-213c-422c-ba61-919c276fd710", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.epoch_trained = analysis.learn.epoch + 1\n", @@ -619,7 +712,11 @@ "cell_type": "code", "execution_count": null, "id": "3a4f91f9-c1a2-40c5-99d4-c289fb89cff8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "N_train_notna = data.train_X.notna().sum().sum()\n", @@ -642,7 +739,11 @@ "cell_type": "code", "execution_count": null, "id": "e73fb4dd-e73a-48df-82b7-b378fd3ee266", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.model.eval()\n", @@ -658,7 +759,10 @@ "execution_count": null, "id": "a9fc0e36", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -670,7 +774,11 @@ "cell_type": "code", "execution_count": null, "id": "b4d1fd73-9eb7-4f25-ad81-c42c6a840e77", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_simulated_na['VAE'] = pred # model_key?\n", @@ -690,7 +798,10 @@ "execution_count": null, "id": "c43e401f", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -717,12 +828,24 @@ "cell_type": "code", "execution_count": null, "id": "7999c89e-65fe-4c00-8e20-cb8ab88d1603", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "analysis.model = analysis.model.cpu()\n", + "# underlying data is train_X for both\n", + "# assert analysis.dls.valid.data.equals(analysis.dls.train.data)\n", + "# Reconstruct DataLoader for case that during training singleton batches were dropped\n", + "_dl = torch.utils.data.DataLoader(\n", + " vaep.io.datasets.DatasetWithTarget(\n", + " analysis.dls.valid.data),\n", + " batch_size=args.batch_size,\n", + " shuffle=False)\n", "df_latent = vaep.model.get_latent_space(analysis.model.get_mu_and_logvar,\n", - " dl=analysis.dls.valid,\n", + " dl=_dl,\n", " dl_index=analysis.dls.valid.data.index)\n", "df_latent" ] @@ -731,7 +854,11 @@ "cell_type": "code", "execution_count": null, "id": "1fdd8f86-639e-4e0e-bb89-466f3ba0ef7b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "ana_latent = analyzers.LatentAnalysis(df_latent,\n", @@ -747,7 +874,11 @@ "cell_type": "code", "execution_count": null, "id": "763a5633-a9dd-4785-a0c0-91f588346c22", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.meta_cat_col and df_meta is not None:\n", @@ -759,7 +890,11 @@ "cell_type": "code", "execution_count": null, "id": "d3fdd5cb-4038-489f-b4d8-54ec6ea913b5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val = val_pred_simulated_na['observed'].groupby(level=-1).count()\n", @@ -771,7 +906,11 @@ "cell_type": "code", "execution_count": null, "id": "446adbaf-81db-4ac5-b041-064744143602", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" @@ -781,7 +920,11 @@ "cell_type": "code", "execution_count": null, "id": "d408dbfa", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_simulated_na.drop('observed', axis=1).sub(\n", @@ -801,7 +944,11 @@ "cell_type": "code", "execution_count": null, "id": "7e505353-f19e-4961-9279-f1f0f1e4be09", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val = val_pred_simulated_na.drop('observed', axis=1).sub(\n", @@ -813,7 +960,11 @@ "cell_type": "code", "execution_count": null, "id": "95022f04-0e0d-47bf-8267-6135a936328f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "errors_val" @@ -845,7 +996,11 @@ "cell_type": "code", "execution_count": null, "id": "d825e38e-f3d6-4bca-b621-150267e7b7bc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -865,7 +1020,11 @@ "cell_type": "code", "execution_count": null, "id": "855a7a6f-93fd-4612-9d8d-96541a2441be", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na')\n", @@ -886,7 +1045,11 @@ "cell_type": "code", "execution_count": null, "id": "571ac8d4-bb5d-45db-bba8-59817e476304", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na')\n", @@ -905,7 +1068,11 @@ "cell_type": "code", "execution_count": null, "id": "87910434-7d07-4e8e-8380-c92fc515bd16", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n", @@ -917,7 +1084,11 @@ "cell_type": "code", "execution_count": null, "id": "7d99deb9-9aad-4ba9-b79d-e4b3c6c7f023", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = models.get_df_from_nested_dict(\n", @@ -937,7 +1108,11 @@ "cell_type": "code", "execution_count": null, "id": "782636ac-c979-4f8b-9fc0-66fd0c7a3a8b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# save simulated missing values for both splits\n", @@ -957,7 +1132,11 @@ "cell_type": "code", "execution_count": null, "id": "06e433ec", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "figures # switch to fnames?" @@ -967,7 +1146,11 @@ "cell_type": "code", "execution_count": null, "id": "0f13cb38-abf0-4b56-9399-3d11d32f7fbc", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", diff --git a/project/01_1_train_VAE.py b/project/01_1_train_VAE.py index ac154dcbd..d68428410 100644 --- a/project/01_1_train_VAE.py +++ b/project/01_1_train_VAE.py @@ -16,41 +16,34 @@ # %% [markdown] # # Variational Autoencoder -# %% +# %% tags=["hide-input"] import logging from functools import partial +import pandas as pd +import sklearn +import torch +from fastai import learner from fastai.basics import * -from fastai.learner import Learner from fastai.callback.all import * from fastai.callback.all import EarlyStoppingCallback +from fastai.learner import Learner from fastai.torch_basics import * - -import torch - from IPython.display import display - -from torch.nn import Sigmoid - -import pandas as pd - -import sklearn -from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from torch.nn import Sigmoid import vaep -import vaep.nb -from vaep.io import datasplits -from vaep.models import ae -import vaep.models as models import vaep.model +import vaep.models as models +import vaep.nb from vaep.analyzers import analyzers - - +from vaep.io import datasplits # overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss -from fastai import learner +from vaep.models import ae, plot_loss + learner.Recorder.plot_loss = plot_loss @@ -61,7 +54,7 @@ figures = {} # collection of ax or figures -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -100,11 +93,11 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) if isinstance(args.hidden_layers, str): @@ -119,26 +112,26 @@ # %% [markdown] # Some naming conventions -# %% +# %% tags=["hide-input"] TEMPLATE_MODEL_PARAMS = 'model_params_{}.json' # %% [markdown] # ## Load data in long format -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format -# %% +# %% tags=["hide-input"] data.train_X.sample(5) # %% [markdown] # Infer index names from long format -# %% +# %% tags=["hide-input"] index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) if len(index_columns) == 1: @@ -157,7 +150,7 @@ # %% [markdown] # load meta data for splits -# %% +# %% tags=["hide-input"] if args.fn_rawfile_metadata: df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0) display(df_meta.loc[data.train_X.index.levels[0]]) @@ -173,7 +166,7 @@ # - Not used for predictions or early stopping. # - [x] add some additional NAs based on distribution of data -# %% +# %% tags=["hide-input"] freq_feat = vaep.io.datasplits.load_freq(args.data) freq_feat.head() # training data @@ -183,11 +176,11 @@ # %% [markdown] # The validation simulated NA is used to by all models to evaluate training performance. -# %% +# %% tags=["hide-input"] val_pred_simulated_na = data.val_y.to_frame(name='observed') val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na = data.test_y.to_frame(name='observed') test_pred_simulated_na.describe() @@ -197,7 +190,7 @@ # # - Autoencoder need data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() args.M = data.train_X.shape[-1] data.train_X.head() @@ -208,13 +201,13 @@ # %% [markdown] # ### Fill Validation data with potentially missing features -# %% +# %% tags=["hide-input"] data.train_X -# %% +# %% tags=["hide-input"] data.val_y # potentially has less features -# %% +# %% tags=["hide-input"] data.val_y = pd.DataFrame(pd.NA, index=data.train_X.index, columns=data.train_X.columns).fillna(data.val_y) data.val_y @@ -225,7 +218,7 @@ # %% [markdown] # ### Analysis: DataLoaders, Model, transform -# %% +# %% tags=["hide-input"] default_pipeline = sklearn.pipeline.Pipeline( [ ('normalize', StandardScaler()), @@ -235,7 +228,7 @@ # %% [markdown] # ### Analysis: DataLoaders, Model -# %% +# %% tags=["hide-input"] analysis = ae.AutoEncoderAnalysis( # datasplits=data, train_df=data.train_X, @@ -259,11 +252,11 @@ # # -# %% +# %% tags=["hide-input"] results = [] loss_fct = partial(models.vae.loss_fct, results=results) -# %% +# %% tags=["hide-input"] analysis.learn = Learner(dls=analysis.dls, model=analysis.model, loss_func=loss_fct, @@ -277,21 +270,21 @@ # [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in # current version. Try again later -# %% +# %% tags=["hide-input"] # learn.summary() -# %% +# %% tags=["hide-input"] suggested_lr = analysis.learn.lr_find() analysis.params['suggested_inital_lr'] = suggested_lr.valley suggested_lr -# %% +# %% tags=["hide-input"] results.clear() # reset results # %% [markdown] # dump model config -# %% +# %% tags=["hide-input"] # needs class as argument, not instance, but serialization needs instance analysis.params['last_decoder_activation'] = Sigmoid() @@ -305,21 +298,21 @@ # restore original value analysis.params['last_decoder_activation'] = Sigmoid -# %% +# %% tags=["hide-input"] # papermill_description=train analysis.learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley) # %% [markdown] # Save number of actually trained epochs -# %% +# %% tags=["hide-input"] args.epoch_trained = analysis.learn.epoch + 1 args.epoch_trained # %% [markdown] # #### Loss normalized by total number of measurements -# %% +# %% tags=["hide-input"] N_train_notna = data.train_X.notna().sum().sum() N_val_notna = data.val_y.notna().sum().sum() fig = models.plot_training_losses(analysis.learn, args.model_key, @@ -330,7 +323,7 @@ # ### Predictions # create predictions and select validation data predictions -# %% +# %% tags=["hide-input"] analysis.model.eval() pred, target = res = ae.get_preds_from_df(df=data.train_X, learn=analysis.learn, position_pred_tuple=0, @@ -338,19 +331,19 @@ pred = pred.stack() pred -# %% +# %% tags=["hide-input"] val_pred_simulated_na['VAE'] = pred # 'model_key' ? val_pred_simulated_na -# %% +# %% tags=["hide-input"] test_pred_simulated_na['VAE'] = pred # model_key? test_pred_simulated_na # %% [markdown] # save missing values predictions -# %% +# %% tags=["hide-input"] if args.save_pred_real_na: pred_real_na = ae.get_missing_values(df_train_wide=data.train_X, val_idx=val_pred_simulated_na.index, @@ -365,14 +358,22 @@ # # - validation data -# %% +# %% tags=["hide-input"] analysis.model = analysis.model.cpu() +# underlying data is train_X for both +# assert analysis.dls.valid.data.equals(analysis.dls.train.data) +# Reconstruct DataLoader for case that during training singleton batches were dropped +_dl = torch.utils.data.DataLoader( + vaep.io.datasets.DatasetWithTarget( + analysis.dls.valid.data), + batch_size=args.batch_size, + shuffle=False) df_latent = vaep.model.get_latent_space(analysis.model.get_mu_and_logvar, - dl=analysis.dls.valid, + dl=_dl, dl_index=analysis.dls.valid.data.index) df_latent -# %% +# %% tags=["hide-input"] ana_latent = analyzers.LatentAnalysis(df_latent, df_meta, args.model_key, @@ -381,20 +382,20 @@ figures[f'latent_{args.model_key}_by_date'], ax = ana_latent.plot_by_date( args.meta_date_col) -# %% +# %% tags=["hide-input"] if args.meta_cat_col and df_meta is not None: figures[f'latent_{args.model_key}_by_{"_".join(args.meta_cat_col.split())}'], ax = ana_latent.plot_by_category( args.meta_cat_col) -# %% +# %% tags=["hide-input"] feat_freq_val = val_pred_simulated_na['observed'].groupby(level=-1).count() feat_freq_val.name = 'freq_val' ax = feat_freq_val.plot.box() -# %% +# %% tags=["hide-input"] feat_freq_val.value_counts().sort_index().head() # require more than one feat? -# %% +# %% tags=["hide-input"] errors_val = val_pred_simulated_na.drop('observed', axis=1).sub( val_pred_simulated_na['observed'], axis=0) errors_val = errors_val.abs().groupby(level=-1).mean() @@ -407,12 +408,12 @@ ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10)) # errors_val_smoothed -# %% +# %% tags=["hide-input"] errors_val = val_pred_simulated_na.drop('observed', axis=1).sub( val_pred_simulated_na['observed'], axis=0) errors_val.abs().groupby(level=-1).agg(['mean', 'count']) -# %% +# %% tags=["hide-input"] errors_val # %% [markdown] @@ -427,7 +428,7 @@ # # - all measured (identified, observed) peptides in validation data -# %% +# %% tags=["hide-input"] # papermill_description=metrics # d_metrics = models.Metrics(no_na_key='NA interpolated', with_na_key='NA not interpolated') d_metrics = models.Metrics() @@ -435,7 +436,7 @@ # %% [markdown] # The simulated NA for the validation step are real test data (not used for training nor early stopping) -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_simulated_na, 'valid_simulated_na') added_metrics @@ -444,19 +445,19 @@ # -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_simulated_na, 'test_simulated_na') added_metrics # %% [markdown] # Save all metrics as json -# %% +# %% tags=["hide-input"] vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json') d_metrics -# %% +# %% tags=["hide-input"] metrics_df = models.get_df_from_nested_dict( d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df @@ -464,7 +465,7 @@ # %% [markdown] # ## Save predictions -# %% +# %% tags=["hide-input"] # save simulated missing values for both splits val_pred_simulated_na.to_csv(args.out_preds / f"pred_val_{args.model_key}.csv") test_pred_simulated_na.to_csv(args.out_preds / f"pred_test_{args.model_key}.csv") @@ -472,9 +473,9 @@ # %% [markdown] # ## Config -# %% +# %% tags=["hide-input"] figures # switch to fnames? -# %% +# %% tags=["hide-input"] args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_transfer_NAGuideR_pred.ipynb b/project/01_1_transfer_NAGuideR_pred.ipynb index a985a61c1..a4fbd5e14 100644 --- a/project/01_1_transfer_NAGuideR_pred.ipynb +++ b/project/01_1_transfer_NAGuideR_pred.ipynb @@ -5,25 +5,30 @@ "id": "a75efcbe-2ae0-4609-872a-759fb5c80af1", "metadata": {}, "source": [ - "# Transfer predictions from NAGuideR" + "# Transfer predictions from NAGuideR\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f29b93d1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "from pathlib import Path\n", "import logging\n", - "import pandas as pd\n", + "from pathlib import Path\n", + "\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", "\n", "import vaep\n", "import vaep.models\n", - "from vaep.io import datasplits\n", "import vaep.pandas\n", + "from vaep.io import datasplits\n", "\n", "vaep.plotting.make_large_descriptors(5)\n", "\n", @@ -34,7 +39,11 @@ "cell_type": "code", "execution_count": null, "id": "cbf23f02", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -84,7 +93,11 @@ "cell_type": "code", "execution_count": null, "id": "3d5c476b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.get_params(args, globals=globals())\n", @@ -96,7 +109,11 @@ "cell_type": "code", "execution_count": null, "id": "ba3513a7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out = {}" @@ -115,7 +132,10 @@ "execution_count": null, "id": "75341d2b", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -135,7 +155,11 @@ "cell_type": "code", "execution_count": null, "id": "723eacd2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na = data.val_y.to_frame(name='observed')\n", @@ -146,7 +170,11 @@ "cell_type": "code", "execution_count": null, "id": "514d193f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "test_pred_fake_na = data.test_y.to_frame(name='observed')\n", @@ -157,7 +185,11 @@ "cell_type": "code", "execution_count": null, "id": "204838b7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# Find and load prediction files, filter for validation and test data" @@ -167,7 +199,11 @@ "cell_type": "code", "execution_count": null, "id": "c06dcd14", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.dumps is not None:\n", @@ -183,7 +219,10 @@ "execution_count": null, "id": "4a9e66bc", "metadata": { - "lines_to_next_cell": 0 + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -226,7 +265,11 @@ "cell_type": "code", "execution_count": null, "id": "72adc8ec", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "val_pred_fake_na" @@ -244,7 +287,11 @@ "cell_type": "code", "execution_count": null, "id": "d7c5dab0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# papermill_description=metrics\n", @@ -255,7 +302,11 @@ "cell_type": "code", "execution_count": null, "id": "773dcbbe", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(val_pred_fake_na.dropna(how='all', axis=1), 'valid_fake_na')\n", @@ -274,7 +325,11 @@ "cell_type": "code", "execution_count": null, "id": "065e1e62", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "added_metrics = d_metrics.add_metrics(test_pred_fake_na.dropna(how='all', axis=1), 'test_fake_na')\n", @@ -285,7 +340,11 @@ "cell_type": "code", "execution_count": null, "id": "37e8e515", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "metrics_df = vaep.models.get_df_from_nested_dict(\n", @@ -297,7 +356,11 @@ "cell_type": "code", "execution_count": null, "id": "f1f0d12c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "order_methods = metrics_df.loc[pd.IndexSlice[:,\n", @@ -309,7 +372,11 @@ "cell_type": "code", "execution_count": null, "id": "a0f8ce7c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "top_5 = ['observed', *order_methods.droplevel(-1).index[:6]]\n", @@ -320,7 +387,11 @@ "cell_type": "code", "execution_count": null, "id": "7e041594", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(8, 2))\n", @@ -337,7 +408,11 @@ "cell_type": "code", "execution_count": null, "id": "77b1b792", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out" diff --git a/project/01_1_transfer_NAGuideR_pred.py b/project/01_1_transfer_NAGuideR_pred.py index ed152c987..bddf8f604 100644 --- a/project/01_1_transfer_NAGuideR_pred.py +++ b/project/01_1_transfer_NAGuideR_pred.py @@ -15,23 +15,25 @@ # %% [markdown] # # Transfer predictions from NAGuideR +# -# %% -from pathlib import Path +# %% tags=["hide-input"] import logging -import pandas as pd +from pathlib import Path + import matplotlib.pyplot as plt +import pandas as pd import vaep import vaep.models -from vaep.io import datasplits import vaep.pandas +from vaep.io import datasplits vaep.plotting.make_large_descriptors(5) logger = vaep.logging.setup_logger(logging.getLogger('vaep')) -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -52,18 +54,18 @@ # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args = vaep.nb.args_from_dict(args) args -# %% +# %% tags=["hide-input"] files_out = {} # %% [markdown] # load data splits -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) @@ -71,18 +73,18 @@ # %% [markdown] # Validation and test data split of simulated missing values -# %% +# %% tags=["hide-input"] val_pred_fake_na = data.val_y.to_frame(name='observed') val_pred_fake_na -# %% +# %% tags=["hide-input"] test_pred_fake_na = data.test_y.to_frame(name='observed') test_pred_fake_na.describe() -# %% +# %% tags=["hide-input"] # Find and load prediction files, filter for validation and test data -# %% +# %% tags=["hide-input"] if args.dumps is not None: entire_pred = [Path(s) for s in args.dumps.split(',')] else: @@ -90,7 +92,7 @@ if '_all_' in str(file)) entire_pred -# %% +# %% tags=["hide-input"] mask = data.train_X.unstack().isna().stack() idx_real_na = mask.index[mask] idx_real_na = (idx_real_na @@ -124,42 +126,42 @@ logger.info(f"Save {fname = }") # del pred -# %% +# %% tags=["hide-input"] val_pred_fake_na # %% [markdown] # Metrics for simulated missing values (NA) -# %% +# %% tags=["hide-input"] # papermill_description=metrics d_metrics = vaep.models.Metrics() -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(val_pred_fake_na.dropna(how='all', axis=1), 'valid_fake_na') pd.DataFrame(added_metrics) # %% [markdown] # ## Test Datasplit -# %% +# %% tags=["hide-input"] added_metrics = d_metrics.add_metrics(test_pred_fake_na.dropna(how='all', axis=1), 'test_fake_na') pd.DataFrame(added_metrics) -# %% +# %% tags=["hide-input"] metrics_df = vaep.models.get_df_from_nested_dict( d_metrics.metrics, column_levels=['model', 'metric_name']).T metrics_df -# %% +# %% tags=["hide-input"] order_methods = metrics_df.loc[pd.IndexSlice[:, 'MAE'], 'valid_fake_na'].sort_values() order_methods -# %% +# %% tags=["hide-input"] top_5 = ['observed', *order_methods.droplevel(-1).index[:6]] top_5 -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(8, 2)) ax, errors_bind = vaep.plotting.errors.plot_errors_binned( val_pred_fake_na[top_5], @@ -169,5 +171,5 @@ files_out[fname.name] = fname.as_posix() vaep.savefig(ax.get_figure(), fname) -# %% +# %% tags=["hide-input"] files_out diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index 263026456..3d48f53a9 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -23,7 +23,9 @@ "execution_count": null, "id": "a1e5f978-a0cb-4bb6-98d1-467eda257165", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -82,7 +84,9 @@ "execution_count": null, "id": "67f5161a", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -139,7 +143,9 @@ "execution_count": null, "id": "ec1509e8-6908-43c3-8909-efbb0229c324", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -152,7 +158,9 @@ "execution_count": null, "id": "19b33594", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -165,7 +173,9 @@ "execution_count": null, "id": "59081f60", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -179,7 +189,9 @@ "id": "c3e124fb", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -199,7 +211,9 @@ "execution_count": null, "id": "747d5e4a", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -214,7 +228,9 @@ "execution_count": null, "id": "a4ba2a48-dedc-47a9-b2ea-79936dfc48ef", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -227,7 +243,9 @@ "execution_count": null, "id": "611a8edf", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -261,7 +279,9 @@ "execution_count": null, "id": "2d043b40-5c74-40cc-a5cf-8d22ac5538a8", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -276,7 +296,9 @@ "execution_count": null, "id": "d8f8c3f4-9896-4f0e-8f93-780f90b22573", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -299,7 +321,9 @@ "execution_count": null, "id": "9a94ad00-78fd-4541-be5d-68391af99bd5", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -320,7 +344,9 @@ "execution_count": null, "id": "526626c0-98c7-4741-abae-b6fc8c218f23", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -341,7 +367,9 @@ "execution_count": null, "id": "f3e738bd-79e9-4714-af4d-f3d0d2893353", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -365,20 +393,22 @@ "execution_count": null, "id": "91bc1e12-8477-4eda-a4c2-1f132e468616", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "# model_key could be used as key from config file\n", - "? load only specified configs?\n", - "? case: no config file available?\n", + "# ? load only specified configs?\n", + "# ? case: no config file available?\n", "all_configs = collect(\n", " paths=(fname for fname in args.out_models.iterdir()\n", " if fname.suffix == '.yaml'\n", " and 'model_config' in fname.name),\n", " load_fn=load_config_file\n", ")\n", - "model_configs = pd.DataFrame(all_configs).set_index('model')\n", + "model_configs = pd.DataFrame(all_configs).set_index('id')\n", "model_configs.T.to_excel(writer, sheet_name='model_params')\n", "model_configs.T" ] @@ -396,7 +426,9 @@ "execution_count": null, "id": "af8c112f-fb4f-4dcd-b729-9c9558715d88", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -428,7 +460,9 @@ "execution_count": null, "id": "4efc3fe6", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -456,7 +490,9 @@ "execution_count": null, "id": "d5196bcc", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -482,7 +518,9 @@ "execution_count": null, "id": "e94d9dd6-d97d-4e1c-b877-48dc1ae9c7c7", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -499,7 +537,11 @@ "cell_type": "code", "execution_count": null, "id": "f5865679", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]\n", @@ -516,7 +558,9 @@ "execution_count": null, "id": "4d6417fc", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -543,7 +587,9 @@ "execution_count": null, "id": "36e078fb-2268-41dd-a069-4ca3dc5ca6cf", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -557,7 +603,9 @@ "id": "a2440887-b5f2-45a1-90cd-d15ef9bfa0a7", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -580,13 +628,15 @@ "execution_count": null, "id": "cea24eb1", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "corr_per_sample_val = (pred_val\n", " .groupby(sample_index_name)\n", - " .aggregate(\n", + " .apply(\n", " lambda df: df.corr().loc[TARGET_COL]\n", " )[ORDER_MODELS])\n", "\n", @@ -607,7 +657,8 @@ "dumps[fname.stem] = fname\n", "with pd.ExcelWriter(fname) as w:\n", " corr_per_sample_val.describe().to_excel(w, sheet_name='summary')\n", - " corr_per_sample_val.to_excel(w, sheet_name='correlations')" + " corr_per_sample_val.to_excel(w, sheet_name='correlations')\n", + " corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')" ] }, { @@ -623,7 +674,9 @@ "execution_count": null, "id": "4068d91f-856e-4aa6-9c62-5f1f77a77c4c", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -647,7 +700,9 @@ "execution_count": null, "id": "52298acd-73c5-4574-b7fe-8fb6544708cf", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -662,7 +717,9 @@ "id": "570fc505-ab27-4710-b4c2-adbe72b33898", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -678,7 +735,9 @@ "execution_count": null, "id": "ddc98a9f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -691,7 +750,9 @@ "id": "af4f0e81-e9af-4763-908d-f7bdf4a4fed7", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -714,7 +775,9 @@ "execution_count": null, "id": "df6923c5-e6f7-4a14-aa8e-d55bf66cf817", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -740,7 +803,10 @@ "execution_count": null, "id": "6f6ffdd5", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -759,7 +825,9 @@ "execution_count": null, "id": "6122a309-5435-44d2-a6f8-8e9d46b5afae", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -782,7 +850,9 @@ "execution_count": null, "id": "1dc848c6-d39e-4092-9b72-3f6a0e1949e2", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -814,7 +884,9 @@ "execution_count": null, "id": "8bce941c", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -830,7 +902,9 @@ "execution_count": null, "id": "ff722dae", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -842,7 +916,9 @@ "execution_count": null, "id": "629eddae", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -862,7 +938,9 @@ "execution_count": null, "id": "f639cd92", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -885,7 +963,9 @@ "execution_count": null, "id": "99f7951f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -926,7 +1006,11 @@ "cell_type": "code", "execution_count": null, "id": "843a917f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test,\n", @@ -950,13 +1034,15 @@ "execution_count": null, "id": "ee088a12-ee60-45d1-bf5a-e07b76413c56", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "corr_per_sample_test = (pred_test\n", " .groupby(sample_index_name)\n", - " .aggregate(lambda df: df.corr().loc[TARGET_COL])\n", + " .apply(lambda df: df.corr().loc[TARGET_COL])\n", " [ORDER_MODELS])\n", "corr_per_sample_test = corr_per_sample_test.join(\n", " pred_test\n", @@ -973,7 +1059,9 @@ "execution_count": null, "id": "825efac2", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -995,7 +1083,8 @@ "dumps[fname.stem] = fname.with_suffix('.xlsx')\n", "with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:\n", " corr_per_sample_test.describe().to_excel(w, sheet_name='summary')\n", - " corr_per_sample_test.to_excel(w, sheet_name='correlations')" + " corr_per_sample_test.to_excel(w, sheet_name='correlations')\n", + " corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')" ] }, { @@ -1011,7 +1100,9 @@ "execution_count": null, "id": "77b846e1-00b8-4f61-b5cd-cdc1692787de", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1027,7 +1118,9 @@ "execution_count": null, "id": "7bff3764-5063-4399-a182-3ba795fbe99d", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1042,11 +1135,13 @@ "execution_count": null, "id": "c6145bd0-9b59-490e-9a0e-89475c18663b", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "options = random.sample(set(feature_names), 1)\n", + "options = random.sample(sorted(set(feature_names)), 1)\n", "pred_test.loc[pd.IndexSlice[:, options[0]], :]" ] }, @@ -1063,11 +1158,13 @@ "execution_count": null, "id": "6ee92128-4f78-45e9-a607-8e6c4163181a", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "corr_per_feat_test = pred_test.groupby(FEAT_NAME).aggregate(\n", + "corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(\n", " lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]\n", "corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[\n", " TARGET_COL].count().rename('n_obs'))\n", @@ -1081,7 +1178,9 @@ "execution_count": null, "id": "8e45b324-eaa0-43e4-b28b-b0f839f91955", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1093,7 +1192,9 @@ "execution_count": null, "id": "4c9a9ecc-526a-41ac-8a4d-d3a389ea6c07", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1114,7 +1215,8 @@ "with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:\n", " corr_per_feat_test.loc[~too_few_obs].describe().to_excel(\n", " w, sheet_name='summary')\n", - " corr_per_feat_test.to_excel(w, sheet_name='correlations')" + " corr_per_feat_test.to_excel(w, sheet_name='correlations')\n", + " corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')" ] }, { @@ -1122,7 +1224,9 @@ "execution_count": null, "id": "b38ffdfc-b1b0-4ae0-a47d-5881c534881f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1136,7 +1240,9 @@ "execution_count": null, "id": "9993d145-8b78-4769-838a-01721900a3c7", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1177,7 +1283,9 @@ "execution_count": null, "id": "829ebc82-587d-47c6-8422-03c610855211", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1193,7 +1301,9 @@ "execution_count": null, "id": "f8269d00-9048-4e70-9f39-dab95e103c32", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1206,7 +1316,9 @@ "execution_count": null, "id": "096083d1-bcd2-44a2-94fe-a89b7d204b66", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1221,7 +1333,9 @@ "id": "05a259ef-48bd-4dd0-8dfe-9e2750579383", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1243,7 +1357,9 @@ "execution_count": null, "id": "d3dd53c0-4068-4eac-a5c3-7aaa608e5f8f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1269,7 +1385,9 @@ "id": "ef92551d", "metadata": { "lines_to_next_cell": 2, - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1295,7 +1413,9 @@ "execution_count": null, "id": "588f7bf3", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1328,7 +1448,10 @@ "execution_count": null, "id": "e1455bcc", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1347,7 +1470,9 @@ "execution_count": null, "id": "b13ecd37", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1372,7 +1497,10 @@ "execution_count": null, "id": "712faf9a", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1434,7 +1562,10 @@ "execution_count": null, "id": "2a578570", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1493,7 +1624,9 @@ "execution_count": null, "id": "3339df97-230f-4cbd-b61d-7aef9a7495e8", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1517,7 +1650,9 @@ "execution_count": null, "id": "095f64eb-1c4f-47ae-9a01-d5b05a795779", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1541,7 +1676,9 @@ "execution_count": null, "id": "c8f67ae1-40e9-4c2a-af0a-41e627703518", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -1553,11 +1690,14 @@ "execution_count": null, "id": "b08b442f", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "dumps" + "dumps\n", + "print(\"done\")" ] } ], diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index 80f3f7c02..fd175a7f1 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -26,7 +26,7 @@ # - as for validation data # - top N based on validation data -# %% +# %% tags=["hide-input"] import logging import random from pathlib import Path @@ -77,7 +77,7 @@ def build_text(s): return ret -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() @@ -104,19 +104,19 @@ def build_text(s): # %% [markdown] # Some argument transformations -# %% +# %% tags=["hide-input"] args = vaep.nb.get_params(args, globals=globals()) args -# %% +# %% tags=["hide-input"] args = vaep.nb.args_from_dict(args) args -# %% +# %% tags=["hide-input"] figures = {} dumps = {} -# %% +# %% tags=["hide-input"] TARGET_COL = 'observed' METRIC = 'MAE' MIN_FREQ = None @@ -128,17 +128,17 @@ def build_text(s): SEL_MODELS = args.sel_models.split(',') -# %% +# %% tags=["hide-input"] # list(sns.color_palette().as_hex()) # string representation of colors if args.plot_to_n > 10: logger.warning("Set maximum of models to 10 (maximum)") args.overwrite_entry('plot_to_n', 10) -# %% +# %% tags=["hide-input"] data = datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) -# %% +# %% tags=["hide-input"] fig, axes = plt.subplots(1, 2, sharey=True, sharex=True) vaep.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0], @@ -158,13 +158,13 @@ def build_text(s): # %% [markdown] # ## data completeness across entire data -# %% +# %% tags=["hide-input"] # load frequency of training features... # needs to be pickle -> index.name needed freq_feat = vaep.io.datasplits.load_freq(args.data, file='freq_features.json') freq_feat.head() # training data -# %% +# %% tags=["hide-input"] prop = freq_feat / len(data.train_X.index.levels[0]) prop.sort_values().to_frame().plot( xlabel=f'{data.val_y.index.names[-1]}', @@ -173,21 +173,21 @@ def build_text(s): # %% [markdown] # View training data in wide format -# %% +# %% tags=["hide-input"] data.to_wide_format() data.train_X # %% [markdown] # Number of samples and features: -# %% +# %% tags=["hide-input"] N_SAMPLES, M_FEAT = data.train_X.shape print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}") # %% [markdown] # Collect outputs in excel file: -# %% +# %% tags=["hide-input"] fname = args.folder_experiment / '01_2_performance_summary.xlsx' dumps[fname.stem] = fname writer = pd.ExcelWriter(fname) @@ -197,24 +197,24 @@ def build_text(s): # ## Model specifications # - used for bar plot annotations -# %% +# %% tags=["hide-input"] # model_key could be used as key from config file -# ? load only specified configs? -# ? case: no config file available? +# # ? load only specified configs? +# # ? case: no config file available? all_configs = collect( paths=(fname for fname in args.out_models.iterdir() if fname.suffix == '.yaml' and 'model_config' in fname.name), load_fn=load_config_file ) -model_configs = pd.DataFrame(all_configs).set_index('model') +model_configs = pd.DataFrame(all_configs).set_index('id') model_configs.T.to_excel(writer, sheet_name='model_params') model_configs.T # %% [markdown] # Set Feature name (columns are features, rows are samples) -# %% +# %% tags=["hide-input"] # index name freq_feat.index.name = data.train_X.columns.name # sample index name @@ -228,7 +228,7 @@ def build_text(s): # ## Validation data # - set top N models to plot based on validation data split -# %% +# %% tags=["hide-input"] pred_val = compare_predictions.load_split_prediction_by_modelkey( experiment_folder=args.folder_experiment, split='val', @@ -242,7 +242,7 @@ def build_text(s): # %% [markdown] # Describe absolute error -# %% +# %% tags=["hide-input"] errors_val = (pred_val .drop(TARGET_COL, axis=1) .sub(pred_val[TARGET_COL], axis=0) @@ -251,7 +251,7 @@ def build_text(s): # %% [markdown] # ### Select top N for plotting and set colors -# %% +# %% tags=["hide-input"] ORDER_MODELS = (errors_val .abs() .mean() @@ -260,7 +260,7 @@ def build_text(s): .to_list()) ORDER_MODELS -# %% +# %% tags=["hide-input"] pred_val = pred_val[[TARGET_COL] + ORDER_MODELS] if args.save_agg_pred: fname = args.folder_experiment / '01_2_agg_pred_val.csv' @@ -269,7 +269,7 @@ def build_text(s): logger.info(f"Saved aggregated predictions to: {fname}") pred_val -# %% +# %% tags=["hide-input"] mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS] mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f') mae_stats_ordered_val.T @@ -282,11 +282,11 @@ def build_text(s): # > 1. The order of "new" models is important for the color assignment. # > 2. User defined model keys for the same model with two configuration will yield different colors. -# %% +# %% tags=["hide-input"] COLORS_TO_USE = vaep.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS)) vaep.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE) -# %% +# %% tags=["hide-input"] TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n] TOP_N_COLOR_PALETTE = {model: color for model, color in zip(TOP_N_ORDER, COLORS_TO_USE)} @@ -296,10 +296,10 @@ def build_text(s): # %% [markdown] # ### Correlation per sample -# %% +# %% tags=["hide-input"] corr_per_sample_val = (pred_val .groupby(sample_index_name) - .aggregate( + .apply( lambda df: df.corr().loc[TARGET_COL] )[ORDER_MODELS]) @@ -321,11 +321,12 @@ def build_text(s): with pd.ExcelWriter(fname) as w: corr_per_sample_val.describe().to_excel(w, sheet_name='summary') corr_per_sample_val.to_excel(w, sheet_name='correlations') + corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted') # %% [markdown] # identify samples which are below lower whisker for models -# %% +# %% tags=["hide-input"] treshold = vaep.pandas.get_lower_whiskers( corr_per_sample_val[TOP_N_ORDER]).min() mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1) @@ -335,12 +336,12 @@ def build_text(s): # %% [markdown] # ### Error plot -# %% +# %% tags=["hide-input"] c_error_min = 4.5 mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1) errors_val.loc[mask].sort_index(level=1).head() -# %% +# %% tags=["hide-input"] errors_val = errors_val.abs().groupby( freq_feat.index.name).mean() # absolute error errors_val = errors_val.join(freq_feat) @@ -348,10 +349,10 @@ def build_text(s): errors_val.head() -# %% +# %% tags=["hide-input"] errors_val.describe()[ORDER_MODELS].T # mean of means -# %% +# %% tags=["hide-input"] c_avg_error = 2 mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1) errors_val.loc[mask] @@ -361,7 +362,7 @@ def build_text(s): # ### Error by non-decimal number of intensity # - number of observations in parentheses. -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(8, 3)) ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( pred_val[ @@ -378,7 +379,7 @@ def build_text(s): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) -# %% +# %% tags=["hide-input"] # # ! only used for reporting plotted = vaep.plotting.errors.get_data_for_errors_by_median( errors=errors_binned, @@ -389,7 +390,7 @@ def build_text(s): plotted -# %% +# %% tags=["hide-input"] errors_binned.head() dumps[fname.stem] = fname.with_suffix('.csv') errors_binned.to_csv(fname.with_suffix('.csv')) @@ -398,7 +399,7 @@ def build_text(s): # %% [markdown] # ## test data -# %% +# %% tags=["hide-input"] pred_test = compare_predictions.load_split_prediction_by_modelkey( experiment_folder=args.folder_experiment, split='test', @@ -416,17 +417,17 @@ def build_text(s): # %% [markdown] # Write averages for all models to excel (from before?) -# %% +# %% tags=["hide-input"] errors_test_mae = vaep.pandas.calc_errors.get_absolute_error( pred_test ) mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS] mae_stats_ordered_test -# %% +# %% tags=["hide-input"] mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f') -# %% +# %% tags=["hide-input"] cp_mean_perf = pd.concat([ mae_stats_ordered_val.loc['mean'], mae_stats_ordered_test.loc['mean'], @@ -437,13 +438,13 @@ def build_text(s): cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f') cp_mean_perf -# %% +# %% tags=["hide-input"] writer.close() # %% [markdown] # ### Intensity distribution as histogram # Plot top 4 models predictions for intensities in test data -# %% +# %% tags=["hide-input"] min_max = vaep.plotting.data.min_max(pred_test[TARGET_COL]) top_n = 4 fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True) @@ -476,7 +477,7 @@ def build_text(s): figures[fname.stem] = fname vaep.savefig(fig, name=fname) -# %% +# %% tags=["hide-input"] counts_per_bin = vaep.pandas.get_counts_per_bin(df=pred_test, bins=bins, columns=[TARGET_COL, *ORDER_MODELS[:top_n]]) @@ -487,10 +488,10 @@ def build_text(s): # %% [markdown] # ### Correlation per sample -# %% +# %% tags=["hide-input"] corr_per_sample_test = (pred_test .groupby(sample_index_name) - .aggregate(lambda df: df.corr().loc[TARGET_COL]) + .apply(lambda df: df.corr().loc[TARGET_COL]) [ORDER_MODELS]) corr_per_sample_test = corr_per_sample_test.join( pred_test @@ -501,7 +502,7 @@ def build_text(s): too_few_obs = corr_per_sample_test['n_obs'] < 3 corr_per_sample_test.loc[~too_few_obs].describe() -# %% +# %% tags=["hide-input"] # # ! add minimum kwargs = dict(ylim=(0.7, 1), rot=90, flierprops=dict(markersize=3), @@ -521,32 +522,33 @@ def build_text(s): with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w: corr_per_sample_test.describe().to_excel(w, sheet_name='summary') corr_per_sample_test.to_excel(w, sheet_name='correlations') + corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted') # %% [markdown] # identify samples which are below lower whisker for models -# %% +# %% tags=["hide-input"] treshold = vaep.pandas.get_lower_whiskers( corr_per_sample_test[TOP_N_ORDER]).min() mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1) corr_per_sample_test.loc[mask].style.highlight_min( axis=1) if mask.sum() else 'Nothing to display' -# %% +# %% tags=["hide-input"] feature_names = pred_test.index.levels[-1] N_SAMPLES = pred_test.index M = len(feature_names) pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :] -# %% -options = random.sample(set(feature_names), 1) +# %% tags=["hide-input"] +options = random.sample(sorted(set(feature_names)), 1) pred_test.loc[pd.IndexSlice[:, options[0]], :] # %% [markdown] # ### Correlation per feature -# %% -corr_per_feat_test = pred_test.groupby(FEAT_NAME).aggregate( +# %% tags=["hide-input"] +corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply( lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS] corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[ TARGET_COL].count().rename('n_obs')) @@ -554,10 +556,10 @@ def build_text(s): too_few_obs = corr_per_feat_test['n_obs'] < 3 corr_per_feat_test.loc[~too_few_obs].describe() -# %% +# %% tags=["hide-input"] corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0) -# %% +# %% tags=["hide-input"] kwargs = dict(rot=90, flierprops=dict(markersize=1), ylabel=f'correlation per {FEAT_NAME_DISPLAY}') @@ -576,13 +578,14 @@ def build_text(s): corr_per_feat_test.loc[~too_few_obs].describe().to_excel( w, sheet_name='summary') corr_per_feat_test.to_excel(w, sheet_name='correlations') + corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted') -# %% +# %% tags=["hide-input"] feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count() feat_count_test.name = 'count' feat_count_test.head() -# %% +# %% tags=["hide-input"] treshold = vaep.pandas.get_lower_whiskers( corr_per_feat_test[TOP_N_ORDER]).min() mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1) @@ -609,23 +612,23 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] # ### Error plot -# %% +# %% tags=["hide-input"] metrics = vaep.models.Metrics() test_metrics = metrics.add_metrics( pred_test[['observed', *TOP_N_ORDER]], key='test data') test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER] test_metrics -# %% +# %% tags=["hide-input"] n_in_comparison = int(test_metrics.loc['N'].unique()[0]) n_in_comparison -# %% +# %% tags=["hide-input"] _to_plot = test_metrics.loc[METRIC].to_frame().T _to_plot.index = [feature_names.name] _to_plot -# %% +# %% tags=["hide-input"] try: text = model_configs[["latent_dim", "hidden_layers"]].apply( build_text, @@ -639,7 +642,7 @@ def highlight_min(s, color, tolerence=0.00001): _to_plot -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(4, 2)) # size of the plot can be adjusted ax = _to_plot.loc[[feature_names.name]].plot.bar( rot=0, @@ -655,7 +658,7 @@ def highlight_min(s, color, tolerence=0.00001): figures[fname.stem] = fname vaep.savefig(fig, name=fname) -# %% +# %% tags=["hide-input"] dumps[fname.stem] = fname.with_suffix('.csv') _to_plot_long = _to_plot.T _to_plot_long = _to_plot_long.rename( @@ -668,7 +671,7 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] # ### Plot error by median feature intensity -# %% +# %% tags=["hide-input"] vaep.plotting.make_large_descriptors(7) fig, ax = plt.subplots(figsize=(8, 2)) @@ -692,7 +695,7 @@ def highlight_min(s, color, tolerence=0.00001): errors_binned.to_csv(fname.with_suffix('.csv')) errors_binned -# %% +# %% tags=["hide-input"] # # ! only used for reporting plotted = vaep.plotting.errors.get_data_for_errors_by_median( errors=errors_binned, @@ -703,7 +706,7 @@ def highlight_min(s, color, tolerence=0.00001): plotted -# %% +# %% tags=["hide-input"] (errors_binned .set_index( ['model', errors_binned.columns[-1]] @@ -714,7 +717,7 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] # ### Custom model selection -# %% +# %% tags=["hide-input"] if SEL_MODELS: metrics = vaep.models.Metrics() test_metrics = metrics.add_metrics( @@ -768,7 +771,7 @@ def highlight_min(s, color, tolerence=0.00001): _to_plot_long.to_csv(fname.with_suffix('.csv')) -# %% +# %% tags=["hide-input"] # custom selection if SEL_MODELS: vaep.plotting.make_large_descriptors(7) @@ -814,7 +817,7 @@ def highlight_min(s, color, tolerence=0.00001): # # - number of observations in parentheses. -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=(8, 2)) ax, errors_binned = vaep.plotting.errors.plot_errors_binned( pred_test[ @@ -829,15 +832,16 @@ def highlight_min(s, color, tolerence=0.00001): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) -# %% +# %% tags=["hide-input"] dumps[fname.stem] = fname.with_suffix('.csv') errors_binned.to_csv(fname.with_suffix('.csv')) errors_binned.head() # %% [markdown] # ## Figures dumped to disk -# %% +# %% tags=["hide-input"] figures -# %% +# %% tags=["hide-input"] dumps +print("done") diff --git a/project/01_3_revision3.py b/project/01_3_revision3.py new file mode 100644 index 000000000..9362de276 --- /dev/null +++ b/project/01_3_revision3.py @@ -0,0 +1,170 @@ +# %% [markdown] +# # Compare models +# +# 1. Load available configurations +# 2. Load validation predictions +# - calculate absolute error on common subset of data +# - select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5) +# - correlation per sample, correlation per feat, correlation overall +# - MAE plots +# 3. Load test data predictions +# - as for validation data +# - top N based on validation data +# +# Model with `UNIQUE` key refer to samples uniquly split into training, validation and test data. +# These models could not use all sample for training. The predictions on simulated values +# are therefore restricted to the validation and test data from the set of unique samples. +# The models trained on all sample have additionally missing values in their training data, +# which were not missing in the unique samples. The comparison is therefore between models +# which had different data available for training. + +# %% +import logging +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import yaml + +import vaep +import vaep.imputation +import vaep.models +import vaep.nb +from vaep.analyzers import compare_predictions +from vaep.models.collect_dumps import select_content + +pd.options.display.max_rows = 30 +pd.options.display.min_rows = 10 +pd.options.display.max_colwidth = 100 + +plt.rcParams.update({'figure.figsize': (3, 2)}) +vaep.plotting.make_large_descriptors(7) + +logger = vaep.logging.setup_nb_logger() +logging.getLogger('fontTools').setLevel(logging.WARNING) + + +def load_config_file(fname: Path, first_split='config_') -> dict: + with open(fname) as f: + loaded = yaml.safe_load(f) + key = f"{select_content(fname.stem, first_split=first_split)}" + return key, loaded + + +def build_text(s): + ret = '' + if not np.isnan(s["latent_dim"]): + ret += f'LD: {int(s["latent_dim"])} ' + try: + if len(s["hidden_layers"]): + t = ",".join(str(x) for x in s["hidden_layers"]) + ret += f"HL: {t}" + except TypeError: + # nan + pass + return ret + + +# %% +# catch passed parameters +args = None +args = dict(globals()).keys() + +# %% [markdown] +# Papermill script parameters: + +# %% tags=["parameters"] +# files and folders +# Datasplit folder with data for experiment +folder_experiment: str = 'runs/example' +folder_data: str = '' # specify data directory if needed +file_format: str = 'csv' # change default to pickled files +# Machine parsed metadata from rawfile workflow +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' +models: str = 'Median,CF,DAE,VAE,KNN' # picked models to compare (comma separated) +sel_models: str = '' # user defined comparison (comma separated) +# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10 +plot_to_n: int = 5 +feat_name_display: str = None # display name for feature name (e.g. 'protein group') + + +# %% +models = 'KNN,KNN_UNIQUE' +folder_experiment = 'runs/rev3' + +# %% [markdown] +# Some argument transformations + +# %% +args = vaep.nb.get_params(args, globals=globals()) +args + +# %% +args = vaep.nb.args_from_dict(args) +args + +# %% +figures = {} +dumps = {} + +# %% +TARGET_COL = 'observed' +METRIC = 'MAE' +MIN_FREQ = None +MODELS_PASSED = args.models.split(',') +MODELS = MODELS_PASSED.copy() +FEAT_NAME_DISPLAY = args.feat_name_display +SEL_MODELS = None +if args.sel_models: + SEL_MODELS = args.sel_models.split(',') + +# %% + + +# %% [markdown] +# # Load predictions on validation and test data split +# + +# %% [markdown] +# ## Validation data +# - set top N models to plot based on validation data split + +# %% +pred_val = compare_predictions.load_split_prediction_by_modelkey( + experiment_folder=args.folder_experiment, + split='val', + model_keys=MODELS_PASSED, + shared_columns=[TARGET_COL]) +SAMPLE_ID, FEAT_NAME = pred_val.index.names +if not FEAT_NAME_DISPLAY: + FEAT_NAME_DISPLAY = FEAT_NAME +pred_val[MODELS] + +# %% +pred_test = compare_predictions.load_split_prediction_by_modelkey( + experiment_folder=args.folder_experiment, + split='test', + model_keys=MODELS_PASSED, + shared_columns=[TARGET_COL]) +pred_test + +# %% +pred_val = pred_val.dropna() +pred_test = pred_test.dropna() + +# %% +metrics = vaep.models.Metrics() +test_metrics = metrics.add_metrics( + pred_test, key='test data') +test_metrics = pd.DataFrame(test_metrics) +test_metrics + +# %% +metrics = vaep.models.Metrics() +val_metrics = metrics.add_metrics( + pred_val, key='validation data') +val_metrics = pd.DataFrame(val_metrics) +val_metrics + +# %% diff --git a/project/02_3_grid_search_analysis.ipynb b/project/02_3_grid_search_analysis.ipynb index 586417d74..4a3028aa6 100644 --- a/project/02_3_grid_search_analysis.ipynb +++ b/project/02_3_grid_search_analysis.ipynb @@ -16,22 +16,25 @@ "metadata": {}, "outputs": [], "source": [ + "import snakemake\n", "import logging\n", "import pathlib\n", - "import pandas as pd\n", - "import plotly.express as px\n", + "\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import plotly.express as px\n", "import seaborn as sns\n", "\n", + "import vaep.io\n", + "import vaep.nb\n", + "import vaep.pandas\n", "import vaep.plotting.plotly as px_vaep\n", - "from vaep.analyzers import compare_predictions\n", + "import vaep.utils\n", "from vaep import sampling\n", + "from vaep.analyzers import compare_predictions\n", "from vaep.io import datasplits\n", - "import vaep.utils\n", - "import vaep.pandas\n", - "import vaep.io\n", - "import vaep.nb\n", + "\n", "matplotlib.rcParams['figure.figsize'] = [12.0, 6.0]\n", "\n", "\n", @@ -96,15 +99,18 @@ "cell_type": "code", "execution_count": null, "id": "8f0497b1-5f91-45e9-a3e1-88de08b928a9", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "# not robust\n", "try:\n", " ORDER = {'model': snakemake.params.models}\n", + " FILE_FORMAT = snakemake.params.file_format\n", "except AttributeError:\n", " ORDER = {'model': ['CF', 'DAE', 'VAE']}\n", - "FILE_FORMAT = snakemake.params.file_format" + " FILE_FORMAT = 'csv'" ] }, { @@ -607,6 +613,16 @@ "id": "f8190d51-c4db-4aae-8b91-11641958a0f8", "metadata": {}, "outputs": [], + "source": [ + "view = metrics_long[[\"model\", \"n_params\", \"data_split\", \"metric_name\", \"metric_value\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f98b49d8", + "metadata": {}, + "outputs": [], "source": [ "plt.rcParams['figure.figsize'] = (7, 4)\n", "plt.rcParams['lines.linewidth'] = 2\n", @@ -616,7 +632,7 @@ "col_order = ('valid_fake_na', 'test_fake_na')\n", "row_order = ('MAE', 'MSE')\n", "fg = sns.relplot(\n", - " data=metrics_long,\n", + " data=view,\n", " x='n_params',\n", " y='metric_value',\n", " col=\"data_split\",\n", @@ -652,6 +668,7 @@ "fname\n", "fname = FOLDER / \"hyperpar_results_by_parameters_val+test.pdf\"\n", "files_out[fname.name] = fname.as_posix()\n", + "view.to_excel(fname.with_suffix('.xlsx'))\n", "fg.savefig(fname)\n", "fg.savefig(fname.with_suffix('.png'), dpi=300)" ] diff --git a/project/02_3_grid_search_analysis.py b/project/02_3_grid_search_analysis.py index b3984e069..540bebc05 100644 --- a/project/02_3_grid_search_analysis.py +++ b/project/02_3_grid_search_analysis.py @@ -17,22 +17,25 @@ # # Analyis of grid hyperparameter search # %% +import snakemake import logging import pathlib -import pandas as pd -import plotly.express as px + import matplotlib import matplotlib.pyplot as plt +import pandas as pd +import plotly.express as px import seaborn as sns +import vaep.io +import vaep.nb +import vaep.pandas import vaep.plotting.plotly as px_vaep -from vaep.analyzers import compare_predictions +import vaep.utils from vaep import sampling +from vaep.analyzers import compare_predictions from vaep.io import datasplits -import vaep.utils -import vaep.pandas -import vaep.io -import vaep.nb + matplotlib.rcParams['figure.figsize'] = [12.0, 6.0] @@ -66,9 +69,11 @@ # not robust try: ORDER = {'model': snakemake.params.models} + FILE_FORMAT = snakemake.params.file_format except AttributeError: ORDER = {'model': ['CF', 'DAE', 'VAE']} -FILE_FORMAT = snakemake.params.file_format + FILE_FORMAT = 'csv' + # %% path_metrics = pathlib.Path(metrics_csv) @@ -318,6 +323,9 @@ hover_data['data_split'] = True hover_data['metric_value'] = ':.4f' +# %% +view = metrics_long[["model", "n_params", "data_split", "metric_name", "metric_value"]] + # %% plt.rcParams['figure.figsize'] = (7, 4) plt.rcParams['lines.linewidth'] = 2 @@ -327,7 +335,7 @@ col_order = ('valid_fake_na', 'test_fake_na') row_order = ('MAE', 'MSE') fg = sns.relplot( - data=metrics_long, + data=view, x='n_params', y='metric_value', col="data_split", @@ -363,6 +371,7 @@ fname fname = FOLDER / "hyperpar_results_by_parameters_val+test.pdf" files_out[fname.name] = fname.as_posix() +view.to_excel(fname.with_suffix('.xlsx')) fg.savefig(fname) fg.savefig(fname.with_suffix('.png'), dpi=300) diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb index 14cf6618c..5169e2d07 100644 --- a/project/04_1_train_pimms_models.ipynb +++ b/project/04_1_train_pimms_models.ipynb @@ -5,10 +5,10 @@ "id": "eae0a078", "metadata": {}, "source": [ - "# Scikit-learn styple transformers of the data\n", + "# PIMMS Tutorial: Scikit-learn style transformers\n", "\n", "1. Load data into pandas dataframe\n", - "2. Fit transformer on training data\n", + "2. Fit model on training data, potentially specify validation data\n", "3. Impute only missing values with predictions from model\n", "\n", "Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas\n", @@ -34,7 +34,7 @@ " except metadata.PackageNotFoundError:\n", " print(\"Install PIMMS...\")\n", " # !pip install git+https://github.com/RasmussenLab/pimms.git@dev\n", - " !pip install pimms-learn " + " !pip install pimms-learn" ] }, { @@ -74,7 +74,7 @@ "\n", "\n", "from vaep.plotting.defaults import color_model_mapping\n", - "import vaep.plotting.data \n", + "import vaep.plotting.data\n", "import vaep.sampling\n", "\n", "from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer\n", @@ -208,7 +208,7 @@ "\n", "\n", "def select_features(df, feat_prevalence=.2, axis=0):\n", - " # ! vaep.filter.select_features\n", + " # # ! vaep.filter.select_features\n", " N = df.shape[axis]\n", " minimum_freq = N * feat_prevalence\n", " freq = df.notna().sum(axis=axis)\n", @@ -273,7 +273,7 @@ "metadata": {}, "outputs": [], "source": [ - "# # # CollaborativeFilteringTransformer?" + "# # # # CollaborativeFilteringTransformer?" ] }, { @@ -680,10 +680,6 @@ } ], "metadata": { - "mystnb": { - "execution_raise_on_error": true, - "execution_timeout": 120 - }, "jupytext": { "cell_metadata_filter": "-all", "main_language": "python", @@ -705,6 +701,10 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.17" + }, + "mystnb": { + "execution_raise_on_error": true, + "execution_timeout": 120 } }, "nbformat": 4, diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py index f4891d239..0a11b509c 100644 --- a/project/04_1_train_pimms_models.py +++ b/project/04_1_train_pimms_models.py @@ -1,8 +1,8 @@ # %% [markdown] -# # Scikit-learn styple transformers of the data +# # PIMMS Tutorial: Scikit-learn style transformers # # 1. Load data into pandas dataframe -# 2. Fit transformer on training data +# 2. Fit model on training data, potentially specify validation data # 3. Impute only missing values with predictions from model # # Autoencoders need wide training data, i.e. a sample with all its features' intensities, whereas @@ -21,7 +21,7 @@ except metadata.PackageNotFoundError: print("Install PIMMS...") # # !pip install git+https://github.com/RasmussenLab/pimms.git@dev - # !pip install pimms-learn + # !pip install pimms-learn # %% [markdown] # If on colab, please restart the environment and run everything from here on. @@ -42,7 +42,7 @@ from vaep.plotting.defaults import color_model_mapping -import vaep.plotting.data +import vaep.plotting.data import vaep.sampling from vaep.sklearn.cf_transformer import CollaborativeFilteringTransformer diff --git a/project/10_1_ald_diff_analysis.ipynb b/project/10_1_ald_diff_analysis.ipynb index d00af1fff..24dd7ce90 100644 --- a/project/10_1_ald_diff_analysis.ipynb +++ b/project/10_1_ald_diff_analysis.ipynb @@ -6,7 +6,7 @@ "source": [ "# Differential Analysis - Compare model imputation with standard imputation\n", "\n", - "- load missing values predictions\n", + "- load missing values predictions (if specified)\n", "- leave all other values as they were\n", "- compare missing values predicition by model with baseline method\n", " (default: draw from shifted normal distribution. short RSN)" @@ -15,7 +15,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", @@ -39,7 +43,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# catch passed parameters\n", @@ -51,14 +59,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Parameters" + "## Parameters\n", + "Default and set parameters for the notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2, "tags": [ "parameters" ] @@ -83,10 +91,22 @@ "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" ] }, + { + "cell_type": "markdown", + "id": "01617e36", + "metadata": {}, + "source": [ + "Add set parameters to configuration" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if not model:\n", @@ -98,7 +118,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "args = vaep.nb.Config()\n", @@ -118,13 +142,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Outputs of this notebook will be stored here" + "Outputs of this notebook will be stored here:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out = {}\n", @@ -135,20 +163,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Data" + "## Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## MS proteomics" + "### MS proteomics or specified omics data\n", + "Aggregated from data splits of the imputation workflow run before." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = vaep.io.datasplits.DataSplits.from_folder(\n", @@ -158,7 +191,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "observed = pd.concat([data.train_X, data.val_y, data.test_y])\n", @@ -169,14 +206,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Clinical data" + "### Clinical data\n", + "Describe numerical data specified for use:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -189,8 +230,11 @@ { "cell_type": "code", "execution_count": null, - "id": "8dc0e77c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ## Additional annotations\n", @@ -221,7 +265,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_clinic[[args.target, *args.covar]].isna().any(axis=1).sum()" @@ -237,7 +285,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_sample_with_complete_clinical_data = df_clinic[[args.target, *args.covar]].notna().all(axis=1)\n", @@ -254,8 +306,11 @@ { "cell_type": "code", "execution_count": null, - "id": "7d96716e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_clinic.loc[idx_complete_data, args.target].value_counts()" @@ -263,17 +318,19 @@ }, { "cell_type": "markdown", - "id": "5c0917f5", "metadata": {}, "source": [ - "check which patients with kleiner score have misssing covariates" + "Check which patients with kleiner score have misssing covariates:" ] }, { "cell_type": "code", "execution_count": null, - "id": "4ec28834", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df_clinic.loc[(~mask_sample_with_complete_clinical_data\n", @@ -283,7 +340,6 @@ }, { "cell_type": "markdown", - "id": "0be92801", "metadata": {}, "source": [ "Save feature frequency of observed data based on complete clinical data" @@ -292,8 +348,11 @@ { "cell_type": "code", "execution_count": null, - "id": "220ee009", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_freq_observed = observed.unstack().loc[idx_complete_data].notna().sum()\n", @@ -311,13 +370,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## ALD study approach using all measurments" + "## ALD study approach using all measurements\n", + "Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study)." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "DATA_COMPLETENESS = 0.6\n", @@ -334,7 +398,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if args.fn_qc_samples:\n", @@ -357,7 +425,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -371,63 +442,51 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## load model predictions for (real) missing data" + "## Load model predictions for (real) missing data\n", + "Load from:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list(args.out_preds.iterdir())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0904ba3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "# available_files = list(args.out_preds.iterdir())\n", "template_pred = str(args.out_preds / args.template_pred)\n", - "template_pred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "fname = args.out_preds / args.template_pred.format(args.model)\n", "fname" ] }, { "cell_type": "markdown", - "id": "6e514a17", + "id": "e7a55383", "metadata": { "lines_to_next_cell": 0 }, "source": [ - "Baseline comparison\n", - "In case of RSN -> use filtering as done in original paper (Niu et al. 2022)\n", - "otherwise -> use all data\n", + "Baseline comparison:\n", + "- in case of RSN -> use filtering as done in original [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study) (Niu et al. 2022)\n", + "- otherwise -> use all data\n", "\n", - "- use columns which are provided by model" + "Use columns which are provided by model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "# ALD study approach -> has access to simulated missing data!\n", - "# (VAE model did not see this data)\n", "pred_real_na = None\n", "if args.model_key and str(args.model_key) != 'None':\n", " pred_real_na = (vaep\n", @@ -452,13 +511,17 @@ "lines_to_next_cell": 2 }, "source": [ - "plot subsets to highlight differences" + "Plot unchanged observed intensities to imputed intensity distribution (if available):" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "def plot_distributions(observed: pd.Series,\n", @@ -496,13 +559,13 @@ " ax.set_ylabel('observations')\n", " ax.locator_params(axis='y', integer=True)\n", " ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", - " return fig\n", + " return fig, bins\n", "\n", "\n", "vaep.plotting.make_large_descriptors(6)\n", - "fig = plot_distributions(observed,\n", - " imputation=pred_real_na,\n", - " model_key=args.model_key, figsize=(2.5, 2))\n", + "fig, bins = plot_distributions(observed,\n", + " imputation=pred_real_na,\n", + " model_key=args.model_key, figsize=(2.5, 2))\n", "fname = args.out_folder / 'dist_plots' / f'real_na_obs_vs_{args.model_key}.pdf'\n", "files_out[fname.name] = fname.as_posix()\n", "vaep.savefig(fig, name=fname)" @@ -510,15 +573,50 @@ }, { "cell_type": "markdown", + "id": "a8394517", "metadata": {}, "source": [ - "## Mean shift by model" + "Dump frequency of histograms to file for reporting (if imputed values are used)" ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "if pred_real_na is not None:\n", + " counts_per_bin = pd.concat([\n", + " vaep.pandas.get_counts_per_bin(observed.to_frame('observed'), bins=bins),\n", + " vaep.pandas.get_counts_per_bin(pred_real_na.to_frame(args.model_key), bins=bins)\n", + " ], axis=1)\n", + " counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n", + " logger.info(\"Counts per bin saved to %s\", fname.with_suffix('.xlsx'))\n", + " display(counts_per_bin)" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "## Mean shift by model\n", + "Compare how imputed values are shifted in comparsion to overall distribution.\n", + "\n", + "First by using all intensities without any grouping:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if pred_real_na is not None:\n", @@ -531,13 +629,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Or by averaging over the calculation by sample" + "Then by averaging over the calculation by sample:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "if pred_real_na is not None:\n", @@ -557,14 +659,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Differential analysis\n", - "Impute missing values (or not)" + "## Differential analysis\n", + "Combine observed and imputed data (if available) for differential analysis:" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "df = pd.concat([observed, pred_real_na]).unstack()\n", @@ -574,11 +680,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# * if some features were not imputed -> drop them\n", - "? could be changed: let a model decide if a feature should be imputed, otherwise don't.\n", + "# ? could be changed: let a model decide if a feature should be imputed, otherwise don't.\n", "if pred_real_na is not None:\n", " if df.isna().sum().sum():\n", " logger.warning(\"DataFrame has missing entries after imputation.\")\n", @@ -590,14 +700,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Targets - Clinical variables" + "Results for target and clinical variables:" ] }, { "cell_type": "code", "execution_count": null, - "id": "fe32bcf6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = njab.stats.ancova.AncovaAll(df_proteomics=df,\n", @@ -606,16 +719,6 @@ " covar=args.covar,\n", " value_name=args.value_name\n", " ).ancova()\n", - "scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9632495", - "metadata": {}, - "outputs": [], - "source": [ "# features are in first index position\n", "feat_idx = scores.index.get_level_values(0)\n", "if gene_to_PG is not None:\n", @@ -626,11 +729,21 @@ "scores" ] }, + { + "cell_type": "markdown", + "id": "bd02d010", + "metadata": {}, + "source": [ + "Only for target:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -639,11 +752,21 @@ "scores.loc[pd.IndexSlice[:, args.target], :]" ] }, + { + "cell_type": "markdown", + "id": "bc8f0344", + "metadata": {}, + "source": [ + "Save all results to file:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -654,24 +777,26 @@ "fname" ] }, + { + "cell_type": "markdown", + "id": "3c3db9ea", + "metadata": {}, + "source": [ + "Saved files:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 0 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "files_out" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3734882", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/project/10_1_ald_diff_analysis.py b/project/10_1_ald_diff_analysis.py index 712334711..9bb4de7f1 100644 --- a/project/10_1_ald_diff_analysis.py +++ b/project/10_1_ald_diff_analysis.py @@ -15,12 +15,12 @@ # %% [markdown] # # Differential Analysis - Compare model imputation with standard imputation # -# - load missing values predictions +# - load missing values predictions (if specified) # - leave all other values as they were # - compare missing values predicition by model with baseline method # (default: draw from shifted normal distribution. short RSN) -# %% +# %% tags=["hide-input"] import logging from pathlib import Path @@ -38,13 +38,14 @@ logger = vaep.logging.setup_nb_logger() logging.getLogger('fontTools').setLevel(logging.WARNING) -# %% +# %% tags=["hide-input"] # catch passed parameters args = None args = dict(globals()).keys() # %% [markdown] # ## Parameters +# Default and set parameters for the notebook. # %% tags=["parameters"] folder_experiment = "runs/appl_ald_data/plasma/proteinGroups" @@ -64,14 +65,16 @@ out_folder = 'diff_analysis' template_pred = 'pred_real_na_{}.csv' # fixed, do not change +# %% [markdown] +# Add set parameters to configuration -# %% +# %% tags=["hide-input"] if not model: model = model_key params = vaep.nb.get_params(args, globals=globals(), remove=True) params -# %% +# %% tags=["hide-input"] args = vaep.nb.Config() args.fn_clinical_data = Path(params["fn_clinical_data"]) args.folder_experiment = Path(params["folder_experiment"]) @@ -85,37 +88,39 @@ args # %% [markdown] -# Outputs of this notebook will be stored here +# Outputs of this notebook will be stored here: -# %% +# %% tags=["hide-input"] files_out = {} args.out_folder # %% [markdown] -# # Data +# ## Data # %% [markdown] -# ## MS proteomics +# ### MS proteomics or specified omics data +# Aggregated from data splits of the imputation workflow run before. -# %% +# %% tags=["hide-input"] data = vaep.io.datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) -# %% +# %% tags=["hide-input"] observed = pd.concat([data.train_X, data.val_y, data.test_y]) observed # %% [markdown] -# ## Clinical data +# ### Clinical data +# Describe numerical data specified for use: -# %% +# %% tags=["hide-input"] df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0) df_clinic = df_clinic.loc[observed.index.levels[0]] cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) df_clinic[[args.target, *args.covar]].describe() -# %% +# %% tags=["hide-input"] # ## Additional annotations # - additional annotations of features (e.g. gene names for protein groups) @@ -136,13 +141,13 @@ # - only complete data is used for Differential Analysis # - covariates are not imputed -# %% +# %% tags=["hide-input"] df_clinic[[args.target, *args.covar]].isna().any(axis=1).sum() # %% [markdown] # Data description of data used: -# %% +# %% tags=["hide-input"] mask_sample_with_complete_clinical_data = df_clinic[[args.target, *args.covar]].notna().all(axis=1) fname = args.out_folder / 'mask_sample_with_complete_clinical_data.csv' files_out[fname.name] = fname.as_posix() @@ -153,13 +158,13 @@ .index) df_clinic.loc[idx_complete_data, [args.target, *args.covar]].describe() -# %% +# %% tags=["hide-input"] df_clinic.loc[idx_complete_data, args.target].value_counts() # %% [markdown] -# check which patients with kleiner score have misssing covariates +# Check which patients with kleiner score have misssing covariates: -# %% +# %% tags=["hide-input"] df_clinic.loc[(~mask_sample_with_complete_clinical_data & df_clinic[args.target].notna()), [args.target, *args.covar]] @@ -167,7 +172,7 @@ # %% [markdown] # Save feature frequency of observed data based on complete clinical data -# %% +# %% tags=["hide-input"] feat_freq_observed = observed.unstack().loc[idx_complete_data].notna().sum() feat_freq_observed.name = 'frequency' @@ -179,9 +184,10 @@ _ = ax.set_xticklabels([l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]) # %% [markdown] -# ## ALD study approach using all measurments +# ## ALD study approach using all measurements +# Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study). -# %% +# %% tags=["hide-input"] DATA_COMPLETENESS = 0.6 # MIN_N_PROTEIN_GROUPS: int = 200 FRAC_PROTEIN_GROUPS: int = 0.622 @@ -192,7 +198,7 @@ ald_study -# %% +# %% tags=["hide-input"] if args.fn_qc_samples: # Move this to data-preprocessing qc_samples = pd.read_pickle(args.fn_qc_samples) @@ -208,7 +214,7 @@ ald_study -# %% +# %% tags=["hide-input"] fig, axes = vaep.plotting.plot_cutoffs(observed.unstack(), feat_completness_over_samples=cutoffs.feat_completness_over_samples, min_feat_in_sample=cutoffs.min_feat_in_sample) @@ -216,28 +222,22 @@ # %% [markdown] -# ## load model predictions for (real) missing data - -# %% -list(args.out_preds.iterdir()) +# ## Load model predictions for (real) missing data +# Load from: -# %% +# %% tags=["hide-input"] +# available_files = list(args.out_preds.iterdir()) template_pred = str(args.out_preds / args.template_pred) -template_pred - -# %% fname = args.out_preds / args.template_pred.format(args.model) fname # %% [markdown] -# Baseline comparison -# In case of RSN -> use filtering as done in original paper (Niu et al. 2022) -# otherwise -> use all data +# Baseline comparison: +# - in case of RSN -> use filtering as done in original [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study) (Niu et al. 2022) +# - otherwise -> use all data # -# - use columns which are provided by model -# %% -# ALD study approach -> has access to simulated missing data! -# (VAE model did not see this data) +# Use columns which are provided by model +# %% tags=["hide-input"] pred_real_na = None if args.model_key and str(args.model_key) != 'None': pred_real_na = (vaep @@ -257,10 +257,10 @@ # %% [markdown] -# plot subsets to highlight differences +# Plot unchanged observed intensities to imputed intensity distribution (if available): -# %% +# %% tags=["hide-input"] def plot_distributions(observed: pd.Series, imputation: pd.Series = None, model_key: str = 'MODEL', @@ -296,30 +296,46 @@ def plot_distributions(observed: pd.Series, ax.set_ylabel('observations') ax.locator_params(axis='y', integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - return fig + return fig, bins vaep.plotting.make_large_descriptors(6) -fig = plot_distributions(observed, - imputation=pred_real_na, - model_key=args.model_key, figsize=(2.5, 2)) +fig, bins = plot_distributions(observed, + imputation=pred_real_na, + model_key=args.model_key, figsize=(2.5, 2)) fname = args.out_folder / 'dist_plots' / f'real_na_obs_vs_{args.model_key}.pdf' files_out[fname.name] = fname.as_posix() vaep.savefig(fig, name=fname) +# %% [markdown] +# Dump frequency of histograms to file for reporting (if imputed values are used) + +# %% tags=["hide-input"] +if pred_real_na is not None: + counts_per_bin = pd.concat([ + vaep.pandas.get_counts_per_bin(observed.to_frame('observed'), bins=bins), + vaep.pandas.get_counts_per_bin(pred_real_na.to_frame(args.model_key), bins=bins) + ], axis=1) + counts_per_bin.to_excel(fname.with_suffix('.xlsx')) + logger.info("Counts per bin saved to %s", fname.with_suffix('.xlsx')) + display(counts_per_bin) + # %% [markdown] # ## Mean shift by model +# Compare how imputed values are shifted in comparsion to overall distribution. +# +# First by using all intensities without any grouping: -# %% +# %% tags=["hide-input"] if pred_real_na is not None: shifts = (vaep.imputation.compute_moments_shift(observed, pred_real_na, names=('observed', args.model_key))) display(pd.DataFrame(shifts).T) # %% [markdown] -# Or by averaging over the calculation by sample +# Then by averaging over the calculation by sample: -# %% +# %% tags=["hide-input"] if pred_real_na is not None: index_level = 0 # per sample mean_by_sample = pd.DataFrame( @@ -333,16 +349,16 @@ def plot_distributions(observed: pd.Series, display(mean_by_sample) # %% [markdown] -# # Differential analysis -# Impute missing values (or not) +# ## Differential analysis +# Combine observed and imputed data (if available) for differential analysis: -# %% +# %% tags=["hide-input"] df = pd.concat([observed, pred_real_na]).unstack() df.loc[idx_complete_data] -# %% +# %% tags=["hide-input"] # * if some features were not imputed -> drop them -# ? could be changed: let a model decide if a feature should be imputed, otherwise don't. +# # ? could be changed: let a model decide if a feature should be imputed, otherwise don't. if pred_real_na is not None: if df.isna().sum().sum(): logger.warning("DataFrame has missing entries after imputation.") @@ -350,18 +366,15 @@ def plot_distributions(observed: pd.Series, df = df.dropna(axis=1) # %% [markdown] -# Targets - Clinical variables +# Results for target and clinical variables: -# %% +# %% tags=["hide-input"] scores = njab.stats.ancova.AncovaAll(df_proteomics=df, df_clinic=df_clinic, target=args.target, covar=args.covar, value_name=args.value_name ).ancova() -scores - -# %% # features are in first index position feat_idx = scores.index.get_level_values(0) if gene_to_PG is not None: @@ -371,20 +384,26 @@ def plot_distributions(observed: pd.Series, ) scores -# %% +# %% [markdown] +# Only for target: + +# %% tags=["hide-input"] scores.columns = pd.MultiIndex.from_product([[str(args.model_key)], scores.columns], names=('model', 'var')) scores.loc[pd.IndexSlice[:, args.target], :] +# %% [markdown] +# Save all results to file: -# %% +# %% tags=["hide-input"] fname = args.out_folder / 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl' files_out[fname.name] = fname.as_posix() fname.parent.mkdir(exist_ok=True, parents=True) scores.to_pickle(fname) fname +# %% [markdown] +# Saved files: -# %% +# %% tags=["hide-input"] files_out -# %% diff --git a/project/10_2_ald_compare_methods.ipynb b/project/10_2_ald_compare_methods.ipynb index 68ad9f949..02ff40448 100644 --- a/project/10_2_ald_compare_methods.ipynb +++ b/project/10_2_ald_compare_methods.ipynb @@ -7,38 +7,38 @@ "source": [ "# Compare outcomes from differential analysis based on different imputation methods\n", "\n", - "- load scores based on `16_ald_diff_analysis`" + "- load scores based on `10_1_ald_diff_analysis`" ] }, { "cell_type": "code", "execution_count": null, "id": "eec6f931-c04d-428c-b2b1-0424c50e6cd2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "import logging\n", "from pathlib import Path\n", "\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import seaborn as sns\n", + "from IPython.display import display\n", "\n", "import vaep\n", "import vaep.databases.diseases\n", + "\n", "logger = vaep.logging.setup_nb_logger()\n", "\n", "plt.rcParams['figure.figsize'] = (2, 2)\n", "fontsize = 5\n", - "vaep.plotting.make_large_descriptors(fontsize)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4bf65da-0569-4a21-ba20-9cae7d3679e7", - "metadata": {}, - "outputs": [], - "source": [ + "vaep.plotting.make_large_descriptors(fontsize)\n", + "logging.getLogger('fontTools').setLevel(logging.ERROR)\n", + "\n", "# catch passed parameters\n", "args = None\n", "args = dict(globals()).keys()" @@ -46,18 +46,18 @@ }, { "cell_type": "markdown", - "id": "22c645fc-c7eb-4c69-a7aa-084fc733258f", + "id": "85f5f5d5", "metadata": {}, "source": [ - "## Parameters" + "## Parameters\n", + "Default and set parameters for the notebook." ] }, { "cell_type": "code", "execution_count": null, - "id": "978876d0-b3cc-4847-8eab-dc0b89ddbbcd", + "id": "f4bf65da-0569-4a21-ba20-9cae7d3679e7", "metadata": { - "lines_to_next_cell": 2, "tags": [ "parameters" ] @@ -70,6 +70,7 @@ "model_key = 'VAE'\n", "baseline = 'RSN'\n", "out_folder = 'diff_analysis'\n", + "selected_statistics = ['p-unc', '-Log10 pvalue', 'qvalue', 'rejected']\n", "\n", "disease_ontology = 5082 # code from https://disease-ontology.org/\n", "# split diseases notebook? Query gene names for proteins in file from uniprot?\n", @@ -77,25 +78,25 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a8016d79-e41a-40a2-bcbf-e11711c33b7d", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "22c645fc-c7eb-4c69-a7aa-084fc733258f", + "metadata": {}, "source": [ - "params = vaep.nb.get_params(args, globals=globals())\n", - "params" + "Add set parameters to configuration" ] }, { "cell_type": "code", "execution_count": null, - "id": "5ded6640-99aa-4759-a8ef-b67029f22766", - "metadata": {}, + "id": "978876d0-b3cc-4847-8eab-dc0b89ddbbcd", + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "params = vaep.nb.get_params(args, globals=globals())\n", "args = vaep.nb.Config()\n", "args.folder_experiment = Path(params[\"folder_experiment\"])\n", "args = vaep.nb.add_default_paths(args,\n", @@ -109,53 +110,34 @@ " / params[\"out_folder\"]\n", " / params[\"target\"]\n", " / 'scores')\n", + "args.freq_features_observed = args.folder_experiment / 'freq_features_observed.csv'\n", "args" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "112dd9f2-6219-452a-9c6a-b1712dabb164", - "metadata": {}, - "outputs": [], - "source": [ - "files_in = {\n", - " 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv',\n", - "}\n", - "files_in" - ] - }, { "cell_type": "markdown", - "id": "c74bcc21-3fb2-4b8d-823a-72a3b6b6e847", - "metadata": {}, - "source": [ - "## Excel file for exports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12009c54-c45f-4ee0-a9b3-b0e8e5f3cff2", + "id": "fc184dea", "metadata": {}, - "outputs": [], "source": [ - "files_out = dict()" + "### Excel file for exports" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ef71b04-d4a5-4def-ad63-866d8bba4a1e", - "metadata": {}, + "id": "a8016d79-e41a-40a2-bcbf-e11711c33b7d", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ + "files_out = dict()\n", "writer_args = dict(float_format='%.3f')\n", "\n", "fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'\n", "files_out[fname.name] = fname\n", "writer = pd.ExcelWriter(fname)\n", - "fname" + "logger.info(\"Writing to excel file: %s\", fname)" ] }, { @@ -163,24 +145,27 @@ "id": "770d1f76-e86f-4ae3-9d7b-ceef9b9e9a22", "metadata": {}, "source": [ - "# Load scores" + "## Load scores" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e8bbada7-8b8e-4399-b0d3-b66c40905839", + "cell_type": "markdown", + "id": "6b108869", "metadata": {}, - "outputs": [], "source": [ - "[x for x in args.scores_folder.iterdir() if 'scores' in str(x)]" + "### Load baseline model scores\n", + "Show all statistics, later use selected statistics" ] }, { "cell_type": "code", "execution_count": null, "id": "97221134-5f61-4158-bfc5-ea30077140b8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'\n", @@ -188,11 +173,23 @@ "scores_baseline" ] }, + { + "cell_type": "markdown", + "id": "e49a8da2", + "metadata": {}, + "source": [ + "### Load selected comparison model scores" + ] + }, { "cell_type": "code", "execution_count": null, "id": "f0635e4d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.scores_folder / f'diff_analysis_scores_{args.model_key}.pkl'\n", @@ -200,22 +197,49 @@ "scores_model" ] }, + { + "cell_type": "markdown", + "id": "06b7e883", + "metadata": {}, + "source": [ + "### Combined scores\n", + "show only selected statistics for comparsion" + ] + }, { "cell_type": "code", "execution_count": null, "id": "373fdf65", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = scores_model.join(scores_baseline, how='outer')[[args.baseline, args.model_key]]\n", + "scores = scores.loc[:, pd.IndexSlice[scores.columns.levels[0].to_list(),\n", + " args.selected_statistics]]\n", "scores" ] }, + { + "cell_type": "markdown", + "id": "b84a6e5a", + "metadata": {}, + "source": [ + "Models in comparison (name mapping)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "34d243d1-3ab4-40e7-9eb8-f9efc828b82d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "models = vaep.nb.Config.from_dict(\n", @@ -223,21 +247,45 @@ "vars(models)" ] }, + { + "cell_type": "markdown", + "id": "dd7a560d", + "metadata": {}, + "source": [ + "## Describe scores" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0fee8f5d-fa52-4369-a1f9-fcfd518ab6bd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores.describe()" ] }, + { + "cell_type": "markdown", + "id": "52ecc596", + "metadata": {}, + "source": [ + "### One to one comparison of by feature:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "c6e5a0a6-343b-4f07-8d9d-2cd5cf95ae1f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = scores.loc[pd.IndexSlice[:, args.target], :]\n", @@ -245,22 +293,46 @@ "scores" ] }, + { + "cell_type": "markdown", + "id": "36e14580", + "metadata": {}, + "source": [ + "And the descriptive statistics\n", + "of the numeric values:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0e45e80a-32d8-4c6c-b0a4-5ce8b7f9e121", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores.describe()" ] }, + { + "cell_type": "markdown", + "id": "e520d6dc", + "metadata": {}, + "source": [ + "and the boolean decision values" + ] + }, { "cell_type": "code", "execution_count": null, "id": "53bd5597-221c-4d54-abf2-82956db42594", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -279,10 +351,14 @@ "cell_type": "code", "execution_count": null, "id": "2a926ba1-0f3b-4089-a349-b6d66128cf37", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "freq_feat = pd.read_csv(files_in['freq_features_observed.csv'], index_col=0)\n", + "freq_feat = pd.read_csv(args.freq_features_observed, index_col=0)\n", "freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),])\n", "freq_feat" ] @@ -292,14 +368,18 @@ "id": "408eacfe-770f-42ff-9057-2a98274e1ae3", "metadata": {}, "source": [ - "# Compare shared features" + "## Compare shared features" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2dfb0f-195b-4044-a228-2d784ea2a458", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores_common = (scores\n", @@ -311,11 +391,23 @@ "scores_common" ] }, + { + "cell_type": "markdown", + "id": "62a9eefd", + "metadata": {}, + "source": [ + "### Annotate decisions in Confusion Table style:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "80cf4145-070d-457a-bb74-ee64299809e7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "def annotate_decision(scores, model, model_column):\n", @@ -335,37 +427,54 @@ "annotations.value_counts()" ] }, + { + "cell_type": "markdown", + "id": "0942d395", + "metadata": {}, + "source": [ + "### List different decisions between models" + ] + }, { "cell_type": "code", "execution_count": null, "id": "cda5ffa4-9a97-4a49-aaba-34e83ef7940a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_different = (\n", " (scores_common.loc[:, pd.IndexSlice[:, 'rejected']].any(axis=1))\n", " & ~(scores_common.loc[:, pd.IndexSlice[:, 'rejected']].all(axis=1))\n", ")\n", - "\n", - "scores_common.loc[mask_different]" + "_to_write = scores_common.loc[mask_different]\n", + "_to_write.to_excel(writer, 'differences', **writer_args)\n", + "logger.info(\"Writen to Excel file under sheet 'differences'.\")\n", + "_to_write" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b9e2739b-a09a-4113-a3f8-f29d6ed398b7", + "cell_type": "markdown", + "id": "e8832084", "metadata": {}, - "outputs": [], "source": [ - "_to_write = scores_common.loc[mask_different]\n", - "_to_write.to_excel(writer, 'differences', **writer_args)" + "## Plot qvalues of both models with annotated decisions\n", + "\n", + "Prepare data for plotting (qvalues)" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2e5341-b054-40c3-b45a-44ae6ca46cfb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "var = 'qvalue'\n", @@ -385,14 +494,18 @@ "tags": [] }, "source": [ - "## Plot of intensities for most extreme example" + "List of features with the highest difference in qvalues" ] }, { "cell_type": "code", "execution_count": null, "id": "c7af4a70-aa43-4772-af00-d425f5ed249f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# should it be possible to run not only RSN?\n", @@ -405,19 +518,21 @@ "id": "e715954e-2d62-4cd9-b4b0-063524bca495", "metadata": {}, "source": [ - "## Differences plotted\n", - "\n", - "- first only using created annotations" + "### Differences plotted with created annotations" ] }, { "cell_type": "code", "execution_count": null, "id": "a20d356d-c397-4440-b70e-9d899aa200fd", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "figsize = (2, 2)\n", + "figsize = (4, 4)\n", "size = 5\n", "fig, ax = plt.subplots(figsize=figsize)\n", "x_col = to_plot.columns[0]\n", @@ -450,17 +565,20 @@ "id": "d8849c76-c5f6-4618-87c0-f2635dc9ac66", "metadata": {}, "source": [ - "- showing how many features were measured (\"observed\")" + "- also showing how many features were measured (\"observed\") by size of circle" ] }, { "cell_type": "code", "execution_count": null, "id": "1cf258f8-9dcb-40bb-af66-b600f8d413f6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "figsize = (2.5, 2.5)\n", "fig, ax = plt.subplots(figsize=figsize)\n", "ax = sns.scatterplot(data=to_plot,\n", " x=to_plot.columns[0],\n", @@ -490,75 +608,64 @@ "id": "1fee3a21-d8b3-40c6-aea2-4774dfe855ca", "metadata": {}, "source": [ - "# Only features contained in model" + "## Only features contained in model\n", + "- this block exist due to a specific part in the ALD analysis of the paper" ] }, { "cell_type": "code", "execution_count": null, "id": "c4e23a01-fd37-4496-a518-445a9ef38db1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores_model_only = scores.reset_index(level=-1, drop=True)\n", - "scores_model_only = (scores_model_only\n", - " .loc[\n", - " scores_model_only.index.difference(\n", - " scores_common.index),\n", - " args.model_key]\n", - " .sort_values(by='qvalue', ascending=True)\n", - " .join(freq_feat)\n", - " )\n", - "scores_model_only" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f668bef4-e2b9-46fb-828f-e7c6a0e23627", - "metadata": {}, - "outputs": [], - "source": [ - "scores_model_only.rejected.value_counts()" + "_diff = scores_model_only.index.difference(scores_common.index)\n", + "if not _diff.empty:\n", + " scores_model_only = (scores_model_only\n", + " .loc[\n", + " _diff,\n", + " args.model_key]\n", + " .sort_values(by='qvalue', ascending=True)\n", + " .join(freq_feat.squeeze().rename(freq_feat.columns.droplevel()[0])\n", + " )\n", + " )\n", + " display(scores_model_only)\n", + "else:\n", + " scores_model_only = None\n", + " logger.info(\"No features only in new comparision model.\")\n", + "\n", + "if not _diff.empty:\n", + " scores_model_only.to_excel(writer, 'only_model', **writer_args)\n", + " display(scores_model_only.rejected.value_counts())\n", + " scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected]\n", + " scores_model_only_rejected.to_excel(\n", + " writer, 'only_model_rejected', **writer_args)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "419ec0b5-24c4-4366-8a03-48fec3aeb29b", + "cell_type": "markdown", + "id": "78b2c336", "metadata": {}, - "outputs": [], "source": [ - "scores_model_only.to_excel(writer, 'only_model', **writer_args)" + "## DISEASES DB lookup\n", + "\n", + "Query diseases database for gene associations with specified disease ontology id." ] }, { "cell_type": "code", "execution_count": null, - "id": "814b8f38-81ef-4546-9182-f65b124e8858", - "metadata": {}, - "outputs": [], - "source": [ - "scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected]\n", - "scores_model_only_rejected.to_excel(\n", - " writer, 'only_model_rejected', **writer_args)" - ] - }, - { - "cell_type": "markdown", - "id": "6868984c-1ebf-4183-bebe-35a48b92e479", + "id": "d93a9242-0ef4-4fc7-bd98-226a93639f58", "metadata": { - "tags": [] + "tags": [ + "hide-input" + ] }, - "source": [ - "# DISEASES DB lookup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d93a9242-0ef4-4fc7-bd98-226a93639f58", - "metadata": {}, "outputs": [], "source": [ "data = vaep.databases.diseases.get_disease_association(\n", @@ -582,7 +689,11 @@ "cell_type": "code", "execution_count": null, "id": "5c26415e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [] }, @@ -590,7 +701,11 @@ "cell_type": "code", "execution_count": null, "id": "b68b43df", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_name = scores.index.names[0] # first index level is feature name\n", @@ -607,7 +722,11 @@ "cell_type": "code", "execution_count": null, "id": "f8d4a74d-5a9b-4d9b-9345-4288bb23e19f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "gene_to_PG = (scores.droplevel(\n", @@ -625,7 +744,11 @@ "cell_type": "code", "execution_count": null, "id": "d9e76def-b48a-458d-a90b-765e6e70f7a4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "disease_associations_all = data.join(\n", @@ -645,7 +768,11 @@ "cell_type": "code", "execution_count": null, "id": "8e9d6944-87ba-4c41-af14-fb5ed93262f0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(scores_model_only.index)\n", @@ -658,7 +785,11 @@ "cell_type": "code", "execution_count": null, "id": "ceefc483-b889-4bab-b207-c8d5fd97fa4a", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_new.loc[idx, 'score'] >= 2.0\n", @@ -677,7 +808,11 @@ "cell_type": "code", "execution_count": null, "id": "4644759b-8cc2-4f99-a16c-16419cfb915c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(\n", @@ -691,7 +826,11 @@ "cell_type": "code", "execution_count": null, "id": "365d8641-97a8-464f-b69b-270af9ae6e2d", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_new_rejected.loc[idx, 'score'] >= 2.0\n", @@ -723,7 +862,11 @@ "cell_type": "code", "execution_count": null, "id": "6416d494-5f3e-4cf4-b766-b1f95e40ae1c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(mask.index[mask])\n", @@ -736,7 +879,11 @@ "cell_type": "code", "execution_count": null, "id": "10899e28-4aee-4d44-a542-e45be6699a1b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_shared_rejected_by_model.loc[idx, 'score'] >= 2.0\n", @@ -768,7 +915,11 @@ "cell_type": "code", "execution_count": null, "id": "7780de55-c63b-4028-a6d0-58bce7be81da", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = disease_associations_all.index.intersection(mask.index[mask])\n", @@ -783,7 +934,11 @@ "cell_type": "code", "execution_count": null, "id": "03115c8f-1f20-4b51-a78c-4d7c0317dc33", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask = disease_assocications_shared_rejected_by_RSN.loc[idx, 'score'] >= 2.0\n", @@ -802,7 +957,11 @@ "cell_type": "code", "execution_count": null, "id": "91e7fbb7-69fd-4b4c-9bc2-40e8dd1907b3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "disease_associations_all.to_excel(\n", @@ -818,26 +977,21 @@ "id": "52a42028-7e2d-47d5-be02-52f7ff1f3665", "metadata": {}, "source": [ - "# Outputs" + "## Outputs" ] }, { "cell_type": "code", "execution_count": null, "id": "7f3a7433-3bf1-4168-8f16-eb6d415ef17f", - "metadata": {}, - "outputs": [], - "source": [ - "writer.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e59ff592-a399-4490-bf3f-7618abf73feb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ + "writer.close()\n", "files_out" ] } diff --git a/project/10_2_ald_compare_methods.py b/project/10_2_ald_compare_methods.py index 55037a3de..9268c089c 100644 --- a/project/10_2_ald_compare_methods.py +++ b/project/10_2_ald_compare_methods.py @@ -15,30 +15,34 @@ # %% [markdown] # # Compare outcomes from differential analysis based on different imputation methods # -# - load scores based on `16_ald_diff_analysis` +# - load scores based on `10_1_ald_diff_analysis` -# %% +# %% tags=["hide-input"] +import logging from pathlib import Path import matplotlib.pyplot as plt import pandas as pd import seaborn as sns +from IPython.display import display import vaep import vaep.databases.diseases + logger = vaep.logging.setup_nb_logger() plt.rcParams['figure.figsize'] = (2, 2) fontsize = 5 vaep.plotting.make_large_descriptors(fontsize) +logging.getLogger('fontTools').setLevel(logging.ERROR) -# %% # catch passed parameters args = None args = dict(globals()).keys() # %% [markdown] # ## Parameters +# Default and set parameters for the notebook. # %% tags=["parameters"] folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups' @@ -47,17 +51,17 @@ model_key = 'VAE' baseline = 'RSN' out_folder = 'diff_analysis' +selected_statistics = ['p-unc', '-Log10 pvalue', 'qvalue', 'rejected'] disease_ontology = 5082 # code from https://disease-ontology.org/ # split diseases notebook? Query gene names for proteins in file from uniprot? annotaitons_gene_col = 'PG.Genes' +# %% [markdown] +# Add set parameters to configuration -# %% +# %% tags=["hide-input"] params = vaep.nb.get_params(args, globals=globals()) -params - -# %% args = vaep.nb.Config() args.folder_experiment = Path(params["folder_experiment"]) args = vaep.nb.add_default_paths(args, @@ -71,80 +75,99 @@ / params["out_folder"] / params["target"] / 'scores') +args.freq_features_observed = args.folder_experiment / 'freq_features_observed.csv' args -# %% -files_in = { - 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv', -} -files_in - # %% [markdown] -# ## Excel file for exports +# ### Excel file for exports # %% files_out = dict() - -# %% writer_args = dict(float_format='%.3f') fname = args.out_folder / 'diff_analysis_compare_methods.xlsx' files_out[fname.name] = fname writer = pd.ExcelWriter(fname) -fname +logger.info("Writing to excel file: %s", fname) # %% [markdown] -# # Load scores +# ## Load scores -# %% -[x for x in args.scores_folder.iterdir() if 'scores' in str(x)] +# %% [markdown] +# ### Load baseline model scores +# Show all statistics, later use selected statistics -# %% +# %% tags=["hide-input"] fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl' scores_baseline = pd.read_pickle(fname) scores_baseline -# %% +# %% [markdown] +# ### Load selected comparison model scores + +# %% tags=["hide-input"] fname = args.scores_folder / f'diff_analysis_scores_{args.model_key}.pkl' scores_model = pd.read_pickle(fname) scores_model -# %% +# %% [markdown] +# ### Combined scores +# show only selected statistics for comparsion + +# %% tags=["hide-input"] scores = scores_model.join(scores_baseline, how='outer')[[args.baseline, args.model_key]] +scores = scores.loc[:, pd.IndexSlice[scores.columns.levels[0].to_list(), + args.selected_statistics]] scores -# %% +# %% [markdown] +# Models in comparison (name mapping) + +# %% tags=["hide-input"] models = vaep.nb.Config.from_dict( vaep.pandas.index_to_dict(scores.columns.get_level_values(0))) vars(models) -# %% +# %% [markdown] +# ## Describe scores + +# %% tags=["hide-input"] scores.describe() -# %% +# %% [markdown] +# ### One to one comparison of by feature: + +# %% tags=["hide-input"] scores = scores.loc[pd.IndexSlice[:, args.target], :] scores.to_excel(writer, 'scores', **writer_args) scores -# %% +# %% [markdown] +# And the descriptive statistics +# of the numeric values: + +# %% tags=["hide-input"] scores.describe() -# %% +# %% [markdown] +# and the boolean decision values + +# %% tags=["hide-input"] scores.describe(include=['bool', 'O']) # %% [markdown] # ## Load frequencies of observed features -# %% -freq_feat = pd.read_csv(files_in['freq_features_observed.csv'], index_col=0) +# %% tags=["hide-input"] +freq_feat = pd.read_csv(args.freq_features_observed, index_col=0) freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),]) freq_feat # %% [markdown] -# # Compare shared features +# ## Compare shared features -# %% +# %% tags=["hide-input"] scores_common = (scores .dropna() .reset_index(-1, drop=True) @@ -154,7 +177,10 @@ scores_common -# %% +# %% [markdown] +# ### Annotate decisions in Confusion Table style: + +# %% tags=["hide-input"] def annotate_decision(scores, model, model_column): return scores[(model_column, 'rejected')].replace({False: f'{model} (no) ', True: f'{model} (yes)'}) @@ -171,19 +197,25 @@ def annotate_decision(scores, model, model_column): annotations.name = 'Differential Analysis Comparison' annotations.value_counts() -# %% +# %% [markdown] +# ### List different decisions between models + +# %% tags=["hide-input"] mask_different = ( (scores_common.loc[:, pd.IndexSlice[:, 'rejected']].any(axis=1)) & ~(scores_common.loc[:, pd.IndexSlice[:, 'rejected']].all(axis=1)) ) - -scores_common.loc[mask_different] - -# %% _to_write = scores_common.loc[mask_different] _to_write.to_excel(writer, 'differences', **writer_args) +logger.info("Writen to Excel file under sheet 'differences'.") +_to_write -# %% +# %% [markdown] +# ## Plot qvalues of both models with annotated decisions +# +# Prepare data for plotting (qvalues) + +# %% tags=["hide-input"] var = 'qvalue' to_plot = [scores_common[v][var] for v in models.values()] for s, k in zip(to_plot, models.keys()): @@ -194,20 +226,18 @@ def annotate_decision(scores, model, model_column): to_plot # %% [markdown] -# ## Plot of intensities for most extreme example +# List of features with the highest difference in qvalues -# %% +# %% tags=["hide-input"] # should it be possible to run not only RSN? to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs() to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False) # %% [markdown] -# ## Differences plotted -# -# - first only using created annotations +# ### Differences plotted with created annotations -# %% -figsize = (2, 2) +# %% tags=["hide-input"] +figsize = (4, 4) size = 5 fig, ax = plt.subplots(figsize=figsize) x_col = to_plot.columns[0] @@ -235,10 +265,9 @@ def annotate_decision(scores, model, model_column): vaep.savefig(fig, name=fname) # %% [markdown] -# - showing how many features were measured ("observed") +# - also showing how many features were measured ("observed") by size of circle -# %% -figsize = (2.5, 2.5) +# %% tags=["hide-input"] fig, ax = plt.subplots(figsize=figsize) ax = sns.scatterplot(data=to_plot, x=to_plot.columns[0], @@ -263,35 +292,39 @@ def annotate_decision(scores, model, model_column): fig, name=files_out[f'diff_analysis_comparision_2_{args.model_key}']) # %% [markdown] -# # Only features contained in model +# ## Only features contained in model +# - this block exist due to a specific part in the ALD analysis of the paper -# %% +# %% tags=["hide-input"] scores_model_only = scores.reset_index(level=-1, drop=True) -scores_model_only = (scores_model_only - .loc[ - scores_model_only.index.difference( - scores_common.index), - args.model_key] - .sort_values(by='qvalue', ascending=True) - .join(freq_feat) - ) -scores_model_only +_diff = scores_model_only.index.difference(scores_common.index) +if not _diff.empty: + scores_model_only = (scores_model_only + .loc[ + _diff, + args.model_key] + .sort_values(by='qvalue', ascending=True) + .join(freq_feat.squeeze().rename(freq_feat.columns.droplevel()[0]) + ) + ) + display(scores_model_only) +else: + scores_model_only = None + logger.info("No features only in new comparision model.") -# %% -scores_model_only.rejected.value_counts() - -# %% -scores_model_only.to_excel(writer, 'only_model', **writer_args) - -# %% -scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected] -scores_model_only_rejected.to_excel( - writer, 'only_model_rejected', **writer_args) +if not _diff.empty: + scores_model_only.to_excel(writer, 'only_model', **writer_args) + display(scores_model_only.rejected.value_counts()) + scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected] + scores_model_only_rejected.to_excel( + writer, 'only_model_rejected', **writer_args) # %% [markdown] -# # DISEASES DB lookup +# ## DISEASES DB lookup +# +# Query diseases database for gene associations with specified disease ontology id. -# %% +# %% tags=["hide-input"] data = vaep.databases.diseases.get_disease_association( doid=args.disease_ontology, limit=10000) data = pd.DataFrame.from_dict(data, orient='index').rename_axis('ENSP', axis=0) @@ -303,9 +336,9 @@ def annotate_decision(scores, model, model_column): # ## Shared features # ToDo: new script -> DISEASES DB lookup -# %% +# %% tags=["hide-input"] -# %% +# %% tags=["hide-input"] feat_name = scores.index.names[0] # first index level is feature name if args.annotaitons_gene_col in scores.index.names: logger.info(f"Found gene annotation in scores index: {scores.index.names}") @@ -315,7 +348,7 @@ def annotate_decision(scores, model, model_column): import sys sys.exit(0) -# %% +# %% tags=["hide-input"] gene_to_PG = (scores.droplevel( list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col}) ) @@ -326,7 +359,7 @@ def annotate_decision(scores, model, model_column): ) gene_to_PG.head() -# %% +# %% tags=["hide-input"] disease_associations_all = data.join( gene_to_PG).dropna().reset_index().set_index(feat_name).join(annotations) disease_associations_all @@ -334,27 +367,27 @@ def annotate_decision(scores, model, model_column): # %% [markdown] # ## only by model -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection(scores_model_only.index) disease_assocications_new = disease_associations_all.loc[idx].sort_values( 'score', ascending=False) disease_assocications_new.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_new.loc[idx, 'score'] >= 2.0 disease_assocications_new.loc[idx].loc[mask] # %% [markdown] # ## Only by model which were significant -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection( scores_model_only_rejected.index) disease_assocications_new_rejected = disease_associations_all.loc[idx].sort_values( 'score', ascending=False) disease_assocications_new_rejected.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_new_rejected.loc[idx, 'score'] >= 2.0 disease_assocications_new_rejected.loc[idx].loc[mask] @@ -365,13 +398,13 @@ def annotate_decision(scores, model, model_column): mask = (scores_common[(str(args.model_key), 'rejected')] & mask_different) mask.sum() -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection(mask.index[mask]) disease_assocications_shared_rejected_by_model = (disease_associations_all.loc[idx].sort_values( 'score', ascending=False)) disease_assocications_shared_rejected_by_model.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_shared_rejected_by_model.loc[idx, 'score'] >= 2.0 disease_assocications_shared_rejected_by_model.loc[idx].loc[mask] @@ -382,7 +415,7 @@ def annotate_decision(scores, model, model_column): mask = (scores_common[(str(args.baseline), 'rejected')] & mask_different) mask.sum() -# %% +# %% tags=["hide-input"] idx = disease_associations_all.index.intersection(mask.index[mask]) disease_assocications_shared_rejected_by_RSN = ( disease_associations_all @@ -390,14 +423,14 @@ def annotate_decision(scores, model, model_column): .sort_values('score', ascending=False)) disease_assocications_shared_rejected_by_RSN.head(20) -# %% +# %% tags=["hide-input"] mask = disease_assocications_shared_rejected_by_RSN.loc[idx, 'score'] >= 2.0 disease_assocications_shared_rejected_by_RSN.loc[idx].loc[mask] # %% [markdown] # ## Write to excel -# %% +# %% tags=["hide-input"] disease_associations_all.to_excel( writer, sheet_name='disease_assoc_all', **writer_args) disease_assocications_new.to_excel( @@ -406,10 +439,8 @@ def annotate_decision(scores, model, model_column): writer, sheet_name='disease_assoc_new_rejected', **writer_args) # %% [markdown] -# # Outputs +# ## Outputs -# %% +# %% tags=["hide-input"] writer.close() - -# %% files_out diff --git a/project/10_3_ald_ml_new_feat.ipynb b/project/10_3_ald_ml_new_feat.ipynb index 08b5a5985..8d9ebd273 100644 --- a/project/10_3_ald_ml_new_feat.ipynb +++ b/project/10_3_ald_ml_new_feat.ipynb @@ -3,29 +3,31 @@ { "cell_type": "markdown", "id": "d5f8edbd", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ - "# Compare outcomes from differential analysis based on different imputation methods\n", - "\n", - "- load scores based on `10_1_ald_diff_analysis.ipynb`\n", - "- compare performance for set of features included in original Study\n", - " to the set of features included in Niu. et. al 2022\n", - " (by lowering the threshold for feature completeness))\n", - "- RSN should be set as baseline if Niu et. al 2022 data is used\n", + "# Fit logistic regression model\n", "\n", - "This notebook could be adapted to compare\n", - "1. different set of features which were classified \"significant\" (is there signal)?" + "- based on different imputation methods\n", + "- baseline: reference\n", + "- model: any other selected imputation method" ] }, { "cell_type": "code", "execution_count": null, "id": "8d8c6764", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", "from pathlib import Path\n", + "from typing import List\n", "\n", "import matplotlib.pyplot as plt\n", "import njab.sklearn\n", @@ -47,16 +49,35 @@ "\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", - "logging.getLogger('fontTools').setLevel(logging.ERROR)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61a7353e", - "metadata": {}, - "outputs": [], - "source": [ + "logging.getLogger('fontTools').setLevel(logging.ERROR)\n", + "\n", + "\n", + "def parse_roc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame:\n", + " ret = list()\n", + " for _r in res:\n", + " _roc = (pd.DataFrame(_r.test.roc,\n", + " index='fpr tpr cutoffs'.split()\n", + " )).loc[['fpr', 'tpr']]\n", + " _roc = _roc.T\n", + " _roc.columns = pd.MultiIndex.from_product([[_r.name], _roc.columns])\n", + " ret.append(_roc)\n", + " ret = pd.concat(ret, axis=1)\n", + " return ret\n", + "\n", + "\n", + "def parse_prc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame:\n", + " ret = list()\n", + " for _r in res:\n", + " _prc = pd.DataFrame(_r.test.prc,\n", + " index='precision recall cutoffs'.split()\n", + " ).loc[['precision', 'recall']]\n", + " _prc = _prc.T.rename(columns={'recall': 'tpr'})\n", + " _prc.columns = pd.MultiIndex.from_product([[_r.name], _prc.columns])\n", + " ret.append(_prc)\n", + " ret = pd.concat(ret, axis=1)\n", + " return ret\n", + "\n", + "\n", "# catch passed parameters\n", "args = None\n", "args = dict(globals()).keys()" @@ -64,10 +85,11 @@ }, { "cell_type": "markdown", - "id": "139c9ae8", + "id": "e1e67f6d", "metadata": {}, "source": [ - "## Parameters" + "## Parameters\n", + "Default and set parameters for the notebook." ] }, { @@ -101,20 +123,14 @@ "cell_type": "code", "execution_count": null, "id": "13538b85", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "params = vaep.nb.get_params(args, globals=globals())\n", - "params" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92a4a7c0", - "metadata": {}, - "outputs": [], - "source": [ "args = vaep.nb.Config()\n", "args.folder_experiment = Path(params[\"folder_experiment\"])\n", "args = vaep.nb.add_default_paths(args,\n", @@ -123,25 +139,18 @@ " / params[\"target\"]\n", " / f\"{params['baseline']}_vs_{params['model_key']}\"))\n", "args.update_from_dict(params)\n", + "files_out = dict()\n", "args" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "04ac25ed", - "metadata": {}, - "outputs": [], - "source": [ - "files_out = dict()" - ] - }, { "cell_type": "markdown", - "id": "07fb7cc9", + "id": "1ca8264e", "metadata": {}, "source": [ - "## Load target" + "## Load data\n", + "\n", + "### Load target" ] }, { @@ -165,14 +174,19 @@ "id": "02bbf2a2", "metadata": {}, "source": [ - "### Measured data" + "### MS proteomics or specified omics data\n", + "Aggregated from data splits of the imputation workflow run before." ] }, { "cell_type": "code", "execution_count": null, "id": "f4cd6005", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = vaep.io.datasplits.DataSplits.from_folder(\n", @@ -194,14 +208,19 @@ "id": "c79ad218", "metadata": {}, "source": [ - "### Load ALD data or create" + "### Select by ALD criteria\n", + "Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study)." ] }, { "cell_type": "code", "execution_count": null, "id": "3038462c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "DATA_COMPLETENESS = 0.6\n", @@ -230,11 +249,23 @@ "ald_study" ] }, + { + "cell_type": "markdown", + "id": "e0e04598", + "metadata": {}, + "source": [ + "Number of complete cases which can be used:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "3a9e70e6", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "mask_has_target = data.index.levels[0].intersection(target.index)\n", @@ -249,14 +280,18 @@ "id": "fcc05bf5", "metadata": {}, "source": [ - "### Load semi-supervised model imputations" + "### Load imputations from specified model" ] }, { "cell_type": "code", "execution_count": null, "id": "5f072d5f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.model_key)\n", @@ -266,11 +301,23 @@ "pred_real_na.sample(3)" ] }, + { + "cell_type": "markdown", + "id": "eadd9ea6", + "metadata": {}, + "source": [ + "### Load imputations from baseline model" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0f2dd584", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.baseline)\n", @@ -283,7 +330,7 @@ "id": "8fa21c8b", "metadata": {}, "source": [ - "# Model predictions\n", + "## Modeling setup\n", "General approach:\n", " - use one train, test split of the data\n", " - select best 10 features from training data `X_train`, `y_train` before binarization of target\n", @@ -293,27 +340,45 @@ "Repeat general approach for\n", " 1. all original ald data: all features justed in original ALD study\n", " 2. all model data: all features available my using the self supervised deep learning model\n", - "3. newly available feat only: the subset of features available from the\n", - "self supervised deep learning model which were newly retained using the\n", - "new approach" + " 3. newly available feat only: the subset of features available from the\n", + " self supervised deep learning model which were newly retained using the\n", + " new approach\n", + "\n", + "All data:" ] }, { "cell_type": "code", "execution_count": null, "id": "f457863e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "X = pd.concat([data, pred_real_na]).unstack()\n", "X" ] }, + { + "cell_type": "markdown", + "id": "0c92c7bd", + "metadata": {}, + "source": [ + "### Subset of data by ALD criteria" + ] + }, { "cell_type": "code", "execution_count": null, "id": "a387dd6f", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# could be just observed, drop columns with missing values\n", @@ -321,18 +386,30 @@ " [ald_study.stack(),\n", " pred_real_na_baseline.loc[\n", " # only select columns in selected in ald_study\n", - " pd.IndexSlice[:, ald_study.columns]\n", + " pd.IndexSlice[:, pred_real_na.index.levels[-1].intersection(ald_study.columns)]\n", " ]\n", " ]\n", ").unstack()\n", "ald_study" ] }, + { + "cell_type": "markdown", + "id": "f8c07f73", + "metadata": {}, + "source": [ + "Features which would not have been included using ALD criteria:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "12b9d002", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "new_features = X.columns.difference(ald_study.columns)\n", @@ -344,14 +421,18 @@ "id": "a8e67247-a2a1-4a2f-b838-0bdc9f40cfa9", "metadata": {}, "source": [ - "Binarize targets, but also keep groups for stratification\n" + "Binarize targets, but also keep groups for stratification" ] }, { "cell_type": "code", "execution_count": null, "id": "4aa1f404-427a-4e78-b98d-cb26bb1d1ec4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "target_to_group = target.copy()\n", @@ -364,54 +445,99 @@ "id": "bfab754f", "metadata": {}, "source": [ - "## Best number of parameters by CV" + "## Determine best number of parameters by cross validation procedure\n", + "\n", + "using subset of data by ALD criteria:" ] }, { "cell_type": "code", "execution_count": null, "id": "90e410d1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "cv_feat_ald = njab.sklearn.find_n_best_features(X=ald_study, y=target, name=args.target,\n", " groups=target_to_group)\n", - "cv_feat_ald = cv_feat_ald.groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat_ald = (cv_feat_ald\n", + " .drop('test_case', axis=1)\n", + " .groupby('n_features')\n", + " .agg(['mean', 'std']))\n", "cv_feat_ald" ] }, + { + "cell_type": "markdown", + "id": "b40fb391", + "metadata": {}, + "source": [ + "Using all data:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "988dea31", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "cv_feat_all = njab.sklearn.find_n_best_features(X=X, y=target, name=args.target,\n", " groups=target_to_group)\n", - "cv_feat_all = cv_feat_all.groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat_all = cv_feat_all.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std'])\n", "cv_feat_all" ] }, + { + "cell_type": "markdown", + "id": "0029a621", + "metadata": {}, + "source": [ + "Using only new features:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "811f75d0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "cv_feat_new = njab.sklearn.find_n_best_features(X=X.loc[:, new_features],\n", " y=target, name=args.target,\n", " groups=target_to_group)\n", - "cv_feat_new = cv_feat_new.groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat_new = cv_feat_new.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std'])\n", "cv_feat_new" ] }, + { + "cell_type": "markdown", + "id": "bd57bbac", + "metadata": {}, + "source": [ + "### Best number of features by subset of the data:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "72655713", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "n_feat_best = pd.DataFrame(\n", @@ -428,14 +554,20 @@ "id": "2efdc8bf", "metadata": {}, "source": [ - "## Train, test split" + "## Train, test split\n", + "Show number of cases in train and test data" ] }, { "cell_type": "code", "execution_count": null, "id": "dc3d3b21", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 0, + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n", @@ -445,52 +577,37 @@ " stratify=target_to_group,\n", " random_state=42)\n", "idx_train = X_train.index\n", - "idx_test = X_test.index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3b4b394", - "metadata": {}, - "outputs": [], - "source": [ + "idx_test = X_test.index\n", + "\n", "njab.pandas.combine_value_counts(\n", " pd.concat([y_train, y_test],\n", " axis=1,\n", " ignore_index=True,\n", - " )\n", - " .rename(columns={0: 'train', 1: 'test'})\n", + " ).rename(columns={0: 'train', 1: 'test'})\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "71879005", - "metadata": {}, - "outputs": [], - "source": [ - "y_train.value_counts()" - ] - }, { "cell_type": "markdown", - "id": "8b528b8e", + "id": "d3a33fb1", "metadata": {}, "source": [ "## Results\n", "\n", "- `run_model` returns dataclasses with the further needed results\n", - "- add mrmr selection of data (select best number of features to use instead of fixing it)" + "- add mrmr selection of data (select best number of features to use instead of fixing it)\n", + "\n", + "Save results for final model on entire data, new features and ALD study criteria selected data." ] }, { "cell_type": "code", "execution_count": null, - "id": "baa9de8b", + "id": "d3b4b394", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -504,28 +621,8 @@ "results_model_full.name = f'{args.model_key} all'\n", "fname = args.out_folder / f'results_{results_model_full.name}.pkl'\n", "files_out[fname.name] = fname\n", - "vaep.io.to_pickle(results_model_full, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18688a0c", - "metadata": {}, - "outputs": [], - "source": [ - "# all(results_model_full.test.roc.tpr\n", - "# ==\n", - "# vaep.sklearn.Results.from_pickle(fname).test.roc.tpr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78e72950", - "metadata": {}, - "outputs": [], - "source": [ + "vaep.io.to_pickle(results_model_full, fname)\n", + "\n", "splits = Splits(X_train=X.loc[idx_train, new_features],\n", " X_test=X.loc[idx_test, new_features],\n", " y_train=y_train,\n", @@ -536,16 +633,8 @@ "results_model_new.name = f'{args.model_key} new'\n", "fname = args.out_folder / f'results_{results_model_new.name}.pkl'\n", "files_out[fname.name] = fname\n", - "vaep.io.to_pickle(results_model_new, fname)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "764ec22c", - "metadata": {}, - "outputs": [], - "source": [ + "vaep.io.to_pickle(results_model_new, fname)\n", + "\n", "splits_ald = Splits(\n", " X_train=ald_study.loc[idx_train],\n", " X_test=ald_study.loc[idx_test],\n", @@ -562,17 +651,21 @@ }, { "cell_type": "markdown", - "id": "790b1db5", + "id": "0ad96ff4", "metadata": {}, "source": [ - "### ROC-AUC" + "### ROC-AUC on test split" ] }, { "cell_type": "code", "execution_count": null, "id": "04b82583", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -584,19 +677,49 @@ "vaep.savefig(fig, name=fname)" ] }, + { + "cell_type": "markdown", + "id": "9e35c686", + "metadata": {}, + "source": [ + "Data used to plot ROC:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322281db", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "res = [results_ald_full, results_model_full, results_model_new]\n", + "\n", + "auc_roc_curve = parse_roc(*res)\n", + "auc_roc_curve.to_excel(fname.with_suffix('.xlsx'))\n", + "auc_roc_curve" + ] + }, { "cell_type": "markdown", "id": "46e9a3f2-89aa-4bd5-a083-d8e16815020a", "metadata": {}, "source": [ - "### Features selected" + "### Features selected for final models" ] }, { "cell_type": "code", "execution_count": null, "id": "9e1bb173", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "selected_features = pd.DataFrame(\n", @@ -620,14 +743,18 @@ "id": "ce227174", "metadata": {}, "source": [ - "### Precision-Recall plot" + "### Precision-Recall plot on test data" ] }, { "cell_type": "code", "execution_count": null, "id": "56ea0d50", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -640,6 +767,30 @@ "vaep.savefig(fig, name=fname)" ] }, + { + "cell_type": "markdown", + "id": "bc9e560d", + "metadata": {}, + "source": [ + "Data used to plot PRC:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9f5e5ce", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "prec_recall_curve = parse_prc(*res)\n", + "prec_recall_curve.to_excel(fname.with_suffix('.xlsx'))\n", + "prec_recall_curve" + ] + }, { "cell_type": "markdown", "id": "0ddf0913", @@ -652,7 +803,11 @@ "cell_type": "code", "execution_count": null, "id": "6eb3ed77", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -669,7 +824,11 @@ "cell_type": "code", "execution_count": null, "id": "64fee389", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=figsize)\n", @@ -686,17 +845,18 @@ "id": "545b7a34", "metadata": {}, "source": [ - "Options:\n", - "- F1 results for test data for best cutoff on training data?\n", - " (select best cutoff of training data, evaluate on test data)\n", - "- plot X_train PCA/UMAP, map X_test" + "Output files:" ] }, { "cell_type": "code", "execution_count": null, "id": "860e0d5e", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out" diff --git a/project/10_3_ald_ml_new_feat.py b/project/10_3_ald_ml_new_feat.py index 886c3839c..40bef222b 100644 --- a/project/10_3_ald_ml_new_feat.py +++ b/project/10_3_ald_ml_new_feat.py @@ -13,20 +13,17 @@ # --- # %% [markdown] -# # Compare outcomes from differential analysis based on different imputation methods +# # Fit logistic regression model # -# - load scores based on `10_1_ald_diff_analysis.ipynb` -# - compare performance for set of features included in original Study -# to the set of features included in Niu. et. al 2022 -# (by lowering the threshold for feature completeness)) -# - RSN should be set as baseline if Niu et. al 2022 data is used -# -# This notebook could be adapted to compare -# 1. different set of features which were classified "significant" (is there signal)? +# - based on different imputation methods +# - baseline: reference +# - model: any other selected imputation method -# %% + +# %% tags=["hide-input"] import logging from pathlib import Path +from typing import List import matplotlib.pyplot as plt import njab.sklearn @@ -50,13 +47,40 @@ logger = vaep.logging.setup_nb_logger() logging.getLogger('fontTools').setLevel(logging.ERROR) -# %% + +def parse_roc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame: + ret = list() + for _r in res: + _roc = (pd.DataFrame(_r.test.roc, + index='fpr tpr cutoffs'.split() + )).loc[['fpr', 'tpr']] + _roc = _roc.T + _roc.columns = pd.MultiIndex.from_product([[_r.name], _roc.columns]) + ret.append(_roc) + ret = pd.concat(ret, axis=1) + return ret + + +def parse_prc(*res: List[njab.sklearn.types.Results]) -> pd.DataFrame: + ret = list() + for _r in res: + _prc = pd.DataFrame(_r.test.prc, + index='precision recall cutoffs'.split() + ).loc[['precision', 'recall']] + _prc = _prc.T.rename(columns={'recall': 'tpr'}) + _prc.columns = pd.MultiIndex.from_product([[_r.name], _prc.columns]) + ret.append(_prc) + ret = pd.concat(ret, axis=1) + return ret + + # catch passed parameters args = None args = dict(globals()).keys() # %% [markdown] # ## Parameters +# Default and set parameters for the notebook. # %% tags=["parameters"] folder_data: str = '' # specify data directory if needed @@ -74,11 +98,8 @@ template_pred = 'pred_real_na_{}.csv' # fixed, do not change -# %% +# %% tags=["hide-input"] params = vaep.nb.get_params(args, globals=globals()) -params - -# %% args = vaep.nb.Config() args.folder_experiment = Path(params["folder_experiment"]) args = vaep.nb.add_default_paths(args, @@ -87,13 +108,13 @@ / params["target"] / f"{params['baseline']}_vs_{params['model_key']}")) args.update_from_dict(params) -args - -# %% files_out = dict() +args # %% [markdown] -# ## Load target +# ## Load data +# +# ### Load target # %% target = pd.read_csv(args.fn_clinical_data, @@ -103,9 +124,10 @@ target # %% [markdown] -# ### Measured data +# ### MS proteomics or specified omics data +# Aggregated from data splits of the imputation workflow run before. -# %% +# %% tags=["hide-input"] data = vaep.io.datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) data = pd.concat([data.train_X, data.val_y, data.test_y]) @@ -115,9 +137,10 @@ # Get overlap between independent features and target # %% [markdown] -# ### Load ALD data or create +# ### Select by ALD criteria +# Use parameters as specified in [ALD study](https://github.com/RasmussenLab/pimms/tree/main/project/data/ALD_study). -# %% +# %% tags=["hide-input"] DATA_COMPLETENESS = 0.6 MIN_N_PROTEIN_GROUPS: int = 200 FRAC_PROTEIN_GROUPS: int = 0.622 @@ -143,7 +166,10 @@ ald_study = ald_study.rename(columns=column_name_first_prot_to_pg) ald_study -# %% +# %% [markdown] +# Number of complete cases which can be used: + +# %% tags=["hide-input"] mask_has_target = data.index.levels[0].intersection(target.index) assert not mask_has_target.empty, f"No data for target: {data.index.levels[0]} and {target.index}" print( @@ -151,22 +177,25 @@ target, data, ald_study = target.loc[mask_has_target], data.loc[mask_has_target], ald_study.loc[mask_has_target] # %% [markdown] -# ### Load semi-supervised model imputations +# ### Load imputations from specified model -# %% +# %% tags=["hide-input"] fname = args.out_preds / args.template_pred.format(args.model_key) print(f"missing values pred. by {args.model_key}: {fname}") load_single_csv_pred_file = vaep.analyzers.compare_predictions.load_single_csv_pred_file pred_real_na = load_single_csv_pred_file(fname).loc[mask_has_target] pred_real_na.sample(3) -# %% +# %% [markdown] +# ### Load imputations from baseline model + +# %% tags=["hide-input"] fname = args.out_preds / args.template_pred.format(args.baseline) pred_real_na_baseline = load_single_csv_pred_file(fname) # .loc[mask_has_target] pred_real_na_baseline # %% [markdown] -# # Model predictions +# ## Modeling setup # General approach: # - use one train, test split of the data # - select best 10 features from training data `X_train`, `y_train` before binarization of target @@ -176,62 +205,83 @@ # Repeat general approach for # 1. all original ald data: all features justed in original ALD study # 2. all model data: all features available my using the self supervised deep learning model -# 3. newly available feat only: the subset of features available from the -# self supervised deep learning model which were newly retained using the -# new approach +# 3. newly available feat only: the subset of features available from the +# self supervised deep learning model which were newly retained using the +# new approach +# +# All data: -# %% +# %% tags=["hide-input"] X = pd.concat([data, pred_real_na]).unstack() X -# %% +# %% [markdown] +# ### Subset of data by ALD criteria + +# %% tags=["hide-input"] # could be just observed, drop columns with missing values ald_study = pd.concat( [ald_study.stack(), pred_real_na_baseline.loc[ # only select columns in selected in ald_study - pd.IndexSlice[:, ald_study.columns] + pd.IndexSlice[:, pred_real_na.index.levels[-1].intersection(ald_study.columns)] ] ] ).unstack() ald_study -# %% +# %% [markdown] +# Features which would not have been included using ALD criteria: + +# %% tags=["hide-input"] new_features = X.columns.difference(ald_study.columns) new_features # %% [markdown] # Binarize targets, but also keep groups for stratification -# -# %% +# %% tags=["hide-input"] target_to_group = target.copy() target = target >= args.cutoff_target pd.crosstab(target.squeeze(), target_to_group.squeeze()) # %% [markdown] -# ## Best number of parameters by CV +# ## Determine best number of parameters by cross validation procedure +# +# using subset of data by ALD criteria: -# %% +# %% tags=["hide-input"] cv_feat_ald = njab.sklearn.find_n_best_features(X=ald_study, y=target, name=args.target, groups=target_to_group) -cv_feat_ald = cv_feat_ald.groupby('n_features').agg(['mean', 'std']) +cv_feat_ald = (cv_feat_ald + .drop('test_case', axis=1) + .groupby('n_features') + .agg(['mean', 'std'])) cv_feat_ald -# %% +# %% [markdown] +# Using all data: + +# %% tags=["hide-input"] cv_feat_all = njab.sklearn.find_n_best_features(X=X, y=target, name=args.target, groups=target_to_group) -cv_feat_all = cv_feat_all.groupby('n_features').agg(['mean', 'std']) +cv_feat_all = cv_feat_all.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std']) cv_feat_all -# %% +# %% [markdown] +# Using only new features: + +# %% tags=["hide-input"] cv_feat_new = njab.sklearn.find_n_best_features(X=X.loc[:, new_features], y=target, name=args.target, groups=target_to_group) -cv_feat_new = cv_feat_new.groupby('n_features').agg(['mean', 'std']) +cv_feat_new = cv_feat_new.drop('test_case', axis=1).groupby('n_features').agg(['mean', 'std']) cv_feat_new -# %% +# %% [markdown] +# ### Best number of features by subset of the data: + +# %% tags=["hide-input"] n_feat_best = pd.DataFrame( {'ald': cv_feat_ald.loc[:, pd.IndexSlice[:, 'mean']].idxmax(), 'all': cv_feat_all.loc[:, pd.IndexSlice[:, 'mean']].idxmax(), @@ -242,8 +292,9 @@ # %% [markdown] # ## Train, test split +# Show number of cases in train and test data -# %% +# %% tags=["hide-input"] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, target, @@ -253,25 +304,21 @@ idx_train = X_train.index idx_test = X_test.index -# %% njab.pandas.combine_value_counts( pd.concat([y_train, y_test], axis=1, ignore_index=True, - ) - .rename(columns={0: 'train', 1: 'test'}) + ).rename(columns={0: 'train', 1: 'test'}) ) - -# %% -y_train.value_counts() - # %% [markdown] # ## Results # # - `run_model` returns dataclasses with the further needed results # - add mrmr selection of data (select best number of features to use instead of fixing it) +# +# Save results for final model on entire data, new features and ALD study criteria selected data. -# %% +# %% tags=["hide-input"] splits = Splits(X_train=X.loc[idx_train], X_test=X.loc[idx_test], y_train=y_train, @@ -284,13 +331,6 @@ files_out[fname.name] = fname vaep.io.to_pickle(results_model_full, fname) - -# %% -# all(results_model_full.test.roc.tpr -# == -# vaep.sklearn.Results.from_pickle(fname).test.roc.tpr) - -# %% splits = Splits(X_train=X.loc[idx_train, new_features], X_test=X.loc[idx_test, new_features], y_train=y_train, @@ -303,7 +343,6 @@ files_out[fname.name] = fname vaep.io.to_pickle(results_model_new, fname) -# %% splits_ald = Splits( X_train=ald_study.loc[idx_train], X_test=ald_study.loc[idx_test], @@ -318,9 +357,9 @@ vaep.io.to_pickle(results_ald_full, fname) # %% [markdown] -# ### ROC-AUC +# ### ROC-AUC on test split -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) plot_split_auc(results_ald_full.test, results_ald_full.name, ax) plot_split_auc(results_model_full.test, results_model_full.name, ax) @@ -330,9 +369,19 @@ vaep.savefig(fig, name=fname) # %% [markdown] -# ### Features selected +# Data used to plot ROC: -# %% +# %% tags=["hide-input"] +res = [results_ald_full, results_model_full, results_model_new] + +auc_roc_curve = parse_roc(*res) +auc_roc_curve.to_excel(fname.with_suffix('.xlsx')) +auc_roc_curve + +# %% [markdown] +# ### Features selected for final models + +# %% tags=["hide-input"] selected_features = pd.DataFrame( [results_ald_full.selected_features, results_model_full.selected_features, @@ -349,9 +398,9 @@ selected_features # %% [markdown] -# ### Precision-Recall plot +# ### Precision-Recall plot on test data -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) ax = plot_split_prc(results_ald_full.test, results_ald_full.name, ax) @@ -361,10 +410,18 @@ files_out[fname.name] = fname vaep.savefig(fig, name=fname) +# %% [markdown] +# Data used to plot PRC: + +# %% tags=["hide-input"] +prec_recall_curve = parse_prc(*res) +prec_recall_curve.to_excel(fname.with_suffix('.xlsx')) +prec_recall_curve + # %% [markdown] # ## Train data plots -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) ax = plot_split_prc(results_ald_full.train, results_ald_full.name, ax) @@ -374,7 +431,7 @@ files_out[fname.name] = fname vaep.savefig(fig, name=fname) -# %% +# %% tags=["hide-input"] fig, ax = plt.subplots(1, 1, figsize=figsize) plot_split_auc(results_ald_full.train, results_ald_full.name, ax) plot_split_auc(results_model_full.train, results_model_full.name, ax) @@ -384,10 +441,7 @@ vaep.savefig(fig, name=fname) # %% [markdown] -# Options: -# - F1 results for test data for best cutoff on training data? -# (select best cutoff of training data, evaluate on test data) -# - plot X_train PCA/UMAP, map X_test +# Output files: -# %% +# %% tags=["hide-input"] files_out diff --git a/project/10_4_ald_compare_single_pg.ipynb b/project/10_4_ald_compare_single_pg.ipynb index 10f97305a..6a1644d68 100644 --- a/project/10_4_ald_compare_single_pg.ipynb +++ b/project/10_4_ald_compare_single_pg.ipynb @@ -15,7 +15,11 @@ "cell_type": "code", "execution_count": null, "id": "4ffa9d4c-622f-46c3-847a-7f7474082ee4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "import logging\n", @@ -36,7 +40,11 @@ "logging.getLogger('fontTools').setLevel(logging.WARNING)\n", "\n", "plt.rcParams['figure.figsize'] = [4, 2.5] # [16.0, 7.0] , [4, 3]\n", - "vaep.plotting.make_large_descriptors(7)" + "vaep.plotting.make_large_descriptors(7)\n", + "\n", + "# catch passed parameters\n", + "args = None\n", + "args = dict(globals()).keys()" ] }, { @@ -47,18 +55,6 @@ "## Parameters" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6ab3869-cc83-47ed-8ce2-0c8a470b96a6", - "metadata": {}, - "outputs": [], - "source": [ - "# catch passed parameters\n", - "args = None\n", - "args = dict(globals()).keys()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -89,20 +85,14 @@ "cell_type": "code", "execution_count": null, "id": "b85c6c2a-146c-48bd-9d7b-1fe4eec8a6ae", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "params = vaep.nb.get_params(args, globals=globals())\n", - "params" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c833157-e36e-476b-a3bd-d604b962ef04", - "metadata": {}, - "outputs": [], - "source": [ "args = vaep.nb.Config()\n", "args.folder_experiment = Path(params[\"folder_experiment\"])\n", "args = vaep.nb.add_default_paths(args,\n", @@ -118,12 +108,22 @@ "args" ] }, + { + "cell_type": "markdown", + "id": "4036fc07", + "metadata": {}, + "source": [ + "Write outputs to excel" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "bc408ac3-85cc-4b39-be77-5202b23bbef7", + "id": "8c833157-e36e-476b-a3bd-d604b962ef04", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -131,14 +131,28 @@ "\n", "fname = args.out_folder / 'diff_analysis_compare_DA.xlsx'\n", "writer = pd.ExcelWriter(fname)\n", - "files_out[fname.name] = fname.as_posix()" + "files_out[fname.name] = fname.as_posix()\n", + "logger.info(\"Writing to excel file: %s\", fname)" + ] + }, + { + "cell_type": "markdown", + "id": "62d61673", + "metadata": {}, + "source": [ + "## Load scores\n", + "List dump of scores:" ] }, { "cell_type": "code", "execution_count": null, "id": "bcbd112b", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "score_dumps = [fname for fname in Path(\n", @@ -146,25 +160,48 @@ "score_dumps" ] }, + { + "cell_type": "markdown", + "id": "18113565", + "metadata": {}, + "source": [ + "Load scores from dumps:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "d240a9b0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "scores = pd.concat([pd.read_pickle(fname) for fname in score_dumps], axis=1)\n", "scores" ] }, + { + "cell_type": "markdown", + "id": "1abb0f0b", + "metadata": {}, + "source": [ + "If reference dump is provided, add it to the scores" + ] + }, { "cell_type": "code", "execution_count": null, "id": "c92dae12", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ - "# Reference dump\n", "if args.ref_method_score:\n", " scores_reference = (pd\n", " .read_pickle(args.ref_method_score)\n", @@ -179,14 +216,18 @@ "id": "79746f59", "metadata": {}, "source": [ - "## Load frequencies of observed features" + "### Load frequencies of observed features" ] }, { "cell_type": "code", "execution_count": null, "id": "86ecc391", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "fname = args.folder_experiment / 'freq_features_observed.csv'\n", @@ -195,11 +236,23 @@ "freq_feat" ] }, + { + "cell_type": "markdown", + "id": "641099cd", + "metadata": {}, + "source": [ + "### Assemble qvalues" + ] + }, { "cell_type": "code", "execution_count": null, "id": "54a41e86", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "qvalues = scores.loc[pd.IndexSlice[:, args.target],\n", @@ -215,11 +268,23 @@ "qvalues" ] }, + { + "cell_type": "markdown", + "id": "d024e94d", + "metadata": {}, + "source": [ + "### Assemble pvalues" + ] + }, { "cell_type": "code", "execution_count": null, "id": "0b2488e4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "pvalues = scores.loc[pd.IndexSlice[:, args.target],\n", @@ -235,11 +300,23 @@ "pvalues" ] }, + { + "cell_type": "markdown", + "id": "b3b02e5a", + "metadata": {}, + "source": [ + "### Assemble rejected features" + ] + }, { "cell_type": "code", "execution_count": null, "id": "ec12c234", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "da_target = scores.loc[pd.IndexSlice[:, args.target],\n", @@ -256,11 +333,23 @@ "count_rejected" ] }, + { + "cell_type": "markdown", + "id": "85e2a9c8", + "metadata": {}, + "source": [ + "### Tabulate rejected decisions by method:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "c9e0a7b4", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# ! This uses implicitly that RSN is not available for some protein groups\n", @@ -271,11 +360,23 @@ "count_rejected_common" ] }, + { + "cell_type": "markdown", + "id": "4bf8e3e5", + "metadata": {}, + "source": [ + "### Tabulate rejected decisions by method for newly included features (if available)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "af1a13cb", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))\n", @@ -283,37 +384,75 @@ "count_rejected_new" ] }, + { + "cell_type": "markdown", + "id": "db57d9f0", + "metadata": {}, + "source": [ + "### Tabulate rejected decisions by method for all features" + ] + }, { "cell_type": "code", "execution_count": null, "id": "f76e8772", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "da_target.to_excel(writer, sheet_name='equality_rejected_all')\n", + "logger.info(\"Written to sheet 'equality_rejected_all' in excel file.\")\n", "da_target" ] }, + { + "cell_type": "markdown", + "id": "e1ed7b09", + "metadata": {}, + "source": [ + "Tabulate number of equal decison by method (`True`) to the ones with varying \n", + "decision depending on the method (`False`)" + ] + }, { "cell_type": "code", "execution_count": null, "id": "9fa40ea2", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "da_target_same = (da_target.sum(axis=1) == 0) | da_target.all(axis=1)\n", "da_target_same.value_counts()" ] }, + { + "cell_type": "markdown", + "id": "01bc6744", + "metadata": {}, + "source": [ + "List frequency of features with varying decisions" + ] + }, { "cell_type": "code", "execution_count": null, "id": "22c37698", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_idx_w_diff = da_target_same[~da_target_same].index\n", - "feat_idx_w_diff" + "feat_idx_w_diff.to_frame()[['frequency']].reset_index(-1, drop=True)" ] }, { @@ -328,7 +467,11 @@ "cell_type": "code", "execution_count": null, "id": "ee57dfa9", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "(qvalues\n", @@ -344,12 +487,15 @@ " .to_excel(writer, sheet_name='qvalues_diff_common')\n", " )\n", "\n", - "(qvalues\n", - " .loc[feat_idx_w_diff]\n", - " .loc[~mask_common] # mask automatically aligned\n", - " .sort_values(('None', 'qvalue'))\n", - " .to_excel(writer, sheet_name='qvalues_diff_new')\n", - " )\n", + "try:\n", + " (qvalues\n", + " .loc[feat_idx_w_diff]\n", + " .loc[~mask_common]\n", + " .sort_values(('None', 'qvalue'))\n", + " .to_excel(writer, sheet_name='qvalues_diff_new')\n", + " )\n", + "except IndexError:\n", + " print(\"No new features or no new ones (with diverging decisions.)\")\n", "writer.close()" ] }, @@ -366,7 +512,10 @@ "execution_count": null, "id": "10092826", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -388,7 +537,11 @@ "cell_type": "code", "execution_count": null, "id": "624d3301", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "target = pd.read_csv(args.fn_clinical_data,\n", @@ -402,7 +555,11 @@ "cell_type": "code", "execution_count": null, "id": "a160ab0c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "target_to_group = target.copy()\n", @@ -425,7 +582,11 @@ "cell_type": "code", "execution_count": null, "id": "7083535b-9a06-479e-9909-935d49311b00", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = vaep.io.datasplits.DataSplits.from_folder(\n", @@ -447,7 +608,11 @@ "cell_type": "code", "execution_count": null, "id": "b8d183d5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_new_abundant = da_target.loc[~mask_common].any(axis=1)\n", @@ -459,7 +624,11 @@ "cell_type": "code", "execution_count": null, "id": "112a677c", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_sel = feat_idx_w_diff.get_level_values(0)\n", @@ -471,7 +640,11 @@ "cell_type": "code", "execution_count": null, "id": "110c0f53", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "data = data.loc[:, feat_sel]\n", @@ -497,7 +670,11 @@ "cell_type": "code", "execution_count": null, "id": "cfd936e9-eb56-4fb7-8010-d68092b925ad", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "# exclude 'None' as this is without imputation (-> data)\n", @@ -513,7 +690,10 @@ "execution_count": null, "id": "26ecc0ed-c550-4a40-802b-25962d7edf7e", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -540,7 +720,10 @@ "execution_count": null, "id": "e422a7a8", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -555,7 +738,11 @@ "cell_type": "code", "execution_count": null, "id": "a3294f6a-65f3-4793-ad0c-4dd8ff11be47", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "idx = feat_sel[0]" @@ -565,7 +752,11 @@ "cell_type": "code", "execution_count": null, "id": "ee17d5eb-a132-4616-b505-4a68efa0e9e5", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "feat_observed = data[idx].dropna()\n", @@ -577,7 +768,10 @@ "execution_count": null, "id": "043395a7-fa33-490e-9d9c-f8071274f0b5", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -594,7 +788,10 @@ "execution_count": null, "id": "f813f693", "metadata": { - "lines_to_next_cell": 2 + "lines_to_next_cell": 2, + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -620,7 +817,9 @@ "execution_count": null, "id": "d819b0e0", "metadata": { - "lines_to_next_cell": 0 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ @@ -728,11 +927,23 @@ " plt.close()" ] }, + { + "cell_type": "markdown", + "id": "f899bcf9", + "metadata": {}, + "source": [ + "Saved files:" + ] + }, { "cell_type": "code", "execution_count": null, "id": "a4b042a1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "files_out" diff --git a/project/10_4_ald_compare_single_pg.py b/project/10_4_ald_compare_single_pg.py index d07f19827..3bb21e39d 100644 --- a/project/10_4_ald_compare_single_pg.py +++ b/project/10_4_ald_compare_single_pg.py @@ -18,7 +18,7 @@ # - see differences in imputation for diverging cases # - dumps top5 -# %% +# %% tags=["hide-input"] import logging from pathlib import Path @@ -39,14 +39,13 @@ plt.rcParams['figure.figsize'] = [4, 2.5] # [16.0, 7.0] , [4, 3] vaep.plotting.make_large_descriptors(7) -# %% [markdown] -# ## Parameters - -# %% # catch passed parameters args = None args = dict(globals()).keys() +# %% [markdown] +# ## Parameters + # %% tags=["parameters"] folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups' fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" @@ -62,11 +61,8 @@ ref_method_score = None # filepath to reference method score -# %% +# %% tags=["hide-input"] params = vaep.nb.get_params(args, globals=globals()) -params - -# %% args = vaep.nb.Config() args.folder_experiment = Path(params["folder_experiment"]) args = vaep.nb.add_default_paths(args, @@ -81,25 +77,37 @@ args.update_from_dict(params) args -# %% +# %% [markdown] +# Write outputs to excel + +# %% tags=["hide-input"] files_out = dict() fname = args.out_folder / 'diff_analysis_compare_DA.xlsx' writer = pd.ExcelWriter(fname) files_out[fname.name] = fname.as_posix() +logger.info("Writing to excel file: %s", fname) +# %% [markdown] +# ## Load scores +# List dump of scores: -# %% +# %% tags=["hide-input"] score_dumps = [fname for fname in Path( args.folder_scores).iterdir() if fname.suffix == '.pkl'] score_dumps -# %% +# %% [markdown] +# Load scores from dumps: + +# %% tags=["hide-input"] scores = pd.concat([pd.read_pickle(fname) for fname in score_dumps], axis=1) scores -# %% -# Reference dump +# %% [markdown] +# If reference dump is provided, add it to the scores + +# %% tags=["hide-input"] if args.ref_method_score: scores_reference = (pd .read_pickle(args.ref_method_score) @@ -109,15 +117,18 @@ logger.info(f'Added reference method scores from {args.ref_method_score}') # %% [markdown] -# ## Load frequencies of observed features +# ### Load frequencies of observed features -# %% +# %% tags=["hide-input"] fname = args.folder_experiment / 'freq_features_observed.csv' freq_feat = pd.read_csv(fname, index_col=0) freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),]) freq_feat -# %% +# %% [markdown] +# ### Assemble qvalues + +# %% tags=["hide-input"] qvalues = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'qvalue'] ].join(freq_feat @@ -130,7 +141,10 @@ qvalues.to_excel(writer, sheet_name='qvalues_all') qvalues -# %% +# %% [markdown] +# ### Assemble pvalues + +# %% tags=["hide-input"] pvalues = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'p-unc'] ].join(freq_feat @@ -143,7 +157,10 @@ pvalues.to_excel(writer, sheet_name='pvalues_all') pvalues -# %% +# %% [markdown] +# ### Assemble rejected features + +# %% tags=["hide-input"] da_target = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'rejected'] ].join(freq_feat @@ -157,7 +174,10 @@ count_rejected.to_excel(writer, sheet_name='count_rejected') count_rejected -# %% +# %% [markdown] +# ### Tabulate rejected decisions by method: + +# %% tags=["hide-input"] # # ! This uses implicitly that RSN is not available for some protein groups # # ! Make an explicit list of the 313 protein groups available in original data mask_common = da_target.notna().all(axis=1) @@ -165,27 +185,41 @@ count_rejected_common.to_excel(writer, sheet_name='count_rejected_common') count_rejected_common -# %% +# %% [markdown] +# ### Tabulate rejected decisions by method for newly included features (if available) + +# %% tags=["hide-input"] count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1)) count_rejected_new.to_excel(writer, sheet_name='count_rejected_new') count_rejected_new -# %% +# %% [markdown] +# ### Tabulate rejected decisions by method for all features + +# %% tags=["hide-input"] da_target.to_excel(writer, sheet_name='equality_rejected_all') +logger.info("Written to sheet 'equality_rejected_all' in excel file.") da_target -# %% +# %% [markdown] +# Tabulate number of equal decison by method (`True`) to the ones with varying +# decision depending on the method (`False`) + +# %% tags=["hide-input"] da_target_same = (da_target.sum(axis=1) == 0) | da_target.all(axis=1) da_target_same.value_counts() -# %% +# %% [markdown] +# List frequency of features with varying decisions + +# %% tags=["hide-input"] feat_idx_w_diff = da_target_same[~da_target_same].index -feat_idx_w_diff +feat_idx_w_diff.to_frame()[['frequency']].reset_index(-1, drop=True) # %% [markdown] # take only those with different decisions -# %% +# %% tags=["hide-input"] (qvalues .loc[feat_idx_w_diff] .sort_values(('None', 'qvalue')) @@ -199,18 +233,21 @@ .to_excel(writer, sheet_name='qvalues_diff_common') ) -(qvalues - .loc[feat_idx_w_diff] - .loc[~mask_common] # mask automatically aligned - .sort_values(('None', 'qvalue')) - .to_excel(writer, sheet_name='qvalues_diff_new') - ) +try: + (qvalues + .loc[feat_idx_w_diff] + .loc[~mask_common] + .sort_values(('None', 'qvalue')) + .to_excel(writer, sheet_name='qvalues_diff_new') + ) +except IndexError: + print("No new features or no new ones (with diverging decisions.)") writer.close() # %% [markdown] # ## Plots for inspecting imputations (for diverging decisions) -# %% +# %% tags=["hide-input"] if not args.make_plots: logger.warning("Not plots requested.") import sys @@ -220,14 +257,14 @@ # %% [markdown] # ## Load target -# %% +# %% tags=["hide-input"] target = pd.read_csv(args.fn_clinical_data, index_col=0, usecols=[args.sample_id_col, args.target]) target = target.dropna() target -# %% +# %% tags=["hide-input"] target_to_group = target.copy() target = target >= args.cutoff_target target = target.replace({False: f'{args.target} < {args.cutoff_target}', @@ -238,7 +275,7 @@ # %% [markdown] # ## Measurments -# %% +# %% tags=["hide-input"] data = vaep.io.datasplits.DataSplits.from_folder( args.data, file_format=args.file_format) @@ -248,17 +285,17 @@ # %% [markdown] # plot all of the new pgs which are at least once significant which are not already dumped. -# %% +# %% tags=["hide-input"] feat_new_abundant = da_target.loc[~mask_common].any(axis=1) feat_new_abundant = feat_new_abundant.loc[feat_new_abundant].index.get_level_values(0) feat_new_abundant -# %% +# %% tags=["hide-input"] feat_sel = feat_idx_w_diff.get_level_values(0) feat_sel = feat_sel.union(feat_new_abundant) len(feat_sel) -# %% +# %% tags=["hide-input"] data = data.loc[:, feat_sel] data @@ -272,7 +309,7 @@ # # Load all prediction files and reshape -# %% +# %% tags=["hide-input"] # exclude 'None' as this is without imputation (-> data) model_keys = [k for k in qvalues.columns.get_level_values(0) if k != 'None'] pred_paths = [ @@ -280,7 +317,7 @@ for method in model_keys] pred_paths -# %% +# %% tags=["hide-input"] load_single_csv_pred_file = vaep.analyzers.compare_predictions.load_single_csv_pred_file pred_real_na = dict() for method in model_keys: @@ -294,7 +331,7 @@ # %% [markdown] # Once imputation, reduce to target samples only (samples with target score) -# %% +# %% tags=["hide-input"] # select samples with target information data = data.loc[target.index] pred_real_na = pred_real_na.loc[target.index] @@ -302,14 +339,14 @@ # assert len(data) == len(pred_real_na) -# %% +# %% tags=["hide-input"] idx = feat_sel[0] -# %% +# %% tags=["hide-input"] feat_observed = data[idx].dropna() feat_observed -# %% +# %% tags=["hide-input"] # axes = axes.ravel() # args.out_folder.parent / 'intensity_plots' # each feature -> one plot? @@ -318,7 +355,7 @@ folder.mkdir(parents=True, exist_ok=True) -# %% +# %% tags=["hide-input"] min_y_int, max_y_int = vaep.plotting.data.get_min_max_iterable( [data.stack(), pred_real_na.stack()]) min_max = min_y_int, max_y_int @@ -331,7 +368,7 @@ # %% [markdown] # ## Compare with target annotation -# %% +# %% tags=["hide-input"] # labels somehow? # target.replace({True: f' >={args.cutoff_target}', False: f'<{args.cutoff_target}'}) @@ -434,5 +471,9 @@ def get_centered_label(method, n, q): fig, name=fname) plt.close() -# %% + +# %% [markdown] +# Saved files: + +# %% tags=["hide-input"] files_out diff --git a/project/README.md b/project/README.md index 7c72014d2..eddd93109 100644 --- a/project/README.md +++ b/project/README.md @@ -27,7 +27,8 @@ or as long formated data. | sample_03 | Protein B | 0.2 | | sample_03 | Protein C | 0.1 | -Currently `pickle`d and `csv` files are supported. +Currently `pickle`d and `csv` files are supported. If you use csv files, make sure +to set an index name for the columns (default: `Sample ID`). It's done mostly automatically. Optionally, ThermoRawFileParser output cab be used as metadata. along further as e.g. clinical metadata for each sample. @@ -82,125 +83,58 @@ papermill 01_0_split_data.ipynb runs/experiment_03/%DATASET%/experiment_03_data tag | notebook | Description --- | --- | --- +Tutorials | +tut | 04_1_train_pimms_models.ipynb | main tutorial showing scikit-learn interface partly with validatio data +tut | 04_1_train_DAE_VAE_wo_val_data.ipynb | Single experiment | run | 01_0_split_data.ipynb | Create train, validation and test data splits +run | 01_0_transform_data_to_wide_format.ipynb | Transform train split to wide format for R models run | 01_1_train_.ipynb | Train a single model e.g. (VAE, DAE, CF) +run | 01_1_train_NAGuideR_methods.ipynb | Train supported R models +run | 01_1_transfer_NAGuideR_pred.ipynb | Transfer R model predictions to correct format in Python run | 01_2_performance_plots.ipynb | Performance of single model run Grid search and best model analysis | -grid | 02_1_aggregate_metrics.py.ipynb | Aggregate metrics -grid | 02_2_aggregate_configs.py.ipynb | Aggregate model configurations +grid | 02_1_{aggregate|join}_metrics.py.ipynb | Aggregate or join metrics +grid | 02_2_{aggregate|join}_configs.py.ipynb | Aggregate or join model configurations grid | 02_3_grid_search_analysis.ipynb | Analyze different runs with varying hyperparameters on a dataset grid | 02_4_best_models_over_all_data | Show best models and best models across data types best | 03_1_best_models_comparison.ipynb | best model trained repeatedly or across datasets -Applications | -ald | 16_ald_data.ipynb | preprocess data -> could be move to data folder -ald | 16_ald_diff_analysis.ipynb | differential analysis (DA), dump scores -ald | 16_ald_compare_methods.ipynb | DA comparison between methods -ald | 16_ald_ml_new_feat.ipynb | ML model comparison -ald | 16_ald_compare_single_pg.ipynb | [DEV] Compare imputation for feat between methods (dist plots) +Differential analysis workflow | +ald | 10_0_ald_data.ipynb | preprocess data -> could be move to data folder +ald | 10_1_ald_diff_analysis.ipynb | differential analysis (DA), dump scores +ald | 10_2_ald_compare_methods.ipynb | DA comparison between methods +ald | 10_3_ald_ml_new_feat.ipynb | ML model comparison +ald | 10_4_ald_compare_single_pg.ipynb | Compare imputation for feat between methods (dist plots) +ald | 10_5_comp_diff_analysis_repetitions.ipynb | [Not in workflow] Compare 10x repeated differential analysis workflow +ald | 10_6_interpret_repeated_ald_da.py | [Not in workflow] Interpret 10x repeated differential analysis +ald | 10_7_ald_reduced_dataset_plots.ipynb | [Not in workflow] Plots releated reduced dataset (80% dataset) +Data inspection and manipulations for experiments | +data | 00_5_training_data_exploration.py | Inspect dataset +data | 00_6_0_permute_data.ipynb | Permute data per column to check overfitting of models (mean unchanged per column) +data | 00_8_add_random_missing_values.py | Script to add random missing values to ALD data +Publication specific notebooks | +pub | 03_2_best_models_comparison_fig2.ipynb | Best models comparison in Fig. 2 +pub | 03_3_combine_experiment_result_tables.ipynb | Combine HeLa experiment results for reporting +pub | 03_4_join_tables.py | Combine ALD experiment results for reporting +pub | 03_6_setup_comparison_rev3.py | Analyze setup of KNN comparison for rev 3 Miscancellous notebooks on different topics (partly exploration) | misc | misc_embeddings.ipynb | FastAI Embeddings misc | misc_illustrations.ipynb | Illustrations of certain concepts (e.g. draw from shifted random distribution) misc | misc_json_formats.ipynb | Investigate storring training data as json with correct encoding -misc | misc_MaxQuantOutput.ipynb | \[documentation\] Analyze MQ output, show MaxQuantOutput class behaviour -misc | misc_protein_support.ipynb | peptide sequences mapped to protein sequences misc | misc_pytorch_fastai_dataset.ipynb | Dataset functionality misc | misc_pytorch_fastai_dataloaders.ipynb| Dataloading functionality misc | misc_sampling_in_pandas.ipynb | How to sample in pandas -# Notebook descriptions (To be completed) +## KNN adhoc analysis using jupytext and papermill -## Inspect dataset - -### `00_5_training_data_exploration.py` - -Can be execute manually +Compare performance splitting samples into train, validation and test set. +Use scikit-learn `KNN_IMPUTER` as it's easiest to tweak and understand. ```bash -jupytext 00_5_training_data_exploration.py --to ipynb -o - | papermill - runs/example/00_5_training_data_exploration.ipynb -f config/single_dev_dataset/example/inspect_data.yaml -``` - -## Single experiment run -### `01_0_split_data.ipynb` - -- select data according to procedure described in **Fig. S1** - -### `01_1_train_.ipynb` -- notebooks for training model `X` (e.g. `VAE`, `DAE` or `CF`) - -### `01_2_performance_plots.ipynb` - -## Grid search and best model analysis - -### `02_1_aggregate_metrics.py.ipynb` and `02_1_join_metrics.py.ipynb` -- helper script to collect `metrics`. -### `02_2_aggregate_configs.py.ipynb` and `02_2_join_configs.py.ipynb` - -- helper script to collect `config`urations. - -### `02_3_grid_search_analysis.ipynb` - -- analyze different runs with varying hyperparameters on a single data set -- run for each protein group, peptides and precursor data set - -### `02_4_best_models_over_all_data.ipynb` - -- show best models across data sets in grid search - -### `03_1_best_models_comparison.ipynb` - -## Misc - -### `misc_clustering_proteins.ipynb` - -- first PCA analysis of proteins from Annelaura - -### `misc_data_exploration_proteins.ipynb` - -### `misc_embeddings.ipynb` - -### `misc_illustrations.ipynb` -- illustrations for presentations -- e.g. shifted normal imputation - -### `misc_pytorch_fastai_dataloaders.ipynb` - -### `misc_pytorch_fastai_dataset.ipynb` -### `misc_id_mapper.ipynb` - -### `misc_json_formats.ipynb` - -### `run_ipynbs.py` - -### `misc_protein_support.ipynb` - -- map peptide sequences to protein sequences -- calculate some metrics - -### `misc_sampling_in_pandas.ipynb` - -### `misc_MaxQuantOutput.ipynb` -- misc - -### 01 Analysis Fasta - -#### `misc_FASTA_tryptic_digest.ipynb` - -- analysis FASTA file used for protein search - -#### `misc_FASTA_data_agg_by_gene.ipynb` - -- analysis of gene to protein mapping of fasta file - -### 02 Analysis dataset - -#### `erda_data_available.ipynb` -- analyze `count_all_peptides.json`: How many peptides are identified overall in all - processed files - -> erda notebook: `00_mq_count_peptides.ipynb` - -#### `misc_data_exploration_peptides.ipynb` -- finds files originationg from fractionation experiments -- plot mask indicating presence/abscence of peptide measurement in an experiment -- intensity log-transformation: +# classic: +jupytext --to ipynb -k - -o - 01_1_train_KNN.py | papermill - runs/rev3/01_1_train_KNN.ipynb +# train only on samples without simulated missing values, add simulated missing values to test and validation samples +jupytext --to ipynb -k - -o - 01_1_train_KNN_unique_samples.py | papermill - runs/rev3/01_1_train_KNN_unique_samples.ipynb +# new comparison (check if the old nb could be used for this purpose) +jupytext --to ipynb -k - -o - 01_3_revision3.py | papermill - runs/rev3/01_3_revision3.ipynb +``` \ No newline at end of file diff --git a/project/bin/create_qsub_commands.py b/project/bin/create_qsub_commands.py index d5b2445a9..0c79ce497 100755 --- a/project/bin/create_qsub_commands.py +++ b/project/bin/create_qsub_commands.py @@ -1,39 +1,39 @@ -# %% -from itertools import product - -# import subprocess -mnar_mcar = [25, 50, 75] -datasets = ["pg_m", "pg_l", "pep_m", "evi_m", "pep_l", "evi_l"] - -for dataset, perc in product(datasets, mnar_mcar): - print(f"# {dataset = } # {perc = }") - cmd = ( - "qsub bin/run_snakemake_cluster.sh" - f" -N sm_{dataset}_{perc}" - f" -v configfile=config/single_dev_dataset/mnar_mcar/{dataset}.yaml,prefix={dataset}_{perc}," - f"frac_mnar={perc/100:.2f}," - f"config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml," - f"config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml," - f"folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" - ) - print(cmd) - # subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) - -# %% [markdown] -# Create local command to run on interactive node -print() -print("#" * 80) -print() -# %% -for dataset, perc in product(datasets, mnar_mcar): - cmd = ( - "snakemake -s workflow/Snakefile_v2" - f" --configfile config/single_dev_dataset/mnar_mcar/{dataset}.yaml" - f" --config frac_mnar={perc/100:.2f}" - f" config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml" - f" config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml" - f" folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" - " -c1" - ) - print(cmd) -# %% +# %% +from itertools import product + +# import subprocess +mnar_mcar = [25, 50, 75] +datasets = ["pg_m", "pg_l", "pep_m", "evi_m", "pep_l", "evi_l"] + +for dataset, perc in product(datasets, mnar_mcar): + print(f"# {dataset = } # {perc = }") + cmd = ( + "qsub bin/run_snakemake_cluster.sh" + f" -N sm_{dataset}_{perc}" + f" -v configfile=config/single_dev_dataset/mnar_mcar/{dataset}.yaml,prefix={dataset}_{perc}," + f"frac_mnar={perc/100:.2f}," + f"config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml," + f"config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml," + f"folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" + ) + print(cmd) + # subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) + +# %% [markdown] +# Create local command to run on interactive node +print() +print("#" * 80) +print() +# %% +for dataset, perc in product(datasets, mnar_mcar): + cmd = ( + "snakemake -s workflow/Snakefile_v2.smk" + f" --configfile config/single_dev_dataset/mnar_mcar/{dataset}.yaml" + f" --config frac_mnar={perc/100:.2f}" + f" config_split=runs/mnar_mcar/{dataset}_{perc}MNAR/01_0_split_data.yaml" + f" config_train=runs/mnar_mcar/{dataset}_{perc}MNAR/train_{{model}}.yaml" + f" folder_experiment=runs/mnar_mcar/{dataset}_{perc}MNAR" + " -c1" + ) + print(cmd) +# %% diff --git a/project/bin/run_snakemake_cluster.sh b/project/bin/run_snakemake_cluster.sh index 20d24ff2c..bc96a46a6 100644 --- a/project/bin/run_snakemake_cluster.sh +++ b/project/bin/run_snakemake_cluster.sh @@ -48,7 +48,7 @@ echo config_train $config_train . ~/setup_conda.sh conda activate vaep -snakemake -s workflow/Snakefile_v2 --jobs 10 -k -p -c2 --latency-wait 60 --rerun-incomplete \ +snakemake -s workflow/Snakefile_v2.smk --jobs 10 -k -p -c2 --latency-wait 60 --rerun-incomplete \ --configfile $configfile \ --config frac_mnar=$frac_mnar folder_experiment=$folder_experiment config_split=$config_split config_train=$config_train \ --max-status-checks-per-second 0.1 \ diff --git a/project/config/alzheimer_study/README.md b/project/config/alzheimer_study/README.md new file mode 100644 index 000000000..b947673a0 --- /dev/null +++ b/project/config/alzheimer_study/README.md @@ -0,0 +1,10 @@ +# Alzheimer study configuration + +For [`workflow/Snakefile_v2.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_v2.smk): + +- [`config.yaml`](config.yaml) +- see comments in config for explanations. + +For [`workflow/Snakefile_ald_comparison](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_ald_comparison.smk): + +- [`comparison.yaml`](comparison.yaml) diff --git a/project/config/alzheimer_study/comparison.yaml b/project/config/alzheimer_study/comparison.yaml new file mode 100644 index 000000000..27030a7cf --- /dev/null +++ b/project/config/alzheimer_study/comparison.yaml @@ -0,0 +1,22 @@ +folder_experiment: runs/alzheimer_study +fn_clinical_data: https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv +target: AD +covar: + AD: age,Kiel,Magdeburg,Sweden +cutoffs: + AD: 0.5 +disease_ontology: # code from https://disease-ontology.org/ + AD: 10652 # Alzheimer disease +f_annotations: null +annotaitons_gene_col: null +baseline: PI +ref_method_score: +make_plots: false +methods: + - Median + - CF + - DAE + - VAE + - QRILC + - TRKNN + - RF diff --git a/project/config/alzheimer_study/config.yaml b/project/config/alzheimer_study/config.yaml new file mode 100644 index 000000000..66e00ee0c --- /dev/null +++ b/project/config/alzheimer_study/config.yaml @@ -0,0 +1,79 @@ +# config for Snakefile_v2.smk +config_split: runs/alzheimer_study/split.yaml # ! will be build by workflow +config_train: runs/alzheimer_study/train_{model}.yaml # ! will be build by workflow +folder_experiment: runs/alzheimer_study # folder to save the results +fn_rawfile_metadata: https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv # metadata file +cuda: False # use GPU? +file_format: csv # intermediate file formats +split_data: # for 01_01_split_data.ipynb -> check parameters + FN_INTENSITIES: https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv + sample_completeness: 0.5 + feat_prevalence: 0.25 + column_names: + - protein groups + index_col: 0 + meta_cat_col: _collection site + meta_date_col: null # null if no date column, translated to None in Python + frac_mnar: 0.25 + frac_non_train: 0.1 +models: + - Median: # name used for model with this configuration + model: Median # model used + - CF: + model: CF # notebook: 01_1_train_{model}.ipynb will be 01_1_train_CF.ipynb + latent_dim: 50 + batch_size: 1024 + epochs_max: 100 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + latent_dim: 10 + batch_size: 64 + epochs_max: 300 + hidden_layers: "64" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + latent_dim: 10 + batch_size: 64 + epochs_max: 300 + hidden_layers: "64" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv + - KNN5: + model: KNN + neighbors: 5 + file_format: csv +NAGuideR_methods: + - BPCA + - COLMEDIAN + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + # - MICE-CART > 1h20min on GitHub small runner + # - MICE-NORM ~ 1h on GitHub small runner + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/knn_comparison/ald_pgs_all/README.md b/project/config/knn_comparison/ald_pgs_all/README.md index 95ea87933..2392be95c 100644 --- a/project/config/knn_comparison/ald_pgs_all/README.md +++ b/project/config/knn_comparison/ald_pgs_all/README.md @@ -3,5 +3,5 @@ for ALD protein groups dataset. ```bash -snakemake -s workflow/Snakefile_v2 --configfile config/knn_comparison/ald_pgs_all/config.yaml -p -c1 -n +snakemake -s workflow/Snakefile_v2.smk --configfile config/knn_comparison/ald_pgs_all/config.yaml -p -c1 -n ``` \ No newline at end of file diff --git a/project/config/knn_comparison/ald_pgs_all/config.yaml b/project/config/knn_comparison/ald_pgs_all/config.yaml index 1a0f21c94..d320c996a 100644 --- a/project/config/knn_comparison/ald_pgs_all/config.yaml +++ b/project/config/knn_comparison/ald_pgs_all/config.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_train: runs/knn_comparison/ald_pgs_all/configs_train/train_{model}.yaml config_split: runs/knn_comparison/ald_pgs_all/config_split.yaml folder_experiment: runs/knn_comparison/ald_pgs_all diff --git a/project/config/knn_comparison/hela_pgs_large/README.md b/project/config/knn_comparison/hela_pgs_large/README.md index b22dd0b33..d865857fe 100644 --- a/project/config/knn_comparison/hela_pgs_large/README.md +++ b/project/config/knn_comparison/hela_pgs_large/README.md @@ -3,5 +3,5 @@ for large protein groups HeLa dataset. ```bash -snakemake -s workflow/Snakefile_v2 --configfile config/knn_comparison/hela_pgs_large/config.yaml -p -c1 -n +snakemake -s workflow/Snakefile_v2.smk --configfile config/knn_comparison/hela_pgs_large/config.yaml -p -c1 -n ``` \ No newline at end of file diff --git a/project/config/knn_comparison/hela_pgs_large/config.yaml b/project/config/knn_comparison/hela_pgs_large/config.yaml index 671a9c222..fbdd598b1 100644 --- a/project/config/knn_comparison/hela_pgs_large/config.yaml +++ b/project/config/knn_comparison/hela_pgs_large/config.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: config/knn_comparison/hela_pgs_large/split.yaml config_train: runs/knn_comparison/hela_pgs_large/configs_train/train_{model}.yaml folder_experiment: runs/knn_comparison/hela_pgs_large diff --git a/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml b/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml index 5712570f4..3530f1668 100644 --- a/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/evi_l.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_l_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_l_50MNAR diff --git a/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml b/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml index 657d3adaa..1e1af8e38 100755 --- a/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/evi_m.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_m_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_m_50MNAR diff --git a/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml b/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml index bf5623c7c..1f98afed4 100644 --- a/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pep_l.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pep_l_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pep_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pep_l_50MNAR @@ -6,70 +6,70 @@ frac_mnar: 0.5 fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv file_format: csv split_data: - FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl - sample_completeness: 0.4 - feat_prevalence: 0.25 - index_col: 0 - meta_date_col: Content Creation Date - column_names: null + FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null models: - - Median: - model: Median # needs to set at least one parameter - - CF: - model: CF - file_format: csv - latent_dim: 50 - batch_size: 4096 - epochs_max: 30 - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - DAE: - model: DAE - file_format: csv - latent_dim: 50 - batch_size: 10 - epochs_max: 200 - hidden_layers: "1024" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - VAE: - model: VAE - file_format: csv - latent_dim: 10 - batch_size: 10 - epochs_max: 200 - hidden_layers: "512" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - KNN: - model: KNN - neighbors: 3 - file_format: csv + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 50 + batch_size: 10 + epochs_max: 200 + hidden_layers: "1024" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 10 + batch_size: 10 + epochs_max: 200 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv NAGuideR_methods: - # - BPCA # > 24h, killed - - COLMEDIAN - # - GSIMP # > 24h, killed - - IMPSEQ - - IMPSEQROB - # - IRM # > 24h, killed - - KNN_IMPUTE - # - LLS # error - # - MICE-CART # > 24h, killed - # - MICE-NORM # > 24h, killed - - MINDET - - MINIMUM - - MINPROB - - MLE - - MSIMPUTE - - MSIMPUTE_MNAR - - PI - - QRILC - # - RF # > 24h, killed - - ROWMEDIAN - # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds - - SVDMETHOD - # - TRKNN # > 24h, killed - - ZERO + # - BPCA # > 24h, killed + - COLMEDIAN + # - GSIMP # > 24h, killed + - IMPSEQ + - IMPSEQROB + # - IRM # > 24h, killed + - KNN_IMPUTE + # - LLS # error + # - MICE-CART # > 24h, killed + # - MICE-NORM # > 24h, killed + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + # - RF # > 24h, killed + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD + # - TRKNN # > 24h, killed + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml b/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml index e633639b7..81a1f8c3c 100755 --- a/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pep_m.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pep_m_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pep_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pep_m_50MNAR @@ -6,72 +6,72 @@ frac_mnar: 0.5 fn_rawfile_metadata: data/dev_datasets/df_intensities_peptides_long/metadata.csv file_format: csv split_data: - FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl - sample_completeness: 0.4 - feat_prevalence: 0.25 - select_N: 50 - index_col: 0 - meta_date_col: Content Creation Date - column_names: null + FN_INTENSITIES: data/dev_datasets/df_intensities_peptides_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + select_N: 50 + index_col: 0 + meta_date_col: Content Creation Date + column_names: null models: - - Median: - model: Median # needs to set at least one parameter - - CF: - model: CF - file_format: csv - latent_dim: 50 - batch_size: 4096 - epochs_max: 30 - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - DAE: - model: DAE - file_format: csv - latent_dim: 75 - batch_size: 25 - patience: 50 - epochs_max: 200 - hidden_layers: "256_128" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - VAE: - model: VAE - file_format: csv - latent_dim: 50 - batch_size: 25 - epochs_max: 200 - hidden_layers: "256" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - KNN: - model: KNN - neighbors: 3 - file_format: csv + - Median: + model: Median # needs to set at least one parameter + - CF: + model: CF + file_format: csv + latent_dim: 50 + batch_size: 4096 + epochs_max: 30 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: + model: DAE + file_format: csv + latent_dim: 75 + batch_size: 25 + patience: 50 + epochs_max: 200 + hidden_layers: "256_128" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: + model: VAE + file_format: csv + latent_dim: 50 + batch_size: 25 + epochs_max: 200 + hidden_layers: "256" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv NAGuideR_methods: - - BPCA - - COLMEDIAN - # - GSIMP > 24h, killed - - IMPSEQ - - IMPSEQROB - - IRM - - KNN_IMPUTE - - LLS - - MICE-CART - - MICE-NORM - - MINDET - - MINIMUM - - MINPROB - - MLE - - MSIMPUTE - - MSIMPUTE_MNAR - - PI - - QRILC - - RF - - ROWMEDIAN - - SEQKNN - - SVDMETHOD - - TRKNN - - ZERO + - BPCA + - COLMEDIAN + # - GSIMP > 24h, killed + - IMPSEQ + - IMPSEQROB + - IRM + - KNN_IMPUTE + - LLS + - MICE-CART + - MICE-NORM + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF + - ROWMEDIAN + - SEQKNN + - SVDMETHOD + - TRKNN + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml b/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml index 2241305f9..5bdf72f2c 100755 --- a/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pg_l.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_l_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_l_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_l_50MNAR @@ -7,66 +7,66 @@ fn_rawfile_metadata: data/dev_datasets/df_intensities_proteinGroups_long/metadat cuda: False file_format: csv split_data: - FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl - sample_completeness: 0.4 - feat_prevalence: 0.25 - index_col: 0 - meta_date_col: Content Creation Date + FN_INTENSITIES: data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl + sample_completeness: 0.4 + feat_prevalence: 0.25 + index_col: 0 + meta_date_col: Content Creation Date models: - - Median: - model: Median - - CF: # 2min - model: CF - latent_dim: 50 - batch_size: 32768 - epochs_max: 100 - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - DAE: # 2min - model: DAE - latent_dim: 25 - batch_size: 64 - epochs_max: 100 - hidden_layers: "512" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - VAE: # 2min - model: VAE - latent_dim: 25 - batch_size: 64 - epochs_max: 50 - hidden_layers: "512" - sample_idx_position: 0 - cuda: False - save_pred_real_na: True - - KNN: - model: KNN - neighbors: 3 - file_format: csv + - Median: + model: Median + - CF: # 2min + model: CF + latent_dim: 50 + batch_size: 32768 + epochs_max: 100 + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - DAE: # 2min + model: DAE + latent_dim: 25 + batch_size: 64 + epochs_max: 100 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - VAE: # 2min + model: VAE + latent_dim: 25 + batch_size: 64 + epochs_max: 50 + hidden_layers: "512" + sample_idx_position: 0 + cuda: False + save_pred_real_na: True + - KNN: + model: KNN + neighbors: 3 + file_format: csv NAGuideR_methods: - - BPCA #6h41min - - COLMEDIAN - # - GSIMP # stopped after 24h - - IMPSEQ # 1min - - IMPSEQROB - - IRM # 7h52min - - KNN_IMPUTE - - LLS - # - MICE-CART # stopped after 24h - # - MICE-NORM # stopped after 24h - - MINDET - - MINIMUM - - MINPROB - - MLE - - MSIMPUTE - - MSIMPUTE_MNAR - - PI - - QRILC - - RF # 58min - - ROWMEDIAN - # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds - - SVDMETHOD # 16min - - TRKNN # 5h38min - - ZERO + - BPCA #6h41min + - COLMEDIAN + # - GSIMP # stopped after 24h + - IMPSEQ # 1min + - IMPSEQROB + - IRM # 7h52min + - KNN_IMPUTE + - LLS + # - MICE-CART # stopped after 24h + # - MICE-NORM # stopped after 24h + - MINDET + - MINIMUM + - MINPROB + - MLE + - MSIMPUTE + - MSIMPUTE_MNAR + - PI + - QRILC + - RF # 58min + - ROWMEDIAN + # - SEQKNN # Error in x[od, ismiss, drop = FALSE]: subscript out of bounds + - SVDMETHOD # 16min + - TRKNN # 5h38min + - ZERO diff --git a/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml b/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml index 40aa30bc3..41410ea0a 100644 --- a/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml +++ b/project/config/single_dev_dataset/mnar_mcar/pg_m.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: runs/mnar_mcar/pg_m_50MNAR/01_0_split_data.yaml # ! will be build config_train: runs/mnar_mcar/pg_m_50MNAR/train_{model}.yaml # ! will be build, should say model_key next folder_experiment: runs/mnar_mcar/pg_m_50MNAR diff --git a/project/config/single_dev_dataset/proteinGroups_N50/README.md b/project/config/single_dev_dataset/proteinGroups_N50/README.md new file mode 100644 index 000000000..4605516a0 --- /dev/null +++ b/project/config/single_dev_dataset/proteinGroups_N50/README.md @@ -0,0 +1,23 @@ +# Config files + +## Version 1 imputation workflow + +For [`worflow/Snakefile`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile) + +```bash +config.yaml # main config +split.yaml # split data config referenced in config.yaml +train_CF.yaml # CF train config referenced in config.yaml +train_DAE.yaml # DAE train config referenced in config.yaml +train_KNN.yaml # KNN train config referenced in config.yaml +train_Median.yaml # Median train config referenced in config.yaml +train_VAE.yaml # VAE train config referenced in config.yaml +``` + +## Version 2 impuation workflow + +For [`workflow/Snakefile_v2.yaml`](https://github.com/RasmussenLab/pimms/blob/HEAD/project/workflow/Snakefile_v2.smk) only one config file is needed: + +```bash +config_v2.yaml +``` \ No newline at end of file diff --git a/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml b/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml index c7c084b1e..1c2c1a7e3 100644 --- a/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml +++ b/project/config/single_dev_dataset/proteinGroups_N50/config_v2.yaml @@ -1,4 +1,4 @@ -# config for Snakefile_v2 +# config for Snakefile_v2.smk config_split: config/single_dev_dataset/proteinGroups_N50/split.yaml # ! will be build config_train: config/single_dev_dataset/proteinGroups_N50/train_{model}.yaml # ! will be build folder_experiment: runs/dev_dataset_small/proteinGroups_N50_Snakefile_v2 diff --git a/project/data/README.md b/project/data/README.md index b509a16c4..b518521a4 100644 --- a/project/data/README.md +++ b/project/data/README.md @@ -1,3 +1,71 @@ # Data Folder -> Put you files here. \ No newline at end of file +> Put you files here. + +## Download development dataset + +The large development data sets can be obtained from PRIDE. An example for the protein +groups level data is provided below and as an executable script. + +### Download large development dataset +Execute the script to download and save the large Hela protein group data for instrument 6070: + +```bash +python download_dev_dataset.py +``` + +This script contains the following code: + +```python +import io +import zipfile +from pathlib import Path + +import pandas as pd +import requests + +FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' +FILE = 'pride_metadata.csv' +print(f'Fetch metadata: {FTP_FOLDER}/{FILE}') +meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0) +meta.sample(5, random_state=42).sort_index() +idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index + +FILE = 'geneGroups_aggregated.zip' +print(f"Fetch archive: {FTP_FOLDER}/{FILE}") +r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900) +with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive: + print('available files in archive' '\n - '.join(zip_archive.namelist())) + FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv' + print('\nread file:', FNAME) + with zip_archive.open(FNAME) as f: + df = pd.read_csv(f, index_col=0) + +# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long') +FOLDER.mkdir(parents=True, exist_ok=True) +fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv' +df.loc[idx_6070].to_csv(fname) +print(f'saved data to: {fname}') +df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl')) +print(f'saved data to: {fname.with_suffix(".pkl")}') +# save metadata: +fname = FOLDER / 'metadata.csv' +meta.loc[idx_6070].to_csv(fname) +print(f'saved metadata to: {fname}') +``` +### Run snakemake workflow + +Then you will be able to run the snakemake workflow for the larger +development dataset: + +```bash +snakemake --configfile config/single_dev_dataset/proteinGroups/config.yaml -c1 -n +``` + +The smaller development data set on the protein groups level is also shipped with this +repository and can be found in the [`dev_datasets/HeLa_6070`](dev_datasets/HeLa_6070/) folder. + +```bash +snakemake -c1 -n +``` diff --git a/project/data/download_dev_dataset.py b/project/data/download_dev_dataset.py new file mode 100644 index 000000000..dec94bdde --- /dev/null +++ b/project/data/download_dev_dataset.py @@ -0,0 +1,42 @@ +"""Download the development dataset of HeLa cells from PRIDE. + +Instrument: Q_Exactive_HF_X_Orbitrap_6070 + +Can be adapted to save all instruments or other datasets. +""" +import io +import zipfile +from pathlib import Path + +import pandas as pd +import requests + +FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' +FILE = 'pride_metadata.csv' +print(f'Fetch metadata: {FTP_FOLDER}/{FILE}') +meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0) +meta.sample(5, random_state=42).sort_index() +idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index + +FILE = 'geneGroups_aggregated.zip' +print(f"Fetch archive: {FTP_FOLDER}/{FILE}") +r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900) +with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive: + print('available files in archive' '\n - '.join(zip_archive.namelist())) + FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv' + print('\nread file:', FNAME) + with zip_archive.open(FNAME) as f: + df = pd.read_csv(f, index_col=0) + +# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long') +FOLDER.mkdir(parents=True, exist_ok=True) +fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv' +df.loc[idx_6070].to_csv(fname) +print(f'saved data to: {fname}') +df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl')) +print(f'saved data to: {fname.with_suffix(".pkl")}') +# save metadata: +fname = FOLDER / 'metadata.csv' +meta.loc[idx_6070].to_csv(fname) +print(f'saved metadata to: {fname}') diff --git a/project/workflow/Snakefile b/project/workflow/Snakefile index 9aace9738..0667e4735 100644 --- a/project/workflow/Snakefile +++ b/project/workflow/Snakefile @@ -56,6 +56,8 @@ rule comparison: models=",".join(MODELS), err=f"{{folder_experiment}}/{nb_stem}.e", out=f"{{folder_experiment}}/{nb_stem}.o", + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb:q}" " -p fn_rawfile_metadata {params.meta_data:q}" @@ -94,6 +96,8 @@ rule transform_NAGuideR_predictions: folder_experiment="{folder_experiment}", # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#non-file-parameters-for-rules dumps_as_str=lambda wildcards, input: ",".join(input.dumps), + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb:q}" " -r folder_experiment {params.folder_experiment:q}" @@ -123,7 +127,7 @@ rule train_NAGuideR_model: # log: # err="{folder_experiment}/01_1_train_NAGuideR_{method}.log", conda: - "vaep" + "envs/trainRmodels.yaml" shell: "papermill {input.nb} {output.nb:q}" " -r train_split {input.train_split:q}" @@ -147,6 +151,8 @@ rule transform_data_to_wide_format: folder_experiment="{folder_experiment}", err=f"{{folder_experiment}}/{nb_stem}.e", out=f"{{folder_experiment}}/{nb_stem}.o", + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb:q}" " -r folder_experiment {params.folder_experiment:q}" @@ -174,7 +180,7 @@ rule train_models: # log: # err="{folder_experiment}/01_1_train_{model}.log", conda: - "vaep" + "envs/pimms.yaml" shell: "papermill {input.nb:q} {output.nb:q}" " -f {input.configfile:q}" @@ -203,6 +209,8 @@ rule create_splits: meta_data=config["fn_rawfile_metadata"], err=f"{{folder_experiment}}/{nb_stem}.e", out=f"{{folder_experiment}}/{nb_stem}.o", + conda: + "envs/pimms.yaml" shell: "papermill {input.nb} {output.nb}" " -f {input.configfile:q}" diff --git a/project/workflow/Snakefile_ald_comparison.smk b/project/workflow/Snakefile_ald_comparison.smk index 0b79e9c1b..63713da86 100644 --- a/project/workflow/Snakefile_ald_comparison.smk +++ b/project/workflow/Snakefile_ald_comparison.smk @@ -18,9 +18,7 @@ out_folder = folder_experiment + "/{out_folder}/{target}/" out_folder_two_methods_cp = out_folder + "{baseline}_vs_{model}/" -target_cutoff = dict(kleiner="2") - -target = "kleiner" +target = config["target"] all_methods = [config["baseline"], "None", *config["methods"]] @@ -73,7 +71,8 @@ rule plot_intensities_for_diverging_results: out_folder=config["out_folder"], ), nb=nb, - fn_clinical_data="data/ALD_study/processed/ald_metadata_cli.csv", + # replace with config + fn_clinical_data=f"{folder_experiment}/data/clinical_data.csv", output: diff_da=out_folder + "diff_analysis_compare_DA.xlsx", qvalues=out_folder + "qvalues_target.pkl", @@ -106,7 +105,7 @@ rule ml_comparison: nb=nb, pred_base=folder_experiment + "/preds/pred_real_na_{baseline}.csv", pred_model=folder_experiment + "/preds/pred_real_na_{model}.csv", - fn_clinical_data="data/ALD_study/processed/ald_metadata_cli.csv", + fn_clinical_data=f"{folder_experiment}/data/clinical_data.csv", output: sel_feat=out_folder_two_methods_cp + "mrmr_feat_by_model.xlsx", nb=out_folder_two_methods_cp + nb, @@ -157,24 +156,44 @@ rule compare_diff_analysis: ########################################################################################## # Scores for each model (method) -nb = "10_1_ald_diff_analysis.ipynb" +nb_stem = "10_1_ald_diff_analysis" rule differential_analysis: input: - nb=nb, - f_annotations=config["f_annotations"], + nb=f"{nb_stem}.ipynb", + fn_clinical_data=f"{folder_experiment}/data/clinical_data.csv", output: score=out_folder + "scores/diff_analysis_scores_{model}.pkl", - nb=out_folder + "scores/diff_analysis_{model}.ipynb", + nb=out_folder + f"scores/{nb_stem}_{{model}}.ipynb", params: covar=lambda wildcards: config["covar"][wildcards.target], + f_annotations=config["f_annotations"], shell: "papermill {input.nb} {output.nb}" f" -r folder_experiment {folder_experiment}" - " -r f_annotations {input.f_annotations}" + " -r fn_clinical_data {input.fn_clinical_data}" + " -p f_annotations {params.f_annotations}" " -r target {wildcards.target}" " -r covar {params.covar}" " -r model_key {wildcards.model}" " -r out_folder {wildcards.out_folder}" " && jupyter nbconvert --to html {output.nb}" + + +########################################################################################## +# Save clinical metadata to data folder of experimental folder +# Makes it possible to have remote clincial data + +rule copy_clinical_data: + output: + local_clincial_data = f"{folder_experiment}/data/clinical_data.csv", + params: + fn_clinical_data = config["fn_clinical_data"], + run: + import pandas as pd + # could be extended for several file-types + df = pd.read_csv(params.fn_clinical_data) + df.to_csv(output.local_clincial_data, index=False) + # , index_col=0) + # usecols=[args.sample_id_col, args.target]) diff --git a/project/workflow/Snakefile_v2 b/project/workflow/Snakefile_v2.smk similarity index 98% rename from project/workflow/Snakefile_v2 rename to project/workflow/Snakefile_v2.smk index bbd2a95b7..1db8774af 100644 --- a/project/workflow/Snakefile_v2 +++ b/project/workflow/Snakefile_v2.smk @@ -215,13 +215,10 @@ rule dump_train_config: f.write("# Build in Snakemake workflow\n") yaml.dump(model_configs[wildcards.model], f, sort_keys=False) - ########################################################################################## -# Create Data splits -# separate workflow by level -> provide custom configs +# Create data splits nb_stem = "01_0_split_data" - rule create_splits: input: nb=f"{nb_stem}.ipynb", @@ -243,7 +240,7 @@ rule create_splits: ########################################################################################## -# create config file dumps for each model +# create data splitting configuration file rule dump_split_config: diff --git a/project/workflow/envs/pimms.yaml b/project/workflow/envs/pimms.yaml new file mode 100644 index 000000000..da6fe3eee --- /dev/null +++ b/project/workflow/envs/pimms.yaml @@ -0,0 +1,52 @@ +# Dev Environment +name: pimms +channels: + - conda-forge + - pytorch + - nvidia + - fastai # fastchan + - bioconda + - plotly + # - defaults +dependencies: + - python>=3.8,<=3.12 + - numpy + - pandas>=1 + - scipy>=1.6 + # plotting + - matplotlib + - python-kaleido + - plotly + - seaborn<0.13 + - pip + # ML + - pytorch #=1.13.1=py3.8_cuda11.7_cudnn8_0 + # - pytorch-cuda + - scikit-learn + - fastai + - torchvision + # - cudatoolkit #=11.7 + # - tensorboard + - umap-learn + # stats + - pingouin + - statsmodels + # other + - tqdm # progress bars + - xmltodict # configs + - openpyxl # xml + - omegaconf + - plac>=1.0 + # snakemake + # jupyter + - ipykernel + - ipython + - ipywidgets + - jupyterlab # standalone jupyter installation + # - jupyter_contrib_nbextensions # delete configuration file if you see an error: https://github.com/jupyter/nbconvert/issues/526#issuecomment-277552771 + - jupyter-dash + - papermill # execute ipynb's + - pip: + - git+https://github.com/RasmussenLab/pimms.git@dev + - mrmr-selection + - njab diff --git a/project/workflow/envs/trainRmodels.yaml b/project/workflow/envs/trainRmodels.yaml new file mode 100644 index 000000000..f4f246bbc --- /dev/null +++ b/project/workflow/envs/trainRmodels.yaml @@ -0,0 +1,36 @@ +# Dev Environment +name: trainRmodels +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - papermill # execute ipynb's + - jupyter + # R packages (listed in NAGuideR) + - r-base + - r-devtools # is it needed for source installs on windows server? + - r-irkernel + - r-reshape2 + - r-stringi # + rmarkdown hack for reshape2 + - r-stringr # reshape2 + - r-tidyverse + - r-gdata + - r-glmnet + - r-e1071 + - r-norm + - r-missforest + - r-vim + - r-mice + - r-cluster + - r-mvtnorm + - r-rrcov + - r-gmm + - r-tmvtnorm + - r-igraph + # - bioconductor-biocinstaller + # - r-imputelcmd # bioconda + # - bioconductor-impute + # - bioconductor-pcamethods + # - rrcovNA, GMSimpute + # SeqKnn, pcaMethods, DreamAI # bioconductor diff --git a/pyproject.toml b/pyproject.toml index 74ee4adf0..25cd2ac9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ [project.scripts] pimms-setup-imputation-comparison = "vaep.cmd_interface.setup_imp_cp_website:main" +pimms-add-diff-comp = "vaep.cmd_interface.setup_diff_analysis_website:main" [project.urls] "Bug Tracker" = "https://github.com/RasmussenLab/pimms/issues" diff --git a/snakemake_env.yml b/snakemake_env.yml new file mode 100644 index 000000000..7713b7b18 --- /dev/null +++ b/snakemake_env.yml @@ -0,0 +1,8 @@ +name: snakemake +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - snakemake-minimal + - mamba diff --git a/tests/io/test_dataset.py b/tests/io/test_dataset.py index 5edf5aefa..cea05e853 100644 --- a/tests/io/test_dataset.py +++ b/tests/io/test_dataset.py @@ -10,7 +10,8 @@ from vaep.io.datasets import DatasetWithMaskAndNoTarget, DatasetWithTarget data = np.random.random(size=(10, 5)) -mask = ~(data < 0.1) +threshold = max(0.15, data.min() + 0.02) +mask = ~(data < threshold) data_w_na = np.where(mask, data, np.nan) assert (data != data_w_na).any() diff --git a/vaep/analyzers/analyzers.py b/vaep/analyzers/analyzers.py index 5b8d4398e..7bd8c1e3c 100644 --- a/vaep/analyzers/analyzers.py +++ b/vaep/analyzers/analyzers.py @@ -395,9 +395,10 @@ def get_consecutive_data_indices(df, n_samples): return df.loc[index[start_sample:start_sample + n_samples]] -def corr_lower_triangle(df): - """Compute the correlation matrix, returning only unique values.""" - corr_df = df.corr() +def corr_lower_triangle(df, **kwargs): + """Compute the correlation matrix, returning only unique values. + """ + corr_df = df.corr(**kwargs) lower_triangle = pd.DataFrame( np.tril(np.ones(corr_df.shape), -1)).astype(bool) lower_triangle.index, lower_triangle.columns = corr_df.index, corr_df.columns diff --git a/vaep/cmd_interface/setup_diff_analysis_website.py b/vaep/cmd_interface/setup_diff_analysis_website.py new file mode 100644 index 000000000..c2f983847 --- /dev/null +++ b/vaep/cmd_interface/setup_diff_analysis_website.py @@ -0,0 +1,110 @@ +"""Console script to create or append index.rst for static website of differential analysis workflow.""" +import argparse +import textwrap +from collections import defaultdict +from pathlib import Path + + +def split_nb_name(nb: str) -> list: + return nb.split('.')[0].split('_') + + +INDEX_RST = textwrap.dedent("""\ + Differential Analysis Notebooks + ------------------------------- + + Inspect the notebooks associated with the differential analysis workflow. + + .. toctree:: + :maxdepth: 2 + :caption: Differential analysis (ANCOVA) + + {nb_1} + + .. toctree:: + :maxdepth: 2 + :caption: Compare ANCOVAs + + {nb_2} + + .. toctree:: + :maxdepth: 2 + :caption: Compare single differential analysis + + {nb_4} + + .. toctree:: + :maxdepth: 2 + :caption: Logistic regression models + + {nb_3} + """) + + +def main(): + parser = argparse.ArgumentParser( + description='Create or append index.rst for static website ' + 'displaying differential analysis notebooks.') + parser.add_argument('--folder', '-f', + type=str, + help='Path to the folder', + required=True) + parser.add_argument('--subfolder_comparision', '-sf_cp', + type=str, + help='Subfolder for comparison', + required=True) + args = parser.parse_args() + + folder_experiment = args.folder + + folder_experiment = Path(folder_experiment) + subfolder_comparison = Path(args.subfolder_comparision) + nbs = [_f.relative_to(folder_experiment) for _f in subfolder_comparison.glob('**/*.ipynb') if _f.is_file()] + nbs + + groups = defaultdict(list) + for nb in nbs: + _group = nb.name.split('_')[1] + groups[_group].append(nb) + groups = dict(groups) + groups + + # Parse notebooks present in imputation workflow + + nb_1 = '' + for nb in groups['1']: + nb_1 += " " * 4 + split_nb_name(nb.name)[-1] + f" <{nb.as_posix()}>\n" + + nb_2 = '' + for nb in groups['2']: + nb_2 += " " * 4 + ' '.join(nb.parent.name.split('_')) + f" <{nb.as_posix()}>\n" + + nb_3 = '' + for nb in groups['3']: + nb_3 += " " * 4 + ' '.join(nb.parent.name.split('_')) + f" <{nb.as_posix()}>\n" + print(nb_3) + + nb_4 = groups['4'][0] + nb_4 = " " * 4 + "Compare single features" + f" <{nb_4.as_posix()}>\n" + + index_rst = INDEX_RST.format(nb_1=nb_1, + nb_2=nb_2, + nb_3=nb_3, + nb_4=nb_4) + # append to index.rst + with open(folder_experiment / 'index.rst', 'a') as f: + f.write(index_rst) + + msg = f"""\ + The index.rst file has been created or extended in {folder_experiment}: + ```bash + {folder_experiment / 'index.rst'} + ``` + """ + + msg = textwrap.dedent(msg) + print(msg) + + +if __name__ == '__main__': + main() diff --git a/vaep/cmd_interface/setup_imp_cp_website.py b/vaep/cmd_interface/setup_imp_cp_website.py index d8c758ddc..0f4c8ebd9 100644 --- a/vaep/cmd_interface/setup_imp_cp_website.py +++ b/vaep/cmd_interface/setup_imp_cp_website.py @@ -91,7 +91,7 @@ def split_nb_name(nb: str) -> list: # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. - exclude_patterns = ['_build', 'jupyter_execute', 'diff_analysis', 'figures', + exclude_patterns = ['_build', 'jupyter_execute', 'figures', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- diff --git a/vaep/io/dataloaders.py b/vaep/io/dataloaders.py index b649c15fe..57c373dba 100644 --- a/vaep/io/dataloaders.py +++ b/vaep/io/dataloaders.py @@ -1,73 +1,15 @@ -import pandas -import torch -from typing import Tuple -from torch.utils.data import Dataset -from fastai.data.load import DataLoader -from fastai.data.core import DataLoaders +import pandas +import pandas as pd from fastai.data.all import * +from fastai.data.core import DataLoaders +from fastai.data.load import DataLoader +from torch.utils.data import Dataset from vaep.io import datasets from vaep.io.datasets import DatasetWithTarget from vaep.transform import VaepPipeline -import pandas as pd - - -class DataLoadersCreator(): - """DataLoader creator. For training or evaluation.""" - - def __init__(self, - df_train: pandas.DataFrame, - df_valid: pandas.DataFrame, - scaler, - DataSetClass: torch.utils.data.Dataset, - batch_size: int - ): - """Helper function to create from pandas.DataFrame(s) in memory datasets. - - Parameters - ---------- - df_train : pandas.DataFrame - Training data samples in DataFrames. - df_valid : pandas.DataFrame - Validation data (for training) in DataFrames. - scaler : [type] - A pipeline of transform to apply to the dataset. - DataSetClass : torch.utils.data.Dataset - Type of dataset to use for generating single samples based on - DataFrames. - batch_size : int - Batch size to use. - - Returns - ------- - Tuple[torch.utils.data.Dataloader, torch.utils.data.Dataloader] - train and validation set dataloaders. - """ - self.data_train = DataSetClass( - data=scaler.transform(df_train)) - self.data_valid = DataSetClass(data=scaler.transform(df_valid)) - self.scaler = scaler - self.batch_size = batch_size - - def get_dls(self, - shuffle_train: bool = True, - **kwargs) -> Tuple[torch.utils.data.DataLoader, - torch.utils.data.DataLoader]: - self.shuffle_train = shuffle_train - dl_train = DataLoader( - dataset=self.data_train, - batch_size=self.batch_size, shuffle=shuffle_train, **kwargs) - - dl_valid = DataLoader( - dataset=self.data_valid, - batch_size=self.batch_size, shuffle=False, **kwargs) - return dl_train, dl_valid - - def __repr__(self): - return f"{self.__class__.__name__} for creating dataloaders with {self.batch_size}." - def get_dls(train_X: pandas.DataFrame, valid_X: pandas.DataFrame, @@ -124,7 +66,11 @@ def get_dls(train_X: pandas.DataFrame, valid_ds = datasets.DatasetWithTarget(df=pd.DataFrame()) # ! Need for script exection (as plain python file) # https://pytorch.org/docs/stable/notes/windows.html#multiprocessing-error-without-if-clause-protection - return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=False, + drop_last = False + if (len(train_X) % bs) == 1: + # Batch-Normalization does not work with batches of size one + drop_last = True + return DataLoaders.from_dsets(train_ds, valid_ds, bs=bs, drop_last=drop_last, num_workers=num_workers) diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 68fc89fc6..3be35408b 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -23,6 +23,10 @@ NUMPY_ONE = np.int64(1) +__all__ = ['ae', 'analysis', 'collab', 'vae', 'plot_loss', 'plot_training_losses', + 'calc_net_weight_count', 'RecorderDump', 'split_prediction_by_mask', + 'compare_indices', 'collect_metrics', 'calculte_metrics', + 'Metrics', 'get_df_from_nested_dict'] def plot_loss(recorder: learner.Recorder, @@ -312,7 +316,8 @@ def __repr__(self): def get_df_from_nested_dict(nested_dict, - column_levels=('data_split', 'model', 'metric_name'), + column_levels=( + 'data_split', 'model', 'metric_name'), row_name='subset'): metrics = {} for k, run_metrics in nested_dict.items(): diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py index ffaa60b17..97520bb02 100644 --- a/vaep/pandas/__init__.py +++ b/vaep/pandas/__init__.py @@ -285,7 +285,7 @@ def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: return ret -def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None): +def get_counts_per_bin(df: pd.DataFrame, bins: range, columns: Optional[List[str]] = None) -> pd.DataFrame: """Return counts per bin for selected columns in DataFrame.""" counts_per_bin = dict() if columns is None: diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index 6d4accf8e..17cc86ced 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -128,6 +128,9 @@ def make_large_descriptors(size='xx-large'): }) +set_font_sizes = make_large_descriptors + + def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, format_str: str = '{x:,.3f}') -> matplotlib.axes.Axes: """Add proportion as second axis. Try to align cleverly @@ -327,3 +330,39 @@ def plot_cutoffs(df: pd.DataFrame, if min_feat_in_sample is not None: ax.axhline(min_feat_in_sample) return fig, axes + + +def only_every_x_ticks(ax, x=2, axis=None): + """Sparse out ticks on both axis by factor x""" + if axis is None: + ax.set_xticks(ax.get_xticks()[::x]) + ax.set_yticks(ax.get_yticks()[::x]) + else: + if axis == 0: + ax.set_xticks(ax.get_xticks()[::x]) + elif axis == 1: + ax.set_yticks(ax.get_yticks()[::x]) + else: + raise ValueError(f'axis must be 0 or 1, got {axis}') + return ax + + +def use_first_n_chars_in_labels(ax, x=2): + """Take first N characters of labels and use them as new labels""" + # xaxis + _new_labels = [_l.get_text()[:x] + for _l in ax.get_xticklabels()] + _ = ax.set_xticklabels(_new_labels) + # yaxis + _new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()] + _ = ax.set_yticklabels(_new_labels) + return ax + + +def split_xticklabels(ax, PG_SEPARATOR=';'): + """Split labels by PG_SEPARATOR and only use first part""" + if PG_SEPARATOR is not None: + _new_labels = [_l.get_text().split(PG_SEPARATOR)[0] + for _l in ax.get_xticklabels()] + _ = ax.set_xticklabels(_new_labels) + return ax diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py index 14ff90430..1051a1a43 100644 --- a/vaep/plotting/data.py +++ b/vaep/plotting/data.py @@ -43,10 +43,12 @@ def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]: def plot_histogram_intensities(s: pd.Series, interval_bins=1, - min_max=(15, 40), + min_max: Tuple[int] = None, ax=None, **kwargs) -> Tuple[Axes, range]: """Plot intensities in Series in a certain range and equally spaced intervals.""" + if min_max is None: + min_max = get_min_max_iterable([s]) min_bin, max_bin = min_max bins = range(min_bin, max_bin, interval_bins) ax = s.plot.hist(bins=bins, xticks=list(bins),