diff --git a/vaep/analyzers/__init__.py b/vaep/analyzers/__init__.py index 952822b09..9856dafb9 100644 --- a/vaep/analyzers/__init__.py +++ b/vaep/analyzers/__init__.py @@ -2,8 +2,7 @@ """ from types import SimpleNamespace -from . import diff_analysis -from . import compare_predictions +from . import compare_predictions, diff_analysis __all__ = ['diff_analysis', 'compare_predictions', 'Analysis'] diff --git a/vaep/analyzers/analyzers.py b/vaep/analyzers/analyzers.py index c807c58af..5b8d4398e 100644 --- a/vaep/analyzers/analyzers.py +++ b/vaep/analyzers/analyzers.py @@ -207,7 +207,7 @@ def to_wide_format( self.is_wide_format = True return self.df self._df_wide = df_wide - print(f"Set attribute: df_wide") + print("Set attribute: df_wide") return df_wide def describe_peptides(self, sample_n: int = None): @@ -264,7 +264,6 @@ def plot_pca(self,): raise AttributeError('No metadata available, please set "df_meta" first.') PCs = self.get_PCA() - cols = list(PCs.columns) fig, axes = plt.subplots(nrows=3, ncols=1, figsize=( 15, 20), constrained_layout=True) @@ -332,7 +331,7 @@ def get_dectection_limit(self): def __repr__(self): keys = sorted(self.__dict__) - items = ("{}".format(k, self.__dict__[k]) for k in keys) + items = ("{}".format(k) for k in keys) return "{} with attributes: {}".format(type(self).__name__, ", ".join(items)) # def __dir__(self): @@ -438,7 +437,7 @@ def plot_date_map(df, ax, ax.set_ylabel(cols[1]) path_collection = scatter_plot_w_dates( ax, df, dates=dates, errors='raise') - cbar = add_date_colorbar(path_collection, ax=ax) + _ = add_date_colorbar(path_collection, ax=ax) def plot_scatter(df, ax, diff --git a/vaep/analyzers/compare_predictions.py b/vaep/analyzers/compare_predictions.py index 4ba0544f0..e73a44d48 100644 --- a/vaep/analyzers/compare_predictions.py +++ b/vaep/analyzers/compare_predictions.py @@ -1,9 +1,9 @@ from __future__ import annotations from pathlib import Path +from typing import List import pandas as pd -from typing import List def load_predictions(pred_files: List, shared_columns=['observed']): @@ -42,7 +42,7 @@ def load_split_prediction_by_modelkey(experiment_folder: Path, allow_missing : bool, optional Ignore missing pred files of requested model, default False shared_columns : List, optional - List of columns that are shared between all models, by default ['observed'] + List of columns that are shared between all models, by default None Returns ------- @@ -60,8 +60,7 @@ def load_split_prediction_by_modelkey(experiment_folder: Path, else: raise FileNotFoundError(f'{file} does not exist') if to_remove: - for file in to_remove: - pred_files.remove(to_remove) + pred_files.remove(to_remove) return load_predictions(pred_files, shared_columns=shared_columns) diff --git a/vaep/databases/diseases.py b/vaep/databases/diseases.py index f4800f76f..5c1307792 100644 --- a/vaep/databases/diseases.py +++ b/vaep/databases/diseases.py @@ -6,7 +6,6 @@ def get_disease_association(doid: int, limit: int = 1000): params = {'type1': -26, - 'type2': 'value2', 'id1': f'DOID:{doid}', 'type2': 9606, 'limit': limit, diff --git a/vaep/io/__init__.py b/vaep/io/__init__.py index e65b0efc5..33613f332 100644 --- a/vaep/io/__init__.py +++ b/vaep/io/__init__.py @@ -3,7 +3,7 @@ import pickle from collections import namedtuple from pathlib import Path, PurePath, PurePosixPath -from typing import List, Tuple, Union +from typing import Tuple, Union import numpy as np import pandas as pd @@ -54,8 +54,7 @@ def search_subfolders(path='.', depth: int = 1, exclude_root: bool = False): def get_subfolders(path): return [x for x in path.iterdir() - if x.is_dir() and - not any(x.match(excl) for excl in EXCLUDED) + if x.is_dir() and not any(x.match(excl) for excl in EXCLUDED) ] directories_previous = directories.copy() @@ -81,7 +80,7 @@ def resolve_path(path: Union[str, Path], to: Union[str, Path] = '.') -> Path: return Path('/'.join(ret)) -def get_fname_from_keys(keys, folder=Path('.'), file_ext='.pkl', remove_duplicates=True): +def get_fname_from_keys(keys, folder='.', file_ext='.pkl', remove_duplicates=True): if remove_duplicates: # https://stackoverflow.com/a/53657523/9684872 keys = list(dict.fromkeys(keys)) @@ -150,8 +149,8 @@ def load_json(fname: Union[str, Path]) -> dict: def parse_dict(input_dict: dict, - types: List[Tuple] = [(PurePath, lambda p: str(PurePosixPath(p))), - (np.ndarray, lambda a: a.to_list())]): + types: Tuple[Tuple] = ((PurePath, lambda p: str(PurePosixPath(p))), + (np.ndarray, lambda a: a.to_list()))): """Transform a set of items (instances) to their string representation""" d = dict() for k, v in input_dict.items(): diff --git a/vaep/io/datasplits.py b/vaep/io/datasplits.py index daae63bc2..25be59183 100644 --- a/vaep/io/datasplits.py +++ b/vaep/io/datasplits.py @@ -1,15 +1,15 @@ from __future__ import annotations -from dataclasses import dataclass, field + import logging +from dataclasses import dataclass, field from functools import partial from pathlib import Path -from typing import Protocol +from typing import Union import pandas as pd -from pandas.core.algorithms import isin +from vaep.io.format import class_full_module, classname from vaep.pandas import interpolate -from vaep.io.format import classname, class_full_module logger = logging.getLogger(__name__) diff --git a/vaep/model.py b/vaep/model.py index 74e947643..07ff775d7 100644 --- a/vaep/model.py +++ b/vaep/model.py @@ -1,14 +1,9 @@ import logging + import numpy as np import pandas as pd - import torch import torch.utils.data -from torch.utils.data import Dataset -from torch import nn -from torch.nn import functional as F - -import fastai.collab as _fastai logger = logging.getLogger(__name__) @@ -61,99 +56,3 @@ def get_latent_space(model_method_call: callable, columns=[f'latent dimension {i+1}' for i in range(M)]) return latent_space - - -# # Defining the model manuelly - -# import torch.nn as nn -# d = 3 - -# n_features= 10 - -# class VAE(nn.Module): -# def __init__(self, d_input=n_features, d=d): -# super().__init__() - -# self.d_input = d_input -# self.d_hidden = d - -# self.encoder = nn.Sequential( -# nn.Linear(d_input, d ** 2), -# nn.ReLU(), -# nn.Linear(d ** 2, d * 2) -# ) - -# self.decoder = nn.Sequential( -# nn.Linear(d, d ** 2), -# nn.ReLU(), -# nn.Linear(d ** 2, self.d_input), -# nn.Sigmoid(), -# ) - -# def reparameterise(self, mu, logvar): -# if self.training: -# std = logvar.mul(0.5).exp_() -# eps = std.data.new(std.size()).normal_() -# return eps.mul(std).add_(mu) -# else: -# return mu - -# def forward(self, x): -# mu_logvar = self.encoder(x.view(-1, self.d_input)).view(-1, 2, d) -# mu = mu_logvar[:, 0, :] -# logvar = mu_logvar[:, 1, :] -# z = self.reparameterise(mu, logvar) -# return self.decoder(z), mu, logvar - -# model = VAE().double().to(device) -# model - -# # Training and testing the VAE - -# def loss_function(recon_batch, batch, mu, logvar, beta=1): -# BCE = nn.functional.binary_cross_entropy( -# recon_batch, batch, reduction='sum' -# ) -# KLD = 0.5 * torch.sum(logvar.exp() - logvar - 1 + mu.pow(2)) - -# return BCE + beta * KLD - -# epochs = 10 -# codes = dict(μ=list(), logσ2=list()) -# for epoch in range(0, epochs + 1): -# # Training -# if epoch > 0: # test untrained net first -# model.train() -# train_loss = 0 -# for x in dl_train: -# x = x.to(device) -# # ===================forward===================== -# x_hat, mu, logvar = model(x) -# loss = loss_function(x_hat, x, mu, logvar) -# train_loss += loss.item() -# # ===================backward==================== -# optimizer.zero_grad() -# loss.backward() -# optimizer.step() -# # ===================log======================== -# print(f'====> Epoch: {epoch} Average loss: {train_loss / len(dl_train.dataset):.4f}') - -# # Testing - -# means, logvars = list(), list() -# with torch.no_grad(): -# model.eval() -# test_loss = 0 -# for x in dl_valid: -# x = x.to(device) -# # ===================forward===================== -# x_hat, mu, logvar = model(x) -# test_loss += loss_function(x_hat, x, mu, logvar).item() -# # =====================log======================= -# means.append(mu.detach()) -# logvars.append(logvar.detach()) -# # ===================log======================== -# codes['μ'].append(torch.cat(means)) -# codes['logσ2'].append(torch.cat(logvars)) -# test_loss /= len(dl_valid.dataset) -# print(f'====> Test set loss: {test_loss:.4f}') diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 48577ac04..68fc89fc6 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -1,33 +1,33 @@ -from functools import reduce +import json import logging -from operator import mul -from pathlib import Path import pickle import pprint -from typing import Tuple, List, Callable, Union -import json +from functools import reduce +from operator import mul +from pathlib import Path +from typing import Callable, List, Tuple import matplotlib.pyplot as plt import numpy as np import pandas as pd +import sklearn.metrics as sklm import torch -from fastcore.foundation import L from fastai import learner -import sklearn.metrics as sklm - -from . import ae -from . import analysis -from . import collab -from . import vae +from fastcore.foundation import L import vaep +from . import ae, analysis, collab, vae + logger = logging.getLogger(__name__) +NUMPY_ONE = np.int64(1) + + def plot_loss(recorder: learner.Recorder, - norm_train: np.int64 = np.int64(1), - norm_val: np.int64 = np.int64(1), + norm_train: np.int64 = NUMPY_ONE, + norm_val: np.int64 = NUMPY_ONE, skip_start: int = 5, with_valid: bool = True, ax: plt.Axes = None) -> plt.Axes: @@ -66,11 +66,13 @@ def plot_loss(recorder: learner.Recorder, return ax +NORM_ONES = np.array([1, 1], dtype='int') + + def plot_training_losses(learner: learner.Learner, name: str, ax=None, - save_recorder: bool = True, - norm_factors=np.array([1, 1], dtype='int'), + norm_factors=NORM_ONES, folder='figures', figsize=(15, 8)): if ax is None: @@ -111,7 +113,7 @@ def __init__(self, recorder, name): self.iters = recorder.iters self.name = name - def save(self, folder=Path('.')): + def save(self, folder='.'): with open(Path(folder) / self.filename_tmp.format(self.name), 'wb') as f: pickle.dump(self, f) @@ -310,7 +312,7 @@ def __repr__(self): def get_df_from_nested_dict(nested_dict, - column_levels=['data_split', 'model', 'metric_name'], + column_levels=('data_split', 'model', 'metric_name'), row_name='subset'): metrics = {} for k, run_metrics in nested_dict.items(): diff --git a/vaep/models/collab.py b/vaep/models/collab.py index 368b6a14b..ec34a4ffb 100644 --- a/vaep/models/collab.py +++ b/vaep/models/collab.py @@ -3,16 +3,16 @@ from typing import Tuple import pandas as pd - -from fastai.tabular.all import * -from fastai.collab import * # import explicit objects for functional annotations -from fastai.collab import (CollabDataLoaders, IndexSplitter, TabularCollab, Categorify, TransformBlock) +from fastai.collab import * +from fastai.collab import (Categorify, CollabDataLoaders, IndexSplitter, + TabularCollab, TransformBlock) +from fastai.tabular.all import * -from . import analysis -import vaep.io.datasplits import vaep.io.dataloaders +import vaep.io.datasplits +from . import analysis logger = logging.getLogger(__name__) @@ -45,11 +45,11 @@ class CollabAnalysis(analysis.ModelAnalysis): def __init__(self, datasplits: vaep.io.datasplits.DataSplits, - sample_column='Sample ID', - item_column='peptide', - target_column='intensity', - model_kwargs=dict(), - batch_size=64): + sample_column: str = 'Sample ID', + item_column: str = 'peptide', + target_column: str = 'intensity', + model_kwargs: dict = None, + batch_size: int = 64): if datasplits.val_y is not None: self.X, self.frac = combine_data(datasplits.train_X, datasplits.val_y) @@ -81,6 +81,8 @@ def __init__(self, splits=splits) self.dls = to.dataloaders(path='.', bs=self.batch_size) self.params = {} + if model_kwargs is None: + model_kwargs = {} self.model_kwargs = model_kwargs self.params['model_kwargs'] = self.model_kwargs diff --git a/vaep/models/vae.py b/vaep/models/vae.py index 6395773e7..d56704f13 100644 --- a/vaep/models/vae.py +++ b/vaep/models/vae.py @@ -6,12 +6,12 @@ - loss is adapted to Dataset and FastAI adaptions - batchnorm1D for now (not weight norm) """ +import math from typing import List import torch -import math -from torch import nn import torch.nn.functional as F +from torch import nn leaky_relu_default = nn.LeakyReLU(.1) @@ -108,8 +108,10 @@ def loss_fct(pred, y, reduction='sum', results: List = None, freebits=0.1): batch = y l_rec = -torch.sum(gaussian_log_prob(batch, x_mu, x_logvar)) - l_reg = torch.sum(F.relu(compute_kld(z_mu, z_logvar) - - freebits * math.log(2)) + freebits * math.log(2), 1) + l_reg = torch.sum((F.relu(compute_kld(z_mu, z_logvar) + - freebits * math.log(2)) + + freebits * math.log(2)), + 1) if results is not None: results.append((l_rec.item(), torch.mean(l_reg).item())) diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index 5053a7d66..6d4accf8e 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -1,21 +1,21 @@ from __future__ import annotations -from functools import partial -import numpy as np -import pandas as pd -import matplotlib import logging import pathlib +from functools import partial + +import matplotlib import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import seaborn import vaep.pandas +from . import data, defaults, errors, plotly from .errors import plot_rolling_error -from . import errors -from . import data -from . import plotly -from . defaults import order_categories, labels_dict, IDX_ORDER + +# from . defaults import order_categories, labels_dict, IDX_ORDER seaborn.set_style("whitegrid") # seaborn.set_theme() @@ -31,6 +31,7 @@ __all__ = ['plotly', 'data', + 'defaults', 'errors', 'plot_rolling_error', # define in this file diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py index edc284529..14ff90430 100644 --- a/vaep/plotting/data.py +++ b/vaep/plotting/data.py @@ -1,6 +1,6 @@ """Plot data distribution based on pandas `DataFrames` or `Series`.""" import logging -from typing import Iterable, Tuple, Union +from typing import Iterable, Optional, Tuple, Union import matplotlib import matplotlib.pyplot as plt @@ -60,8 +60,8 @@ def plot_observations(df: pd.DataFrame, title: str = '', axis: int = 1, size: int = 1, - ylabel: str = 'number of features', - xlabel: str = 'Samples ordered by number of features') -> Axes: + ylabel: str = 'Frequency', + xlabel: Optional[str] = None) -> Axes: """Plot non missing observations by row (axis=1) or column (axis=0) in order of number of available observations. No binning is applied, only counts of non-missing values are plotted. @@ -86,6 +86,12 @@ def plot_observations(df: pd.DataFrame, Axes Axes on which plot was plotted """ + if xlabel is None: + if df.columns.name: + xlabel = f'Samples ordered by identified {df.columns.name}' + else: + xlabel = 'Samples ordered by identified features' + ax = (df .notna() .sum(axis=axis) diff --git a/vaep/plotting/defaults.py b/vaep/plotting/defaults.py index f4a470abe..d168ae627 100644 --- a/vaep/plotting/defaults.py +++ b/vaep/plotting/defaults.py @@ -1,4 +1,5 @@ import logging + import matplotlib as mpl import seaborn as sns @@ -87,23 +88,3 @@ def _repr_html_(self): 'metric_name': 'metric', } -order_categories = {'data level': ['proteinGroups', 'aggPeptides', 'evidence'], - 'model': ['median', 'interpolated', 'CF', 'DAE', 'VAE']} - -IDX_ORDER = (['proteinGroups', 'aggPeptides', 'evidence'], - ['median', 'interpolated', 'CF', 'DAE', 'VAE']) - - -ORDER_MODELS = ['RSN', 'median', 'interpolated', - 'CF', 'DAE', 'VAE', - ] - -l_colors_to_use_hex = ['#937860', # seaborn brown - '#4c72b0', # seaborn blue - '#dd8452', # seaborn organe - '#55a868', # seaborn green - '#c44e52', # seaborn red - '#8172b3', # seaborn violete/lila - ] - -d_colors_to_use_hex = {k: v for k, v in zip(ORDER_MODELS, l_colors_to_use_hex)} diff --git a/vaep/plotting/plotly.py b/vaep/plotting/plotly.py index cd8439ad8..8cac92b78 100644 --- a/vaep/plotting/plotly.py +++ b/vaep/plotting/plotly.py @@ -1,52 +1,3 @@ -from functools import partial, update_wrapper -import plotly.express as px - -# set some defaults -from .defaults import labels_dict, order_categories - -TEMPLATE = 'none' - - -figure_size_defaults = dict(width=1600, - height=700, - template=TEMPLATE) - -scatter_defaults = dict(x='data_split', - y='metric_value', - color="model", - facet_row="metric_name", - facet_col="subset_w_N", - hover_data=['n_hidden_layers', - 'hidden_layers', - 'batch_size', - 'n_params'], - ) - - -scatter = partial(px.scatter, - **scatter_defaults, - **figure_size_defaults, - labels_dict=labels_dict, - category_orders=order_categories) -update_wrapper(scatter, px.scatter) - - -bar = partial(px.bar, - x='model', - y='metric_value', - color='data level', - barmode="group", - text='text', - category_orders=order_categories, - height=600, - template=TEMPLATE) -update_wrapper(bar, px.bar) - - -line = partial(px.line, - **figure_size_defaults, - ) -update_wrapper(line, px.line) def apply_default_layout(fig): diff --git a/vaep/sklearn/cf_transformer.py b/vaep/sklearn/cf_transformer.py index 7231a594c..ac152a92b 100644 --- a/vaep/sklearn/cf_transformer.py +++ b/vaep/sklearn/cf_transformer.py @@ -1,36 +1,29 @@ """Scikit-learn style interface for Collaborative Filtering model.""" from __future__ import annotations -from fastai.collab import CollabDataLoaders from pathlib import Path -from fastai.torch_core import default_device -from fastai.losses import MSELossFlat -from fastai.data.transforms import IndexSplitter -from fastai.data.block import TransformBlock +import matplotlib.pyplot as plt +import pandas as pd +from fastai import learner from fastai.callback.tracker import EarlyStoppingCallback -from fastai.tabular.core import Categorify -from fastai.tabular.all import * -from fastai.collab import EmbeddingDotBias -from fastai.collab import TabularCollab from fastai.collab import * - +from fastai.collab import CollabDataLoaders, EmbeddingDotBias, TabularCollab +from fastai.data.block import TransformBlock +from fastai.data.transforms import IndexSplitter from fastai.learner import Learner -from fastai import learner - -import matplotlib.pyplot as plt -import pandas as pd - -from sklearn.utils.validation import check_is_fitted +from fastai.losses import MSELossFlat +from fastai.tabular.all import * +from fastai.tabular.core import Categorify +from fastai.torch_core import default_device from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted import vaep -from vaep.models import collab import vaep.models as models - - # patch plotting function -from vaep.models import plot_loss +from vaep.models import collab, plot_loss + learner.Recorder.plot_loss = plot_loss @@ -187,6 +180,6 @@ def plot_loss(self, y, figsize=(8, 4)): # -> Axes: vaep.savefig(fig, name='collab_training', folder=self.out_folder) self.model_kwargs['batch_size'] = self.batch_size - vaep.io.dump_json(self.model_kwargs, self.out_folder / - 'model_params_{}.json'.format('CF')) + vaep.io.dump_json(self.model_kwargs, + self.out_folder / 'model_params_{}.json'.format('CF')) return ax diff --git a/vaep/transform.py b/vaep/transform.py index b0806297f..947e678f7 100644 --- a/vaep/transform.py +++ b/vaep/transform.py @@ -1,15 +1,11 @@ import logging from typing import List -import numpy as np import pandas as pd import sklearn +import sklearn.pipeline import torch from sklearn import preprocessing -from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler - -from vaep.io.datasets import to_tensor logger = logging.getLogger(__name__) @@ -65,11 +61,11 @@ def inverse_transform(self, X, **kwargs): if isinstance(X, pd.DataFrame): return pd.DataFrame(res, columns=X.columns, index=X.index) return res -# could become factory function, build args dictionary def make_pandas_compatible(cls): """Patch transform and inverse_transform.""" + # ? could become factory function, build args dictionary _fcts = ['transform', 'inverse_transform'] for _fct in _fcts: if not hasattr(cls, _fct):