Skip to content

Commit

Permalink
Merge pull request #61 from RasmussenLab/main_fig_update
Browse files Browse the repository at this point in the history
Main fig update
  • Loading branch information
Henry Webel authored Apr 24, 2024
2 parents b7e2400 + f983259 commit 80a62d7
Show file tree
Hide file tree
Showing 20 changed files with 701 additions and 278 deletions.
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- matplotlib
- python-kaleido
- plotly
- seaborn
- seaborn<0.13
- pip
# ML
- pytorch=1 #=1.13.1=py3.8_cuda11.7_cudnn8_0
Expand Down
127 changes: 99 additions & 28 deletions project/01_0_split_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,24 @@
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"from functools import partial\n",
"from pathlib import Path\n",
"import logging\n",
"from typing import Union, List\n",
"from typing import List, Union\n",
"\n",
"from IPython.display import display\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"import plotly.express as px\n",
"from IPython.display import display\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import vaep\n",
"import vaep.io.load\n",
"from vaep.analyzers import analyzers\n",
"from vaep.io.datasplits import DataSplits\n",
"from vaep.sampling import feature_frequency\n",
"\n",
"from vaep.analyzers import analyzers\n",
"from vaep.sklearn import get_PCA\n",
"import vaep.io.load\n",
"\n",
"logger = vaep.logging.setup_nb_logger()\n",
"logger.info(\"Split data and make diagnostic plots\")\n",
Expand All @@ -52,7 +51,7 @@
"\n",
"\n",
"pd.options.display.max_columns = 32\n",
"plt.rcParams['figure.figsize'] = [3, 2]\n",
"plt.rcParams['figure.figsize'] = [4, 2]\n",
"\n",
"vaep.plotting.make_large_descriptors(7)\n",
"\n",
Expand Down Expand Up @@ -82,6 +81,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2,
"tags": [
"parameters"
]
Expand All @@ -108,7 +108,8 @@
"# train, validation and test data splits\n",
"frac_non_train: float = 0.1 # fraction of non training data (validation and test split)\n",
"frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random\n",
"prop_sample_w_sim: float = 1.0 # proportion of samples with simulated missing values"
"prop_sample_w_sim: float = 1.0 # proportion of samples with simulated missing values\n",
"feat_name_display: str = None # display name for feature name (e.g. 'protein group')"
]
},
{
Expand All @@ -124,9 +125,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"args = vaep.nb.args_from_dict(args)\n",
Expand Down Expand Up @@ -195,6 +194,11 @@
" )\n",
"if args.column_names:\n",
" df.columns.names = args.column_names\n",
"if args.feat_name_display is None:\n",
" args.overwrite_entry('feat_name_display', 'features')\n",
" if args.column_names:\n",
" args.overwrite_entry('feat_name_display', args.column_names[0])\n",
"\n",
"\n",
"if not df.index.name:\n",
" logger.warning(\"No sample index name found, setting to 'Sample ID'\")\n",
Expand All @@ -221,7 +225,7 @@
" .plot\n",
" .box()\n",
" )\n",
"ax.set_ylabel('number of observation across samples')"
"ax.set_ylabel('Frequency')"
]
},
{
Expand Down Expand Up @@ -557,7 +561,7 @@
"source": [
"group = 1\n",
"ax = df.notna().sum(axis=1).hist()\n",
"ax.set_xlabel('features per eligable sample')\n",
"ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample')\n",
"ax.set_ylabel('observations')\n",
"fname = args.out_figures / f'0_{group}_hist_features_per_sample'\n",
"figures[fname.stem] = fname\n",
Expand All @@ -576,7 +580,7 @@
"_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]\n",
"_ = ax.set_xticklabels(_new_labels, rotation=45,\n",
" horizontalalignment='right')\n",
"ax.set_xlabel('feature prevalence')\n",
"ax.set_xlabel(f'{args.feat_name_display.capitalize()} prevalence')\n",
"ax.set_ylabel('observations')\n",
"fname = args.out_figures / f'0_{group}_feature_prevalence'\n",
"figures[fname.stem] = fname\n",
Expand All @@ -599,8 +603,9 @@
"min_max = vaep.plotting.data.min_max(df.stack())\n",
"ax, bins = vaep.plotting.data.plot_histogram_intensities(\n",
" df.stack(), min_max=min_max)\n",
"\n",
"ax.set_xlabel('Intensity binned')\n",
"fname = args.out_figures / f'0_{group}_intensity_distribution_overall'\n",
"\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand All @@ -614,6 +619,9 @@
"ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='scatter')\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter'\n",
"ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand All @@ -624,11 +632,17 @@
"metadata": {},
"outputs": [],
"source": [
"ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='boxplot')\n",
"ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='boxplot', return_plot_data=True)\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot'\n",
"ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
"vaep.savefig(ax.get_figure(), fname)\n",
"_data_feat_median_over_prop_missing.to_csv(fname.with_suffix('.csv'))\n",
"# _data_feat_median_over_prop_missing.to_excel(fname.with_suffix('.xlsx'))\n",
"del _data_feat_median_over_prop_missing"
]
},
{
Expand All @@ -644,7 +658,8 @@
"metadata": {},
"outputs": [],
"source": [
"sample_counts.name = 'identified features'"
"_feature_display_name = f'identified {args.feat_name_display}'\n",
"sample_counts.name = _feature_display_name"
]
},
{
Expand Down Expand Up @@ -718,12 +733,12 @@
"outputs": [],
"source": [
"fig, ax = plt.subplots()\n",
"col_identified_feat = 'identified features'\n",
"col_identified_feat = _feature_display_name\n",
"analyzers.plot_scatter(\n",
" pcs[pcs_name],\n",
" ax,\n",
" pcs[col_identified_feat],\n",
" title=f'by {col_identified_feat}',\n",
" feat_name_display=args.feat_name_display,\n",
" size=5,\n",
")\n",
"fname = (args.out_figures\n",
Expand All @@ -737,12 +752,23 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ! write principal components to excel (if needed)\n",
"# pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c861197",
"metadata": {},
"outputs": [],
"source": [
"fig = px.scatter(\n",
" pcs, x=pcs_name[0], y=pcs_name[1],\n",
" hover_name=pcs_index_name,\n",
" # hover_data=analysis.df_meta,\n",
" title=f'First two Principal Components of {args.M} features for {pcs.shape[0]} samples',\n",
" title=f'First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples',\n",
" # color=pcs['Software Version'],\n",
" color=col_identified_feat,\n",
" template='none',\n",
Expand Down Expand Up @@ -1295,7 +1321,8 @@
"ax.legend(_legend[:-1])\n",
"if args.use_every_nth_xtick > 1:\n",
" ax.set_xticks(ax.get_xticks()[::2])\n",
"fname = args.out_figures / f'0_{group}_test_over_train_split.pdf'\n",
"ax.set_xlabel('Intensity bins')\n",
"fname = args.out_figures / f'0_{group}_val_over_train_split.pdf'\n",
"figures[fname.name] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand Down Expand Up @@ -1324,6 +1351,18 @@
"vaep.savefig(ax.get_figure(), fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed941a81",
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -1346,6 +1385,36 @@
"vaep.savefig(ax.get_figure(), fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eede54fd",
"metadata": {},
"outputs": [],
"source": [
"# Save binned counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e6e1f6b",
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = dict()\n",
"for col in splits_df.columns:\n",
" _series = (pd.cut(splits_df[col], bins=bins)\n",
" .to_frame()\n",
" .groupby(col)\n",
" .size())\n",
" _series.index.name = 'bin'\n",
" counts_per_bin[col] = _series\n",
"counts_per_bin = pd.DataFrame(counts_per_bin)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1407,8 +1476,8 @@
"fig, ax = plt.subplots(figsize=(6, 2))\n",
"s = 1\n",
"s_axes = pd.DataFrame({'medians': medians,\n",
" 'validation split': splits.val_y.notna().sum(),\n",
" 'training split': splits.train_X.notna().sum()}\n",
" 'Validation split': splits.val_y.notna().sum(),\n",
" 'Training split': splits.train_X.notna().sum()}\n",
" ).plot.box(by='medians',\n",
" boxprops=dict(linewidth=s),\n",
" flierprops=dict(markersize=s),\n",
Expand All @@ -1417,7 +1486,9 @@
" _ = ax.set_xticklabels(ax.get_xticklabels(),\n",
" rotation=45,\n",
" horizontalalignment='right')\n",
"\n",
" ax.set_xlabel(f'{args.feat_name_display.capitalize()} binned by their median intensity '\n",
" f'(N {args.feat_name_display})')\n",
" _ = ax.set_ylabel('Frequency')\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train'\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
Expand Down
Loading

0 comments on commit 80a62d7

Please sign in to comment.