Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Main fig update #61

Merged
merged 15 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- matplotlib
- python-kaleido
- plotly
- seaborn
- seaborn<0.13
- pip
# ML
- pytorch=1 #=1.13.1=py3.8_cuda11.7_cudnn8_0
Expand Down
127 changes: 99 additions & 28 deletions project/01_0_split_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,24 @@
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"from functools import partial\n",
"from pathlib import Path\n",
"import logging\n",
"from typing import Union, List\n",
"from typing import List, Union\n",
"\n",
"from IPython.display import display\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"import plotly.express as px\n",
"from IPython.display import display\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import vaep\n",
"import vaep.io.load\n",
"from vaep.analyzers import analyzers\n",
"from vaep.io.datasplits import DataSplits\n",
"from vaep.sampling import feature_frequency\n",
"\n",
"from vaep.analyzers import analyzers\n",
"from vaep.sklearn import get_PCA\n",
"import vaep.io.load\n",
"\n",
"logger = vaep.logging.setup_nb_logger()\n",
"logger.info(\"Split data and make diagnostic plots\")\n",
Expand All @@ -52,7 +51,7 @@
"\n",
"\n",
"pd.options.display.max_columns = 32\n",
"plt.rcParams['figure.figsize'] = [3, 2]\n",
"plt.rcParams['figure.figsize'] = [4, 2]\n",
"\n",
"vaep.plotting.make_large_descriptors(7)\n",
"\n",
Expand Down Expand Up @@ -82,6 +81,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2,
"tags": [
"parameters"
]
Expand All @@ -108,7 +108,8 @@
"# train, validation and test data splits\n",
"frac_non_train: float = 0.1 # fraction of non training data (validation and test split)\n",
"frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random\n",
"prop_sample_w_sim: float = 1.0 # proportion of samples with simulated missing values"
"prop_sample_w_sim: float = 1.0 # proportion of samples with simulated missing values\n",
"feat_name_display: str = None # display name for feature name (e.g. 'protein group')"
]
},
{
Expand All @@ -124,9 +125,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"args = vaep.nb.args_from_dict(args)\n",
Expand Down Expand Up @@ -195,6 +194,11 @@
" )\n",
"if args.column_names:\n",
" df.columns.names = args.column_names\n",
"if args.feat_name_display is None:\n",
" args.overwrite_entry('feat_name_display', 'features')\n",
" if args.column_names:\n",
" args.overwrite_entry('feat_name_display', args.column_names[0])\n",
"\n",
"\n",
"if not df.index.name:\n",
" logger.warning(\"No sample index name found, setting to 'Sample ID'\")\n",
Expand All @@ -221,7 +225,7 @@
" .plot\n",
" .box()\n",
" )\n",
"ax.set_ylabel('number of observation across samples')"
"ax.set_ylabel('Frequency')"
]
},
{
Expand Down Expand Up @@ -557,7 +561,7 @@
"source": [
"group = 1\n",
"ax = df.notna().sum(axis=1).hist()\n",
"ax.set_xlabel('features per eligable sample')\n",
"ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample')\n",
"ax.set_ylabel('observations')\n",
"fname = args.out_figures / f'0_{group}_hist_features_per_sample'\n",
"figures[fname.stem] = fname\n",
Expand All @@ -576,7 +580,7 @@
"_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]\n",
"_ = ax.set_xticklabels(_new_labels, rotation=45,\n",
" horizontalalignment='right')\n",
"ax.set_xlabel('feature prevalence')\n",
"ax.set_xlabel(f'{args.feat_name_display.capitalize()} prevalence')\n",
"ax.set_ylabel('observations')\n",
"fname = args.out_figures / f'0_{group}_feature_prevalence'\n",
"figures[fname.stem] = fname\n",
Expand All @@ -599,8 +603,9 @@
"min_max = vaep.plotting.data.min_max(df.stack())\n",
"ax, bins = vaep.plotting.data.plot_histogram_intensities(\n",
" df.stack(), min_max=min_max)\n",
"\n",
"ax.set_xlabel('Intensity binned')\n",
"fname = args.out_figures / f'0_{group}_intensity_distribution_overall'\n",
"\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand All @@ -614,6 +619,9 @@
"ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='scatter')\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter'\n",
"ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand All @@ -624,11 +632,17 @@
"metadata": {},
"outputs": [],
"source": [
"ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='boxplot')\n",
"ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='boxplot', return_plot_data=True)\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot'\n",
"ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
"vaep.savefig(ax.get_figure(), fname)\n",
"_data_feat_median_over_prop_missing.to_csv(fname.with_suffix('.csv'))\n",
"# _data_feat_median_over_prop_missing.to_excel(fname.with_suffix('.xlsx'))\n",
"del _data_feat_median_over_prop_missing"
]
},
{
Expand All @@ -644,7 +658,8 @@
"metadata": {},
"outputs": [],
"source": [
"sample_counts.name = 'identified features'"
"_feature_display_name = f'identified {args.feat_name_display}'\n",
"sample_counts.name = _feature_display_name"
]
},
{
Expand Down Expand Up @@ -718,12 +733,12 @@
"outputs": [],
"source": [
"fig, ax = plt.subplots()\n",
"col_identified_feat = 'identified features'\n",
"col_identified_feat = _feature_display_name\n",
"analyzers.plot_scatter(\n",
" pcs[pcs_name],\n",
" ax,\n",
" pcs[col_identified_feat],\n",
" title=f'by {col_identified_feat}',\n",
" feat_name_display=args.feat_name_display,\n",
" size=5,\n",
")\n",
"fname = (args.out_figures\n",
Expand All @@ -737,12 +752,23 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ! write principal components to excel (if needed)\n",
"# pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c861197",
"metadata": {},
"outputs": [],
"source": [
"fig = px.scatter(\n",
" pcs, x=pcs_name[0], y=pcs_name[1],\n",
" hover_name=pcs_index_name,\n",
" # hover_data=analysis.df_meta,\n",
" title=f'First two Principal Components of {args.M} features for {pcs.shape[0]} samples',\n",
" title=f'First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples',\n",
" # color=pcs['Software Version'],\n",
" color=col_identified_feat,\n",
" template='none',\n",
Expand Down Expand Up @@ -1295,7 +1321,8 @@
"ax.legend(_legend[:-1])\n",
"if args.use_every_nth_xtick > 1:\n",
" ax.set_xticks(ax.get_xticks()[::2])\n",
"fname = args.out_figures / f'0_{group}_test_over_train_split.pdf'\n",
"ax.set_xlabel('Intensity bins')\n",
"fname = args.out_figures / f'0_{group}_val_over_train_split.pdf'\n",
"figures[fname.name] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand Down Expand Up @@ -1324,6 +1351,18 @@
"vaep.savefig(ax.get_figure(), fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed941a81",
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = vaep.pandas.get_counts_per_bin(df=splits_df, bins=bins)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -1346,6 +1385,36 @@
"vaep.savefig(ax.get_figure(), fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eede54fd",
"metadata": {},
"outputs": [],
"source": [
"# Save binned counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e6e1f6b",
"metadata": {},
"outputs": [],
"source": [
"counts_per_bin = dict()\n",
"for col in splits_df.columns:\n",
" _series = (pd.cut(splits_df[col], bins=bins)\n",
" .to_frame()\n",
" .groupby(col)\n",
" .size())\n",
" _series.index.name = 'bin'\n",
" counts_per_bin[col] = _series\n",
"counts_per_bin = pd.DataFrame(counts_per_bin)\n",
"counts_per_bin.to_excel(fname.with_suffix('.xlsx'))\n",
"counts_per_bin"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1407,8 +1476,8 @@
"fig, ax = plt.subplots(figsize=(6, 2))\n",
"s = 1\n",
"s_axes = pd.DataFrame({'medians': medians,\n",
" 'validation split': splits.val_y.notna().sum(),\n",
" 'training split': splits.train_X.notna().sum()}\n",
" 'Validation split': splits.val_y.notna().sum(),\n",
" 'Training split': splits.train_X.notna().sum()}\n",
" ).plot.box(by='medians',\n",
" boxprops=dict(linewidth=s),\n",
" flierprops=dict(markersize=s),\n",
Expand All @@ -1417,7 +1486,9 @@
" _ = ax.set_xticklabels(ax.get_xticklabels(),\n",
" rotation=45,\n",
" horizontalalignment='right')\n",
"\n",
" ax.set_xlabel(f'{args.feat_name_display.capitalize()} binned by their median intensity '\n",
" f'(N {args.feat_name_display})')\n",
" _ = ax.set_ylabel('Frequency')\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train'\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
Expand Down
Loading
Loading