Skip to content

Commit

Permalink
🎨 improved axis labels, feat name
Browse files Browse the repository at this point in the history
- extract PCs: not added yet, but as a comment
  • Loading branch information
Henry committed Apr 8, 2024
1 parent af39e0a commit ec7f286
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 58 deletions.
54 changes: 33 additions & 21 deletions project/01_0_split_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@
" )\n",
"if args.column_names:\n",
" df.columns.names = args.column_names\n",
"if args.feat_name_display is None:\n",
" args.overwrite_entry('feat_name_display', 'features')\n",
" if args.column_names:\n",
" args.overwrite_entry('feat_name_display', args.column_names[0])\n",
"\n",
"\n",
"if not df.index.name:\n",
" logger.warning(\"No sample index name found, setting to 'Sample ID'\")\n",
Expand All @@ -220,7 +225,7 @@
" .plot\n",
" .box()\n",
" )\n",
"ax.set_ylabel('number of observation across samples')"
"ax.set_ylabel('Frequency')"
]
},
{
Expand Down Expand Up @@ -556,7 +561,7 @@
"source": [
"group = 1\n",
"ax = df.notna().sum(axis=1).hist()\n",
"ax.set_xlabel('features per eligable sample')\n",
"ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample')\n",
"ax.set_ylabel('observations')\n",
"fname = args.out_figures / f'0_{group}_hist_features_per_sample'\n",
"figures[fname.stem] = fname\n",
Expand All @@ -575,7 +580,7 @@
"_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]\n",
"_ = ax.set_xticklabels(_new_labels, rotation=45,\n",
" horizontalalignment='right')\n",
"ax.set_xlabel('feature prevalence')\n",
"ax.set_xlabel(f'{args.feat_name_display.capitalize()} prevalence')\n",
"ax.set_ylabel('observations')\n",
"fname = args.out_figures / f'0_{group}_feature_prevalence'\n",
"figures[fname.stem] = fname\n",
Expand All @@ -598,7 +603,7 @@
"min_max = vaep.plotting.data.min_max(df.stack())\n",
"ax, bins = vaep.plotting.data.plot_histogram_intensities(\n",
" df.stack(), min_max=min_max)\n",
"\n",
"ax.set_xlabel('Intensity binned')\n",
"fname = args.out_figures / f'0_{group}_intensity_distribution_overall'\n",
"\n",
"figures[fname.stem] = fname\n",
Expand All @@ -614,10 +619,9 @@
"ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='scatter')\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter'\n",
"if args.feat_name_display:\n",
" ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand All @@ -631,10 +635,9 @@
"ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
" data=df, type='boxplot', return_plot_data=True)\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot'\n",
"if args.feat_name_display:\n",
" ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"ax.set_xlabel(\n",
" f'{args.feat_name_display.capitalize()} binned by their median intensity'\n",
" f' (N {args.feat_name_display})')\n",
"figures[fname.stem] = fname\n",
"vaep.savefig(ax.get_figure(), fname)\n",
"_data_feat_median_over_prop_missing.to_csv(fname.with_suffix('.csv'))\n",
Expand All @@ -655,9 +658,7 @@
"metadata": {},
"outputs": [],
"source": [
"_feature_display_name = 'identified features'\n",
"if args.feat_name_display:\n",
" _feature_display_name = f'identified {args.feat_name_display}'\n",
"_feature_display_name = f'identified {args.feat_name_display}'\n",
"sample_counts.name = _feature_display_name"
]
},
Expand Down Expand Up @@ -737,7 +738,7 @@
" pcs[pcs_name],\n",
" ax,\n",
" pcs[col_identified_feat],\n",
" title=f'by {_feature_display_name}',\n",
" feat_name_display=args.feat_name_display,\n",
" size=5,\n",
")\n",
"fname = (args.out_figures\n",
Expand All @@ -752,14 +753,22 @@
"metadata": {},
"outputs": [],
"source": [
"_feature_name = 'features'\n",
"if args.feat_name_display:\n",
" _feature_name = args.feat_name_display\n",
"# ! write principal components to excel (if needed)\n",
"# pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c861197",
"metadata": {},
"outputs": [],
"source": [
"fig = px.scatter(\n",
" pcs, x=pcs_name[0], y=pcs_name[1],\n",
" hover_name=pcs_index_name,\n",
" # hover_data=analysis.df_meta,\n",
" title=f'First two Principal Components of {args.M} {_feature_name} for {pcs.shape[0]} samples',\n",
" title=f'First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples',\n",
" # color=pcs['Software Version'],\n",
" color=col_identified_feat,\n",
" template='none',\n",
Expand Down Expand Up @@ -1312,7 +1321,8 @@
"ax.legend(_legend[:-1])\n",
"if args.use_every_nth_xtick > 1:\n",
" ax.set_xticks(ax.get_xticks()[::2])\n",
"fname = args.out_figures / f'0_{group}_test_over_train_split.pdf'\n",
"ax.set_xlabel('Intensity bins')\n",
"fname = args.out_figures / f'0_{group}_val_over_train_split.pdf'\n",
"figures[fname.name] = fname\n",
"vaep.savefig(ax.get_figure(), fname)"
]
Expand Down Expand Up @@ -1434,6 +1444,8 @@
" _ = ax.set_xticklabels(ax.get_xticklabels(),\n",
" rotation=45,\n",
" horizontalalignment='right')\n",
" ax.set_xlabel(f'{args.feat_name_display.capitalize()} binned by their median intensity '\n",
" f'(N {args.feat_name_display})')\n",
" _ = ax.set_ylabel('Frequency')\n",
"fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train'\n",
"figures[fname.stem] = fname\n",
Expand Down
47 changes: 26 additions & 21 deletions project/01_0_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):
)
if args.column_names:
df.columns.names = args.column_names
if args.feat_name_display is None:
args.overwrite_entry('feat_name_display', 'features')
if args.column_names:
args.overwrite_entry('feat_name_display', args.column_names[0])


if not df.index.name:
logger.warning("No sample index name found, setting to 'Sample ID'")
Expand All @@ -158,7 +163,7 @@ def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):
.plot
.box()
)
ax.set_ylabel('number of observation across samples')
ax.set_ylabel('Frequency')


# %%
Expand Down Expand Up @@ -355,7 +360,7 @@ def join_as_str(seq):
# %%
group = 1
ax = df.notna().sum(axis=1).hist()
ax.set_xlabel('features per eligable sample')
ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample')
ax.set_ylabel('observations')
fname = args.out_figures / f'0_{group}_hist_features_per_sample'
figures[fname.stem] = fname
Expand All @@ -366,7 +371,7 @@ def join_as_str(seq):
_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels, rotation=45,
horizontalalignment='right')
ax.set_xlabel('feature prevalence')
ax.set_xlabel(f'{args.feat_name_display.capitalize()} prevalence')
ax.set_ylabel('observations')
fname = args.out_figures / f'0_{group}_feature_prevalence'
figures[fname.stem] = fname
Expand All @@ -380,7 +385,7 @@ def join_as_str(seq):
min_max = vaep.plotting.data.min_max(df.stack())
ax, bins = vaep.plotting.data.plot_histogram_intensities(
df.stack(), min_max=min_max)

ax.set_xlabel('Intensity binned')
fname = args.out_figures / f'0_{group}_intensity_distribution_overall'

figures[fname.stem] = fname
Expand All @@ -390,21 +395,19 @@ def join_as_str(seq):
ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
data=df, type='scatter')
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter'
if args.feat_name_display:
ax.set_xlabel(
f'{args.feat_name_display.capitalize()} binned by their median intensity'
f' (N {args.feat_name_display})')
ax.set_xlabel(
f'{args.feat_name_display.capitalize()} binned by their median intensity'
f' (N {args.feat_name_display})')
figures[fname.stem] = fname
vaep.savefig(ax.get_figure(), fname)

# %%
ax, _data_feat_median_over_prop_missing = vaep.plotting.data.plot_feat_median_over_prop_missing(
data=df, type='boxplot', return_plot_data=True)
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot'
if args.feat_name_display:
ax.set_xlabel(
f'{args.feat_name_display.capitalize()} binned by their median intensity'
f' (N {args.feat_name_display})')
ax.set_xlabel(
f'{args.feat_name_display.capitalize()} binned by their median intensity'
f' (N {args.feat_name_display})')
figures[fname.stem] = fname
vaep.savefig(ax.get_figure(), fname)
_data_feat_median_over_prop_missing.to_csv(fname.with_suffix('.csv'))
Expand All @@ -415,9 +418,7 @@ def join_as_str(seq):
# ### Interactive and Single plots

# %%
_feature_display_name = 'identified features'
if args.feat_name_display:
_feature_display_name = f'identified {args.feat_name_display}'
_feature_display_name = f'identified {args.feat_name_display}'
sample_counts.name = _feature_display_name

# %%
Expand Down Expand Up @@ -463,7 +464,7 @@ def join_as_str(seq):
pcs[pcs_name],
ax,
pcs[col_identified_feat],
title=f'by {_feature_display_name}',
feat_name_display=args.feat_name_display,
size=5,
)
fname = (args.out_figures
Expand All @@ -472,14 +473,15 @@ def join_as_str(seq):
vaep.savefig(fig, fname)

# %%
_feature_name = 'features'
if args.feat_name_display:
_feature_name = args.feat_name_display
# # ! write principal components to excel (if needed)
# pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx'))

# %%
fig = px.scatter(
pcs, x=pcs_name[0], y=pcs_name[1],
hover_name=pcs_index_name,
# hover_data=analysis.df_meta,
title=f'First two Principal Components of {args.M} {_feature_name} for {pcs.shape[0]} samples',
title=f'First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples',
# color=pcs['Software Version'],
color=col_identified_feat,
template='none',
Expand Down Expand Up @@ -832,7 +834,8 @@ def join_as_str(seq):
ax.legend(_legend[:-1])
if args.use_every_nth_xtick > 1:
ax.set_xticks(ax.get_xticks()[::2])
fname = args.out_figures / f'0_{group}_test_over_train_split.pdf'
ax.set_xlabel('Intensity bins')
fname = args.out_figures / f'0_{group}_val_over_train_split.pdf'
figures[fname.name] = fname
vaep.savefig(ax.get_figure(), fname)

Expand Down Expand Up @@ -914,6 +917,8 @@ def join_as_str(seq):
_ = ax.set_xticklabels(ax.get_xticklabels(),
rotation=45,
horizontalalignment='right')
ax.set_xlabel(f'{args.feat_name_display.capitalize()} binned by their median intensity '
f'(N {args.feat_name_display})')
_ = ax.set_ylabel('Frequency')
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train'
figures[fname.stem] = fname
Expand Down
33 changes: 17 additions & 16 deletions vaep/analyzers/analyzers.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,23 @@
import logging
import random
from collections import namedtuple
from pathlib import Path
from types import SimpleNamespace
from typing import Tuple, Union, List

import logging
import random

from typing import List, Optional, Tuple, Union

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

from njab.sklearn import run_pca
from sklearn.impute import SimpleImputer

import vaep
from vaep.analyzers import Analysis

from vaep.pandas import _add_indices
from vaep.io.datasplits import long_format, wide_format
from vaep.io.load import verify_df
from vaep.pandas import _add_indices

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -450,18 +443,26 @@ def plot_date_map(df, ax,

def plot_scatter(df, ax,
meta: pd.Series,
title: str = 'by some metadata',
feat_name_display: str = 'features',
title: Optional[str] = None,
alpha=ALPHA,
fontsize=8,
size=2):
cols = list(df.columns)
assert len(cols) == 2, f'Please provide two dimensons, not {df.columns}'
if not title:
title = f'by identified {feat_name_display}'
ax.set_title(title, fontsize=fontsize)
ax.set_xlabel(cols[0])
ax.set_ylabel(cols[1])
path_collection = ax.scatter(
x=cols[0], y=cols[1], s=size, c=meta, data=df, alpha=alpha)
cbar = ax.get_figure().colorbar(path_collection, ax=ax)
_ = ax.get_figure().colorbar(path_collection, ax=ax,
label=f'Identified {feat_name_display}',
# ticklocation='left', # ignored by matplotlib
location='right', # ! left does not put colobar without overlapping y ticks
format="{x:,.0f}",
)


def seaborn_scatter(df, ax,
Expand Down

0 comments on commit ec7f286

Please sign in to comment.