Skip to content

Commit

Permalink
🎨🔥 use combine_value_counts from njab
Browse files Browse the repository at this point in the history
  • Loading branch information
Henry committed Mar 12, 2024
1 parent 8c711a4 commit fd0b8fa
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 65 deletions.
23 changes: 10 additions & 13 deletions project/10_4_ald_compare_single_pg.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"from pathlib import Path\n",
"\n",
"import logging\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import njab\n",
"import pandas as pd\n",
"\n",
"import matplotlib\n",
"import seaborn\n",
"\n",
"import vaep\n",
"import vaep.analyzers\n",
"import vaep.io.datasplits\n",
"import vaep.imputation\n",
"import vaep.io.datasplits\n",
"\n",
"logger = vaep.logging.setup_nb_logger()\n",
"logging.getLogger('fontTools').setLevel(logging.WARNING)\n",
Expand Down Expand Up @@ -251,8 +251,7 @@
"fname = args.out_folder / 'equality_rejected_target.pkl'\n",
"files_out[fname.name] = fname.as_posix()\n",
"da_target.to_pickle(fname)\n",
"\n",
"count_rejected = vaep.pandas.combine_value_counts(da_target.droplevel(-1, axis=1))\n",
"count_rejected = njab.pandas.combine_value_counts(da_target.droplevel(-1, axis=1))\n",
"count_rejected.to_excel(writer, sheet_name='count_rejected')\n",
"count_rejected"
]
Expand All @@ -264,10 +263,10 @@
"metadata": {},
"outputs": [],
"source": [
"! This uses implicitly that RSN is not available for some protein groups\n",
"! Make an explicit list of the 313 protein groups available in original data\n",
"# ! This uses implicitly that RSN is not available for some protein groups\n",
"# ! Make an explicit list of the 313 protein groups available in original data\n",
"mask_common = da_target.notna().all(axis=1)\n",
"count_rejected_common = vaep.pandas.combine_value_counts(da_target.loc[mask_common].droplevel(-1, axis=1))\n",
"count_rejected_common = njab.pandas.combine_value_counts(da_target.loc[mask_common].droplevel(-1, axis=1))\n",
"count_rejected_common.to_excel(writer, sheet_name='count_rejected_common')\n",
"count_rejected_common"
]
Expand All @@ -276,12 +275,10 @@
"cell_type": "code",
"execution_count": null,
"id": "af1a13cb",
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"count_rejected_new = vaep.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))\n",
"count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))\n",
"count_rejected_new.to_excel(writer, sheet_name='count_rejected_new')\n",
"count_rejected_new"
]
Expand Down
20 changes: 9 additions & 11 deletions project/10_4_ald_compare_single_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@
# - dumps top5

# %%
import logging
from pathlib import Path

import logging
import matplotlib
import matplotlib.pyplot as plt
import njab
import pandas as pd

import matplotlib
import seaborn

import vaep
import vaep.analyzers
import vaep.io.datasplits
import vaep.imputation
import vaep.io.datasplits

logger = vaep.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)
Expand Down Expand Up @@ -153,25 +153,23 @@
fname = args.out_folder / 'equality_rejected_target.pkl'
files_out[fname.name] = fname.as_posix()
da_target.to_pickle(fname)

count_rejected = vaep.pandas.combine_value_counts(da_target.droplevel(-1, axis=1))
count_rejected = njab.pandas.combine_value_counts(da_target.droplevel(-1, axis=1))
count_rejected.to_excel(writer, sheet_name='count_rejected')
count_rejected

# %%
# ! This uses implicitly that RSN is not available for some protein groups
# ! Make an explicit list of the 313 protein groups available in original data
# # ! This uses implicitly that RSN is not available for some protein groups
# # ! Make an explicit list of the 313 protein groups available in original data
mask_common = da_target.notna().all(axis=1)
count_rejected_common = vaep.pandas.combine_value_counts(da_target.loc[mask_common].droplevel(-1, axis=1))
count_rejected_common = njab.pandas.combine_value_counts(da_target.loc[mask_common].droplevel(-1, axis=1))
count_rejected_common.to_excel(writer, sheet_name='count_rejected_common')
count_rejected_common

# %%
count_rejected_new = vaep.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))
count_rejected_new = njab.pandas.combine_value_counts(da_target.loc[~mask_common].droplevel(-1, axis=1))
count_rejected_new.to_excel(writer, sheet_name='count_rejected_new')
count_rejected_new


# %%
da_target.to_excel(writer, sheet_name='equality_rejected_all')
da_target
Expand Down
5 changes: 4 additions & 1 deletion project/10_5_comp_diff_analysis_repetitions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import njab\n",
"import pandas as pd\n",
"\n",
"import vaep"
]
},
Expand Down Expand Up @@ -333,7 +336,7 @@
"mask_new_da_with_imp = mask_new_da_with_imputation = ((~mask_pgs_included_in_ald_study)\n",
" & (da_counts['None'] != 10))\n",
"\n",
"tab_new_da_with_imp = vaep.pandas.combine_value_counts(\n",
"tab_new_da_with_imp = njab.pandas.combine_value_counts(\n",
" da_counts\n",
" .loc[mask_new_da_with_imputation]\n",
").fillna(0).astype(int)\n",
Expand Down
5 changes: 4 additions & 1 deletion project/10_5_comp_diff_analysis_repetitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

# %%
from pathlib import Path

import njab
import pandas as pd

import vaep

# %%
Expand Down Expand Up @@ -176,7 +179,7 @@ def _load_pickle(pfath, run: int):
mask_new_da_with_imp = mask_new_da_with_imputation = ((~mask_pgs_included_in_ald_study)
& (da_counts['None'] != 10))

tab_new_da_with_imp = vaep.pandas.combine_value_counts(
tab_new_da_with_imp = njab.pandas.combine_value_counts(
da_counts
.loc[mask_new_da_with_imputation]
).fillna(0).astype(int)
Expand Down
14 changes: 9 additions & 5 deletions project/10_7_ald_reduced_dataset_plots.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import njab\n",
"import pandas as pd\n",
"\n",
"import vaep\n",
"\n",
"plt.rcParams['figure.figsize'] = [4, 2] # [16.0, 7.0] , [4, 3]\n",
"vaep.plotting.make_large_descriptors(6)\n",
"\n",
Expand Down Expand Up @@ -79,14 +82,15 @@
"cell_type": "code",
"execution_count": null,
"id": "676766a0",
"metadata": {},
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"files_out = dict()\n",
"fname = out_folder / 'ald_reduced_dataset_plots.xlsx'\n",
"files_out[fname.name] = fname.as_posix()\n",
"writer = pd.ExcelWriter(fname)\n",
"\n"
"writer = pd.ExcelWriter(fname)"
]
},
{
Expand Down Expand Up @@ -237,7 +241,7 @@
" 1: 'TP'}\n",
").droplevel(-1, axis=1)\n",
")\n",
"da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)\n",
"da_target_sel_counts = njab.pandas.combine_value_counts(da_target_sel_counts)\n",
"ax = da_target_sel_counts.T.plot.bar(ylabel='count')\n",
"ax.locator_params(axis='y', integer=True)\n",
"fname = out_folder / 'lost_signal_da_counts.pdf'\n",
Expand Down Expand Up @@ -308,7 +312,7 @@
" 1: 'FP'}\n",
").droplevel(-1, axis=1)\n",
")\n",
"da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)\n",
"da_target_sel_counts = njab.pandas.combine_value_counts(da_target_sel_counts)\n",
"ax = da_target_sel_counts.T.plot.bar(ylabel='count')\n",
"ax.locator_params(axis='y', integer=True)\n",
"fname = out_folder / 'gained_signal_da_counts.pdf'\n",
Expand Down
8 changes: 5 additions & 3 deletions project/10_7_ald_reduced_dataset_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@

# %%
from pathlib import Path

import matplotlib.pyplot as plt
import njab
import pandas as pd

import vaep

plt.rcParams['figure.figsize'] = [4, 2] # [16.0, 7.0] , [4, 3]
vaep.plotting.make_large_descriptors(6)

Expand Down Expand Up @@ -54,7 +57,6 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05,
writer = pd.ExcelWriter(fname)



# %%


Expand Down Expand Up @@ -124,7 +126,7 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05,
1: 'TP'}
).droplevel(-1, axis=1)
)
da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)
da_target_sel_counts = njab.pandas.combine_value_counts(da_target_sel_counts)
ax = da_target_sel_counts.T.plot.bar(ylabel='count')
ax.locator_params(axis='y', integer=True)
fname = out_folder / 'lost_signal_da_counts.pdf'
Expand Down Expand Up @@ -168,7 +170,7 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05,
1: 'FP'}
).droplevel(-1, axis=1)
)
da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)
da_target_sel_counts = njab.pandas.combine_value_counts(da_target_sel_counts)
ax = da_target_sel_counts.T.plot.bar(ylabel='count')
ax.locator_params(axis='y', integer=True)
fname = out_folder / 'gained_signal_da_counts.pdf'
Expand Down
32 changes: 1 addition & 31 deletions vaep/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,15 @@
import collections.abc
from collections import namedtuple


from types import SimpleNamespace

from typing import Iterable

import numpy as np
import pandas as pd
import omegaconf
import pandas as pd

from .calc_errors import calc_errors_per_feat, get_absolute_error


def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame:
"""Pass a selection of columns to combine it's value counts.
This performs no checks. Make sure the scale of the variables
you pass is comparable.
Parameters
----------
X : pandas.DataFrame
A DataFrame of several columns with values in a similar range.
dropna : bool, optional
Exclude NA values from counting, by default True
Returns
-------
pandas.DataFrame
DataFrame of combined value counts.
"""
"""
"""
_df = pd.DataFrame()
for col in X.columns:
_df = _df.join(X[col].value_counts(dropna=dropna), how='outer')
freq_targets = _df.sort_index()
return freq_targets


def unique_cols(s: pd.Series) -> bool:
"""Check all entries are equal in pandas.Series
Expand Down

0 comments on commit fd0b8fa

Please sign in to comment.