Skip to content

Commit

Permalink
✨ add biomarker discovery tutorial
Browse files Browse the repository at this point in the history
- tutorial based on 3_log_reg in cirrhosis_project:
   https://github.com/RasmussenLab/hela_qc_mnt_data
- use alzheimer data from OmicLearn
   https://github.com/MannLabs/OmicLearn
- start moving further functionality still in cirrhosis_project to
  stand-alone package.

ToDo: Test and adapt for colab
  • Loading branch information
Henry committed Oct 30, 2023
1 parent ffddd4a commit eddb192
Show file tree
Hide file tree
Showing 9 changed files with 2,995 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/njab/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
"""
from importlib.metadata import version

from . import stats, sklearn, plotting
from . import stats, sklearn, plotting, pandas

__version__ = version('njab')

__all__ = ['stats', 'sklearn', 'plotting']
__all__ = ['stats', 'sklearn', 'plotting', 'pandas']
50 changes: 50 additions & 0 deletions src/njab/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging
import typing

import pandas as pd
import omegaconf

logger = logging.getLogger(__name__)


def replace_with(string_key: str, replace: str = "()/", replace_with: str = '') -> str:
"""Replace characters in a string with a replacement."""
for symbol in replace:
string_key = string_key.replace(symbol, replace_with)
return string_key


def get_colums_accessor(df: pd.DataFrame, all_lower_case=False) -> omegaconf.OmegaConf:
"""Get an dictionary augmented with attribute access of column name as key
with white spaces replaced and the original column name as values."""
cols = {replace_with(col.replace(' ', '_').replace(
'-', '_')): col for col in df.columns}
if all_lower_case:
cols = {k.lower(): v for k, v in cols.items()}
return omegaconf.OmegaConf.create(cols)


def col_isin_df(cols: typing.Union[list, str], df: pd.DataFrame) -> list:
"""Remove item (column) from passed list if not in DataFrame.
Warning is issued for missing items.
cols can be a comma-separated string of column names.
"""
if isinstance(cols, str):
cols = cols.split(',')
ret = list()
for _var in cols:
if _var not in df.columns:
logger.warning(
f"Desired variable not found: {_var}", stacklevel=0)
continue
ret.append(_var)
return ret


def value_counts_with_margins(y: pd.Series) -> pd.DataFrame:
"""Value counts of Series with proportion as margins."""
ret = y.value_counts().to_frame('counts')
ret.index.name = y.name
ret['prop'] = y.value_counts(normalize=True)
return ret
1 change: 1 addition & 0 deletions tutorial/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
alzheimer
11 changes: 11 additions & 0 deletions tutorial/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Alzheimer Data

> Used for logistic regression tutorial `log_reg.ipynb`
Proteome Profiling in Cerebrospinal Fluid Reveals Novel Biomarkers of Alzheimer's Disease

- [PXD016278](https://www.ebi.ac.uk/pride/archive/projects/PXD016278)
- [publication](https://www.embopress.org/doi/full/10.15252/msb.20199356)
- [curated data version from omiclearn](https://github.com/MannLabs/OmicLearn/tree/master/omiclearn/data)
- download `Alzheimer.xlsx` from repository and process using `data/prepare_alzheimer_excel.py`

211 changes: 211 additions & 0 deletions tutorial/data/alzheimer/meta.csv

Large diffs are not rendered by default.

211 changes: 211 additions & 0 deletions tutorial/data/alzheimer/proteome.csv

Large diffs are not rendered by default.

66 changes: 66 additions & 0 deletions tutorial/data/process_Alzheimer_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# %%
from pathlib import Path
import pandas as pd

# %%
fname_curated_data = 'Alzheimer.xlsx'

FOLDER = Path('alzheimer').absolute()
FOLDER.mkdir(exist_ok=True, parents=True)
files_out = dict()

# %%
data = pd.read_excel(fname_curated_data)
data.index = [f'Sample_{i:03d}' for i in data.index]
data.index.name = 'Sample ID'
data.head()

# %%
meta = data.filter(like='_', axis=1)
meta.sample(5)

# %% [markdown]
# # Meta data dump
# - N=197 samples with complete data for
# - age
# - gender
# - primary biochemical Alzheimer disease classification

# %%
cols = ['_age at CSF collection', '_gender', '_primary biochemical AD classification']
meta[cols].dropna().describe(include='all')

# %%
fname = FOLDER / 'meta.csv'
files_out[fname.stem] = fname
meta.to_csv(fname)

# %% [markdown]
# # Proteome dump
# %%
data = data.iloc[:, :-11]
data.sample(5)

# %%
ax = data.notna().sum(axis=0).plot.box()

# %%
fname = FOLDER / 'proteome.csv'
files_out[fname.stem] = fname
data.to_csv(fname)

# %% [markdown]
# Protein Group - Gene Mapping

# %%

protein_groups = data.columns.to_list()
proteins_unique = set()
for pg in protein_groups:
proteins_unique |= set(pg.split(';'))
# proteins_unique

# %%
files_out

# %%
Loading

0 comments on commit eddb192

Please sign in to comment.