✨ add biomarker discovery tutorial

- tutorial based on 3_log_reg in cirrhosis_project: https://github.com/RasmussenLab/hela_qc_mnt_data - use alzheimer data from OmicLearn https://github.com/MannLabs/OmicLearn - start moving further functionality still in cirrhosis_project to stand-alone package. ToDo: Test and adapt for colab
RasmussenLab · Oct 30, 2023 · eddb192 · eddb192
1 parent ffddd4a
commit eddb192
Show file tree

Hide file tree

Showing 9 changed files with 2,995 additions and 2 deletions.
diff --git a/src/njab/__init__.py b/src/njab/__init__.py
@@ -6,8 +6,8 @@
 """
 from importlib.metadata import version
 
-from . import stats, sklearn, plotting
+from . import stats, sklearn, plotting, pandas
 
 __version__ = version('njab')
 
-__all__ = ['stats', 'sklearn', 'plotting']
+__all__ = ['stats', 'sklearn', 'plotting', 'pandas']
diff --git a/src/njab/pandas/__init__.py b/src/njab/pandas/__init__.py
@@ -0,0 +1,50 @@
+import logging
+import typing
+
+import pandas as pd
+import omegaconf
+
+logger = logging.getLogger(__name__)
+
+
+def replace_with(string_key: str, replace: str = "()/", replace_with: str = '') -> str:
+    """Replace characters in a string with a replacement."""
+    for symbol in replace:
+        string_key = string_key.replace(symbol, replace_with)
+    return string_key
+
+
+def get_colums_accessor(df: pd.DataFrame, all_lower_case=False) -> omegaconf.OmegaConf:
+    """Get an dictionary augmented with attribute access of column name as key
+       with white spaces replaced and the original column name as values."""
+    cols = {replace_with(col.replace(' ', '_').replace(
+        '-', '_')): col for col in df.columns}
+    if all_lower_case:
+        cols = {k.lower(): v for k, v in cols.items()}
+    return omegaconf.OmegaConf.create(cols)
+
+
+def col_isin_df(cols: typing.Union[list, str], df: pd.DataFrame) -> list:
+    """Remove item (column) from passed list if not in DataFrame.
+       Warning is issued for missing items.
+
+       cols can be a comma-separated string of column names.
+    """
+    if isinstance(cols, str):
+        cols = cols.split(',')
+    ret = list()
+    for _var in cols:
+        if _var not in df.columns:
+            logger.warning(
+                f"Desired variable not found: {_var}", stacklevel=0)
+            continue
+        ret.append(_var)
+    return ret
+
+
+def value_counts_with_margins(y: pd.Series) -> pd.DataFrame:
+    """Value counts of Series with proportion as margins."""
+    ret = y.value_counts().to_frame('counts')
+    ret.index.name = y.name
+    ret['prop'] = y.value_counts(normalize=True)
+    return ret
diff --git a/tutorial/.gitignore b/tutorial/.gitignore
@@ -0,0 +1 @@
+alzheimer
diff --git a/tutorial/data/README.md b/tutorial/data/README.md
@@ -0,0 +1,11 @@
+# Alzheimer Data
+
+> Used for logistic regression tutorial `log_reg.ipynb`
+
+Proteome Profiling in Cerebrospinal Fluid Reveals Novel Biomarkers of Alzheimer's Disease
+
+- [PXD016278](https://www.ebi.ac.uk/pride/archive/projects/PXD016278)
+- [publication](https://www.embopress.org/doi/full/10.15252/msb.20199356)
+- [curated data version from omiclearn](https://github.com/MannLabs/OmicLearn/tree/master/omiclearn/data)
+  - download `Alzheimer.xlsx` from repository and process using `data/prepare_alzheimer_excel.py`
+
diff --git a/tutorial/data/alzheimer/meta.csv b/tutorial/data/alzheimer/meta.csv
diff --git a/tutorial/data/alzheimer/proteome.csv b/tutorial/data/alzheimer/proteome.csv
diff --git a/tutorial/data/process_Alzheimer_excel.py b/tutorial/data/process_Alzheimer_excel.py
@@ -0,0 +1,66 @@
+# %%
+from pathlib import Path
+import pandas as pd
+
+# %%
+fname_curated_data = 'Alzheimer.xlsx'
+
+FOLDER = Path('alzheimer').absolute()
+FOLDER.mkdir(exist_ok=True, parents=True)
+files_out = dict()
+
+# %%
+data = pd.read_excel(fname_curated_data)
+data.index = [f'Sample_{i:03d}' for i in data.index]
+data.index.name = 'Sample ID'
+data.head()
+
+# %%
+meta = data.filter(like='_', axis=1)
+meta.sample(5)
+
+# %% [markdown]
+# # Meta data dump
+# - N=197 samples with complete data for
+#   - age
+#   - gender
+#   - primary biochemical Alzheimer disease classification
+
+# %%
+cols = ['_age at CSF collection', '_gender', '_primary biochemical AD classification']
+meta[cols].dropna().describe(include='all')
+
+# %%
+fname = FOLDER / 'meta.csv'
+files_out[fname.stem] = fname
+meta.to_csv(fname)
+
+# %% [markdown]
+# # Proteome dump
+# %%
+data = data.iloc[:, :-11]
+data.sample(5)
+
+# %%
+ax = data.notna().sum(axis=0).plot.box()
+
+# %%
+fname = FOLDER / 'proteome.csv'
+files_out[fname.stem] = fname
+data.to_csv(fname)
+
+# %% [markdown]
+# Protein Group - Gene Mapping
+
+# %%
+
+protein_groups = data.columns.to_list()
+proteins_unique = set()
+for pg in protein_groups:
+    proteins_unique |= set(pg.split(';'))
+# proteins_unique
+
+# %%
+files_out
+
+# %%