-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- tutorial based on 3_log_reg in cirrhosis_project: https://github.com/RasmussenLab/hela_qc_mnt_data - use alzheimer data from OmicLearn https://github.com/MannLabs/OmicLearn - start moving further functionality still in cirrhosis_project to stand-alone package. ToDo: Test and adapt for colab
- Loading branch information
Henry
committed
Oct 30, 2023
1 parent
ffddd4a
commit eddb192
Showing
9 changed files
with
2,995 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import logging | ||
import typing | ||
|
||
import pandas as pd | ||
import omegaconf | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def replace_with(string_key: str, replace: str = "()/", replace_with: str = '') -> str: | ||
"""Replace characters in a string with a replacement.""" | ||
for symbol in replace: | ||
string_key = string_key.replace(symbol, replace_with) | ||
return string_key | ||
|
||
|
||
def get_colums_accessor(df: pd.DataFrame, all_lower_case=False) -> omegaconf.OmegaConf: | ||
"""Get an dictionary augmented with attribute access of column name as key | ||
with white spaces replaced and the original column name as values.""" | ||
cols = {replace_with(col.replace(' ', '_').replace( | ||
'-', '_')): col for col in df.columns} | ||
if all_lower_case: | ||
cols = {k.lower(): v for k, v in cols.items()} | ||
return omegaconf.OmegaConf.create(cols) | ||
|
||
|
||
def col_isin_df(cols: typing.Union[list, str], df: pd.DataFrame) -> list: | ||
"""Remove item (column) from passed list if not in DataFrame. | ||
Warning is issued for missing items. | ||
cols can be a comma-separated string of column names. | ||
""" | ||
if isinstance(cols, str): | ||
cols = cols.split(',') | ||
ret = list() | ||
for _var in cols: | ||
if _var not in df.columns: | ||
logger.warning( | ||
f"Desired variable not found: {_var}", stacklevel=0) | ||
continue | ||
ret.append(_var) | ||
return ret | ||
|
||
|
||
def value_counts_with_margins(y: pd.Series) -> pd.DataFrame: | ||
"""Value counts of Series with proportion as margins.""" | ||
ret = y.value_counts().to_frame('counts') | ||
ret.index.name = y.name | ||
ret['prop'] = y.value_counts(normalize=True) | ||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
alzheimer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Alzheimer Data | ||
|
||
> Used for logistic regression tutorial `log_reg.ipynb` | ||
Proteome Profiling in Cerebrospinal Fluid Reveals Novel Biomarkers of Alzheimer's Disease | ||
|
||
- [PXD016278](https://www.ebi.ac.uk/pride/archive/projects/PXD016278) | ||
- [publication](https://www.embopress.org/doi/full/10.15252/msb.20199356) | ||
- [curated data version from omiclearn](https://github.com/MannLabs/OmicLearn/tree/master/omiclearn/data) | ||
- download `Alzheimer.xlsx` from repository and process using `data/prepare_alzheimer_excel.py` | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# %% | ||
from pathlib import Path | ||
import pandas as pd | ||
|
||
# %% | ||
fname_curated_data = 'Alzheimer.xlsx' | ||
|
||
FOLDER = Path('alzheimer').absolute() | ||
FOLDER.mkdir(exist_ok=True, parents=True) | ||
files_out = dict() | ||
|
||
# %% | ||
data = pd.read_excel(fname_curated_data) | ||
data.index = [f'Sample_{i:03d}' for i in data.index] | ||
data.index.name = 'Sample ID' | ||
data.head() | ||
|
||
# %% | ||
meta = data.filter(like='_', axis=1) | ||
meta.sample(5) | ||
|
||
# %% [markdown] | ||
# # Meta data dump | ||
# - N=197 samples with complete data for | ||
# - age | ||
# - gender | ||
# - primary biochemical Alzheimer disease classification | ||
|
||
# %% | ||
cols = ['_age at CSF collection', '_gender', '_primary biochemical AD classification'] | ||
meta[cols].dropna().describe(include='all') | ||
|
||
# %% | ||
fname = FOLDER / 'meta.csv' | ||
files_out[fname.stem] = fname | ||
meta.to_csv(fname) | ||
|
||
# %% [markdown] | ||
# # Proteome dump | ||
# %% | ||
data = data.iloc[:, :-11] | ||
data.sample(5) | ||
|
||
# %% | ||
ax = data.notna().sum(axis=0).plot.box() | ||
|
||
# %% | ||
fname = FOLDER / 'proteome.csv' | ||
files_out[fname.stem] = fname | ||
data.to_csv(fname) | ||
|
||
# %% [markdown] | ||
# Protein Group - Gene Mapping | ||
|
||
# %% | ||
|
||
protein_groups = data.columns.to_list() | ||
proteins_unique = set() | ||
for pg in protein_groups: | ||
proteins_unique |= set(pg.split(';')) | ||
# proteins_unique | ||
|
||
# %% | ||
files_out | ||
|
||
# %% |
Oops, something went wrong.