Skip to content

Commit

Permalink
✨ dump and rename peptide files with PEP score
Browse files Browse the repository at this point in the history
create a new intermediate dump of 7,444 HeLa runs
  • Loading branch information
Henry Webel committed Sep 13, 2023
1 parent d9d6f1a commit e2e7521
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 20 deletions.
98 changes: 81 additions & 17 deletions project/erda_02_mq_count_features.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
Expand Down Expand Up @@ -53,7 +55,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')\n",
Expand All @@ -73,32 +77,47 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n",
"df_ids = pd.read_csv(fn_id_old_new)\n",
"df_ids"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Select files and create list of folders"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}\n",
"# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}\n",
"# folders_dict"
"# folders_dict\n",
"folders = [Path(folder_path) for folder_path in folders_dict.values()]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"OVERWRITE = False\n",
"OVERWRITE = True\n",
"\n",
"from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n",
"\n",
Expand All @@ -115,7 +134,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import random\n",
Expand All @@ -129,7 +150,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"use_columns = mq_output.peptides.columns[33:45]\n",
Expand All @@ -140,7 +163,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df_json_string = df.to_json(orient='index', indent=4)\n",
Expand All @@ -150,7 +175,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df_csv = df.to_csv()\n",
Expand All @@ -160,7 +187,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"pd.read_json(df_json_string, orient='index')"
Expand All @@ -169,7 +198,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands"
Expand All @@ -185,7 +216,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"peptide_counter = PeptideCounter(FNAME_C_PEPTIDES, overwrite=OVERWRITE)\n",
Expand All @@ -195,7 +228,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"if peptide_counter.loaded:\n",
Expand All @@ -222,10 +257,37 @@
"outputs": [],
"source": [
"%%time\n",
"# folders = [Path(folder_path) for folder_path in folders_dict.values()]\n",
"c = peptide_counter.sum_over_files(folders=folders)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"for k, v in peptide_counter.dumps.items():\n",
" old_name = v\n",
" new_name = v.parent / (df_ids.loc[k, 'new_sample_id'] + '.csv')\n",
" try:\n",
" os.rename(old_name, new_name)\n",
" except FileNotFoundError:\n",
" logging.warning(f\"File not found: {old_name}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"new_name"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -238,7 +300,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# To share as python file\n",
Expand Down Expand Up @@ -761,7 +825,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
21 changes: 19 additions & 2 deletions project/erda_02_mq_count_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.5
# jupytext_version: 1.15.1
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
Expand Down Expand Up @@ -67,13 +67,19 @@
df_ids = pd.read_csv(fn_id_old_new)
df_ids

# %% [markdown]
# Select files and create list of folders

# %%
folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}
# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}
# folders_dict
folders = [Path(folder_path) for folder_path in folders_dict.values()]


# %%
OVERWRITE = False
OVERWRITE = True

from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

Expand Down Expand Up @@ -129,9 +135,20 @@

# %%
# %%time
# folders = [Path(folder_path) for folder_path in folders_dict.values()]
c = peptide_counter.sum_over_files(folders=folders)

# %%
for k, v in peptide_counter.dumps.items():
old_name = v
new_name = v.parent / (df_ids.loc[k, 'new_sample_id'] + '.csv')
try:
os.rename(old_name, new_name)
except FileNotFoundError:
logging.warning(f"File not found: {old_name}")

# %%
new_name

# %%
c.most_common(10) # peptide_counter.counter.most_common(10)

Expand Down
3 changes: 2 additions & 1 deletion vaep/io/data_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def __call__(self, folders,
### aggregated peptides

# # check df for redundant information (same feature value for all entries)
usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE]
usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE, 'PEP']


def count_peptides(folders: List[Path], dump=True,
Expand All @@ -426,6 +426,7 @@ def count_peptides(folders: List[Path], dump=True,
d_dtypes_training_sample = {
'Sequence': pd.StringDtype(),
'Proteins': pd.StringDtype(),
'PEP': pd.Float32Dtype(),
'Leading razor protein': pd.StringDtype(),
'Gene names': pd.StringDtype(),
'Intensity': pd.Int64Dtype()
Expand Down

0 comments on commit e2e7521

Please sign in to comment.