✨ dump and rename peptide files with PEP score

create a new intermediate dump of 7,444 HeLa runs
RasmussenLab · Sep 13, 2023 · e2e7521 · e2e7521
1 parent d9d6f1a
commit e2e7521
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 20 deletions.
diff --git a/project/erda_02_mq_count_features.ipynb b/project/erda_02_mq_count_features.ipynb
@@ -10,7 +10,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "import os\n",
@@ -53,7 +55,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')\n",
@@ -73,32 +77,47 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n",
     "df_ids = pd.read_csv(fn_id_old_new)\n",
     "df_ids"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Select files and create list of folders"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}\n",
     "# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}\n",
-    "# folders_dict"
+    "# folders_dict\n",
+    "folders = [Path(folder_path) for folder_path in folders_dict.values()]\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "OVERWRITE = False\n",
+    "OVERWRITE = True\n",
     "\n",
     "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n",
     "\n",
@@ -115,7 +134,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "import random\n",
@@ -129,7 +150,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "use_columns = mq_output.peptides.columns[33:45]\n",
@@ -140,7 +163,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "df_json_string = df.to_json(orient='index', indent=4)\n",
@@ -150,7 +175,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "df_csv = df.to_csv()\n",
@@ -160,7 +187,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "pd.read_json(df_json_string, orient='index')"
@@ -169,7 +198,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands"
@@ -185,7 +216,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "peptide_counter = PeptideCounter(FNAME_C_PEPTIDES, overwrite=OVERWRITE)\n",
@@ -195,7 +228,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "if peptide_counter.loaded:\n",
@@ -222,10 +257,37 @@
    "outputs": [],
    "source": [
     "%%time\n",
-    "# folders = [Path(folder_path) for folder_path in folders_dict.values()]\n",
     "c = peptide_counter.sum_over_files(folders=folders)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "for k, v in peptide_counter.dumps.items():\n",
+    "    old_name = v\n",
+    "    new_name = v.parent / (df_ids.loc[k, 'new_sample_id'] + '.csv')\n",
+    "    try:\n",
+    "        os.rename(old_name, new_name)\n",
+    "    except FileNotFoundError:\n",
+    "        logging.warning(f\"File not found: {old_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "new_name"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -238,7 +300,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# To share as python file\n",
@@ -761,7 +825,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.15"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/project/erda_02_mq_count_features.py b/project/erda_02_mq_count_features.py
@@ -5,7 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.15.1
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
@@ -67,13 +67,19 @@
 df_ids = pd.read_csv(fn_id_old_new)
 df_ids
 
+# %% [markdown]
+# Select files and create list of folders
+
 # %%
 folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}
 # folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}
 # folders_dict
+folders = [Path(folder_path) for folder_path in folders_dict.values()]
+
 
 # %%
 OVERWRITE = False
+OVERWRITE = True
 
 from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES
 
@@ -129,9 +135,20 @@
 
 # %%
 # %%time
-# folders = [Path(folder_path) for folder_path in folders_dict.values()]
 c = peptide_counter.sum_over_files(folders=folders)
 
+# %%
+for k, v in peptide_counter.dumps.items():
+    old_name = v
+    new_name = v.parent / (df_ids.loc[k, 'new_sample_id'] + '.csv')
+    try:
+        os.rename(old_name, new_name)
+    except FileNotFoundError:
+        logging.warning(f"File not found: {old_name}")
+
+# %%
+new_name
+
 # %%
 c.most_common(10) # peptide_counter.counter.most_common(10)
 

diff --git a/vaep/io/data_objects.py b/vaep/io/data_objects.py
@@ -399,7 +399,7 @@ def __call__(self, folders,
 ### aggregated peptides
 
 # # check df for redundant information (same feature value for all entries)
-usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE]
+usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE, 'PEP']
 
 
 def count_peptides(folders: List[Path], dump=True,
@@ -426,6 +426,7 @@ def count_peptides(folders: List[Path], dump=True,
 d_dtypes_training_sample = {
     'Sequence': pd.StringDtype(),
     'Proteins': pd.StringDtype(),
+    'PEP': pd.Float32Dtype(),
     'Leading razor protein': pd.StringDtype(),
     'Gene names': pd.StringDtype(),
     'Intensity': pd.Int64Dtype()