📝✨ Allow users to download large HeLa protein groups dataset easily

RasmussenLab · May 31, 2024 · 21d7a43 · 21d7a43
1 parent 83d0aa0
commit 21d7a43
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 13 deletions.
diff --git a/project/data/README.md b/project/data/README.md
@@ -4,7 +4,17 @@
 
 ## Download development dataset
 
-The large development data set can be obtained from PRIDE:
+The large development data sets can be obtained from PRIDE. An example for the protein
+groups level data is provided below and as an executable script.
+
+### Download large development dataset
+Execute the script to download and save the large Hela protein group data for instrument 6070:
+
+```bash
+python download_dev_dataset.py
+```
+
+This script contains the following code:
 
 ```python
 import io
@@ -14,25 +24,48 @@ from pathlib import Path
 import pandas as pd
 import requests
 
-ftp_folder = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233'
-file = 'pride_metadata.csv'
-
-meta = pd.read_csv(f'{ftp_folder}/{file}', index_col=0)
+FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233'
+FILE = 'pride_metadata.csv'
+print(f'Fetch metadata: {FTP_FOLDER}/{FILE}')
+meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0)
 meta.sample(5, random_state=42).sort_index()
 idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index
-idx_6070
 
-file = 'geneGroups_aggregated.zip'
-r = requests.get(f'{ftp_folder}/{file}')
+FILE = 'geneGroups_aggregated.zip'
+print(f"Fetch archive:  {FTP_FOLDER}/{FILE}")
+r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900)
 with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive:
-    fname = Path('geneGroups/intensities_wide_selected_N07444_M04547.csv')
-    with zip_archive.open(fname) as f:
+    print('available files in archive' '\n - '.join(zip_archive.namelist()))
+    FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv'
+    print('\nread file:', FNAME)
+    with zip_archive.open(FNAME) as f:
         df = pd.read_csv(f, index_col=0)
-    fname.parent.mkdir(parents=True, exist_ok=True)
 
-# save protein groups data for instrument 6070
-df.loc[idx_6070].to_csv(fname.parent / 'geneGroups_6070.csv')
+# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl
+FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long')
+FOLDER.mkdir(parents=True, exist_ok=True)
+fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv'
+df.loc[idx_6070].to_csv(fname)
+print(f'saved data to: {fname}')
+df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl'))
+print(f'saved data to: {fname.with_suffix(".pkl")}')
+# save metadata:
+fname = FOLDER / 'metadata.csv'
+meta.loc[idx_6070].to_csv(fname)
+print(f'saved metadata to: {fname}')
+```
+### Run snakemake workflow
+
+Then you will be able to run the snakemake workflow for the larger 
+development dataset:
+
+```bash
+snakemake --configfile config/single_dev_dataset/proteinGroups/config.yaml -c1 -n
 ```
 
 The smaller development data set on the protein groups level is also shipped with this
 repository and can be found in the [`dev_datasets/HeLa_6070`](dev_datasets/HeLa_6070/) folder.
+
+```bash
+snakemake -c1 -n
+```
diff --git a/project/data/download_dev_dataset.py b/project/data/download_dev_dataset.py
@@ -0,0 +1,42 @@
+"""Download the development dataset of HeLa cells from PRIDE.
+
+Instrument: Q_Exactive_HF_X_Orbitrap_6070
+
+Can be adapted to save all instruments or other datasets.
+"""
+import io
+import zipfile
+from pathlib import Path
+
+import pandas as pd
+import requests
+
+FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233'
+FILE = 'pride_metadata.csv'
+print(f'Fetch metadata: {FTP_FOLDER}/{FILE}')
+meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0)
+meta.sample(5, random_state=42).sort_index()
+idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index
+
+FILE = 'geneGroups_aggregated.zip'
+print(f"Fetch archive:  {FTP_FOLDER}/{FILE}")
+r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900)
+with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive:
+    print('available files in archive' '\n - '.join(zip_archive.namelist()))
+    FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv'
+    print('\nread file:', FNAME)
+    with zip_archive.open(FNAME) as f:
+        df = pd.read_csv(f, index_col=0)
+
+# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl
+FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long')
+FOLDER.mkdir(parents=True, exist_ok=True)
+fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv'
+df.loc[idx_6070].to_csv(fname)
+print(f'saved data to: {fname}')
+df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl'))
+print(f'saved data to: {fname.with_suffix(".pkl")}')
+# save metadata:
+fname = FOLDER / 'metadata.csv'
+meta.loc[idx_6070].to_csv(fname)
+print(f'saved metadata to: {fname}')