Skip to content

Commit

Permalink
📝✨ Allow users to download large HeLa protein groups dataset easily
Browse files Browse the repository at this point in the history
  • Loading branch information
Henry Webel committed May 31, 2024
1 parent 83d0aa0 commit 21d7a43
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 13 deletions.
59 changes: 46 additions & 13 deletions project/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,17 @@
## Download development dataset

The large development data set can be obtained from PRIDE:
The large development data sets can be obtained from PRIDE. An example for the protein
groups level data is provided below and as an executable script.

### Download large development dataset
Execute the script to download and save the large Hela protein group data for instrument 6070:

```bash
python download_dev_dataset.py
```

This script contains the following code:

```python
import io
Expand All @@ -14,25 +24,48 @@ from pathlib import Path
import pandas as pd
import requests

ftp_folder = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233'
file = 'pride_metadata.csv'

meta = pd.read_csv(f'{ftp_folder}/{file}', index_col=0)
FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233'
FILE = 'pride_metadata.csv'
print(f'Fetch metadata: {FTP_FOLDER}/{FILE}')
meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0)
meta.sample(5, random_state=42).sort_index()
idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index
idx_6070

file = 'geneGroups_aggregated.zip'
r = requests.get(f'{ftp_folder}/{file}')
FILE = 'geneGroups_aggregated.zip'
print(f"Fetch archive: {FTP_FOLDER}/{FILE}")
r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900)
with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive:
fname = Path('geneGroups/intensities_wide_selected_N07444_M04547.csv')
with zip_archive.open(fname) as f:
print('available files in archive' '\n - '.join(zip_archive.namelist()))
FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv'
print('\nread file:', FNAME)
with zip_archive.open(FNAME) as f:
df = pd.read_csv(f, index_col=0)
fname.parent.mkdir(parents=True, exist_ok=True)

# save protein groups data for instrument 6070
df.loc[idx_6070].to_csv(fname.parent / 'geneGroups_6070.csv')
# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl
FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long')
FOLDER.mkdir(parents=True, exist_ok=True)
fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv'
df.loc[idx_6070].to_csv(fname)
print(f'saved data to: {fname}')
df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl'))
print(f'saved data to: {fname.with_suffix(".pkl")}')
# save metadata:
fname = FOLDER / 'metadata.csv'
meta.loc[idx_6070].to_csv(fname)
print(f'saved metadata to: {fname}')
```
### Run snakemake workflow

Then you will be able to run the snakemake workflow for the larger
development dataset:

```bash
snakemake --configfile config/single_dev_dataset/proteinGroups/config.yaml -c1 -n
```

The smaller development data set on the protein groups level is also shipped with this
repository and can be found in the [`dev_datasets/HeLa_6070`](dev_datasets/HeLa_6070/) folder.

```bash
snakemake -c1 -n
```
42 changes: 42 additions & 0 deletions project/data/download_dev_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Download the development dataset of HeLa cells from PRIDE.
Instrument: Q_Exactive_HF_X_Orbitrap_6070
Can be adapted to save all instruments or other datasets.
"""
import io
import zipfile
from pathlib import Path

import pandas as pd
import requests

FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233'
FILE = 'pride_metadata.csv'
print(f'Fetch metadata: {FTP_FOLDER}/{FILE}')
meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0)
meta.sample(5, random_state=42).sort_index()
idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index

FILE = 'geneGroups_aggregated.zip'
print(f"Fetch archive: {FTP_FOLDER}/{FILE}")
r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900)
with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive:
print('available files in archive' '\n - '.join(zip_archive.namelist()))
FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv'
print('\nread file:', FNAME)
with zip_archive.open(FNAME) as f:
df = pd.read_csv(f, index_col=0)

# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl
FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long')
FOLDER.mkdir(parents=True, exist_ok=True)
fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv'
df.loc[idx_6070].to_csv(fname)
print(f'saved data to: {fname}')
df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl'))
print(f'saved data to: {fname.with_suffix(".pkl")}')
# save metadata:
fname = FOLDER / 'metadata.csv'
meta.loc[idx_6070].to_csv(fname)
print(f'saved metadata to: {fname}')

0 comments on commit 21d7a43

Please sign in to comment.