diff --git a/project/data/README.md b/project/data/README.md index 75c4be06e..b518521a4 100644 --- a/project/data/README.md +++ b/project/data/README.md @@ -4,7 +4,17 @@ ## Download development dataset -The large development data set can be obtained from PRIDE: +The large development data sets can be obtained from PRIDE. An example for the protein +groups level data is provided below and as an executable script. + +### Download large development dataset +Execute the script to download and save the large Hela protein group data for instrument 6070: + +```bash +python download_dev_dataset.py +``` + +This script contains the following code: ```python import io @@ -14,25 +24,48 @@ from pathlib import Path import pandas as pd import requests -ftp_folder = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' -file = 'pride_metadata.csv' - -meta = pd.read_csv(f'{ftp_folder}/{file}', index_col=0) +FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' +FILE = 'pride_metadata.csv' +print(f'Fetch metadata: {FTP_FOLDER}/{FILE}') +meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0) meta.sample(5, random_state=42).sort_index() idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index -idx_6070 -file = 'geneGroups_aggregated.zip' -r = requests.get(f'{ftp_folder}/{file}') +FILE = 'geneGroups_aggregated.zip' +print(f"Fetch archive: {FTP_FOLDER}/{FILE}") +r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900) with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive: - fname = Path('geneGroups/intensities_wide_selected_N07444_M04547.csv') - with zip_archive.open(fname) as f: + print('available files in archive' '\n - '.join(zip_archive.namelist())) + FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv' + print('\nread file:', FNAME) + with zip_archive.open(FNAME) as f: df = pd.read_csv(f, index_col=0) - fname.parent.mkdir(parents=True, exist_ok=True) -# save protein groups data for instrument 6070 -df.loc[idx_6070].to_csv(fname.parent / 'geneGroups_6070.csv') +# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long') +FOLDER.mkdir(parents=True, exist_ok=True) +fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv' +df.loc[idx_6070].to_csv(fname) +print(f'saved data to: {fname}') +df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl')) +print(f'saved data to: {fname.with_suffix(".pkl")}') +# save metadata: +fname = FOLDER / 'metadata.csv' +meta.loc[idx_6070].to_csv(fname) +print(f'saved metadata to: {fname}') +``` +### Run snakemake workflow + +Then you will be able to run the snakemake workflow for the larger +development dataset: + +```bash +snakemake --configfile config/single_dev_dataset/proteinGroups/config.yaml -c1 -n ``` The smaller development data set on the protein groups level is also shipped with this repository and can be found in the [`dev_datasets/HeLa_6070`](dev_datasets/HeLa_6070/) folder. + +```bash +snakemake -c1 -n +``` diff --git a/project/data/download_dev_dataset.py b/project/data/download_dev_dataset.py new file mode 100644 index 000000000..dec94bdde --- /dev/null +++ b/project/data/download_dev_dataset.py @@ -0,0 +1,42 @@ +"""Download the development dataset of HeLa cells from PRIDE. + +Instrument: Q_Exactive_HF_X_Orbitrap_6070 + +Can be adapted to save all instruments or other datasets. +""" +import io +import zipfile +from pathlib import Path + +import pandas as pd +import requests + +FTP_FOLDER = 'https://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD042233' +FILE = 'pride_metadata.csv' +print(f'Fetch metadata: {FTP_FOLDER}/{FILE}') +meta = pd.read_csv(f'{FTP_FOLDER}/{FILE}', index_col=0) +meta.sample(5, random_state=42).sort_index() +idx_6070 = meta.query('`instrument serial number`.str.contains("#6070")').index + +FILE = 'geneGroups_aggregated.zip' +print(f"Fetch archive: {FTP_FOLDER}/{FILE}") +r = requests.get(f'{FTP_FOLDER}/{FILE}', timeout=900) +with zipfile.ZipFile(io.BytesIO(r.content), 'r') as zip_archive: + print('available files in archive' '\n - '.join(zip_archive.namelist())) + FNAME = 'geneGroups/intensities_wide_selected_N07444_M04547.csv' + print('\nread file:', FNAME) + with zip_archive.open(FNAME) as f: + df = pd.read_csv(f, index_col=0) + +# dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl +FOLDER = Path('dev_datasets/df_intensities_proteinGroups_long') +FOLDER.mkdir(parents=True, exist_ok=True) +fname = FOLDER / 'Q_Exactive_HF_X_Orbitrap_6070.csv' +df.loc[idx_6070].to_csv(fname) +print(f'saved data to: {fname}') +df.loc[idx_6070].to_pickle(fname.with_suffix('.pkl')) +print(f'saved data to: {fname.with_suffix(".pkl")}') +# save metadata: +fname = FOLDER / 'metadata.csv' +meta.loc[idx_6070].to_csv(fname) +print(f'saved metadata to: {fname}')