Skip to content

Commit

Permalink
🎨 format notebooks
Browse files Browse the repository at this point in the history
- just revisited to check some details
  • Loading branch information
Henry committed Aug 31, 2023
1 parent f789d2c commit e7350fc
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 75 deletions.
6 changes: 4 additions & 2 deletions project/00_0_0_lftp_upload_commands.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
"execution_count": null,
"id": "9869ac5e-fab3-4c66-a32c-48ae4fadc0a3",
"metadata": {
"lines_to_next_cell": 2,
"tags": []
},
"outputs": [],
Expand All @@ -100,8 +101,9 @@
"df_meta[date_col] = pd.to_datetime(\n",
" df_meta[date_col])\n",
"df_meta.sort_values(date_col, inplace=True)\n",
"df_meta\n",
"msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\" "
"msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\"\n",
"print(msg)\n",
"df_meta"
]
},
{
Expand Down
6 changes: 4 additions & 2 deletions project/00_0_0_lftp_upload_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.5
# jupytext_version: 1.15.0
# kernelspec:
# display_name: Python 3
# language: python
Expand Down Expand Up @@ -65,8 +65,10 @@ def rename(fname, new_sample_id, new_folder=None, ext=None):
df_meta[date_col] = pd.to_datetime(
df_meta[date_col])
df_meta.sort_values(date_col, inplace=True)
msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser."
print(msg)
df_meta
msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser."


# %%
meta_stats = df_meta.describe(include='all', datetime_is_numeric=True)
Expand Down
86 changes: 50 additions & 36 deletions project/00_0_hela_metadata_rawfiles.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@
},
"outputs": [],
"source": [
"fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n",
"fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n",
"# outputs\n",
"fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n",
"fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n",
"fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)"
"fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n",
"fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n",
"fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)"
]
},
{
Expand All @@ -78,8 +78,18 @@
"df_meta_rawfiles[date_col] = pd.to_datetime(\n",
" df_meta_rawfiles[date_col])\n",
"df_meta_rawfiles.sort_values(date_col, inplace=True)\n",
"df_meta_rawfiles\n",
"msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\" "
"df_meta_rawfiles"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32b42511",
"metadata": {},
"outputs": [],
"source": [
"msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\"\n",
"print(msg)"
]
},
{
Expand Down Expand Up @@ -108,7 +118,7 @@
"metadata": {},
"outputs": [],
"source": [
"meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T"
"meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T"
]
},
{
Expand All @@ -118,7 +128,8 @@
"metadata": {},
"outputs": [],
"source": [
"df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection\n",
"# needs to go to Config which is not overwriteable by attribute selection\n",
"df_meta_rawfiles_columns = df_meta_rawfiles.columns\n",
"meta_raw_names = df_meta_rawfiles.columns.droplevel()\n",
"assert meta_raw_names.is_unique\n",
"df_meta_rawfiles.columns = meta_raw_names\n",
Expand All @@ -133,24 +144,24 @@
"outputs": [],
"source": [
"meta_raw_selected = [\n",
" 'Content Creation Date', \n",
" 'Thermo Scientific instrument model',\n",
" 'instrument serial number',\n",
" 'Software Version', \n",
" 'Number of MS1 spectra',\n",
" 'Number of MS2 spectra', \n",
" 'Number of scans',\n",
" 'MS max charge',\n",
" 'MS max RT',\n",
" 'MS min MZ',\n",
" 'MS max MZ',\n",
" 'MS scan range', \n",
" 'mass resolution',\n",
" 'Retention time range',\n",
" 'Mz range',\n",
" 'beam-type collision-induced dissociation', \n",
" 'injection volume setting',\n",
" 'dilution factor',\n",
" 'Content Creation Date',\n",
" 'Thermo Scientific instrument model',\n",
" 'instrument serial number',\n",
" 'Software Version',\n",
" 'Number of MS1 spectra',\n",
" 'Number of MS2 spectra',\n",
" 'Number of scans',\n",
" 'MS max charge',\n",
" 'MS max RT',\n",
" 'MS min MZ',\n",
" 'MS max MZ',\n",
" 'MS scan range',\n",
" 'mass resolution',\n",
" 'Retention time range',\n",
" 'Mz range',\n",
" 'beam-type collision-induced dissociation',\n",
" 'injection volume setting',\n",
" 'dilution factor',\n",
"]\n",
"df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))"
]
Expand Down Expand Up @@ -183,7 +194,9 @@
"metadata": {},
"outputs": [],
"source": [
"MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') \n",
"MetaRawSettings = namedtuple(\n",
" 'MetaRawSettings',\n",
" 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')\n",
"meta_raw_settings = [\n",
" 'Thermo Scientific instrument model',\n",
" 'instrument attribute',\n",
Expand Down Expand Up @@ -219,8 +232,7 @@
"view without `MS max charge`:\n",
" - software can be updated\n",
" - variation by `injection volume setting` and instrument over time\n",
" - missing `dilution factor`\n",
" "
" - missing `dilution factor`\n"
]
},
{
Expand All @@ -231,7 +243,9 @@
"outputs": [],
"source": [
"to_drop = ['MS max charge']\n",
"# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination\n",
"# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop,\n",
"# axis=1).drop_duplicates(ignore_index=False) # index gives first example\n",
"# with this combination\n",
"df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)"
]
},
Expand All @@ -250,7 +264,8 @@
"metadata": {},
"outputs": [],
"source": [
"df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10)"
"df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[\n",
" meta_raw_settings.ms_model].count().sort_values().tail(10)"
]
},
{
Expand All @@ -273,8 +288,7 @@
"grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))\n",
"instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n",
"msg += (f\" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)\"\n",
" f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\"\n",
" )\n",
" f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\")\n",
"instrument_counts"
]
},
Expand Down Expand Up @@ -375,9 +389,9 @@
"grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3]))\n",
"instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n",
"N = 500\n",
"msg += (f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n",
" f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\"\n",
" )\n",
"msg += (\n",
" f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n",
" f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\")\n",
"instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv')\n",
"instrument_counts.to_frame('No. samples')"
]
Expand Down
78 changes: 43 additions & 35 deletions project/00_0_hela_metadata_rawfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.5
# jupytext_version: 1.15.0
# kernelspec:
# display_name: Python 3
# language: python
Expand All @@ -32,11 +32,11 @@
# ## Arguments

# %% tags=["parameters"]
fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow
# outputs
fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)
fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides
fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)
fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)
fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides
fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)

# %% [markdown]
# ### Machine metadata
Expand All @@ -50,7 +50,10 @@
df_meta_rawfiles[date_col])
df_meta_rawfiles.sort_values(date_col, inplace=True)
df_meta_rawfiles
msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser."

# %%
msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser."
print(msg)

# %%
meta_stats = df_meta_rawfiles.describe(include='all', datetime_is_numeric=True)
Expand All @@ -60,35 +63,36 @@
# subset with variation

# %%
meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T
meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T

# %%
df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection
# needs to go to Config which is not overwriteable by attribute selection
df_meta_rawfiles_columns = df_meta_rawfiles.columns
meta_raw_names = df_meta_rawfiles.columns.droplevel()
assert meta_raw_names.is_unique
df_meta_rawfiles.columns = meta_raw_names
df_meta_rawfiles

# %%
meta_raw_selected = [
'Content Creation Date',
'Thermo Scientific instrument model',
'instrument serial number',
'Software Version',
'Number of MS1 spectra',
'Number of MS2 spectra',
'Number of scans',
'MS max charge',
'MS max RT',
'MS min MZ',
'MS max MZ',
'MS scan range',
'mass resolution',
'Retention time range',
'Mz range',
'beam-type collision-induced dissociation',
'injection volume setting',
'dilution factor',
'Content Creation Date',
'Thermo Scientific instrument model',
'instrument serial number',
'Software Version',
'Number of MS1 spectra',
'Number of MS2 spectra',
'Number of scans',
'MS max charge',
'MS max RT',
'MS min MZ',
'MS max MZ',
'MS scan range',
'mass resolution',
'Retention time range',
'Mz range',
'beam-type collision-induced dissociation',
'injection volume setting',
'dilution factor',
]
df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))

Expand All @@ -104,7 +108,9 @@
# - quite some variation due to `MS max charge`: Is it a parameter?

# %%
MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')
MetaRawSettings = namedtuple(
'MetaRawSettings',
'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')
meta_raw_settings = [
'Thermo Scientific instrument model',
'instrument attribute',
Expand All @@ -129,18 +135,21 @@
# - software can be updated
# - variation by `injection volume setting` and instrument over time
# - missing `dilution factor`
#
#

# %%
to_drop = ['MS max charge']
# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination
# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop,
# axis=1).drop_duplicates(ignore_index=False) # index gives first example
# with this combination
df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)

# %% [markdown]
# Relatively big samples for different machines of the same kind running with the same firmware:

# %%
df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10)
df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[
meta_raw_settings.ms_model].count().sort_values().tail(10)

# %% [markdown]
# Ignoring instrument software
Expand All @@ -149,8 +158,7 @@
grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))
instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()
msg += (f" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)"
f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements."
)
f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.")
instrument_counts

# %%
Expand Down Expand Up @@ -194,9 +202,9 @@
grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3]))
instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()
N = 500
msg += (f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs"
f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them."
)
msg += (
f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs"
f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.")
instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv')
instrument_counts.to_frame('No. samples')

Expand Down

0 comments on commit e7350fc

Please sign in to comment.