🎨 format notebooks

- just revisited to check some details
RasmussenLab · Aug 31, 2023 · e7350fc · e7350fc
1 parent f789d2c
commit e7350fc
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 75 deletions.
diff --git a/project/00_0_0_lftp_upload_commands.ipynb b/project/00_0_0_lftp_upload_commands.ipynb
@@ -91,6 +91,7 @@
    "execution_count": null,
    "id": "9869ac5e-fab3-4c66-a32c-48ae4fadc0a3",
    "metadata": {
+    "lines_to_next_cell": 2,
     "tags": []
    },
    "outputs": [],
@@ -100,8 +101,9 @@
     "df_meta[date_col] = pd.to_datetime(\n",
     "    df_meta[date_col])\n",
     "df_meta.sort_values(date_col, inplace=True)\n",
-    "df_meta\n",
-    "msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\" "
+    "msg = f\"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser.\"\n",
+    "print(msg)\n",
+    "df_meta"
    ]
   },
   {

diff --git a/project/00_0_0_lftp_upload_commands.py b/project/00_0_0_lftp_upload_commands.py
@@ -5,7 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.15.0
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -65,8 +65,10 @@ def rename(fname, new_sample_id, new_folder=None, ext=None):
 df_meta[date_col] = pd.to_datetime(
     df_meta[date_col])
 df_meta.sort_values(date_col, inplace=True)
+msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser."
+print(msg)
 df_meta
-msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser." 
+
 
 # %%
 meta_stats = df_meta.describe(include='all', datetime_is_numeric=True)

diff --git a/project/00_0_hela_metadata_rawfiles.ipynb b/project/00_0_hela_metadata_rawfiles.ipynb
@@ -47,11 +47,11 @@
    },
    "outputs": [],
    "source": [
-    "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n",
+    "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv'  # Machine parsed metadata from rawfile workflow\n",
     "# outputs\n",
-    "fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n",
-    "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n",
-    "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)"
+    "fn_files_per_instrument: str = 'data/files_per_instrument.yaml'  # All parsed raw files nested by instrument (model, attribute, serial number)\n",
+    "fn_files_selected: str = 'data/samples_selected.yaml'  # selected files based on threshold of identified peptides\n",
+    "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml'  # Selected parsed raw files nested by instrument (model, attribute, serial number)"
    ]
   },
   {
@@ -78,8 +78,18 @@
     "df_meta_rawfiles[date_col] = pd.to_datetime(\n",
     "    df_meta_rawfiles[date_col])\n",
     "df_meta_rawfiles.sort_values(date_col, inplace=True)\n",
-    "df_meta_rawfiles\n",
-    "msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\" "
+    "df_meta_rawfiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32b42511",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msg = f\"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser.\"\n",
+    "print(msg)"
    ]
   },
   {
@@ -108,7 +118,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "meta_stats.loc[:, (meta_stats.loc['unique'] > 1) |  (meta_stats.loc['std'] > 0.1)].T"
+    "meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T"
    ]
   },
   {
@@ -118,7 +128,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection\n",
+    "# needs to go to Config which is not overwriteable by attribute selection\n",
+    "df_meta_rawfiles_columns = df_meta_rawfiles.columns\n",
     "meta_raw_names = df_meta_rawfiles.columns.droplevel()\n",
     "assert meta_raw_names.is_unique\n",
     "df_meta_rawfiles.columns = meta_raw_names\n",
@@ -133,24 +144,24 @@
    "outputs": [],
    "source": [
     "meta_raw_selected = [\n",
-    " 'Content Creation Date', \n",
-    " 'Thermo Scientific instrument model',\n",
-    " 'instrument serial number',\n",
-    " 'Software Version', \n",
-    " 'Number of MS1 spectra',\n",
-    " 'Number of MS2 spectra', \n",
-    " 'Number of scans',\n",
-    " 'MS max charge',\n",
-    " 'MS max RT',\n",
-    " 'MS min MZ',\n",
-    " 'MS max MZ',\n",
-    " 'MS scan range', \n",
-    " 'mass resolution',\n",
-    " 'Retention time range',\n",
-    " 'Mz range',\n",
-    " 'beam-type collision-induced dissociation', \n",
-    " 'injection volume setting',\n",
-    " 'dilution factor',\n",
+    "    'Content Creation Date',\n",
+    "    'Thermo Scientific instrument model',\n",
+    "    'instrument serial number',\n",
+    "    'Software Version',\n",
+    "    'Number of MS1 spectra',\n",
+    "    'Number of MS2 spectra',\n",
+    "    'Number of scans',\n",
+    "    'MS max charge',\n",
+    "    'MS max RT',\n",
+    "    'MS min MZ',\n",
+    "    'MS max MZ',\n",
+    "    'MS scan range',\n",
+    "    'mass resolution',\n",
+    "    'Retention time range',\n",
+    "    'Mz range',\n",
+    "    'beam-type collision-induced dissociation',\n",
+    "    'injection volume setting',\n",
+    "    'dilution factor',\n",
     "]\n",
     "df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))"
    ]
@@ -183,7 +194,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') \n",
+    "MetaRawSettings = namedtuple(\n",
+    "    'MetaRawSettings',\n",
+    "    'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')\n",
     "meta_raw_settings = [\n",
     "    'Thermo Scientific instrument model',\n",
     "    'instrument attribute',\n",
@@ -219,8 +232,7 @@
     "view without `MS max charge`:\n",
     "  - software can be updated\n",
     "  - variation by `injection volume setting` and instrument over time\n",
-    "  - missing `dilution factor`\n",
-    "  "
+    "  - missing `dilution factor`\n"
    ]
   },
   {
@@ -231,7 +243,9 @@
    "outputs": [],
    "source": [
     "to_drop = ['MS max charge']\n",
-    "# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination\n",
+    "# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop,\n",
+    "# axis=1).drop_duplicates(ignore_index=False) # index gives first example\n",
+    "# with this combination\n",
     "df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)"
    ]
   },
@@ -250,7 +264,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10)"
+    "df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[\n",
+    "    meta_raw_settings.ms_model].count().sort_values().tail(10)"
    ]
   },
   {
@@ -273,8 +288,7 @@
     "grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))\n",
     "instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n",
     "msg += (f\" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)\"\n",
-    "        f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\"\n",
-    "       )\n",
+    "        f\", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.\")\n",
     "instrument_counts"
    ]
   },
@@ -375,9 +389,9 @@
     "grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3]))\n",
     "instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()\n",
     "N = 500\n",
-    "msg += (f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n",
-    "        f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\"\n",
-    "       )\n",
+    "msg += (\n",
+    "    f\" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs\"\n",
+    "    f\", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.\")\n",
     "instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv')\n",
     "instrument_counts.to_frame('No. samples')"
    ]

diff --git a/project/00_0_hela_metadata_rawfiles.py b/project/00_0_hela_metadata_rawfiles.py
@@ -5,7 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.15.0
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -32,11 +32,11 @@
 # ## Arguments
 
 # %% tags=["parameters"]
-fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow
+fn_rawfile_metadata: str = 'data/rawfile_metadata.csv'  # Machine parsed metadata from rawfile workflow
 # outputs
-fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)
-fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides
-fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)
+fn_files_per_instrument: str = 'data/files_per_instrument.yaml'  # All parsed raw files nested by instrument (model, attribute, serial number)
+fn_files_selected: str = 'data/samples_selected.yaml'  # selected files based on threshold of identified peptides
+fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml'  # Selected parsed raw files nested by instrument (model, attribute, serial number)
 
 # %% [markdown]
 # ### Machine metadata
@@ -50,7 +50,10 @@
     df_meta_rawfiles[date_col])
 df_meta_rawfiles.sort_values(date_col, inplace=True)
 df_meta_rawfiles
-msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser." 
+
+# %%
+msg = f"A total of {len(df_meta_rawfiles)} raw files could be read using the ThermoFisherRawFileParser."
+print(msg)
 
 # %%
 meta_stats = df_meta_rawfiles.describe(include='all', datetime_is_numeric=True)
@@ -60,35 +63,36 @@
 # subset with variation
 
 # %%
-meta_stats.loc[:, (meta_stats.loc['unique'] > 1) |  (meta_stats.loc['std'] > 0.1)].T
+meta_stats.loc[:, (meta_stats.loc['unique'] > 1) | (meta_stats.loc['std'] > 0.1)].T
 
 # %%
-df_meta_rawfiles_columns = df_meta_rawfiles.columns # needs to go to Config which is not overwriteable by attribute selection
+# needs to go to Config which is not overwriteable by attribute selection
+df_meta_rawfiles_columns = df_meta_rawfiles.columns
 meta_raw_names = df_meta_rawfiles.columns.droplevel()
 assert meta_raw_names.is_unique
 df_meta_rawfiles.columns = meta_raw_names
 df_meta_rawfiles
 
 # %%
 meta_raw_selected = [
- 'Content Creation Date', 
- 'Thermo Scientific instrument model',
- 'instrument serial number',
- 'Software Version', 
- 'Number of MS1 spectra',
- 'Number of MS2 spectra', 
- 'Number of scans',
- 'MS max charge',
- 'MS max RT',
- 'MS min MZ',
- 'MS max MZ',
- 'MS scan range', 
- 'mass resolution',
- 'Retention time range',
- 'Mz range',
- 'beam-type collision-induced dissociation', 
- 'injection volume setting',
- 'dilution factor',
+    'Content Creation Date',
+    'Thermo Scientific instrument model',
+    'instrument serial number',
+    'Software Version',
+    'Number of MS1 spectra',
+    'Number of MS2 spectra',
+    'Number of scans',
+    'MS max charge',
+    'MS max RT',
+    'MS min MZ',
+    'MS max MZ',
+    'MS scan range',
+    'mass resolution',
+    'Retention time range',
+    'Mz range',
+    'beam-type collision-induced dissociation',
+    'injection volume setting',
+    'dilution factor',
 ]
 df_meta_rawfiles[meta_raw_selected].describe(percentiles=np.linspace(0.05, 0.95, 10))
 
@@ -104,7 +108,9 @@
 #   - quite some variation due to `MS max charge`: Is it a parameter?
 
 # %%
-MetaRawSettings = namedtuple('MetaRawSettings', 'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor') 
+MetaRawSettings = namedtuple(
+    'MetaRawSettings',
+    'ms_model ms_attr ms_sn ms_firmware max_charge mass_res cid_type inject_volume dill_factor')
 meta_raw_settings = [
     'Thermo Scientific instrument model',
     'instrument attribute',
@@ -129,18 +135,21 @@
 #   - software can be updated
 #   - variation by `injection volume setting` and instrument over time
 #   - missing `dilution factor`
-#   
+#
 
 # %%
 to_drop = ['MS max charge']
-# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=False) # index gives first example with this combination
+# df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop,
+# axis=1).drop_duplicates(ignore_index=False) # index gives first example
+# with this combination
 df_meta_rawfiles[list(meta_raw_settings)].drop(to_drop, axis=1).drop_duplicates(ignore_index=True)
 
 # %% [markdown]
 # Relatively big samples for different machines of the same kind running with the same firmware:
 
 # %%
-df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[meta_raw_settings.ms_model].count().sort_values().tail(10)
+df_meta_rawfiles.groupby([meta_raw_settings.ms_model, meta_raw_settings.ms_firmware])[
+    meta_raw_settings.ms_model].count().sort_values().tail(10)
 
 # %% [markdown]
 # Ignoring instrument software
@@ -149,8 +158,7 @@
 grouping = df_meta_rawfiles.groupby(list(meta_raw_settings[:3]))
 instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()
 msg += (f" There are a total of {len(instrument_counts)} unique instruments in the entire dataset (based on the instrument name, attributs and serial number)"
-        f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements."
-       )
+        f", of which at least {(instrument_counts >= 1000).sum()} have 1,000 raw files assigned to them. Note that the entire dataset contains fractionated measurements.")
 instrument_counts
 
 # %%
@@ -194,9 +202,9 @@
 grouping = df_meta_rawfiles.loc[files_selected['files']].groupby(list(meta_raw_settings[:3]))
 instrument_counts = grouping[meta_raw_settings.ms_model].count().sort_values()
 N = 500
-msg += (f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs"
-        f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them."
-       )
+msg += (
+    f" Among the {len(files_selected['files'])} raw files with a minimum of {files_selected['threshold']:,d} identified peptides there are a total of {len(instrument_counts)} unique instruments with quantified runs"
+    f", of which {(instrument_counts >= N).sum()} have at least {N:,d} rawfiles assigned to them.")
 instrument_counts.to_csv('data/files_selected_per_instrument_counts.csv')
 instrument_counts.to_frame('No. samples')