diff --git a/project/00_0_0_lftp_upload_commands.ipynb b/project/00_0_0_lftp_upload_commands.ipynb index 81dd8c796..1d9ae4337 100644 --- a/project/00_0_0_lftp_upload_commands.ipynb +++ b/project/00_0_0_lftp_upload_commands.ipynb @@ -56,11 +56,11 @@ }, "outputs": [], "source": [ - "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", - "fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files\n", - "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", - "out_folder: str = 'data/rename' # output folder\n", - "fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files" + "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", + "fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files\n", + "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", + "out_folder: str = 'data/rename' # output folder\n", + "fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files" ] }, { @@ -133,14 +133,14 @@ "outputs": [], "source": [ "cols_identifies = [('FileProperties', 'Pathname'),\n", - " ('FileProperties', 'Version'),\n", - " ('FileProperties', 'Content Creation Date'),\n", - " ('InstrumentProperties', 'Thermo Scientific instrument model'),\n", - " ('InstrumentProperties', 'instrument attribute'),\n", - " ('InstrumentProperties', 'instrument serial number'),\n", - " ('InstrumentProperties', 'Software Version'),\n", - " ('InstrumentProperties', 'firmware version'),\n", - "]\n", + " ('FileProperties', 'Version'),\n", + " ('FileProperties', 'Content Creation Date'),\n", + " ('InstrumentProperties', 'Thermo Scientific instrument model'),\n", + " ('InstrumentProperties', 'instrument attribute'),\n", + " ('InstrumentProperties', 'instrument serial number'),\n", + " ('InstrumentProperties', 'Software Version'),\n", + " ('InstrumentProperties', 'firmware version'),\n", + " ]\n", "\n", "df_meta = df_meta[cols_identifies]\n", "df_meta.columns = [t[-1] for t in cols_identifies]\n", @@ -198,9 +198,9 @@ "source": [ "date_col = \"Content Creation Date\"\n", "idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime(\"%Y_%m_%d_%H_%M\")\n", - " + '_'\n", - " + df_meta[\"Instrument_name\"]\n", - ").str.replace(' ', '-')\n", + " + '_'\n", + " + df_meta[\"Instrument_name\"]\n", + " ).str.replace(' ', '-')\n", "\n", "mask = idx_all.duplicated(keep=False)\n", "duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps\n", @@ -214,8 +214,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_meta['new_sample_id'] = idx_all\n", - "\n", + "df_meta['new_sample_id'] = idx_all\n", "\n", "\n", "_n = df_meta.groupby(\"new_sample_id\").cumcount().astype('string').str.replace('0', '')\n", @@ -340,10 +339,10 @@ " if string_ not in used_before:\n", " ret += f'_{string_}'\n", " used_before |= set(strings_)\n", - " ret = (ret[1:] # remove _ from start\n", + " ret = (ret[1:] # remove _ from start\n", " .replace('Slot_#', '')\n", " .replace('slot_#', '')\n", - " )\n", + " )\n", " return ret\n", "\n", "\n", @@ -353,7 +352,7 @@ " \"instrument attribute\",\n", " \"instrument serial number\",\n", " ]\n", - " ]\n", + "]\n", " .sample(20)\n", " .apply(build_instrument_name, axis=1)\n", ")" @@ -401,8 +400,8 @@ " .loc[selected, \"Path_old\"]\n", " .iloc[:3]\n", " .to_csv(out_folder / 'rawfiles_to_checksum.txt',\n", - " index=False,\n", - " header=False)\n", + " index=False,\n", + " header=False)\n", " )" ] }, @@ -453,7 +452,7 @@ "```\n", "to allow parallell commands, use the runtime setting\n", "```bash\n", - ">>> cat ~/.lftprc \n", + ">>> cat ~/.lftprc\n", "set cmd:parallel 2\n", "```" ] @@ -501,11 +500,11 @@ "source": [ "commands = df_meta.loc[selected]\n", "commands = (\n", - " 'put ' \n", + " 'put '\n", " + commands['Path_old'].astype('string')\n", - " + ' -o ' \n", - " + \"./raw_files/\" \n", - " + commands[\"Instrument_name\"] \n", + " + ' -o '\n", + " + \"./raw_files/\"\n", + " + commands[\"Instrument_name\"]\n", " + '/'\n", " + commands['new_sample_id'] + '.raw'\n", ")\n", @@ -559,9 +558,9 @@ "source": [ "commands = df_meta.loc[selected]\n", "commands = (\n", - " \"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf \" # command\n", - " + \"mq_out/\" + commands.index # source\n", - " + \" ./MQ_tables/\" + commands[\"Instrument_name\"]+ \"/\" + commands[\"new_sample_id\"] # dest\n", + " \"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf \" # command\n", + " + \"mq_out/\" + commands.index # source\n", + " + \" ./MQ_tables/\" + commands[\"Instrument_name\"] + \"/\" + commands[\"new_sample_id\"] # dest\n", ")\n", "\n", "print(commands.sample(10).to_csv(header=False, index=False))" @@ -579,9 +578,7 @@ "cell_type": "code", "execution_count": null, "id": "83c04b90-0c4e-4fe7-88f6-ed02cef93a23", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "fname = out_folder / 'lftp_commands_mq_output.txt'\n", diff --git a/project/00_0_0_lftp_upload_commands.py b/project/00_0_0_lftp_upload_commands.py index 921a5733f..17cc26be2 100644 --- a/project/00_0_0_lftp_upload_commands.py +++ b/project/00_0_0_lftp_upload_commands.py @@ -42,11 +42,11 @@ def rename(fname, new_sample_id, new_folder=None, ext=None): # ## Arguments # %% tags=["parameters"] -fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow -fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files -fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides -out_folder: str = 'data/rename' # output folder -fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files +fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow +fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files +fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides +out_folder: str = 'data/rename' # output folder +fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files # %% out_folder = Path(out_folder) @@ -79,14 +79,14 @@ def rename(fname, new_sample_id, new_folder=None, ext=None): # %% cols_identifies = [('FileProperties', 'Pathname'), - ('FileProperties', 'Version'), - ('FileProperties', 'Content Creation Date'), - ('InstrumentProperties', 'Thermo Scientific instrument model'), - ('InstrumentProperties', 'instrument attribute'), - ('InstrumentProperties', 'instrument serial number'), - ('InstrumentProperties', 'Software Version'), - ('InstrumentProperties', 'firmware version'), -] + ('FileProperties', 'Version'), + ('FileProperties', 'Content Creation Date'), + ('InstrumentProperties', 'Thermo Scientific instrument model'), + ('InstrumentProperties', 'instrument attribute'), + ('InstrumentProperties', 'instrument serial number'), + ('InstrumentProperties', 'Software Version'), + ('InstrumentProperties', 'firmware version'), + ] df_meta = df_meta[cols_identifies] df_meta.columns = [t[-1] for t in cols_identifies] @@ -113,17 +113,16 @@ def rename(fname, new_sample_id, new_folder=None, ext=None): # %% date_col = "Content Creation Date" idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime("%Y_%m_%d_%H_%M") - + '_' - + df_meta["Instrument_name"] -).str.replace(' ', '-') + + '_' + + df_meta["Instrument_name"] + ).str.replace(' ', '-') mask = idx_all.duplicated(keep=False) duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps duplicated_sample_idx # %% -df_meta['new_sample_id'] = idx_all - +df_meta['new_sample_id'] = idx_all _n = df_meta.groupby("new_sample_id").cumcount().astype('string').str.replace('0', '') @@ -182,10 +181,10 @@ def build_instrument_name(s): if string_ not in used_before: ret += f'_{string_}' used_before |= set(strings_) - ret = (ret[1:] # remove _ from start + ret = (ret[1:] # remove _ from start .replace('Slot_#', '') .replace('slot_#', '') - ) + ) return ret @@ -195,7 +194,7 @@ def build_instrument_name(s): "instrument attribute", "instrument serial number", ] - ] +] .sample(20) .apply(build_instrument_name, axis=1) ) @@ -217,8 +216,8 @@ def build_instrument_name(s): .loc[selected, "Path_old"] .iloc[:3] .to_csv(out_folder / 'rawfiles_to_checksum.txt', - index=False, - header=False) + index=False, + header=False) ) # %% [markdown] @@ -247,7 +246,7 @@ def build_instrument_name(s): # ``` # to allow parallell commands, use the runtime setting # ```bash -# >>> cat ~/.lftprc +# >>> cat ~/.lftprc # set cmd:parallel 2 # ``` @@ -269,11 +268,11 @@ def build_instrument_name(s): # %% commands = df_meta.loc[selected] commands = ( - 'put ' + 'put ' + commands['Path_old'].astype('string') - + ' -o ' - + "./raw_files/" - + commands["Instrument_name"] + + ' -o ' + + "./raw_files/" + + commands["Instrument_name"] + '/' + commands['new_sample_id'] + '.raw' ) @@ -299,9 +298,9 @@ def build_instrument_name(s): # %% commands = df_meta.loc[selected] commands = ( - "mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf " # command - + "mq_out/" + commands.index # source - + " ./MQ_tables/" + commands["Instrument_name"]+ "/" + commands["new_sample_id"] # dest + "mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf " # command + + "mq_out/" + commands.index # source + + " ./MQ_tables/" + commands["Instrument_name"] + "/" + commands["new_sample_id"] # dest ) print(commands.sample(10).to_csv(header=False, index=False)) @@ -312,4 +311,3 @@ def build_instrument_name(s): # %% fname = out_folder / 'lftp_commands_mq_output.txt' commands.to_csv(fname, header=False, index=False) - diff --git a/project/00_0_1_check_filesizes.ipynb b/project/00_0_1_check_filesizes.ipynb index 36b937a6b..549a0b1ef 100644 --- a/project/00_0_1_check_filesizes.ipynb +++ b/project/00_0_1_check_filesizes.ipynb @@ -153,7 +153,7 @@ "source": [ "mask = (entries['size_pride'] - entries['size_erda']).abs() > 5\n", "to_redo = entries.loc[mask].reset_index()\n", - "to_redo " + "to_redo" ] }, { @@ -172,7 +172,7 @@ "id": "b6087751", "metadata": {}, "source": [ - "## Check MaxQuant output filesizes " + "## Check MaxQuant output filesizes" ] }, { @@ -207,7 +207,7 @@ " files.append(entry)\n", " if entry.id_old not in folder:\n", " folder.add(entry.id_old)\n", - " \n", + "\n", "print(f\"{len(folder) =: }\")\n", "print(f\"{len(files) =: }\")\n", "files[:3]" @@ -235,11 +235,11 @@ "outputs": [], "source": [ "files['path_pride'] = ('MQ_tables/'\n", - " + files['Instrument_name']\n", - " + '/' \n", - " + files[\"new_sample_id\"]\n", - " + '/'\n", - " + files[\"filename\"])\n", + " + files['Instrument_name']\n", + " + '/'\n", + " + files[\"new_sample_id\"]\n", + " + '/'\n", + " + files[\"filename\"])\n", "files['path_pride'].iloc[:4].to_list()" ] }, @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "files['filename'].value_counts() # except mqpar.xml all present on erda" + "files['filename'].value_counts() # except mqpar.xml all present on erda" ] }, { @@ -359,9 +359,7 @@ "cell_type": "code", "execution_count": null, "id": "3fc22aef", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "to_do = pd.concat([missing_on_pride, files_redo])\n", diff --git a/project/00_0_1_check_filesizes.py b/project/00_0_1_check_filesizes.py index 64fc10c7f..e8df8a58a 100644 --- a/project/00_0_1_check_filesizes.py +++ b/project/00_0_1_check_filesizes.py @@ -74,14 +74,14 @@ # %% mask = (entries['size_pride'] - entries['size_erda']).abs() > 5 to_redo = entries.loc[mask].reset_index() -to_redo +to_redo # %% commands = 'put ' + to_redo['fname'] + ' -o ' + to_redo['path_pride'] print(commands.to_csv(header=False, index=False)) # %% [markdown] -# ## Check MaxQuant output filesizes +# ## Check MaxQuant output filesizes # %% df_meta = df_meta.reset_index().set_index('Sample ID') @@ -101,7 +101,7 @@ files.append(entry) if entry.id_old not in folder: folder.add(entry.id_old) - + print(f"{len(folder) =: }") print(f"{len(files) =: }") files[:3] @@ -113,16 +113,16 @@ # %% files['path_pride'] = ('MQ_tables/' - + files['Instrument_name'] - + '/' - + files["new_sample_id"] - + '/' - + files["filename"]) + + files['Instrument_name'] + + '/' + + files["new_sample_id"] + + '/' + + files["filename"]) files['path_pride'].iloc[:4].to_list() # %% -files['filename'].value_counts() # except mqpar.xml all present on erda +files['filename'].value_counts() # except mqpar.xml all present on erda # %% files_pride = list() @@ -163,4 +163,3 @@ to_do = pd.concat([missing_on_pride, files_redo]) commands = 'put -e \'' + to_do['path_erda'] + "' -o '" + to_do.index + "'" commands.to_csv(FOLDER / 'mq_out_remaining.txt', header=False, index=False) - diff --git a/project/00_0_2_mqout_renaming.ipynb b/project/00_0_2_mqout_renaming.ipynb index 2f56c1303..e32643f4b 100644 --- a/project/00_0_2_mqout_renaming.ipynb +++ b/project/00_0_2_mqout_renaming.ipynb @@ -36,7 +36,7 @@ "source": [ "FOLDER = Path('data/rename')\n", "meta_in = FOLDER / 'selected_old_new_id_mapping.csv'\n", - "fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files" + "fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files" ] }, { @@ -71,10 +71,10 @@ "outputs": [], "source": [ "files_types = [\"modificationSpecificPeptides.txt\",\n", - "\"mqpar.xml\",\n", - "\"mzRange.txt\",\n", - "\"Oxidation (M)Sites.txt\",\n", - "\"summary.txt\",]" + " \"mqpar.xml\",\n", + " \"mzRange.txt\",\n", + " \"Oxidation (M)Sites.txt\",\n", + " \"summary.txt\",]" ] }, { @@ -109,7 +109,7 @@ " new_name=new_name,\n", " fn=fname)\n", " to_rename.append(command)\n", - " \n", + "\n", " counter[fname.name] += 1\n", "len(to_rename)" ] @@ -122,7 +122,7 @@ "outputs": [], "source": [ "# mqpar.xml missing in some folders\n", - "pd.Series(counter) # maybe one folder has some missing?" + "pd.Series(counter) # maybe one folder has some missing?" ] }, { @@ -133,7 +133,7 @@ "outputs": [], "source": [ "with open(FOLDER / 'sed_rename_commands.sh', 'w') as f:\n", - " f.writelines('\\n'.join(to_rename))" + " f.writelines('\\n'.join(to_rename))" ] } ], diff --git a/project/00_0_2_mqout_renaming.py b/project/00_0_2_mqout_renaming.py index b61e127e9..0a223751f 100644 --- a/project/00_0_2_mqout_renaming.py +++ b/project/00_0_2_mqout_renaming.py @@ -15,7 +15,7 @@ # %% FOLDER = Path('data/rename') meta_in = FOLDER / 'selected_old_new_id_mapping.csv' -fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files +fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files # %% df_meta = pd.read_csv(meta_in, index_col='new_sample_id') @@ -27,10 +27,10 @@ # %% files_types = ["modificationSpecificPeptides.txt", -"mqpar.xml", -"mzRange.txt", -"Oxidation (M)Sites.txt", -"summary.txt",] + "mqpar.xml", + "mzRange.txt", + "Oxidation (M)Sites.txt", + "summary.txt",] # %% name_lookup = df_meta["Sample ID"].reset_index().set_index("new_sample_id") @@ -51,14 +51,14 @@ new_name=new_name, fn=fname) to_rename.append(command) - + counter[fname.name] += 1 len(to_rename) # %% # mqpar.xml missing in some folders -pd.Series(counter) # maybe one folder has some missing? +pd.Series(counter) # maybe one folder has some missing? # %% with open(FOLDER / 'sed_rename_commands.sh', 'w') as f: - f.writelines('\n'.join(to_rename)) + f.writelines('\n'.join(to_rename)) diff --git a/project/00_0_3_create_sdrf.ipynb b/project/00_0_3_create_sdrf.ipynb index e08e3e973..eec8de40d 100644 --- a/project/00_0_3_create_sdrf.ipynb +++ b/project/00_0_3_create_sdrf.ipynb @@ -55,7 +55,7 @@ }, "outputs": [], "source": [ - "sdrf = pd.DataFrame() # pd.read_table(fn_sdrf_cellline_template)\n", + "sdrf = pd.DataFrame() # pd.read_table(fn_sdrf_cellline_template)\n", "sdrf['source name'] = df_meta.index\n", "sdrf = sdrf.set_index('source name')\n", "sdrf['characteristics[organism]'] = 'Homo sapiens'\n", diff --git a/project/00_0_3_create_sdrf.py b/project/00_0_3_create_sdrf.py index 14e033b34..dd6d5178b 100644 --- a/project/00_0_3_create_sdrf.py +++ b/project/00_0_3_create_sdrf.py @@ -17,7 +17,7 @@ df_meta # %% -sdrf = pd.DataFrame() # pd.read_table(fn_sdrf_cellline_template) +sdrf = pd.DataFrame() # pd.read_table(fn_sdrf_cellline_template) sdrf['source name'] = df_meta.index sdrf = sdrf.set_index('source name') sdrf['characteristics[organism]'] = 'Homo sapiens' diff --git a/project/00_0_4_create_submission_folder.ipynb b/project/00_0_4_create_submission_folder.ipynb index 891af6559..59360c84f 100644 --- a/project/00_0_4_create_submission_folder.ipynb +++ b/project/00_0_4_create_submission_folder.ipynb @@ -19,6 +19,8 @@ }, "outputs": [], "source": [ + "import pandas as pd\n", + "import numpy as np\n", "from collections import defaultdict\n", "from pathlib import Path, PurePosixPath" ] @@ -136,8 +138,6 @@ }, "outputs": [], "source": [ - "import numpy as np\n", - "import pandas as pd\n", "files = pd.DataFrame(columns='FMH\tfile_id\tfile_type\tfile_path\tfile_mapping'.split('\\t'))\n", "files['file_path'] = pd.read_csv(file, header=None)\n", "files['FMH'] = 'FMH'\n", @@ -165,9 +165,7 @@ { "cell_type": "markdown", "id": "9107e219", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "source": [ "Some manuel adding of the last files still required..." ] diff --git a/project/00_0_4_create_submission_folder.py b/project/00_0_4_create_submission_folder.py index 3078981f5..91ad14fdf 100644 --- a/project/00_0_4_create_submission_folder.py +++ b/project/00_0_4_create_submission_folder.py @@ -3,6 +3,8 @@ # %% +import pandas as pd +import numpy as np from collections import defaultdict from pathlib import Path, PurePosixPath @@ -62,8 +64,6 @@ '.tsv': 'EXPERIMENTAL_DESIGN'} # %% -import numpy as np -import pandas as pd files = pd.DataFrame(columns='FMH file_id file_type file_path file_mapping'.split('\t')) files['file_path'] = pd.read_csv(file, header=None) files['FMH'] = 'FMH' @@ -78,4 +78,3 @@ files.to_csv(FOLDER / 'submiss.px_to_add.tsv', sep='\t', index=False) # %% [markdown] # Some manuel adding of the last files still required... - diff --git a/project/00_0_hela_metadata_rawfiles.ipynb b/project/00_0_hela_metadata_rawfiles.ipynb index 820e3bfec..3971259e4 100644 --- a/project/00_0_hela_metadata_rawfiles.ipynb +++ b/project/00_0_hela_metadata_rawfiles.ipynb @@ -49,9 +49,11 @@ "source": [ "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n", "# outputs\n", - "fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number)\n", + "# All parsed raw files nested by instrument (model, attribute, serial number)\n", + "fn_files_per_instrument: str = 'data/files_per_instrument.yaml'\n", "fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n", - "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number)" + "# Selected parsed raw files nested by instrument (model, attribute, serial number)\n", + "fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml'" ] }, { diff --git a/project/00_0_hela_metadata_rawfiles.py b/project/00_0_hela_metadata_rawfiles.py index b8f1248df..03a501db0 100644 --- a/project/00_0_hela_metadata_rawfiles.py +++ b/project/00_0_hela_metadata_rawfiles.py @@ -34,9 +34,11 @@ # %% tags=["parameters"] fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow # outputs -fn_files_per_instrument: str = 'data/files_per_instrument.yaml' # All parsed raw files nested by instrument (model, attribute, serial number) +# All parsed raw files nested by instrument (model, attribute, serial number) +fn_files_per_instrument: str = 'data/files_per_instrument.yaml' fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides -fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # Selected parsed raw files nested by instrument (model, attribute, serial number) +# Selected parsed raw files nested by instrument (model, attribute, serial number) +fn_files_per_instrument_selected: str = 'data/files_selected_per_instrument.yaml' # %% [markdown] # ### Machine metadata diff --git a/project/00_1_hela_MQ_summaries.ipynb b/project/00_1_hela_MQ_summaries.ipynb index 9cd1dee49..4b563a2ad 100644 --- a/project/00_1_hela_MQ_summaries.ipynb +++ b/project/00_1_hela_MQ_summaries.ipynb @@ -22,7 +22,10 @@ "import yaml\n", "import numpy as np\n", "import pandas as pd\n", + "\n", "import vaep\n", + "from vaep.pandas import get_unique_non_unique_columns\n", + "from vaep.pandas import unique_cols\n", "\n", "from config import FN_ALL_SUMMARIES\n", "print(f\"{FN_ALL_SUMMARIES = }\")" @@ -39,7 +42,7 @@ }, "outputs": [], "source": [ - "FN_ALL_SUMMARIES: str = 'data/mq_summaries.csv' # MqAllSummaries json" + "FN_ALL_SUMMARIES: str = 'data/mq_summaries.csv' # MqAllSummaries json" ] }, { @@ -65,7 +68,6 @@ "metadata": {}, "outputs": [], "source": [ - "from vaep.pandas import unique_cols\n", "unique_cols(mq_all_summaries.Multiplicity), unique_cols(\n", " mq_all_summaries[\"Variable modifications first search\"]) # int, NA" ] @@ -76,7 +78,6 @@ "metadata": {}, "outputs": [], "source": [ - "from vaep.pandas import get_unique_non_unique_columns\n", "columns = get_unique_non_unique_columns(mq_all_summaries)\n", "mq_all_summaries[columns.unique]" ] @@ -114,19 +115,23 @@ "source": [ "class col_summary:\n", " MS1 = 'MS'\n", - " MS2 = 'MS/MS' \n", - " MS2_identified = 'MS/MS Identified'\n", + " MS2 = 'MS/MS'\n", + " MS2_identified = 'MS/MS Identified'\n", " peptides_identified = 'Peptide Sequences Identified'\n", "\n", + "\n", "if mq_all_summaries is None:\n", " raise ValueError(\"No data assigned\")\n", - " \n", - "MS_spectra = mq_all_summaries[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]]\n", + "\n", + "MS_spectra = mq_all_summaries[[col_summary.MS1, col_summary.MS2,\n", + " col_summary.MS2_identified, col_summary.peptides_identified]]\n", + "\n", "\n", "def compute_summary(threshold_identified):\n", - " mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified\n", + " mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified\n", " display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10)))\n", "\n", + "\n", "w_ions_range = widgets.IntSlider(value=15_000, min=15_000, max=MS_spectra[col_summary.peptides_identified].max())\n", "display(widgets.interactive(compute_summary, threshold_identified=w_ions_range))" ] @@ -160,9 +165,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "dump_dict = {'threshold': int(w_ions_range.value)}\n", diff --git a/project/00_1_hela_MQ_summaries.py b/project/00_1_hela_MQ_summaries.py index 496b0da07..2a27a89d4 100644 --- a/project/00_1_hela_MQ_summaries.py +++ b/project/00_1_hela_MQ_summaries.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -25,13 +25,16 @@ import yaml import numpy as np import pandas as pd + import vaep +from vaep.pandas import get_unique_non_unique_columns +from vaep.pandas import unique_cols from config import FN_ALL_SUMMARIES print(f"{FN_ALL_SUMMARIES = }") # %% tags=["parameters"] -FN_ALL_SUMMARIES: str = 'data/mq_summaries.csv' # MqAllSummaries json +FN_ALL_SUMMARIES: str = 'data/mq_summaries.csv' # MqAllSummaries json # %% @@ -42,12 +45,10 @@ # Find unique columns, see [post](https://stackoverflow.com/a/54405767/9684872) # %% -from vaep.pandas import unique_cols unique_cols(mq_all_summaries.Multiplicity), unique_cols( mq_all_summaries["Variable modifications first search"]) # int, NA # %% -from vaep.pandas import get_unique_non_unique_columns columns = get_unique_non_unique_columns(mq_all_summaries) mq_all_summaries[columns.unique] @@ -64,19 +65,23 @@ # %% class col_summary: MS1 = 'MS' - MS2 = 'MS/MS' - MS2_identified = 'MS/MS Identified' + MS2 = 'MS/MS' + MS2_identified = 'MS/MS Identified' peptides_identified = 'Peptide Sequences Identified' + if mq_all_summaries is None: raise ValueError("No data assigned") - -MS_spectra = mq_all_summaries[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]] + +MS_spectra = mq_all_summaries[[col_summary.MS1, col_summary.MS2, + col_summary.MS2_identified, col_summary.peptides_identified]] + def compute_summary(threshold_identified): - mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified + mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10))) + w_ions_range = widgets.IntSlider(value=15_000, min=15_000, max=MS_spectra[col_summary.peptides_identified].max()) display(widgets.interactive(compute_summary, threshold_identified=w_ions_range)) @@ -99,4 +104,3 @@ def compute_summary(threshold_identified): with open('data/samples_selected.yaml', 'w') as f: yaml.dump(dump_dict, stream=f) - diff --git a/project/00_2_hela_all_raw_files.ipynb b/project/00_2_hela_all_raw_files.ipynb index 40ff06b32..384a5aa09 100644 --- a/project/00_2_hela_all_raw_files.ipynb +++ b/project/00_2_hela_all_raw_files.ipynb @@ -35,7 +35,7 @@ "find . -name '*.raw' -ls > all_raw_files_dump_2021_10_27.txt\n", "```\n", "\n", - "which was executed in the " + "which was executed in the" ] }, { @@ -82,7 +82,7 @@ "# FN_ALL_RAW_FILES = config.FOLDER_DATA / config.FN_ALL_RAW_FILES\n", "FN_ALL_RAW_FILES: str = config.FOLDER_DATA / 'all_raw_files_dump_2021_10_29.txt'\n", "FN_ALL_SUMMARIES: str = config.FN_ALL_SUMMARIES\n", - "FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000' " + "FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000'" ] }, { @@ -107,7 +107,7 @@ "data = []\n", "with open(cfg.FN_ALL_RAW_FILES) as f:\n", " for line in f:\n", - " line = line.split(maxsplit=8) # ignore white spaces in file names, example:\n", + " line = line.split(maxsplit=8) # ignore white spaces in file names, example:\n", " #'-rw-r--r--. 1 501 501 282917566 Dec 3 2022 ./share_hela_raw/MNT_202220220921_EXLP1_Evo1_LiNi_ - Copy1.raw'\n", " path = Path(line[-1].strip())\n", " data.append(RawFile(path.stem, path, int(line[4])))\n", @@ -158,7 +158,7 @@ "mask_non_unique = data.reset_index().duplicated(subset=['name', 'bytes'])\n", "mask_non_unique.index = data.index\n", "idx_non_unique = data.loc[mask_non_unique].index.unique()\n", - "idx_non_unique # min number of files to remove" + "idx_non_unique # min number of files to remove" ] }, { @@ -178,6 +178,7 @@ " print(f'Number of files with more than 2 duplicates: {(non_unique > 2).sum()}')\n", " return non_unique\n", "\n", + "\n", "non_unique = check_for_duplicates(df=data)\n", "non_unique" ] @@ -196,7 +197,7 @@ "outputs": [], "source": [ "data.loc[\n", - " non_unique.index.difference(idx_non_unique) ]" + " non_unique.index.difference(idx_non_unique)]" ] }, { @@ -220,14 +221,15 @@ " non_unique_remaining = pd.DataFrame()\n", " for idx, g in _data_to_remove.groupby(level=0):\n", " mask = ['\\\\MNT' in str(x) for x in g.path]\n", - " assert len(mask) != sum(mask) , f'All files in MNT subfolders: {idx}'\n", + " assert len(mask) != sum(mask), f'All files in MNT subfolders: {idx}'\n", " data_in_MNT_to_remove = data_in_MNT_to_remove.append(g[mask])\n", - " non_unique_remaining = non_unique_remaining.append(g[[x!=True for x in mask]])\n", + " non_unique_remaining = non_unique_remaining.append(g[[x != True for x in mask]])\n", "\n", " del _data_to_remove, mask, idx, g\n", "\n", "assert len(data.loc[idx_non_unique]) == len(non_unique_remaining) + len(data_in_MNT_to_remove)\n", - "assert len(non_unique_remaining.loc[['\\\\MNT' in str(x) for x in non_unique_remaining.path]]) == 0, \"There are files in MNT folder left\"\n", + "assert len(non_unique_remaining.loc[['\\\\MNT' in str(x)\n", + " for x in non_unique_remaining.path]]) == 0, \"There are files in MNT folder left\"\n", "data_in_MNT_to_remove" ] }, @@ -264,7 +266,7 @@ "mask_non_unique_remaining = non_unique_remaining.reset_index().duplicated(subset=['name', 'bytes'])\n", "mask_non_unique_remaining.index = non_unique_remaining.index\n", "data_to_remove = data_in_MNT_to_remove.append(\n", - " non_unique_remaining.loc[mask_non_unique_remaining]\n", + " non_unique_remaining.loc[mask_non_unique_remaining]\n", ")\n", "data_to_remove" ] @@ -284,7 +286,8 @@ "metadata": {}, "outputs": [], "source": [ - "data_unique = data.reset_index().set_index('num_index').drop(data_to_remove.set_index('num_index').index).set_index('name')\n", + "data_unique = data.reset_index().set_index('num_index').drop(\n", + " data_to_remove.set_index('num_index').index).set_index('name')\n", "data_unique" ] }, @@ -310,7 +313,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert len(data_unique) + len(data_to_remove) == len(data)" + "assert len(data_unique) + len(data_to_remove) == len(data)" ] }, { @@ -343,7 +346,9 @@ "metadata": {}, "outputs": [], "source": [ - "cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath(cfg.FN_ALL_RAW_FILES, config.build_df_fname(data_unique, 'unique'), new_suffix='csv')\n", + "cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath(\n", + " cfg.FN_ALL_RAW_FILES, config.build_df_fname(\n", + " data_unique, 'unique'), new_suffix='csv')\n", "data_unique.to_csv(cfg.FN_ALL_RAW_FILES_UNIQUE)" ] }, @@ -474,13 +479,13 @@ " # continue with samples below 2019 (select in DropDown below)\n", " '20180508_QE3_nLC5_DBJ_DIAprot_HELA_500ng_GPF',\n", " '20180528_QE5_Evo2_DBJ_DIAprot_HeLa_500ng',\n", - " '20190108_QE7_Evo1_DBJ_SA_LFQpho_HELA_PACs_200ug', # s mssing in LFQphos\n", + " '20190108_QE7_Evo1_DBJ_SA_LFQpho_HELA_PACs_200ug', # s mssing in LFQphos\n", " '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_200ug',\n", " '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_300ug',\n", " '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_400ug',\n", " '20190212_QE5_Evo1_DBJ_LFQprot',\n", " '20190314_QE3_DBJ_Evo2_LFQphos_Hela_200ug_StageTip',\n", - " '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StageTip', # first t missing in StagetTip\n", + " '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StageTip', # first t missing in StagetTip\n", " '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StagetTip',\n", " '20190402_QE3_Evo1_DBJ_DIAprot_HELA',\n", " '20190402_QE3_Evo1_DBJ_LFQprot_HELA',\n", @@ -489,7 +494,7 @@ " '20190507_QE5_Evo1_DBJ_LFQprot_Subcell_HeLa_Ctrl',\n", " '20190507_QE5_Evo1_DBJ_LFQprot_Subcell_library_HeLa_Ctrl_Ani_Mix',\n", " '20190622_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000',\n", - " '20190628_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000', \n", + " '20190628_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000',\n", "]\n", "\n", "# exclude keys and handle separately. Remaining keys can be used directly to create list of inputs.\n", @@ -505,7 +510,7 @@ "w_data = widgets.Dropdown(options=frac_unique, index=0)\n", "show_fractions_frac = partial(show_fractions, df=df_selected)\n", "out_sel = widgets.interactive_output(show_fractions_frac, {'stub': w_data})\n", - "widgets.VBox([w_data, out_sel]) # repr of class\n", + "widgets.VBox([w_data, out_sel]) # repr of class\n", "#stub, export" ] }, @@ -715,7 +720,7 @@ "metadata": {}, "outputs": [], "source": [ - "analysis = AnalyzePeptides.from_csv(cfg.FN_ALL_RAW_FILES_UNIQUE,index_col='name') # ToDo: Add numbers to file names\n", + "analysis = AnalyzePeptides.from_csv(cfg.FN_ALL_RAW_FILES_UNIQUE, index_col='name') # ToDo: Add numbers to file names\n", "analysis.df" ] }, @@ -745,7 +750,7 @@ "metadata": {}, "outputs": [], "source": [ - "analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one" + "analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one" ] }, { @@ -761,7 +766,7 @@ "metadata": {}, "outputs": [], "source": [ - "vars(cfg) # return a dict which is rendered differently in ipython" + "vars(cfg) # return a dict which is rendered differently in ipython" ] }, { diff --git a/project/00_2_hela_all_raw_files.py b/project/00_2_hela_all_raw_files.py index bde0f37db..06949d392 100644 --- a/project/00_2_hela_all_raw_files.py +++ b/project/00_2_hela_all_raw_files.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -44,7 +44,7 @@ # find . -name '*.raw' -ls > all_raw_files_dump_2021_10_27.txt # ``` # -# which was executed in the +# which was executed in the # %% from pathlib import Path, PurePosixPath @@ -75,7 +75,7 @@ # FN_ALL_RAW_FILES = config.FOLDER_DATA / config.FN_ALL_RAW_FILES FN_ALL_RAW_FILES: str = config.FOLDER_DATA / 'all_raw_files_dump_2021_10_29.txt' FN_ALL_SUMMARIES: str = config.FN_ALL_SUMMARIES -FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000' +FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000' # %% cfg.FN_ALL_RAW_FILES = FN_ALL_RAW_FILES @@ -88,7 +88,7 @@ data = [] with open(cfg.FN_ALL_RAW_FILES) as f: for line in f: - line = line.split(maxsplit=8) # ignore white spaces in file names, example: + line = line.split(maxsplit=8) # ignore white spaces in file names, example: #'-rw-r--r--. 1 501 501 282917566 Dec 3 2022 ./share_hela_raw/MNT_202220220921_EXLP1_Evo1_LiNi_ - Copy1.raw' path = Path(line[-1].strip()) data.append(RawFile(path.stem, path, int(line[4]))) @@ -117,7 +117,7 @@ mask_non_unique = data.reset_index().duplicated(subset=['name', 'bytes']) mask_non_unique.index = data.index idx_non_unique = data.loc[mask_non_unique].index.unique() -idx_non_unique # min number of files to remove +idx_non_unique # min number of files to remove # %% @@ -132,6 +132,7 @@ def check_for_duplicates(df): print(f'Number of files with more than 2 duplicates: {(non_unique > 2).sum()}') return non_unique + non_unique = check_for_duplicates(df=data) non_unique @@ -140,7 +141,7 @@ def check_for_duplicates(df): # %% data.loc[ - non_unique.index.difference(idx_non_unique) ] + non_unique.index.difference(idx_non_unique)] # %% [markdown] # For same sized groups, remove first the onces in the `MNT` folder: @@ -154,14 +155,15 @@ def check_for_duplicates(df): non_unique_remaining = pd.DataFrame() for idx, g in _data_to_remove.groupby(level=0): mask = ['\\MNT' in str(x) for x in g.path] - assert len(mask) != sum(mask) , f'All files in MNT subfolders: {idx}' + assert len(mask) != sum(mask), f'All files in MNT subfolders: {idx}' data_in_MNT_to_remove = data_in_MNT_to_remove.append(g[mask]) - non_unique_remaining = non_unique_remaining.append(g[[x!=True for x in mask]]) + non_unique_remaining = non_unique_remaining.append(g[[x != True for x in mask]]) del _data_to_remove, mask, idx, g assert len(data.loc[idx_non_unique]) == len(non_unique_remaining) + len(data_in_MNT_to_remove) -assert len(non_unique_remaining.loc[['\\MNT' in str(x) for x in non_unique_remaining.path]]) == 0, "There are files in MNT folder left" +assert len(non_unique_remaining.loc[['\\MNT' in str(x) + for x in non_unique_remaining.path]]) == 0, "There are files in MNT folder left" data_in_MNT_to_remove # %% [markdown] @@ -178,7 +180,7 @@ def check_for_duplicates(df): mask_non_unique_remaining = non_unique_remaining.reset_index().duplicated(subset=['name', 'bytes']) mask_non_unique_remaining.index = non_unique_remaining.index data_to_remove = data_in_MNT_to_remove.append( - non_unique_remaining.loc[mask_non_unique_remaining] + non_unique_remaining.loc[mask_non_unique_remaining] ) data_to_remove @@ -186,7 +188,8 @@ def check_for_duplicates(df): print(f"Save {data_to_remove['size_gb'].sum():1.0f} GB disk space by deleting {len(data_to_remove)} files.") # %% -data_unique = data.reset_index().set_index('num_index').drop(data_to_remove.set_index('num_index').index).set_index('name') +data_unique = data.reset_index().set_index('num_index').drop( + data_to_remove.set_index('num_index').index).set_index('name') data_unique # %% [markdown] @@ -196,7 +199,7 @@ def check_for_duplicates(df): data_unique.loc[data_to_remove.index.unique()] # %% -assert len(data_unique) + len(data_to_remove) == len(data) +assert len(data_unique) + len(data_to_remove) == len(data) # %% [markdown] # Show files which are duplicated, but have different sizes: @@ -209,7 +212,9 @@ def check_for_duplicates(df): # Save unique files # %% -cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath(cfg.FN_ALL_RAW_FILES, config.build_df_fname(data_unique, 'unique'), new_suffix='csv') +cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath( + cfg.FN_ALL_RAW_FILES, config.build_df_fname( + data_unique, 'unique'), new_suffix='csv') data_unique.to_csv(cfg.FN_ALL_RAW_FILES_UNIQUE) # %% [markdown] @@ -280,13 +285,13 @@ def check_for_duplicates(df): # continue with samples below 2019 (select in DropDown below) '20180508_QE3_nLC5_DBJ_DIAprot_HELA_500ng_GPF', '20180528_QE5_Evo2_DBJ_DIAprot_HeLa_500ng', - '20190108_QE7_Evo1_DBJ_SA_LFQpho_HELA_PACs_200ug', # s mssing in LFQphos + '20190108_QE7_Evo1_DBJ_SA_LFQpho_HELA_PACs_200ug', # s mssing in LFQphos '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_200ug', '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_300ug', '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_400ug', '20190212_QE5_Evo1_DBJ_LFQprot', '20190314_QE3_DBJ_Evo2_LFQphos_Hela_200ug_StageTip', - '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StageTip', # first t missing in StagetTip + '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StageTip', # first t missing in StagetTip '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StagetTip', '20190402_QE3_Evo1_DBJ_DIAprot_HELA', '20190402_QE3_Evo1_DBJ_LFQprot_HELA', @@ -295,7 +300,7 @@ def check_for_duplicates(df): '20190507_QE5_Evo1_DBJ_LFQprot_Subcell_HeLa_Ctrl', '20190507_QE5_Evo1_DBJ_LFQprot_Subcell_library_HeLa_Ctrl_Ani_Mix', '20190622_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000', - '20190628_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000', + '20190628_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000', ] # exclude keys and handle separately. Remaining keys can be used directly to create list of inputs. @@ -305,7 +310,7 @@ def check_for_duplicates(df): w_data = widgets.Dropdown(options=frac_unique, index=0) show_fractions_frac = partial(show_fractions, df=df_selected) out_sel = widgets.interactive_output(show_fractions_frac, {'stub': w_data}) -widgets.VBox([w_data, out_sel]) # repr of class +widgets.VBox([w_data, out_sel]) # repr of class #stub, export # %% [markdown] @@ -406,7 +411,7 @@ def check_for_duplicates(df): # ### From file name # %% -analysis = AnalyzePeptides.from_csv(cfg.FN_ALL_RAW_FILES_UNIQUE,index_col='name') # ToDo: Add numbers to file names +analysis = AnalyzePeptides.from_csv(cfg.FN_ALL_RAW_FILES_UNIQUE, index_col='name') # ToDo: Add numbers to file names analysis.df # %% @@ -416,12 +421,12 @@ def check_for_duplicates(df): # Metadata has fewer cases due to duplicates with differnt file sizes ( see above) # %% -analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one +analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one # %% [markdown] # ## cfg # %% -vars(cfg) # return a dict which is rendered differently in ipython +vars(cfg) # return a dict which is rendered differently in ipython # %% diff --git a/project/00_3_0_pride_metadata_creation.ipynb b/project/00_3_0_pride_metadata_creation.ipynb index 624189c69..a074c91f5 100644 --- a/project/00_3_0_pride_metadata_creation.ipynb +++ b/project/00_3_0_pride_metadata_creation.ipynb @@ -7,7 +7,7 @@ "source": [ "# Selected files\n", "\n", - "- document metadata and file sizes of published dataset in Scientific Data Report \n", + "- document metadata and file sizes of published dataset in Scientific Data Report\n", "\n", "## Contents\n", "\n", @@ -45,11 +45,11 @@ "metadata": {}, "outputs": [], "source": [ - "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", - "fn_raw_file_size: str = 'processed/all_raw_file_sizes.csv' # raw file sizes\n", + "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", + "fn_raw_file_size: str = 'processed/all_raw_file_sizes.csv' # raw file sizes\n", "fn_rawfile_metadata: str = 'data/rawfile_metadata.csv'\n", - "fn_summaries:str = 'data/processed/all_summaries.json'\n", - "date_col:str = 'Content Creation Date'\n", + "fn_summaries: str = 'data/processed/all_summaries.json'\n", + "date_col: str = 'Content Creation Date'\n", "out_folder: str = 'data/dev_datasets/pride_upload'" ] }, @@ -140,7 +140,6 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", "df_raw_file_size['path'] = df_raw_file_size['path'].apply(lambda x: Path(x).as_posix())\n", "df_raw_file_size = df_raw_file_size.reset_index().set_index('path')\n", "df_raw_file_size" @@ -195,7 +194,7 @@ "outputs": [], "source": [ "df_meta = df_meta.loc[df_ids.index]\n", - "df_meta.columns = df_meta.columns.droplevel() # remove top level name\n", + "df_meta.columns = df_meta.columns.droplevel() # remove top level name\n", "df_meta" ] }, @@ -239,7 +238,7 @@ " .join(df_raw_file_size)\n", " .join(df_meta)\n", " .join(df_summaries)\n", - " )\n", + " )\n", "df_meta" ] }, @@ -263,11 +262,11 @@ "source": [ "df_meta = (df_meta\n", " .drop(['Path_old', 'Pathname', 'path'], axis=1)\n", - " .rename({'Path_new':'Pathname'}, axis=1)\n", + " .rename({'Path_new': 'Pathname'}, axis=1)\n", " .dropna(how='all', axis=1)\n", " .convert_dtypes()\n", " .assign(**{date_col: lambda df_meta: pd.to_datetime(df_meta[date_col])})\n", - ")\n", + " )\n", "df_meta" ] }, @@ -323,12 +322,12 @@ "dtypes = pd.read_json(\n", " files_out['pride_metadata_schema.json'],\n", " orient='index'\n", - " ).squeeze()\n", - "mask_dates = dtypes.str.contains('datetime') # date columns need to be provide separately\n", + ").squeeze()\n", + "mask_dates = dtypes.str.contains('datetime') # date columns need to be provide separately\n", "pd.read_csv(files_out['pride_metadata.csv'],\n", " parse_dates=mask_dates.loc[mask_dates].index.to_list(),\n", " dtype=dtypes.loc[~mask_dates].to_dict()\n", - ").dtypes" + " ).dtypes" ] }, { diff --git a/project/00_3_0_pride_metadata_creation.py b/project/00_3_0_pride_metadata_creation.py index 17594453d..9165f16b2 100644 --- a/project/00_3_0_pride_metadata_creation.py +++ b/project/00_3_0_pride_metadata_creation.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -15,7 +15,7 @@ # %% [markdown] # # Selected files # -# - document metadata and file sizes of published dataset in Scientific Data Report +# - document metadata and file sizes of published dataset in Scientific Data Report # # ## Contents # @@ -33,11 +33,11 @@ # ## PARAMETERS # %% -fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id -fn_raw_file_size: str = 'processed/all_raw_file_sizes.csv' # raw file sizes +fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id +fn_raw_file_size: str = 'processed/all_raw_file_sizes.csv' # raw file sizes fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' -fn_summaries:str = 'data/processed/all_summaries.json' -date_col:str = 'Content Creation Date' +fn_summaries: str = 'data/processed/all_summaries.json' +date_col: str = 'Content Creation Date' out_folder: str = 'data/dev_datasets/pride_upload' # %% [markdown] @@ -71,7 +71,6 @@ df_raw_file_size.index.is_unique # %% -from pathlib import Path df_raw_file_size['path'] = df_raw_file_size['path'].apply(lambda x: Path(x).as_posix()) df_raw_file_size = df_raw_file_size.reset_index().set_index('path') df_raw_file_size @@ -93,7 +92,7 @@ # %% df_meta = df_meta.loc[df_ids.index] -df_meta.columns = df_meta.columns.droplevel() # remove top level name +df_meta.columns = df_meta.columns.droplevel() # remove top level name df_meta # %% [markdown] @@ -113,7 +112,7 @@ .join(df_raw_file_size) .join(df_meta) .join(df_summaries) - ) + ) df_meta # %% @@ -123,11 +122,11 @@ # %% df_meta = (df_meta .drop(['Path_old', 'Pathname', 'path'], axis=1) - .rename({'Path_new':'Pathname'}, axis=1) + .rename({'Path_new': 'Pathname'}, axis=1) .dropna(how='all', axis=1) .convert_dtypes() .assign(**{date_col: lambda df_meta: pd.to_datetime(df_meta[date_col])}) -) + ) df_meta # %% [markdown] @@ -152,12 +151,12 @@ dtypes = pd.read_json( files_out['pride_metadata_schema.json'], orient='index' - ).squeeze() -mask_dates = dtypes.str.contains('datetime') # date columns need to be provide separately +).squeeze() +mask_dates = dtypes.str.contains('datetime') # date columns need to be provide separately pd.read_csv(files_out['pride_metadata.csv'], parse_dates=mask_dates.loc[mask_dates].index.to_list(), dtype=dtypes.loc[~mask_dates].to_dict() -).dtypes + ).dtypes # %% diff --git a/project/00_3_1_pride_metadata_analysis.ipynb b/project/00_3_1_pride_metadata_analysis.ipynb index 75f62cdf8..d68e970a9 100644 --- a/project/00_3_1_pride_metadata_analysis.ipynb +++ b/project/00_3_1_pride_metadata_analysis.ipynb @@ -110,41 +110,41 @@ "# Orbitrap Fusion Lumos MS:1002732\n", "\n", "instrument_labels = {'Q-Exactive-Orbitrap_1': 'Q Exactive 1',\n", - " 'Q-Exactive-Plus-Orbitrap_1': 'Exactive Plus 1',\n", - " 'Q-Exactive-HF-Orbitrap_206': 'Q Exactive HF 206',\n", - " 'Q-Exactive-Plus-Orbitrap_143': 'Exactive Plus 143',\n", - " 'Q-Exactive-HF-Orbitrap_1': 'Q Exactive HF 1',\n", - " 'Q-Exactive-HF-Orbitrap_147': 'Q Exactive HF 147',\n", - " 'Q-Exactive-HF-Orbitrap_204': 'Q Exactive HF 204',\n", - " 'Q-Exactive-HF-Orbitrap_148': 'Q Exactive HF 148',\n", - " 'Q-Exactive-HF-Orbitrap_207': 'Q Exactive HF 207',\n", - " 'Q-Exactive-HF-Orbitrap_143': 'Q Exactive HF 143',\n", - " 'Orbitrap-Fusion-Lumos_FSN20115': 'Orbitrap Fusion Lumos FSN20115',\n", - " 'Q-Exactive-HF-Orbitrap_2612': 'Q Exactive HF 2612',\n", - " 'Q-Exactive-HF-X-Orbitrap_6016': 'Q Exactive HF-X 6016',\n", - " 'Q-Exactive-HF-X-Orbitrap_6004': 'Q Exactive HF-X 6004',\n", - " 'Q-Exactive-HF-X-Orbitrap_6075': 'Q Exactive HF-X 6075',\n", - " 'Q-Exactive-HF-X-Orbitrap_6078': 'Q Exactive HF-X 6078',\n", - " 'Q-Exactive-HF-X-Orbitrap_6070': 'Q Exactive HF-X 6070',\n", - " 'Q-Exactive-HF-X-Orbitrap_6071': 'Q Exactive HF-X 6071',\n", - " 'Q-Exactive-HF-X-Orbitrap_6011': 'Q Exactive HF-X 6011',\n", - " 'Q-Exactive-HF-X-Orbitrap_6073': 'Q Exactive HF-X 6073',\n", - " 'Q-Exactive-HF-X-Orbitrap_6101': 'Q Exactive HF-X 6101',\n", - " 'Q-Exactive-HF-X-Orbitrap_6096': 'Q Exactive HF-X 6096',\n", - " 'Exactive-Series-Orbitrap_6004': 'Exactive Series 6004',\n", - " 'Q-Exactive-HF-X-Orbitrap_6043': 'Q Exactive HF-X 6043',\n", - " 'Q-Exactive-HF-X-Orbitrap_6025': 'Q Exactive HF-X 6025',\n", - " 'Q-Exactive-HF-X-Orbitrap_6022': 'Q Exactive HF-X 6022',\n", - " 'Q-Exactive-HF-X-Orbitrap_6023': 'Q Exactive HF-X 6023',\n", - " 'Q-Exactive-HF-X-Orbitrap_6028': 'Q Exactive HF-X 6028',\n", - " 'Q-Exactive-HF-X-Orbitrap_6013': 'Q Exactive HF-X 6013',\n", - " 'Q-Exactive-HF-X-Orbitrap_6044': 'Q Exactive HF-X 6044',\n", - " 'Q-Exactive-HF-X-Orbitrap_6324': 'Q Exactive HF-X 6324',\n", - " 'Orbitrap-Exploris-480_Invalid_SN_0001': 'Orbitrap Exploris 480 Invalid SN 0001',\n", - " 'Orbitrap-Exploris-480_MA10134C': 'Orbitrap Exploris 480 MA10134C',\n", - " 'Orbitrap-Exploris-480_MA10132C': 'Orbitrap Exploris 480 MA10132C',\n", - " 'Orbitrap-Exploris-480_MA10130C': 'Orbitrap Exploris 480 MA10130C',\n", - " 'Orbitrap-Exploris-480_MA10215C': 'Orbitrap Exploris 480 MA10215C'}\n", + " 'Q-Exactive-Plus-Orbitrap_1': 'Exactive Plus 1',\n", + " 'Q-Exactive-HF-Orbitrap_206': 'Q Exactive HF 206',\n", + " 'Q-Exactive-Plus-Orbitrap_143': 'Exactive Plus 143',\n", + " 'Q-Exactive-HF-Orbitrap_1': 'Q Exactive HF 1',\n", + " 'Q-Exactive-HF-Orbitrap_147': 'Q Exactive HF 147',\n", + " 'Q-Exactive-HF-Orbitrap_204': 'Q Exactive HF 204',\n", + " 'Q-Exactive-HF-Orbitrap_148': 'Q Exactive HF 148',\n", + " 'Q-Exactive-HF-Orbitrap_207': 'Q Exactive HF 207',\n", + " 'Q-Exactive-HF-Orbitrap_143': 'Q Exactive HF 143',\n", + " 'Orbitrap-Fusion-Lumos_FSN20115': 'Orbitrap Fusion Lumos FSN20115',\n", + " 'Q-Exactive-HF-Orbitrap_2612': 'Q Exactive HF 2612',\n", + " 'Q-Exactive-HF-X-Orbitrap_6016': 'Q Exactive HF-X 6016',\n", + " 'Q-Exactive-HF-X-Orbitrap_6004': 'Q Exactive HF-X 6004',\n", + " 'Q-Exactive-HF-X-Orbitrap_6075': 'Q Exactive HF-X 6075',\n", + " 'Q-Exactive-HF-X-Orbitrap_6078': 'Q Exactive HF-X 6078',\n", + " 'Q-Exactive-HF-X-Orbitrap_6070': 'Q Exactive HF-X 6070',\n", + " 'Q-Exactive-HF-X-Orbitrap_6071': 'Q Exactive HF-X 6071',\n", + " 'Q-Exactive-HF-X-Orbitrap_6011': 'Q Exactive HF-X 6011',\n", + " 'Q-Exactive-HF-X-Orbitrap_6073': 'Q Exactive HF-X 6073',\n", + " 'Q-Exactive-HF-X-Orbitrap_6101': 'Q Exactive HF-X 6101',\n", + " 'Q-Exactive-HF-X-Orbitrap_6096': 'Q Exactive HF-X 6096',\n", + " 'Exactive-Series-Orbitrap_6004': 'Exactive Series 6004',\n", + " 'Q-Exactive-HF-X-Orbitrap_6043': 'Q Exactive HF-X 6043',\n", + " 'Q-Exactive-HF-X-Orbitrap_6025': 'Q Exactive HF-X 6025',\n", + " 'Q-Exactive-HF-X-Orbitrap_6022': 'Q Exactive HF-X 6022',\n", + " 'Q-Exactive-HF-X-Orbitrap_6023': 'Q Exactive HF-X 6023',\n", + " 'Q-Exactive-HF-X-Orbitrap_6028': 'Q Exactive HF-X 6028',\n", + " 'Q-Exactive-HF-X-Orbitrap_6013': 'Q Exactive HF-X 6013',\n", + " 'Q-Exactive-HF-X-Orbitrap_6044': 'Q Exactive HF-X 6044',\n", + " 'Q-Exactive-HF-X-Orbitrap_6324': 'Q Exactive HF-X 6324',\n", + " 'Orbitrap-Exploris-480_Invalid_SN_0001': 'Orbitrap Exploris 480 Invalid SN 0001',\n", + " 'Orbitrap-Exploris-480_MA10134C': 'Orbitrap Exploris 480 MA10134C',\n", + " 'Orbitrap-Exploris-480_MA10132C': 'Orbitrap Exploris 480 MA10132C',\n", + " 'Orbitrap-Exploris-480_MA10130C': 'Orbitrap Exploris 480 MA10130C',\n", + " 'Orbitrap-Exploris-480_MA10215C': 'Orbitrap Exploris 480 MA10215C'}\n", "\n", "df_meta[\"instrument_label\"] = df_meta[\"instrument_label\"].replace(instrument_labels)" ] @@ -219,10 +219,10 @@ "\n", "counts_instrument = counts_instrument.join(\n", " (df_meta\n", - " [[*thermo_raw_files.cols_instrument, 'instrument_label']]\n", - " .drop_duplicates()\n", - " .set_index(thermo_raw_files.cols_instrument)\n", - " )\n", + " [[*thermo_raw_files.cols_instrument, 'instrument_label']]\n", + " .drop_duplicates()\n", + " .set_index(thermo_raw_files.cols_instrument)\n", + " )\n", " .set_index('instrument_label', append=True)\n", ")\n", "counts_instrument.to_excel(\n", @@ -286,11 +286,11 @@ "source": [ "fig, ax = plt.subplots()\n", "ax = (counts_instrument\n", - " .plot\n", - " .bar(\n", - " ax=ax,\n", - " )\n", - ")\n", + " .plot\n", + " .bar(\n", + " ax=ax,\n", + " )\n", + " )\n", "ax.set_xlabel('')\n", "ax.set_ylabel('number of samples (runs)')\n", "fname = out_folder / 'number_of_samples_per_instrument.pdf'\n", @@ -331,7 +331,7 @@ "ax = (df_meta\n", " .loc[mask, cols]\n", " .plot\n", - " .scatter(cols[0], cols[1],\n", + " .scatter(cols[0], cols[1],\n", " color='orange',\n", " label='normal files',\n", " ylabel='filesize (in GB)',\n", @@ -362,7 +362,9 @@ "cols = vaep.pandas.get_columns_accessor_from_iterable(cols)\n", "\n", "view = df_meta.loc[mask_top10_instruments]\n", - "view[\"instrument_label+N\"] = view[\"instrument_label\"].replace(counts_instrument.to_frame().apply( lambda s: f\"{s.name} (N={s['count']:03d})\" , axis=1))\n", + "view[\"instrument_label+N\"] = view[\"instrument_label\"].replace(\n", + " counts_instrument.to_frame().apply(\n", + " lambda s: f\"{s.name} (N={s['count']:03d})\", axis=1))\n", "view" ] }, @@ -391,7 +393,7 @@ " title='instrument label',\n", " loc='upper right',\n", " # alignment='left',\n", - ")\n", + " )\n", "ax.xaxis.set_major_formatter(\"{x:,.0f}\")\n", "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", "fname = out_folder / 'ms1_to_ms2_top10_instruments.pdf'\n", @@ -457,7 +459,7 @@ "\n", "fig, ax = plt.subplots()\n", "\n", - "ax = ax = seaborn.scatterplot(\n", + "ax = seaborn.scatterplot(\n", " view,\n", " x=cols.MS_max_RT,\n", " y=cols.Peptide_Sequences_Identified,\n", diff --git a/project/00_3_1_pride_metadata_analysis.py b/project/00_3_1_pride_metadata_analysis.py index 2ec9a8fae..c6ad55d12 100644 --- a/project/00_3_1_pride_metadata_analysis.py +++ b/project/00_3_1_pride_metadata_analysis.py @@ -55,41 +55,41 @@ # Orbitrap Fusion Lumos MS:1002732 instrument_labels = {'Q-Exactive-Orbitrap_1': 'Q Exactive 1', - 'Q-Exactive-Plus-Orbitrap_1': 'Exactive Plus 1', - 'Q-Exactive-HF-Orbitrap_206': 'Q Exactive HF 206', - 'Q-Exactive-Plus-Orbitrap_143': 'Exactive Plus 143', - 'Q-Exactive-HF-Orbitrap_1': 'Q Exactive HF 1', - 'Q-Exactive-HF-Orbitrap_147': 'Q Exactive HF 147', - 'Q-Exactive-HF-Orbitrap_204': 'Q Exactive HF 204', - 'Q-Exactive-HF-Orbitrap_148': 'Q Exactive HF 148', - 'Q-Exactive-HF-Orbitrap_207': 'Q Exactive HF 207', - 'Q-Exactive-HF-Orbitrap_143': 'Q Exactive HF 143', - 'Orbitrap-Fusion-Lumos_FSN20115': 'Orbitrap Fusion Lumos FSN20115', - 'Q-Exactive-HF-Orbitrap_2612': 'Q Exactive HF 2612', - 'Q-Exactive-HF-X-Orbitrap_6016': 'Q Exactive HF-X 6016', - 'Q-Exactive-HF-X-Orbitrap_6004': 'Q Exactive HF-X 6004', - 'Q-Exactive-HF-X-Orbitrap_6075': 'Q Exactive HF-X 6075', - 'Q-Exactive-HF-X-Orbitrap_6078': 'Q Exactive HF-X 6078', - 'Q-Exactive-HF-X-Orbitrap_6070': 'Q Exactive HF-X 6070', - 'Q-Exactive-HF-X-Orbitrap_6071': 'Q Exactive HF-X 6071', - 'Q-Exactive-HF-X-Orbitrap_6011': 'Q Exactive HF-X 6011', - 'Q-Exactive-HF-X-Orbitrap_6073': 'Q Exactive HF-X 6073', - 'Q-Exactive-HF-X-Orbitrap_6101': 'Q Exactive HF-X 6101', - 'Q-Exactive-HF-X-Orbitrap_6096': 'Q Exactive HF-X 6096', - 'Exactive-Series-Orbitrap_6004': 'Exactive Series 6004', - 'Q-Exactive-HF-X-Orbitrap_6043': 'Q Exactive HF-X 6043', - 'Q-Exactive-HF-X-Orbitrap_6025': 'Q Exactive HF-X 6025', - 'Q-Exactive-HF-X-Orbitrap_6022': 'Q Exactive HF-X 6022', - 'Q-Exactive-HF-X-Orbitrap_6023': 'Q Exactive HF-X 6023', - 'Q-Exactive-HF-X-Orbitrap_6028': 'Q Exactive HF-X 6028', - 'Q-Exactive-HF-X-Orbitrap_6013': 'Q Exactive HF-X 6013', - 'Q-Exactive-HF-X-Orbitrap_6044': 'Q Exactive HF-X 6044', - 'Q-Exactive-HF-X-Orbitrap_6324': 'Q Exactive HF-X 6324', - 'Orbitrap-Exploris-480_Invalid_SN_0001': 'Orbitrap Exploris 480 Invalid SN 0001', - 'Orbitrap-Exploris-480_MA10134C': 'Orbitrap Exploris 480 MA10134C', - 'Orbitrap-Exploris-480_MA10132C': 'Orbitrap Exploris 480 MA10132C', - 'Orbitrap-Exploris-480_MA10130C': 'Orbitrap Exploris 480 MA10130C', - 'Orbitrap-Exploris-480_MA10215C': 'Orbitrap Exploris 480 MA10215C'} + 'Q-Exactive-Plus-Orbitrap_1': 'Exactive Plus 1', + 'Q-Exactive-HF-Orbitrap_206': 'Q Exactive HF 206', + 'Q-Exactive-Plus-Orbitrap_143': 'Exactive Plus 143', + 'Q-Exactive-HF-Orbitrap_1': 'Q Exactive HF 1', + 'Q-Exactive-HF-Orbitrap_147': 'Q Exactive HF 147', + 'Q-Exactive-HF-Orbitrap_204': 'Q Exactive HF 204', + 'Q-Exactive-HF-Orbitrap_148': 'Q Exactive HF 148', + 'Q-Exactive-HF-Orbitrap_207': 'Q Exactive HF 207', + 'Q-Exactive-HF-Orbitrap_143': 'Q Exactive HF 143', + 'Orbitrap-Fusion-Lumos_FSN20115': 'Orbitrap Fusion Lumos FSN20115', + 'Q-Exactive-HF-Orbitrap_2612': 'Q Exactive HF 2612', + 'Q-Exactive-HF-X-Orbitrap_6016': 'Q Exactive HF-X 6016', + 'Q-Exactive-HF-X-Orbitrap_6004': 'Q Exactive HF-X 6004', + 'Q-Exactive-HF-X-Orbitrap_6075': 'Q Exactive HF-X 6075', + 'Q-Exactive-HF-X-Orbitrap_6078': 'Q Exactive HF-X 6078', + 'Q-Exactive-HF-X-Orbitrap_6070': 'Q Exactive HF-X 6070', + 'Q-Exactive-HF-X-Orbitrap_6071': 'Q Exactive HF-X 6071', + 'Q-Exactive-HF-X-Orbitrap_6011': 'Q Exactive HF-X 6011', + 'Q-Exactive-HF-X-Orbitrap_6073': 'Q Exactive HF-X 6073', + 'Q-Exactive-HF-X-Orbitrap_6101': 'Q Exactive HF-X 6101', + 'Q-Exactive-HF-X-Orbitrap_6096': 'Q Exactive HF-X 6096', + 'Exactive-Series-Orbitrap_6004': 'Exactive Series 6004', + 'Q-Exactive-HF-X-Orbitrap_6043': 'Q Exactive HF-X 6043', + 'Q-Exactive-HF-X-Orbitrap_6025': 'Q Exactive HF-X 6025', + 'Q-Exactive-HF-X-Orbitrap_6022': 'Q Exactive HF-X 6022', + 'Q-Exactive-HF-X-Orbitrap_6023': 'Q Exactive HF-X 6023', + 'Q-Exactive-HF-X-Orbitrap_6028': 'Q Exactive HF-X 6028', + 'Q-Exactive-HF-X-Orbitrap_6013': 'Q Exactive HF-X 6013', + 'Q-Exactive-HF-X-Orbitrap_6044': 'Q Exactive HF-X 6044', + 'Q-Exactive-HF-X-Orbitrap_6324': 'Q Exactive HF-X 6324', + 'Orbitrap-Exploris-480_Invalid_SN_0001': 'Orbitrap Exploris 480 Invalid SN 0001', + 'Orbitrap-Exploris-480_MA10134C': 'Orbitrap Exploris 480 MA10134C', + 'Orbitrap-Exploris-480_MA10132C': 'Orbitrap Exploris 480 MA10132C', + 'Orbitrap-Exploris-480_MA10130C': 'Orbitrap Exploris 480 MA10130C', + 'Orbitrap-Exploris-480_MA10215C': 'Orbitrap Exploris 480 MA10215C'} df_meta["instrument_label"] = df_meta["instrument_label"].replace(instrument_labels) @@ -126,10 +126,10 @@ counts_instrument = counts_instrument.join( (df_meta - [[*thermo_raw_files.cols_instrument, 'instrument_label']] - .drop_duplicates() - .set_index(thermo_raw_files.cols_instrument) - ) + [[*thermo_raw_files.cols_instrument, 'instrument_label']] + .drop_duplicates() + .set_index(thermo_raw_files.cols_instrument) + ) .set_index('instrument_label', append=True) ) counts_instrument.to_excel( @@ -164,11 +164,11 @@ # %% fig, ax = plt.subplots() ax = (counts_instrument - .plot - .bar( - ax=ax, - ) -) + .plot + .bar( + ax=ax, + ) + ) ax.set_xlabel('') ax.set_ylabel('number of samples (runs)') fname = out_folder / 'number_of_samples_per_instrument.pdf' @@ -197,7 +197,7 @@ ax = (df_meta .loc[mask, cols] .plot - .scatter(cols[0], cols[1], + .scatter(cols[0], cols[1], color='orange', label='normal files', ylabel='filesize (in GB)', @@ -221,7 +221,9 @@ cols = vaep.pandas.get_columns_accessor_from_iterable(cols) view = df_meta.loc[mask_top10_instruments] -view["instrument_label+N"] = view["instrument_label"].replace(counts_instrument.to_frame().apply( lambda s: f"{s.name} (N={s['count']:03d})" , axis=1)) +view["instrument_label+N"] = view["instrument_label"].replace( + counts_instrument.to_frame().apply( + lambda s: f"{s.name} (N={s['count']:03d})", axis=1)) view # %% @@ -241,7 +243,7 @@ title='instrument label', loc='upper right', # alignment='left', -) + ) ax.xaxis.set_major_formatter("{x:,.0f}") ax.yaxis.set_major_formatter("{x:,.0f}") fname = out_folder / 'ms1_to_ms2_top10_instruments.pdf' diff --git a/project/00_4_development_dataset_support.py b/project/00_4_development_dataset_support.py index c6f34a9dd..31c671c70 100644 --- a/project/00_4_development_dataset_support.py +++ b/project/00_4_development_dataset_support.py @@ -20,13 +20,14 @@ import pandas as pd import plotly.express as px -import vaep # set formatting defaults +import vaep # set formatting defaults # %% [markdown] # ## Parameters # %% tags=["parameters"] -support_json: str = 'data\dev_datasets\df_intensities_proteinGroups_long\Q_Exactive_HF_X_Orbitrap_6070_support.json' # Path to json support file +# Path to json support file +support_json: str = 'data\\dev_datasets\\df_intensities_proteinGroups_long\\Q_Exactive_HF_X_Orbitrap_6070_support.json' # %% [markdown] # ## Completeness of samples @@ -36,10 +37,10 @@ support.head() # %% -support.describe(percentiles=np.linspace(0.1,1,10)) +support.describe(percentiles=np.linspace(0.1, 1, 10)) # %% -ax = support.plot(rot=90, figsize=(20,10), legend=False) +ax = support.plot(rot=90, figsize=(20, 10), legend=False) ax.set_ylabel('number of features') ax.yaxis.set_major_formatter("{x:,.0f}") diff --git a/project/00_4_hela_development_dataset_splitting.ipynb b/project/00_4_hela_development_dataset_splitting.ipynb index 2a1a337cf..bdf50a5cd 100644 --- a/project/00_4_hela_development_dataset_splitting.ipynb +++ b/project/00_4_hela_development_dataset_splitting.ipynb @@ -71,10 +71,10 @@ "source": [ "N_MIN_INSTRUMENT = 300\n", "META_DATA: str = 'data/files_selected_metadata.csv'\n", - "FILE_EXT = 'pkl' # 'csv' or 'pkl'\n", + "FILE_EXT = 'pkl' # 'csv' or 'pkl'\n", "SAMPLE_ID = 'Sample ID'\n", "\n", - "DUMP: str = erda_dumps.FN_PROTEIN_GROUPS # Filepath to erda dump\n", + "DUMP: str = erda_dumps.FN_PROTEIN_GROUPS # Filepath to erda dump\n", "OUT_NAME = 'protein group' # for legends labels\n", "# DUMP: str = erda_dumps.FN_PEPTIDES\n", "# OUT_NAME = 'peptide' # for legends labels\n", @@ -181,7 +181,7 @@ "# feat_name = list(data.index.names)\n", "# feat_name.remove(SAMPLE_ID)\n", "feat_name = (OUT_NAME,)\n", - "feat_name # index name(s) which are not the sample index" + "feat_name # index name(s) which are not the sample index" ] }, { @@ -209,7 +209,7 @@ "outputs": [], "source": [ "# sample_ids = data.index.levels[0] # assume first index position is Sample ID?\n", - "sample_ids = data.index.unique() #.get_level_values(SAMPLE_ID).unique() # more explict\n", + "sample_ids = data.index.unique() # .get_level_values(SAMPLE_ID).unique() # more explict\n", "sample_ids" ] }, @@ -258,10 +258,10 @@ "outputs": [], "source": [ "idx_all = (pd.to_datetime(df_meta[\"Content Creation Date\"]).dt.strftime(\"%Y_%m_%d_%H_%M\")\n", - " + '_'\n", - " + df_meta[\"Thermo Scientific instrument model\"].str.replace(' ', '-')\n", - " + '_'\n", - " + df_meta[\"instrument serial number\"].str.split('#').str[-1])\n", + " + '_'\n", + " + df_meta[\"Thermo Scientific instrument model\"].str.replace(' ', '-')\n", + " + '_'\n", + " + df_meta[\"instrument serial number\"].str.split('#').str[-1])\n", "\n", "mask = idx_all.duplicated(keep=False)\n", "duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps\n", @@ -276,9 +276,9 @@ "metadata": {}, "outputs": [], "source": [ - "data_duplicates = data.loc[duplicated_sample_idx.index] #.unstack()\n", + "data_duplicates = data.loc[duplicated_sample_idx.index] # .unstack()\n", "# data_duplicates.T.corr() # same samples are have corr. of 1\n", - "data_duplicates.sum(axis=1) # keep only one seems okay" + "data_duplicates.sum(axis=1) # keep only one seems okay" ] }, { @@ -342,7 +342,7 @@ }, "outputs": [], "source": [ - "counts = data.count(axis=1) # wide format\n", + "counts = data.count(axis=1) # wide format\n", "N = len(counts)\n", "fname = FOLDER_DATASETS / 'support_all.json'\n", "files_out[fname.name] = fname\n", @@ -373,7 +373,7 @@ }, "outputs": [], "source": [ - "counts = data.count(axis=0) # wide format\n", + "counts = data.count(axis=0) # wide format\n", "counts.to_json(FOLDER_DATASETS / 'feat_completeness_all.json', indent=4)\n", "ax = (counts\n", " .sort_values() # will raise an error with a DataFrame\n", @@ -592,7 +592,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Summary statistics for top 5 instruments " + "## Summary statistics for top 5 instruments" ] }, { @@ -646,7 +646,7 @@ "metadata": {}, "outputs": [], "source": [ - "top_5_meta = df_meta.loc[mask_top_5] \n", + "top_5_meta = df_meta.loc[mask_top_5]\n", "top_5_meta[['injection volume setting', 'dilution factor']].describe()" ] }, @@ -664,7 +664,7 @@ "outputs": [], "source": [ "for _instrument, _df_meta_instrument in top_5_meta.groupby(by=thermo_raw_files.cols_instrument):\n", - " print('#'* 80, ' - '.join(_instrument), sep='\\n')\n", + " print('#' * 80, ' - '.join(_instrument), sep='\\n')\n", " display(_df_meta_instrument.describe())\n", " display(_df_meta_instrument['injection volume setting'].value_counts())\n", " break" @@ -728,7 +728,7 @@ "\n", " # calculate support\n", " counts = dataset.count(axis=1).squeeze()\n", - " ## to disk\n", + " # to disk\n", " fname_support = vaep.io.get_fname_from_keys(values,\n", " folder='.',\n", " file_ext=\"\")\n", @@ -736,7 +736,7 @@ " (fname_support.stem + '_support.json').replace('Exactive_Series_slot_#', ''))\n", " files_out[fname_support.name] = fname_support\n", " logger.info(f\"Dump support to: {fname_support.as_posix()}\")\n", - " \n", + "\n", " counts.to_json(fname_support, indent=4)\n", "\n", " # very slow alternative, but 100% correct\n", @@ -757,7 +757,7 @@ " ))\n", " vaep.plotting.add_prop_as_second_yaxis(ax, M)\n", " fig.tight_layout()\n", - " fname_support = fname_support.with_suffix('.pdf') \n", + " fname_support = fname_support.with_suffix('.pdf')\n", " files_out[fname_support.name] = fname_support\n", " vaep.plotting.savefig(fig, name=fname_support)" ] diff --git a/project/00_4_hela_development_dataset_splitting.py b/project/00_4_hela_development_dataset_splitting.py index bafc8dac3..5cba7438c 100644 --- a/project/00_4_hela_development_dataset_splitting.py +++ b/project/00_4_hela_development_dataset_splitting.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -54,10 +54,10 @@ # %% tags=["parameters"] N_MIN_INSTRUMENT = 300 META_DATA: str = 'data/files_selected_metadata.csv' -FILE_EXT = 'pkl' # 'csv' or 'pkl' +FILE_EXT = 'pkl' # 'csv' or 'pkl' SAMPLE_ID = 'Sample ID' -DUMP: str = erda_dumps.FN_PROTEIN_GROUPS # Filepath to erda dump +DUMP: str = erda_dumps.FN_PROTEIN_GROUPS # Filepath to erda dump OUT_NAME = 'protein group' # for legends labels # DUMP: str = erda_dumps.FN_PEPTIDES # OUT_NAME = 'peptide' # for legends labels @@ -118,7 +118,7 @@ # feat_name = list(data.index.names) # feat_name.remove(SAMPLE_ID) feat_name = (OUT_NAME,) -feat_name # index name(s) which are not the sample index +feat_name # index name(s) which are not the sample index # %% # M = len(data.index.levels[-1]) @@ -130,7 +130,7 @@ # %% # sample_ids = data.index.levels[0] # assume first index position is Sample ID? -sample_ids = data.index.unique() #.get_level_values(SAMPLE_ID).unique() # more explict +sample_ids = data.index.unique() # .get_level_values(SAMPLE_ID).unique() # more explict sample_ids # %% [markdown] @@ -151,10 +151,10 @@ # - drop metadata (entire) # %% idx_all = (pd.to_datetime(df_meta["Content Creation Date"]).dt.strftime("%Y_%m_%d_%H_%M") - + '_' - + df_meta["Thermo Scientific instrument model"].str.replace(' ', '-') - + '_' - + df_meta["instrument serial number"].str.split('#').str[-1]) + + '_' + + df_meta["Thermo Scientific instrument model"].str.replace(' ', '-') + + '_' + + df_meta["instrument serial number"].str.split('#').str[-1]) mask = idx_all.duplicated(keep=False) duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps @@ -162,9 +162,9 @@ # # %% -data_duplicates = data.loc[duplicated_sample_idx.index] #.unstack() +data_duplicates = data.loc[duplicated_sample_idx.index] # .unstack() # data_duplicates.T.corr() # same samples are have corr. of 1 -data_duplicates.sum(axis=1) # keep only one seems okay +data_duplicates.sum(axis=1) # keep only one seems okay # %% idx_unique = idx_all.drop_duplicates() @@ -191,7 +191,7 @@ # ## Support per sample in entire data set # %% -counts = data.count(axis=1) # wide format +counts = data.count(axis=1) # wide format N = len(counts) fname = FOLDER_DATASETS / 'support_all.json' files_out[fname.name] = fname @@ -215,7 +215,7 @@ # %% -counts = data.count(axis=0) # wide format +counts = data.count(axis=0) # wide format counts.to_json(FOLDER_DATASETS / 'feat_completeness_all.json', indent=4) ax = (counts .sort_values() # will raise an error with a DataFrame @@ -358,7 +358,7 @@ vaep.savefig(fig, name=fname) # %% [markdown] -# ## Summary statistics for top 5 instruments +# ## Summary statistics for top 5 instruments # %% fig, ax = plt.subplots(1, 1, figsize=(6, 6)) @@ -399,7 +399,7 @@ # %% -top_5_meta = df_meta.loc[mask_top_5] +top_5_meta = df_meta.loc[mask_top_5] top_5_meta[['injection volume setting', 'dilution factor']].describe() # %% [markdown] @@ -407,7 +407,7 @@ # %% for _instrument, _df_meta_instrument in top_5_meta.groupby(by=thermo_raw_files.cols_instrument): - print('#'* 80, ' - '.join(_instrument), sep='\n') + print('#' * 80, ' - '.join(_instrument), sep='\n') display(_df_meta_instrument.describe()) display(_df_meta_instrument['injection volume setting'].value_counts()) break @@ -455,7 +455,7 @@ # calculate support counts = dataset.count(axis=1).squeeze() - ## to disk + # to disk fname_support = vaep.io.get_fname_from_keys(values, folder='.', file_ext="") @@ -463,7 +463,7 @@ (fname_support.stem + '_support.json').replace('Exactive_Series_slot_#', '')) files_out[fname_support.name] = fname_support logger.info(f"Dump support to: {fname_support.as_posix()}") - + counts.to_json(fname_support, indent=4) # very slow alternative, but 100% correct @@ -484,7 +484,7 @@ )) vaep.plotting.add_prop_as_second_yaxis(ax, M) fig.tight_layout() - fname_support = fname_support.with_suffix('.pdf') + fname_support = fname_support.with_suffix('.pdf') files_out[fname_support.name] = fname_support vaep.plotting.savefig(fig, name=fname_support) diff --git a/project/00_6_0_permute_data.ipynb b/project/00_6_0_permute_data.ipynb index d8b6493db..c16637bc1 100644 --- a/project/00_6_0_permute_data.ipynb +++ b/project/00_6_0_permute_data.ipynb @@ -23,6 +23,7 @@ "import numpy as np\n", "import vaep\n", "import vaep.analyzers.analyzers\n", + "from vaep.utils import create_random_df\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", "logger.info(\"Split data and make diagnostic plots\")" @@ -35,7 +36,6 @@ "metadata": {}, "outputs": [], "source": [ - "from vaep.utils import create_random_df\n", "t = create_random_df(N=10, M=3)\n", "t = t.apply(lambda x: np.arange(len(x)))\n", "t" @@ -77,12 +77,11 @@ }, "outputs": [], "source": [ - "FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns)\n", - "index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these.\n", - "# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise)\n", - "column_names: List[str] = [\"Gene Names\"] # Manuelly set column names (of Index object in columns)\n", - "out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data\n", - "random_seed: int = 42 # Random seed for reproducibility\n", + "FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns)\n", + "index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these.\n", + "column_names: List[str] = [\"Gene Names\"] # Manuelly set column names (of Index object in columns)\n", + "out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data\n", + "random_seed: int = 42 # Random seed for reproducibility\n", "file_format: str = 'pkl'" ] }, @@ -149,9 +148,9 @@ "\n", "\n", "FILE_FORMAT_TO_CONSTRUCTOR_IN = {'csv': 'from_csv',\n", - " 'pkl': 'from_pickle',\n", - " 'pickle': 'from_pickle',\n", - " }\n", + " 'pkl': 'from_pickle',\n", + " 'pickle': 'from_pickle',\n", + " }\n", "\n", "FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:]\n", "logger.info(f\"File format (extension): {FILE_EXT} (!specifies data loading function!)\")" @@ -168,10 +167,10 @@ "source": [ "constructor = getattr(\n", " vaep.analyzers.analyzers.AnalyzePeptides,\n", - " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv \n", + " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv\n", "analysis = constructor(fname=args.FN_INTENSITIES,\n", - " index_col=args.index_col,\n", - " )" + " index_col=args.index_col,\n", + " )" ] }, { @@ -215,7 +214,7 @@ "\n", "method = getattr(df, FILE_FORMAT_TO_CONSTRUCTOR.get(FILE_EXT))\n", "\n", - "fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES , 'permuted')\n", + "fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES, 'permuted')\n", "method(fname)" ] }, @@ -228,10 +227,10 @@ "source": [ "constructor = getattr(\n", " vaep.analyzers.analyzers.AnalyzePeptides,\n", - " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv \n", + " FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv\n", "analysis = constructor(fname=args.FN_INTENSITIES,\n", - " index_col=args.index_col,\n", - " )" + " index_col=args.index_col,\n", + " )" ] } ], @@ -257,7 +256,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/project/00_6_0_permute_data.py b/project/00_6_0_permute_data.py index bc33b0774..9c6612d2e 100644 --- a/project/00_6_0_permute_data.py +++ b/project/00_6_0_permute_data.py @@ -9,12 +9,12 @@ import numpy as np import vaep import vaep.analyzers.analyzers +from vaep.utils import create_random_df logger = vaep.logging.setup_nb_logger() logger.info("Split data and make diagnostic plots") # %% -from vaep.utils import create_random_df t = create_random_df(N=10, M=3) t = t.apply(lambda x: np.arange(len(x))) t @@ -30,12 +30,11 @@ # %% tags=["parameters"] -FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns) -index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these. -# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise) -column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) -out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data -random_seed: int = 42 # Random seed for reproducibility +FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns) +index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these. +column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) +out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data +random_seed: int = 42 # Random seed for reproducibility file_format: str = 'pkl' # %% @@ -63,9 +62,9 @@ FILE_FORMAT_TO_CONSTRUCTOR_IN = {'csv': 'from_csv', - 'pkl': 'from_pickle', - 'pickle': 'from_pickle', - } + 'pkl': 'from_pickle', + 'pickle': 'from_pickle', + } FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:] logger.info(f"File format (extension): {FILE_EXT} (!specifies data loading function!)") @@ -73,10 +72,10 @@ # %% constructor = getattr( vaep.analyzers.analyzers.AnalyzePeptides, - FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv + FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv analysis = constructor(fname=args.FN_INTENSITIES, - index_col=args.index_col, - ) + index_col=args.index_col, + ) # %% @@ -95,12 +94,12 @@ method = getattr(df, FILE_FORMAT_TO_CONSTRUCTOR.get(FILE_EXT)) -fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES , 'permuted') +fname = vaep.utils.append_to_filepath(args.FN_INTENSITIES, 'permuted') method(fname) # %% constructor = getattr( vaep.analyzers.analyzers.AnalyzePeptides, - FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) #AnalyzePeptides.from_csv + FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv analysis = constructor(fname=args.FN_INTENSITIES, - index_col=args.index_col, - ) + index_col=args.index_col, + ) diff --git a/project/00_6_hela_training_data_exploration.ipynb b/project/00_6_hela_training_data_exploration.ipynb index ab393060d..856f3919d 100644 --- a/project/00_6_hela_training_data_exploration.ipynb +++ b/project/00_6_hela_training_data_exploration.ipynb @@ -2,159 +2,103 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "source": [ - "# Inspect data using plots\n", - "- spread of intensities between samples\n", - "- spread of intensities within samples\n", - "- missing data plots: violin, box and histogram - both for features and samples\n", - " - optionally: plot proposed cutoffs (on per default)\n", - "- correlation analysis: can linear correlation be picked up?\n", - "-\n", + "# Peptides\n", "\n", - "Does not save filtered data, this is done by splitting notebook. Only visualisations." + "Load peptides selected for training" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "from __future__ import annotations\n", - "import json\n", + "from datetime import datetime\n", + "from functools import partial\n", "from pathlib import Path\n", + "from random import sample\n", "\n", + "import ipywidgets as w\n", "import numpy as np\n", "import pandas as pd\n", + "import matplotlib\n", "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", "\n", - "import matplotlib\n", + "# from sklearn import preprocessing\n", + "# from sklearn.decomposition import PCA\n", + "import seaborn as sns\n", "\n", "import vaep\n", - "from vaep import plotting\n", - "from vaep.pandas import missing_data\n", - "import vaep.data_handling\n", + "from vaep.data_handling import coverage\n", + "from vaep.plotting import _savefig\n", + "\n", + "import config\n", "from vaep.analyzers import analyzers\n", + "from vaep.io.data_objects import PeptideCounter\n", + "from vaep.transform import log\n", + "\n", + "pd.options.display.max_columns = 100\n", + "pd.options.display.min_rows = 30" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Descriptive Statistics (Linear case)\n", "\n", - "logger = vaep.logging.setup_nb_logger()\n", - "\n", - "matplotlib.rcParams.update({'font.size': 5,\n", - " 'figure.figsize': [4.0, 2.0]})\n", - "\n", - "\n", - "def only_every_x_ticks(ax, x=2, axis=None):\n", - " \"\"\"Sparse out ticks on both axis by factor x\"\"\"\n", - " if axis is None:\n", - " ax.set_xticks(ax.get_xticks()[::x])\n", - " ax.set_yticks(ax.get_yticks()[::x])\n", - " else:\n", - " if axis == 0:\n", - " ax.set_xticks(ax.get_xticks()[::x])\n", - " elif axis == 1:\n", - " ax.set_yticks(ax.get_yticks()[::x])\n", - " else:\n", - " raise ValueError(f'axis must be 0 or 1, got {axis}')\n", - " return ax\n", - "\n", - "\n", - "def use_first_n_chars_in_labels(ax, x=2):\n", - " \"\"\"Take first N characters of labels and use them as new labels\"\"\"\n", - " # xaxis\n", - " _new_labels = [l.get_text()[:x]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - " # yaxis\n", - " _new_labels = [l.get_text()[:x] for l in ax.get_yticklabels()]\n", - " _ = ax.set_yticklabels(_new_labels)\n", - " return ax\n", - "\n", - "\n", - "def split_xticklabels(ax, PG_SEPARATOR=';'):\n", - " \"\"\"Split labels by PG_SEPARATOR and only use first part\"\"\"\n", - " if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - " return ax\n", - "\n", - "\n", - "def get_clustermap(data,\n", - " figsize=(8, 8),\n", - " cbar_pos: tuple[float, float, float, float] = (\n", - " 0.02, 0.83, 0.03, 0.15),\n", - " **kwargs):\n", - " from sklearn.impute import SimpleImputer\n", - " from vaep.pandas import _add_indices\n", - " X = SimpleImputer().fit_transform(data)\n", - " X = _add_indices(X, data)\n", - " cg = sns.clustermap(X,\n", - " z_score=0,\n", - " cmap=\"vlag\",\n", - " center=0,\n", - " cbar_pos=cbar_pos,\n", - " figsize=figsize,\n", - " **kwargs\n", - " )\n", - " return cg\n", - "\n", - "\n", - "def get_dynamic_range(min_max):\n", - " dynamic_range = pd.DataFrame(range(*min_max), columns=['x'])\n", - " dynamic_range['$2^x$'] = dynamic_range.x.apply(lambda x: 2**x)\n", - " dynamic_range.set_index('x', inplace=True)\n", - " dynamic_range.index.name = ''\n", - " dynamic_range = dynamic_range.T\n", - " return dynamic_range" + "- spread of peptide quantifications between samples\n", + "- spread of quantifications within samples\n", + "- correlation analysis: can linear correlation be picked up?\n" ] }, { "cell_type": "markdown", + "id": "8b4a827b", "metadata": {}, "source": [ - "## Parameters" + "### Peptides" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 2, - "tags": [ - "parameters" - ] - }, + "metadata": {}, + "outputs": [], + "source": [ + "FN_PEPTIDE_INTENSITIES = Path(\n", + " 'data/dev_datasets/df_intensities_proteinGroups_long_2017_2018_2019_2020_N05015_M04547/Q_Exactive_HF_X_Orbitrap_Exactive_Series_slot_#6070.csv')\n", + "FIGUREFOLDER = FN_PEPTIDE_INTENSITIES.parent / 'figures' / FN_PEPTIDE_INTENSITIES.stem\n", + "FIGUREFOLDER.mkdir(exist_ok=True, parents=True)\n", + "FIGUREFOLDER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "858f0520", + "metadata": {}, "outputs": [], "source": [ - "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", - "FOLDER_EXPERIMENT: str = 'runs/example/data_inspection'\n", "N_FIRST_ROWS = None # possibility to select N first rows\n", - "LOG_TRANSFORM: bool = True # log transform data\n", - "# list of integers or string denoting the index columns (used for csv)\n", - "INDEX_COL: list = [0]\n", - "COL_INDEX_NAME: str = 'Protein groups' # name of column index, can be None\n", - "LONG_FORMAT: bool = False # if True, the data is expected to be in long format\n", - "# Threshold used later for data filtering (here only for visualisation)\n", - "COMPLETENESS_OVER_SAMPLES = 0.25 # 25% of samples have to have that features\n", - "MIN_FEAT_PER_SAMPLE = .4 # 40% of features selected in first step\n", - "# protein group separator, e.g.';' (could also be gene groups)\n", - "PG_SEPARATOR: str = ';'\n", - "SAMPLE_FIRST_N_CHARS: int = 16 # number of characters used for sample names\n", - "# if True, do not use tick on heatmap - only label\n", - "NO_TICK_LABELS_ON_HEATMAP: bool = True" + "analysis = analyzers.AnalyzePeptides.from_csv(fname=FN_PEPTIDE_INTENSITIES, index_col=[0, 1], nrows=N_FIRST_ROWS)\n", + "df = analysis.to_wide_format()\n", + "analysis.describe_peptides(sample_n=30)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load and check data\n", + "### Peptide frequency: sellect the N most common peptides\n", "\n", - "- supported for now: pickle and comma separated\n", - "- transform long to wide data?\n", - "- log transform data using logarithm of two?\n", - "- remove entirely missing columns (features) or rows (samples)" + "- N most common peptides between samples" ] }, { @@ -163,29 +107,20 @@ "metadata": {}, "outputs": [], "source": [ - "FOLDER_EXPERIMENT = Path(FOLDER_EXPERIMENT)\n", - "FN_INTENSITIES = Path(FN_INTENSITIES)\n", + "N = 10\n", "\n", - "FIGUREFOLDER = FOLDER_EXPERIMENT / 'figures'\n", - "FIGUREFOLDER.mkdir(exist_ok=True, parents=True)\n", - "FIGUREFOLDER\n", - "\n", - "files_out = dict()" + "peptide_counter = PeptideCounter(config.FNAME_C_PEPTIDES)\n", + "peptide_counter.counter.most_common(N)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 0 - }, + "metadata": {}, "outputs": [], "source": [ - "if FN_INTENSITIES.suffix == '.pkl':\n", - " data = pd.read_pickle(FN_INTENSITIES)\n", - "elif FN_INTENSITIES.suffix == '.csv':\n", - " data = pd.read_csv(FN_INTENSITIES, index_col=INDEX_COL, nrows=N_FIRST_ROWS)\n", - "data" + "counts = analysis.df.count().sort_values(ascending=False)\n", + "counts.iloc[:N]" ] }, { @@ -194,47 +129,44 @@ "metadata": {}, "outputs": [], "source": [ - "if LONG_FORMAT:\n", - " data = data.squeeze().unstack()\n", - "if LOG_TRANSFORM:\n", - " data = np.log2(data).astype(float)\n", - "\n", - "\n", - "# drop entrily missing rows or columns\n", - "data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')\n", - "data" + "analysis.df[counts.iloc[:N].index]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlation between peptides\n", + "- linear correlation as indicator that there is some variation which could be used by models (or other heuristics)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ - "if len(data.columns.names) > 1:\n", - " _levels_dropped = data.columns.names[1:]\n", - " data.columns = data.columns.droplevel(_levels_dropped)\n", - " logger.warning(\"Drop multiindex level, kepp only first. Dropped: \"\n", - " f\"{_levels_dropped}\")\n", - "# allows overwriting of index name, also to None\n", - "data.columns.name = COL_INDEX_NAME" + "sample = analysis.df.sample(n=30, axis=1)\n", + "# ToDo func is assigned to df\n", + "corr_lower_triangle = analyzers.corr_lower_triangle(sample)\n", + "corr_lower_triangle" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "id": "826033b0", "metadata": {}, + "outputs": [], "source": [ - "## Calculate cutoffs for visualization and stats" + "fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- filtering based on many other samples?\n", - "- low feature completeness threshold in comparison to other approaches" + "### Samples" ] }, { @@ -243,12 +175,14 @@ "metadata": {}, "outputs": [], "source": [ - "min_samples_per_feat = int(len(data) * COMPLETENESS_OVER_SAMPLES)\n", - "print(f\"{min_samples_per_feat = }\")\n", - "mask = data.notna().sum(axis=0) > min_samples_per_feat\n", - "print(f\"drop = {(~mask).sum()} features\")\n", - "selected = data.loc[:, mask]\n", - "selected.shape" + "analysis.df.sample(30, axis=0).T.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Peptides (all)" ] }, { @@ -257,70 +191,74 @@ "metadata": {}, "outputs": [], "source": [ - "min_feat_per_sample = int(selected.shape[-1] * MIN_FEAT_PER_SAMPLE)\n", - "print(f\"{min_feat_per_sample = }\")\n", - "samples_selected = selected.notna().sum(axis=1) >= min_feat_per_sample\n", - "print(f\"drop = {(~samples_selected).sum()} samples\")\n", - "selected = selected[samples_selected]\n", - "selected.shape" + "stats = analysis.describe_peptides()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "id": "f0b01a33", "metadata": {}, + "outputs": [], "source": [ - "### Update records if cutoffs would be used" + "# biological coefficient of variation: standard deviation (variation) w.r.t mean\n", + "_ = stats.loc['CV'].hist(figsize=(10, 4))" ] }, { "cell_type": "code", "execution_count": null, + "id": "1e84e975", "metadata": {}, "outputs": [], "source": [ - "records = dict(inital=missing_data.get_record(data))\n", - "records" + "_ = stats.loc['count'].hist(figsize=(10, 4))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "8dca5410", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "records.update(\n", - " dict(filtered=missing_data.get_record(selected)))\n", - "records.update({'params':\n", - " dict(MIN_FEAT_PER_SAMPLE=float(MIN_FEAT_PER_SAMPLE),\n", - " COMPLETENESS_OVER_SAMPLES=float(\n", - " COMPLETENESS_OVER_SAMPLES),\n", - " min_feat_per_sample=int(min_feat_per_sample),\n", - " min_samples_per_feat=int(min_samples_per_feat),)\n", - " })\n", - "records" + "INDEX_NAME = 'Sample ID'\n", + "analysis.df.index.name = INDEX_NAME" ] }, { "cell_type": "code", "execution_count": null, + "id": "0da26061", "metadata": { - "lines_to_next_cell": 2 + "Collapsed": "false" }, "outputs": [], "source": [ - "fname = FOLDER_EXPERIMENT / 'records.json'\n", - "files_out[fname.name] = fname\n", - "with open(fname, 'w') as f:\n", - " json.dump(records, f, indent=4)" + "analysis.df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60215da2", + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "N_MIN_OBS = analysis.df.shape[0] * 0.7 # here: present in 70% of the samples\n", + "mask_min_obsevation = analysis.df.notna().sum() >= N_MIN_OBS\n", + "mask_min_obsevation.sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Plot basic distribution present-absent pattern of features and samples\n", - "\n", - "### Line plots" + "Reference analysis.df as `X`" ] }, { @@ -329,59 +267,77 @@ "metadata": {}, "outputs": [], "source": [ - "fig = plotting.data.plot_missing_dist_highdim(data,\n", - " min_feat_per_sample=min_feat_per_sample,\n", - " min_samples_per_feat=min_samples_per_feat)\n", - "fname = FIGUREFOLDER / f'dist_all_lineplot_w_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "X = analysis.df" + ] + }, + { + "cell_type": "markdown", + "id": "c9f1411e", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Completeness of peptides" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "fig = plotting.data.plot_missing_dist_highdim(data)\n", - "fname = FIGUREFOLDER / f'dist_all_lineplot_wo_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "%time not_missing = vaep.data_handling.get_sorted_not_missing(X)\n", + "not_missing.iloc[:, -10:].describe()" ] }, { "cell_type": "code", "execution_count": null, - "id": "f891da5c", - "metadata": {}, + "id": "d83e6998", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "fig = plotting.data.plot_missing_pattern_histogram(data,\n", - " min_feat_per_sample=min_feat_per_sample,\n", - " min_samples_per_feat=min_samples_per_feat)\n", - "fname = FIGUREFOLDER / f'dist_all_histogram_w_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "sample_completeness = not_missing.sum(axis=1).sort_values() / X.shape[-1]\n", + "sample_completeness" ] }, { "cell_type": "code", "execution_count": null, - "id": "1f38e2d9", - "metadata": {}, + "id": "b70f867c", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "fig = plotting.data.plot_missing_pattern_histogram(data)\n", - "fname = FIGUREFOLDER / f'dist_all_histogram_wo_cutoffs.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "N_MOST_COMMON_PEPTIDES = 300\n", + "data_to_visualize = not_missing.iloc[:, -N_MOST_COMMON_PEPTIDES:]\n", + "data_to_visualize = data_to_visualize.loc[sample_completeness.index]\n", + "print(f\"Look at missingness pattern of {N_MOST_COMMON_PEPTIDES} most common peptides across sample.\\n\"\n", + " f\"Data matrix dimension used for printing: { data_to_visualize.shape}\")\n", + "\n", + "\n", + "fig_heatmap_missing, axes_heatmap_missing = plt.subplots(\n", + " 1, 1, figsize=(12, 8))\n", + "USE_CBAR = False\n", + "\n", + "axes_heatmap_missing = sns.heatmap(data_to_visualize,\n", + " ax=axes_heatmap_missing,\n", + " cbar=USE_CBAR,\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Boxplots" + "White patches indicates that a peptide has been measured, black means it was not measured. Some samples (rows) have few of the most common peptides. This suggests to set a minimum of total peptides in a sample, which is common pratice.\n", + "\n", + "> An algorithm should work with the most common peptides and base it's inference capabilities after training on these." ] }, { @@ -390,371 +346,530 @@ "metadata": {}, "outputs": [], "source": [ - "fig = plotting.data.plot_missing_dist_boxplots(data)\n", - "fname = FIGUREFOLDER / f'dist_all_boxplots.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "data_to_visualize.sum(axis=1).nsmallest(20) # Samplest with the fewest measurements out of the seletion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# # This currently crashes if you want to have a pdf\n", + "datetime_now = datetime.now()\n", + "_savefig = partial(_savefig, folder=FIGUREFOLDER)\n", + "\n", + "_savefig(fig_heatmap_missing,\n", + " f'peptides_heatmap_missing_{datetime_now:%y%m%d}', pdf=False)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "source": [ - "### Violinplots" + "## Sample stats" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "fig = plotting.data.plot_missing_pattern_violinplot(\n", - " data, min_feat_per_sample, min_samples_per_feat)\n", - "fname = FIGUREFOLDER / f'dist_all_violin_plot.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "TYPE = 'peptides'\n", + "COL_NO_MISSING, COL_NO_IDENTIFIED = f'no_missing_{TYPE}', f'no_identified_{TYPE}'\n", + "COL_PROP_SAMPLES = 'prop_samples'\n", + "\n", + "\n", + "sample_stats = vaep.data_handling.compute_stats_missing(not_missing, COL_NO_MISSING, COL_NO_IDENTIFIED)\n", + "sample_stats" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": { - "lines_to_next_cell": 0 + "Collapsed": "false" }, + "outputs": [], "source": [ - "## Feature medians over prop. of missing of feature" + "fig_ident = sns.relplot(\n", + " x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats)\n", + "fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}')\n", + "fig_ident.fig.suptitle(f'Frequency of identified {TYPE} by sample id', y=1.03)\n", + "_savefig(fig_ident, f'identified_{TYPE}_by_sample', folder=FIGUREFOLDER)\n", + "\n", + "fig_ident_dist = sns.relplot(\n", + " x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED, data=sample_stats)\n", + "fig_ident_dist.set_axis_labels(\n", + " 'Proportion of samples (sorted by frequency)', f'Frequency of identified {TYPE}')\n", + "fig_ident_dist.fig.suptitle(\n", + " f'Frequency of identified {TYPE} groups by sample id', y=1.03)\n", + "_savefig(fig_ident_dist, f'identified_{TYPE}_ordered', folder=FIGUREFOLDER)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "ax = plotting.data.plot_feat_median_over_prop_missing(\n", - " data=data, type='scatter', s=1)\n", - "fname = FIGUREFOLDER / 'intensity_median_vs_prop_missing_scatter'\n", - "files_out[fname.stem] = fname\n", - "vaep.savefig(ax.get_figure(), fname)" + "COL_NO_MISSING_PROP = COL_NO_MISSING + '_PROP'\n", + "sample_stats[COL_NO_MISSING_PROP] = sample_stats[COL_NO_MISSING] / \\\n", + " float(X.shape[1])\n", + "\n", + "# from ggplot import *\n", + "# ggplot(aes(x='nan_proc'), data = nonnan) + geom_histogram(binwidth = 0.005) #+ ylim(0,0.025)\n", + "sns.set(style=\"darkgrid\")\n", + "g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROP, data=sample_stats)\n", + "plt.subplots_adjust(top=0.9)\n", + "g.set_axis_labels(\n", + " \"Proportion of samples (sorted by frequency)\", \"proportion missing\")\n", + "g.fig.suptitle(f'Proportion of missing {TYPE} ordered')\n", + "_savefig(g, \"proportion_proteins_missing\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Look at sequences\n", + "\n", + "Shows mainly that from a 6-7 AA on, peptides sequences are nearly unique.\n", + "\n", + "> Overlapping peptides (from the start or the end) could still be interesting to find" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2 + "Collapsed": "false" }, "outputs": [], "source": [ - "ax = plotting.data.plot_feat_median_over_prop_missing(\n", - " data=data, type='boxplot', s=.8)\n", - "fname = FIGUREFOLDER / 'intensity_median_vs_prop_missing_boxplot'\n", - "files_out[fname.stem] = fname\n", - "vaep.savefig(ax.get_figure(), fname)" + "class SequenceAnalyser():\n", + "\n", + " def __init__(self, sequences: pd.Series):\n", + " if not isinstance(sequences, pd.Series):\n", + " raise ValueError(\n", + " \"Please provide a pandas.Series, not {}\".format(type(sequences)))\n", + " self.sequences = sequences\n", + "\n", + " def calc_counts(self, n_characters):\n", + " return self.sequences.str[:n_characters].value_counts()\n", + "\n", + " def length(self):\n", + " return self.sequences.str.len().sort_values()" ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ - "## Correlation between peptides\n", - "- linear correlation as indicator that there is some variation which could be used by models (or other heuristics)" + "sequences = SequenceAnalyser(X.columns.to_series())\n", + "sequences.length()" ] }, { "cell_type": "code", "execution_count": null, + "id": "b7979950", "metadata": { - "lines_to_next_cell": 2 + "Collapsed": "false" }, "outputs": [], "source": [ - "%%time\n", - "corr_lower_triangle = analyzers.corr_lower_triangle(data)\n", - "fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40)\n", - "fname = FIGUREFOLDER / f'corr_histogram_feat.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname)" + "_ = w.interact(sequences.calc_counts,\n", + " n_characters=w.IntSlider(value=4, min=1, max=55))" ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "id": "431b9221", + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ - "### Coefficient of variation (CV) of features" + "sequences_p4 = sequences.calc_counts(4)\n", + "display(sequences_p4.head())" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "a00e631b", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "cv = data.std() / data.mean()\n", - "# biological coefficient of variation: standard deviation (variation) w.r.t mean\n", - "ax = cv.hist(bins=30)\n", - "fname = FIGUREFOLDER / f'CV_histogram_features.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(ax.get_figure(), name=fname)" + "sequences_p4.loc[sequences_p4.isin(('CON_', 'REV_'))].sort_index()" ] }, { "cell_type": "markdown", - "metadata": {}, + "id": "0bc4e272", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "What to do when\n", + "\n", + "\n", + "```\n", + "AAAAAAAAAAGAAGGRGSGPGR\n", + "AAAAAAAAAAGAAGGRGSGPGRR\n", + "\n", + "AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVK\n", + "AAAANSGSSLPLFDCPTWAGKPPPGLHLDVVKGDK\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "821af58a", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Select Training Data" + ] + }, + { + "cell_type": "markdown", + "id": "e83f0238", + "metadata": { + "Collapsed": "false" + }, "source": [ - "## Clustermap and heatmaps of missing values" + "### Minumum required sample quality\n", + "First define the minum requirement of a sample to be kept in" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "b2517983", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "# needs to deal with duplicates\n", - "# notna = data.notna().T.drop_duplicates().T\n", - "# get index and column names\n", - "cg = sns.clustermap(data.notna(), cbar_pos=None)\n", - "ax = cg.ax_heatmap\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'clustermap_present_absent_pattern.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(cg.fig,\n", - " name=fname,\n", - " pdf=False)" + "range_peps = (0, max(sample_stats[COL_NO_IDENTIFIED]))\n", + "MIN_DEPTH_SAMPLE = int(range_peps[1] * 0.6)\n", + "w_min_depth_sample = w.IntSlider(\n", + " value=MIN_DEPTH_SAMPLE, min=0, max=range_peps[1])\n", + "print(f'Minimum {TYPE} per sample observed:')\n", + "w_min_depth_sample" ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "id": "d4b59bd2", + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ - "based on cluster, plot heatmaps of features and samples" + "mask_samples = sample_stats[COL_NO_IDENTIFIED] >= w_min_depth_sample.value\n", + "print(f\"Selected {mask_samples.sum()} samples\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "e75668aa", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "assert (len(cg.dendrogram_row.reordered_ind), len(\n", - " cg.dendrogram_col.reordered_ind)) == data.shape" + "x_50 = coverage(X.loc[mask_samples], coverage_col=0.5, coverage_row=0.2)\n", + "# x_50_pca = log_z_zeroone_na(x_50) # there is a huge difference if NA is set to low value or mean!!\n", + "x_90 = coverage(X.loc[mask_samples], 0.9, 0.9)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "1e3135a6", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "vaep.plotting.make_large_descriptors(5)\n", - "ax = sns.heatmap(\n", - " data.iloc[cg.dendrogram_row.reordered_ind,\n", - " cg.dendrogram_col.reordered_ind],\n", - ")\n", - "only_every_x_ticks(ax, x=2)\n", - "use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'heatmap_intensities_ordered_by_missing_pattern.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(ax.get_figure(), name=fname, pdf=False)\n", - "# ax.get_figure().savefig(fname, dpi=300)" + "x_50.shape, x_90.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5be5099", + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "x_90.sample()" ] }, { "cell_type": "markdown", + "id": "df2816dd", "metadata": {}, "source": [ - "### Heatmap of sample and feature correlation" + "Data selection should be done for each experiment, so it is not resaved here" ] }, { "cell_type": "code", "execution_count": null, + "id": "864cc51c", "metadata": {}, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax = sns.heatmap(\n", - " analyzers.corr_lower_triangle(\n", - " data.iloc[:, cg.dendrogram_col.reordered_ind]),\n", - " ax=ax,\n", - " square=True,\n", - ")\n", - "_ = only_every_x_ticks(ax, x=2)\n", - "_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'heatmap_feature_correlation.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname, pdf=False)" + "#from vaep.io.data_objects import get_fname\n", + "# fname = config.FOLDER_DATA / get_fname(*x_90.shape)\n", + "# print(fname)\n", + "# x_90.to_csv(fname)\n", + "# fname = config.FOLDER_DATA / get_fname(*x_50.shape)\n", + "# print(fname)\n", + "# x_50.to_csv(fname)" + ] + }, + { + "cell_type": "markdown", + "id": "3d8ea98b", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Distribution of Intensity values\n", + "- comparing non-transformed to $\\log_{10}$ transformed\n", + "- log transformation makes data more normal distributed\n", + "\n", + "> log10 or log2 or ln" + ] + }, + { + "cell_type": "markdown", + "id": "6a9f9f88", + "metadata": {}, + "source": [ + "#### Sample with all peptides" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "cd813441", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize=(4, 4))\n", - "ax = sns.heatmap(\n", - " analyzers.corr_lower_triangle(\n", - " data.T.iloc[:, cg.dendrogram_row.reordered_ind]),\n", - " ax=ax,\n", - " square=True,\n", - ")\n", - "_ = only_every_x_ticks(ax, x=2)\n", - "_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "fname = FIGUREFOLDER / 'heatmap_sample_correlation.png'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig, name=fname, pdf=False)" + "sample = x_50.sample().iloc[0]\n", + "sample_id = sample.name\n", + "print(\"Sample ID:\", sample_id)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "83097013", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "kwargs = dict()\n", - "if NO_TICK_LABELS_ON_HEATMAP:\n", - " kwargs['xticklabels'] = False\n", - " kwargs['yticklabels'] = False\n", - "cg = get_clustermap(data, **kwargs)\n", - "ax = cg.ax_heatmap\n", - "if PG_SEPARATOR is not None:\n", - " _new_labels = [l.get_text().split(PG_SEPARATOR)[0]\n", - " for l in ax.get_xticklabels()]\n", - " _ = ax.set_xticklabels(_new_labels)\n", - "_ = only_every_x_ticks(ax, x=2, axis=0)\n", - "_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)\n", "\n", - "fname = FIGUREFOLDER / 'clustermap_intensities_normalized.png'\n", - "files_out[fname.name] = fname\n", - "cg.fig.savefig(fname, dpi=300) # avoid tight_layout\n", - "# tight_layout makes the cbar a bit ugly\n", - "# vaep.savefig(cg.fig,\n", - "# name=fname,\n", - "# pdf=False)" + "sns.set(style=\"darkgrid\")\n", + "\n", + "\n", + "def plot_dist_comparison(\n", + " sample: pd.Series, figsize=(12, 5),\n", + " log=np.log, log_name=None,\n", + ") -> matplotlib.figure.Figure:\n", + " fig, axes = plt.subplots(1, 2, figsize=figsize)\n", + "\n", + " sns.histplot(sample, bins=100, ax=axes[0])\n", + " axes[0].set_title(\"Unnormalized distribution\")\n", + "\n", + " sample_log = log(sample)\n", + " sns.histplot(sample_log, bins=100, ax=axes[1])\n", + " if not log_name:\n", + " log_name = str(log).split(\"'\")[1]\n", + " axes[1].set_title(f\"{log_name} normalized distribution\")\n", + " sample_id = sample.name\n", + " _ = fig.suptitle(f\"Dynamic Range of measured intensities in sample {sample_id}\")\n", + " fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n", + " return fig\n", + "\n", + "\n", + "fig = plot_dist_comparison(sample)\n", + "_savefig(fig, f\"distribution_sample_peptides_{str(sample_id)}_ln\")" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "id": "cbcff3de", "metadata": {}, + "outputs": [], "source": [ - "## Sample stats" + "fig = plot_dist_comparison(sample, log=np.log2)\n", + "_savefig(fig, f\"distribution_peptides_sample_{str(sample_id)}_log2\")" ] }, { "cell_type": "code", "execution_count": null, + "id": "46fea6ba", "metadata": {}, "outputs": [], "source": [ - "TYPE = 'feat'\n", - "COL_NO_MISSING, COL_NO_IDENTIFIED = f'no_missing_{TYPE}', f'no_identified_{TYPE}'\n", - "COL_PROP_SAMPLES = 'prop_samples'\n", - "\n", - "sample_stats = vaep.data_handling.compute_stats_missing(\n", - " data.notna(), COL_NO_MISSING, COL_NO_IDENTIFIED)\n", - "sample_stats" + "sample_log_stats = np.log2(sample).describe().to_frame('log2')\n", + "sample_log_stats['ln'] = np.log(sample).describe()\n", + "sample_log_stats" ] }, { "cell_type": "code", "execution_count": null, + "id": "6e1f2979", "metadata": {}, "outputs": [], "source": [ - "fig_ident = sns.relplot(\n", - " x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats)\n", - "fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}')\n", - "fig_ident.fig.suptitle(f'Frequency of identified {TYPE} by sample id', y=1.03)\n", - "vaep.savefig(fig_ident, f'identified_{TYPE}_by_sample', folder=FIGUREFOLDER)\n", + "print(f\"Factor for log2 to ln: {1 / np.log2(np.e) = :.3f}\")\n", + "c = 1 / np.log2(np.e)" + ] + }, + { + "cell_type": "markdown", + "id": "1de60bb5", + "metadata": {}, + "source": [ + "If $ log2(x) \\sim \\mathcal{N}\\big(\\mu_{log2}, \\sigma_{log2}^2 \\big) $, then $ ln(x) \\sim \\mathcal{N}\\big(0.693 \\cdot \\mu_{log2}, 0.693^2 \\cdot \\sigma_{log2}^2 \\big) $.\n", "\n", - "fig_ident_dist = sns.relplot(\n", - " x=COL_PROP_SAMPLES, y=COL_NO_IDENTIFIED, data=sample_stats)\n", - "fig_ident_dist.set_axis_labels(\n", - " 'Proportion of samples (sorted by frequency)', f'Frequency of identified {TYPE}')\n", - "fig_ident_dist.fig.suptitle(\n", - " f'Frequency of identified {TYPE} groups by sample id', y=1.03)\n", - "fname = FIGUREFOLDER / f'identified_{TYPE}_ordered.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(fig_ident_dist, fname)" + "> Question: Is a wider or narrower distribtion important, or does only be \"normal\"" ] }, { "cell_type": "code", "execution_count": null, + "id": "a233ca42", "metadata": {}, "outputs": [], "source": [ - "COL_NO_MISSING_PROP = COL_NO_MISSING + '_PROP'\n", - "sample_stats[COL_NO_MISSING_PROP] = sample_stats[COL_NO_MISSING] / \\\n", - " float(data.shape[1])\n", - "sns.set(style=\"white\")\n", - "g = sns.relplot(x='prop_samples', y=COL_NO_MISSING_PROP, data=sample_stats)\n", - "plt.subplots_adjust(top=0.9)\n", - "plt.ylim(0, 1)\n", - "g.set_axis_labels(\n", - " \"Proportion of samples (sorted by frequency)\", \"proportion missing\")\n", - "g.fig.suptitle(f'Proportion of missing {TYPE} ordered')\n", - "\n", - "fname = FIGUREFOLDER / 'proportion_feat_missing.pdf'\n", - "files_out[fname.name] = fname\n", - "vaep.savefig(g, fname)" + "print(f\"mean: {sample_log_stats.loc['mean','log2'] * c = : .3f}\")\n", + "print(f\"std : {sample_log_stats.loc['std' ,'log2'] * c = : .3f}\")" ] }, { "cell_type": "markdown", + "id": "fb999f0b", "metadata": {}, "source": [ - "### Reference table intensities (log2)" + "#### One Peptide, all samples" ] }, { "cell_type": "code", "execution_count": null, + "id": "495cd93c", "metadata": { - "lines_to_next_cell": 2 + "Collapsed": "false" }, "outputs": [], "source": [ - "min_max = int(data.min().min()), int(data.max().max()) + 1\n", - "dynamic_range = None\n", - "if min_max[1] < 100:\n", - " dynamic_range = get_dynamic_range(min_max)\n", - "dynamic_range" + "sample = x_50.sample(axis=1).squeeze()\n", + "peptide = sample.name\n", + "\n", + "fig = plot_dist_comparison(sample)\n", + "_savefig(fig, f\"distribution_peptide_samples_{str(peptide)}_ln\")" + ] + }, + { + "cell_type": "markdown", + "id": "56898125", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Reference table intensities (natural logarithm)\n", + "\n", + "14 to 23 spans a dynamic range of 3 orders of base 10" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "id": "8b08e367", + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ - "files_out" + "dynamic_range = pd.DataFrame(range(14, 24), columns=['x'])\n", + "dynamic_range['$e^x$'] = dynamic_range.x.apply(np.exp)\n", + "dynamic_range.set_index('x', inplace=True)\n", + "dynamic_range.index.name = ''\n", + "dynamic_range.T" ] + }, + { + "cell_type": "markdown", + "id": "6d57a983", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Next UP" + ] + }, + { + "cell_type": "markdown", + "id": "0bc1af1f", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "9a6a80d7", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Find Protein of Peptides\n", + "- check with some reference list of peptides: This is created in `project\\FASTA_tryptic_digest.ipynb`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7abf7a7f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/project/00_6_hela_training_data_exploration.py b/project/00_6_hela_training_data_exploration.py index 913a81401..84cc616fc 100644 --- a/project/00_6_hela_training_data_exploration.py +++ b/project/00_6_hela_training_data_exploration.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -21,9 +21,12 @@ from datetime import datetime from functools import partial from pathlib import Path +from random import sample +import ipywidgets as w import numpy as np import pandas as pd +import matplotlib import matplotlib.pyplot as plt # from sklearn import preprocessing @@ -37,6 +40,7 @@ import config from vaep.analyzers import analyzers from vaep.io.data_objects import PeptideCounter +from vaep.transform import log pd.options.display.max_columns = 100 pd.options.display.min_rows = 30 @@ -53,14 +57,15 @@ # ### Peptides # %% -FN_PEPTIDE_INTENSITIES = Path('data/dev_datasets/df_intensities_proteinGroups_long_2017_2018_2019_2020_N05015_M04547/Q_Exactive_HF_X_Orbitrap_Exactive_Series_slot_#6070.csv') +FN_PEPTIDE_INTENSITIES = Path( + 'data/dev_datasets/df_intensities_proteinGroups_long_2017_2018_2019_2020_N05015_M04547/Q_Exactive_HF_X_Orbitrap_Exactive_Series_slot_#6070.csv') FIGUREFOLDER = FN_PEPTIDE_INTENSITIES.parent / 'figures' / FN_PEPTIDE_INTENSITIES.stem FIGUREFOLDER.mkdir(exist_ok=True, parents=True) FIGUREFOLDER # %% -N_FIRST_ROWS = None # possibility to select N first rows -analysis = analyzers.AnalyzePeptides.from_csv(fname=FN_PEPTIDE_INTENSITIES, index_col=[0,1],nrows=N_FIRST_ROWS) +N_FIRST_ROWS = None # possibility to select N first rows +analysis = analyzers.AnalyzePeptides.from_csv(fname=FN_PEPTIDE_INTENSITIES, index_col=[0, 1], nrows=N_FIRST_ROWS) df = analysis.to_wide_format() analysis.describe_peptides(sample_n=30) @@ -108,10 +113,11 @@ stats = analysis.describe_peptides() # %% -_ = stats.loc['CV'].hist(figsize=(10, 4)) # biological coefficient of variation: standard deviation (variation) w.r.t mean +# biological coefficient of variation: standard deviation (variation) w.r.t mean +_ = stats.loc['CV'].hist(figsize=(10, 4)) # %% -_ = stats.loc['count'].hist(figsize=(10,4)) +_ = stats.loc['count'].hist(figsize=(10, 4)) # %% Collapsed="false" INDEX_NAME = 'Sample ID' @@ -121,7 +127,7 @@ analysis.df # %% Collapsed="false" -N_MIN_OBS = analysis.df.shape[0] * 0.7 # here: present in 70% of the samples +N_MIN_OBS = analysis.df.shape[0] * 0.7 # here: present in 70% of the samples mask_min_obsevation = analysis.df.notna().sum() >= N_MIN_OBS mask_min_obsevation.sum() @@ -156,16 +162,16 @@ axes_heatmap_missing = sns.heatmap(data_to_visualize, ax=axes_heatmap_missing, - cbar = USE_CBAR, - ) + cbar=USE_CBAR, + ) # %% [markdown] -# White patches indicates that a peptide has been measured, black means it was not measured. Some samples (rows) have few of the most common peptides. This suggests to set a minimum of total peptides in a sample, which is common pratice. +# White patches indicates that a peptide has been measured, black means it was not measured. Some samples (rows) have few of the most common peptides. This suggests to set a minimum of total peptides in a sample, which is common pratice. # # > An algorithm should work with the most common peptides and base it's inference capabilities after training on these. # %% -data_to_visualize.sum(axis=1).nsmallest(20) # Samplest with the fewest measurements out of the seletion +data_to_visualize.sum(axis=1).nsmallest(20) # Samplest with the fewest measurements out of the seletion # %% Collapsed="false" # # This currently crashes if you want to have a pdf @@ -184,7 +190,7 @@ COL_PROP_SAMPLES = 'prop_samples' -sample_stats = vaep.data_handling.compute_stats_missing(not_missing, COL_NO_MISSING, COL_NO_IDENTIFIED ) +sample_stats = vaep.data_handling.compute_stats_missing(not_missing, COL_NO_MISSING, COL_NO_IDENTIFIED) sample_stats # %% Collapsed="false" @@ -246,9 +252,8 @@ def length(self): sequences.length() # %% Collapsed="false" -import ipywidgets as w _ = w.interact(sequences.calc_counts, - n_characters=w.IntSlider(value=4, min=1, max=55)) + n_characters=w.IntSlider(value=4, min=1, max=55)) # %% Collapsed="false" sequences_p4 = sequences.calc_counts(4) @@ -258,7 +263,7 @@ def length(self): sequences_p4.loc[sequences_p4.isin(('CON_', 'REV_'))].sort_index() # %% [markdown] Collapsed="false" -# What to do when +# What to do when # # # ``` @@ -276,11 +281,10 @@ def length(self): # %% [markdown] Collapsed="false" # ### Minumum required sample quality -# First define the minum requirement of a sample to be kept in +# First define the minum requirement of a sample to be kept in # %% Collapsed="false" -import ipywidgets as w -range_peps = (0, max(sample_stats[COL_NO_IDENTIFIED])) +range_peps = (0, max(sample_stats[COL_NO_IDENTIFIED])) MIN_DEPTH_SAMPLE = int(range_peps[1] * 0.6) w_min_depth_sample = w.IntSlider( value=MIN_DEPTH_SAMPLE, min=0, max=range_peps[1]) @@ -326,11 +330,10 @@ def length(self): # %% Collapsed="false" sample = x_50.sample().iloc[0] -sample_id = sample.name +sample_id = sample.name print("Sample ID:", sample_id) # %% Collapsed="false" -import matplotlib sns.set(style="darkgrid") @@ -363,8 +366,8 @@ def plot_dist_comparison( _savefig(fig, f"distribution_peptides_sample_{str(sample_id)}_log2") # %% -sample_log_stats = np.log2(sample).describe().to_frame('log2') -sample_log_stats['ln'] = np.log (sample).describe() +sample_log_stats = np.log2(sample).describe().to_frame('log2') +sample_log_stats['ln'] = np.log(sample).describe() sample_log_stats # %% @@ -384,8 +387,6 @@ def plot_dist_comparison( # #### One Peptide, all samples # %% Collapsed="false" -from vaep.transform import log -from random import sample sample = x_50.sample(axis=1).squeeze() peptide = sample.name @@ -412,6 +413,6 @@ def plot_dist_comparison( # %% [markdown] Collapsed="false" # ### Find Protein of Peptides -# - check with some reference list of peptides: This is created in `project\FASTA_tryptic_digest.ipynb` +# - check with some reference list of peptides: This is created in `project\FASTA_tryptic_digest.ipynb` # %% diff --git a/project/01_0_transform_data_to_wide_format.ipynb b/project/01_0_transform_data_to_wide_format.ipynb index f8ccd842c..4d5266a1d 100644 --- a/project/01_0_transform_data_to_wide_format.ipynb +++ b/project/01_0_transform_data_to_wide_format.ipynb @@ -145,7 +145,7 @@ "source": [ "fname = params.data / 'sample_annotation_placeholder.csv'\n", "annotation.to_csv(fname)\n", - "fname " + "fname" ] }, { @@ -164,14 +164,15 @@ "cell_type": "code", "execution_count": null, "id": "ce749fdb", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "fname = params.data / 'data_wide_sample_cols.csv'\n", - "# fillna('Filtered') \n", + "# fillna('Filtered')\n", "train_data.T.to_csv(fname)\n", - "fname\n", - "\n" + "fname" ] }, { diff --git a/project/01_0_transform_data_to_wide_format.py b/project/01_0_transform_data_to_wide_format.py index 7fe6f7293..b23bf8154 100644 --- a/project/01_0_transform_data_to_wide_format.py +++ b/project/01_0_transform_data_to_wide_format.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -70,18 +70,17 @@ # %% fname = params.data / 'sample_annotation_placeholder.csv' annotation.to_csv(fname) -fname +fname # %% [markdo] # Save with samples in columns # %% fname = params.data / 'data_wide_sample_cols.csv' -# fillna('Filtered') +# fillna('Filtered') train_data.T.to_csv(fname) fname - # %% # 'data_wide_sample_cols.csv' diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb index f67d3e710..d0fba2b54 100644 --- a/project/01_1_train_CF.ipynb +++ b/project/01_1_train_CF.ipynb @@ -15,7 +15,10 @@ "metadata": {}, "outputs": [], "source": [ + "\n", + "\n", "import logging\n", + "\n", "from pprint import pprint\n", "\n", "from fastai.basics import *\n", @@ -26,22 +29,23 @@ "from fastai.tabular.all import *\n", "from fastai.collab import *\n", "\n", - "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss, RecorderDump\n", - "from fastai import learner\n", - "learner.Recorder.plot_loss = plot_loss\n", - "# import fastai.callback.hook # Learner.summary\n", - "\n", - "\n", "import vaep\n", "import vaep.model\n", "import vaep.models as models\n", - "from vaep.io import datasplits\n", - "from vaep import sampling\n", - "\n", + "from vaep.models import plot_loss, RecorderDump\n", "\n", "import vaep.nb\n", + "from vaep import sampling\n", + "from vaep.io import datasplits\n", + "\n", "from vaep.logging import setup_logger\n", + "\n", + "# overwriting Recorder callback with custom plot_loss\n", + "from fastai import learner\n", + "learner.Recorder.plot_loss = plot_loss\n", + "# import fastai.callback.hook # Learner.summary\n", + "\n", + "\n", "logger = setup_logger(logger=logging.getLogger('vaep'))\n", "logger.info(\n", " \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n", @@ -96,7 +100,7 @@ "# model\n", "# Dimensionality of encoding dimension (latent space of model)\n", "latent_dim: int = 10\n", - "# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder\n", + "# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder\n", "sample_idx_position: int = 0 # position of index which is sample ID\n", "model: str = 'CF' # model name\n", "model_key: str = 'CF' # potentially alternative key for model (grid search)\n", @@ -201,6 +205,17 @@ "data.train_X.sample(5)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3311709", + "metadata": {}, + "outputs": [], + "source": [ + "! add check that specified data is available\n", + "# silent error in fastai if e.g. target column is not available" + ] + }, { "cell_type": "markdown", "id": "6045414b", @@ -344,7 +359,7 @@ " target_column='intensity',\n", " model_kwargs=dict(n_factors=args.latent_dim,\n", " y_range=(int(data.train_X.min()),\n", - " int(data.train_X.max())+1)\n", + " int(data.train_X.max()) + 1)\n", " ),\n", " batch_size=args.batch_size)" ] @@ -586,7 +601,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py index 642bcb02a..de18d7276 100644 --- a/project/01_1_train_CF.py +++ b/project/01_1_train_CF.py @@ -17,7 +17,10 @@ # # Collaborative Filtering # %% + + import logging + from pprint import pprint from fastai.basics import * @@ -28,22 +31,23 @@ from fastai.tabular.all import * from fastai.collab import * -# overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss, RecorderDump -from fastai import learner -learner.Recorder.plot_loss = plot_loss -# import fastai.callback.hook # Learner.summary - - import vaep import vaep.model import vaep.models as models -from vaep.io import datasplits -from vaep import sampling - +from vaep.models import plot_loss, RecorderDump import vaep.nb +from vaep import sampling +from vaep.io import datasplits + from vaep.logging import setup_logger + +# overwriting Recorder callback with custom plot_loss +from fastai import learner +learner.Recorder.plot_loss = plot_loss +# import fastai.callback.hook # Learner.summary + + logger = setup_logger(logger=logging.getLogger('vaep')) logger.info( "Experiment 03 - Analysis of latent spaces and performance comparisions") @@ -75,7 +79,7 @@ # model # Dimensionality of encoding dimension (latent space of model) latent_dim: int = 10 -# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder +# hidden_layers:str = '128_64' # Underscore separated string of layers, '128 64' for the encoder, reversed for decoder sample_idx_position: int = 0 # position of index which is sample ID model: str = 'CF' # model name model_key: str = 'CF' # potentially alternative key for model (grid search) @@ -122,6 +126,10 @@ # %% data.train_X.sample(5) +# %% +# ! add check that specified data is available +# silent error in fastai if e.g. target column is not available + # %% [markdown] # Infer index names from long format @@ -192,7 +200,7 @@ target_column='intensity', model_kwargs=dict(n_factors=args.latent_dim, y_range=(int(data.train_X.min()), - int(data.train_X.max())+1) + int(data.train_X.max()) + 1) ), batch_size=args.batch_size) @@ -317,7 +325,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_train_DAE.ipynb b/project/01_1_train_DAE.ipynb index 01f3397db..f8991e229 100644 --- a/project/01_1_train_DAE.ipynb +++ b/project/01_1_train_DAE.ipynb @@ -24,30 +24,31 @@ "from fastai.callback.all import *\n", "from fastai.torch_basics import *\n", "\n", + "import sklearn\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "import vaep\n", + "from vaep import sampling\n", + "from vaep.io import datasplits\n", + "from vaep.models import ae\n", + "import vaep.models as models\n", + "import vaep.model\n", + "from vaep.analyzers import analyzers\n", + "\n", "# overwriting Recorder callback with custom plot_loss\n", "from vaep.models import plot_loss\n", "from fastai import learner\n", + "\n", "learner.Recorder.plot_loss = plot_loss\n", "# import fastai.callback.hook # Learner.summary\n", "\n", - "import sklearn\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.preprocessing import MinMaxScaler\n", "\n", - "import vaep\n", - "from vaep.analyzers import analyzers\n", - "import vaep.model\n", - "import vaep.models as models\n", - "from vaep.models import ae\n", "# from vaep.models import collab as vaep_collab\n", "# from vaep.io.datasets import DatasetWithTarget\n", "# from vaep.transform import VaepPipeline\n", - "from vaep.io import datasplits\n", "# from vaep.io.dataloaders import get_dls, get_test_dl\n", - "from vaep import sampling\n", "\n", - "import vaep.nb as config\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\n", " \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n", @@ -470,7 +471,9 @@ "id": "35704935-c739-48f5-9912-1c1ab1e6c4d3", "metadata": {}, "source": [ - "Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later" + "Adding a `EarlyStoppingCallback` results in an error. Potential fix in\n", + "[PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in\n", + "current version. Try again later" ] }, { @@ -776,7 +779,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_DAE.py b/project/01_1_train_DAE.py index f336a1d30..afe60065f 100644 --- a/project/01_1_train_DAE.py +++ b/project/01_1_train_DAE.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -24,30 +24,31 @@ from fastai.callback.all import * from fastai.torch_basics import * +import sklearn +from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer + +import vaep +from vaep import sampling +from vaep.io import datasplits +from vaep.models import ae +import vaep.models as models +import vaep.model +from vaep.analyzers import analyzers + # overwriting Recorder callback with custom plot_loss from vaep.models import plot_loss from fastai import learner + learner.Recorder.plot_loss = plot_loss # import fastai.callback.hook # Learner.summary -import sklearn -from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import MinMaxScaler -import vaep -from vaep.analyzers import analyzers -import vaep.model -import vaep.models as models -from vaep.models import ae # from vaep.models import collab as vaep_collab # from vaep.io.datasets import DatasetWithTarget # from vaep.transform import VaepPipeline -from vaep.io import datasplits # from vaep.io.dataloaders import get_dls, get_test_dl -from vaep import sampling -import vaep.nb as config logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info( "Experiment 03 - Analysis of latent spaces and performance comparisions") @@ -257,7 +258,9 @@ analysis.learn.show_training_loop() # %% [markdown] -# Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later +# Adding a `EarlyStoppingCallback` results in an error. Potential fix in +# [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in +# current version. Try again later # %% # learn.summary() @@ -393,7 +396,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_train_Median.ipynb b/project/01_1_train_Median.ipynb index ab7e1965c..3c2933e5d 100644 --- a/project/01_1_train_Median.ipynb +++ b/project/01_1_train_Median.ipynb @@ -66,17 +66,17 @@ "outputs": [], "source": [ "# files and folders\n", - "folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment\n", - "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", - "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow\n", + "folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment\n", + "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", + "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Metadata for samples\n", "# model\n", - "sample_idx_position: int = 0 # position of index which is sample ID\n", - "model_key: str = 'Median' # model key (lower cased version will be used for file names)\n", - "model: str = 'Median' # model name\n", - "save_pred_real_na: bool = True # Save all predictions for real na\n", + "sample_idx_position: int = 0 # position of index which is sample ID\n", + "model_key: str = 'Median' # model key (lower cased version will be used for file names)\n", + "model: str = 'Median' # model name\n", + "save_pred_real_na: bool = True # Save all predictions for real na\n", "# metadata -> defaults for metadata extracted from machine data\n", - "meta_date_col: str = None # date column in meta data\n", - "meta_cat_col: str = None # category column in meta data" + "meta_date_col: str = None # date column in meta data\n", + "meta_cat_col: str = None # category column in meta data" ] }, { @@ -146,7 +146,7 @@ "metadata": {}, "outputs": [], "source": [ - "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) " + "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)" ] }, { @@ -172,7 +172,7 @@ "id": "fa7dcd09", "metadata": {}, "source": [ - "Infer index names from long format " + "Infer index names from long format" ] }, { @@ -184,7 +184,7 @@ "source": [ "index_columns = list(data.train_X.index.names)\n", "sample_id = index_columns.pop(args.sample_idx_position)\n", - "if len(index_columns) == 1: \n", + "if len(index_columns) == 1:\n", " index_column = index_columns.pop()\n", " index_columns = None\n", " logger.info(f\"{sample_id = }, single feature: {index_column = }\")\n", @@ -241,7 +241,7 @@ "outputs": [], "source": [ "freq_feat = vaep.io.datasplits.load_freq(args.data)\n", - "freq_feat.head() # training data" + "freq_feat.head() # training data" ] }, { @@ -257,7 +257,7 @@ "id": "23ac9141", "metadata": {}, "source": [ - "The validation fake NA is used to by all models to evaluate training performance. " + "The validation fake NA is used to by all models to evaluate training performance." ] }, { @@ -275,12 +275,13 @@ "cell_type": "code", "execution_count": null, "id": "68ea1649", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "test_pred_fake_na = data.test_y.to_frame(name='observed')\n", - "test_pred_fake_na.describe()\n", - "\n" + "test_pred_fake_na.describe()" ] }, { @@ -322,7 +323,7 @@ "metadata": {}, "outputs": [], "source": [ - "# interpolated = vaep.pandas.interpolate(wide_df = data.train_X) \n", + "# interpolated = vaep.pandas.interpolate(wide_df = data.train_X)\n", "# val_pred_fake_na['interpolated'] = interpolated\n", "# test_pred_fake_na['interpolated'] = interpolated\n", "# del interpolated\n", @@ -414,7 +415,7 @@ "metadata": {}, "outputs": [], "source": [ - "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" + "feat_freq_val.value_counts().sort_index().head() # require more than one feat?" ] }, { @@ -429,9 +430,11 @@ "errors_val = errors_val.join(freq_feat).sort_values(by='freq', ascending=True)\n", "\n", "\n", - "errors_val_smoothed = errors_val.copy() #.loc[feat_freq_val > 1]\n", - "errors_val_smoothed[errors_val.columns[:-1]] = errors_val[errors_val.columns[:-1]].rolling(window=200, min_periods=1).mean()\n", - "ax = errors_val_smoothed.plot(x='freq', figsize=(15,10) )\n", + "errors_val_smoothed = errors_val.copy() # .loc[feat_freq_val > 1]\n", + "errors_val_smoothed[errors_val.columns[:-\n", + " 1]] = errors_val[errors_val.columns[:-\n", + " 1]].rolling(window=200, min_periods=1).mean()\n", + "ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10))\n", "# errors_val_smoothed" ] }, @@ -463,8 +466,8 @@ "source": [ "## Comparisons\n", "\n", - "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) \n", - "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) \n", + "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n", + "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n", "> Could be changed." ] }, @@ -477,7 +480,7 @@ "\n", "- all measured (identified, observed) peptides in validation data\n", "\n", - "> Does not make too much sense to compare collab and AEs, \n", + "> Does not make too much sense to compare collab and AEs,\n", "> as the setup differs of training and validation data differs" ] }, @@ -518,7 +521,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { @@ -597,7 +602,7 @@ "source": [ "# val\n", "fname = args.out_preds / f\"pred_val_{args.model_key}.csv\"\n", - "setattr(args, fname.stem, fname.as_posix()) # add [] assignment?\n", + "setattr(args, fname.stem, fname.as_posix()) # add [] assignment?\n", "val_pred_fake_na.to_csv(fname)\n", "# test\n", "fname = args.out_preds / f\"pred_test_{args.model_key}.csv\"\n", @@ -620,7 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "figures # switch to fnames?" + "figures # switch to fnames?" ] }, { @@ -630,7 +635,7 @@ "metadata": {}, "outputs": [], "source": [ - "args.dump(fname=args.out_models/ f\"model_config_{args.model_key}.yaml\")\n", + "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n", "args" ] } diff --git a/project/01_1_train_Median.py b/project/01_1_train_Median.py index 33d169b61..72a7cf562 100644 --- a/project/01_1_train_Median.py +++ b/project/01_1_train_Median.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -43,17 +43,17 @@ # %% tags=["parameters"] # files and folders -folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment -file_format: str = 'csv' # file format of create splits, default pickle (pkl) -fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow +folder_experiment: str = 'runs/example' # Datasplit folder with data for experiment +file_format: str = 'csv' # file format of create splits, default pickle (pkl) +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Metadata for samples # model -sample_idx_position: int = 0 # position of index which is sample ID -model_key: str = 'Median' # model key (lower cased version will be used for file names) -model: str = 'Median' # model name -save_pred_real_na: bool = True # Save all predictions for real na +sample_idx_position: int = 0 # position of index which is sample ID +model_key: str = 'Median' # model key (lower cased version will be used for file names) +model: str = 'Median' # model name +save_pred_real_na: bool = True # Save all predictions for real na # metadata -> defaults for metadata extracted from machine data -meta_date_col: str = None # date column in meta data -meta_cat_col: str = None # category column in meta data +meta_date_col: str = None # date column in meta data +meta_cat_col: str = None # category column in meta data # %% [markdown] @@ -79,7 +79,7 @@ # ## Load data in long format # %% -data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) +data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) # %% [markdown] # data is loaded in long format @@ -88,12 +88,12 @@ data.train_X.sample(5) # %% [markdown] -# Infer index names from long format +# Infer index names from long format # %% index_columns = list(data.train_X.index.names) sample_id = index_columns.pop(args.sample_idx_position) -if len(index_columns) == 1: +if len(index_columns) == 1: index_column = index_columns.pop() index_columns = None logger.info(f"{sample_id = }, single feature: {index_column = }") @@ -126,13 +126,13 @@ # %% freq_feat = vaep.io.datasplits.load_freq(args.data) -freq_feat.head() # training data +freq_feat.head() # training data # %% [markdown] # ### Produce some addional fake samples # %% [markdown] -# The validation fake NA is used to by all models to evaluate training performance. +# The validation fake NA is used to by all models to evaluate training performance. # %% val_pred_fake_na = data.val_y.to_frame(name='observed') @@ -143,7 +143,6 @@ test_pred_fake_na.describe() - # %% [markdown] # ## Data in wide format # @@ -159,7 +158,7 @@ # ### Add interpolation performance # %% -# interpolated = vaep.pandas.interpolate(wide_df = data.train_X) +# interpolated = vaep.pandas.interpolate(wide_df = data.train_X) # val_pred_fake_na['interpolated'] = interpolated # test_pred_fake_na['interpolated'] = interpolated # del interpolated @@ -207,7 +206,7 @@ # freq_feat.to_frame('overall').join(feat_freq_val).plot.scatter(x='overall', y='freq_val') # %% -feat_freq_val.value_counts().sort_index().head() # require more than one feat? +feat_freq_val.value_counts().sort_index().head() # require more than one feat? # %% errors_val = val_pred_fake_na.drop('observed', axis=1).sub(val_pred_fake_na['observed'], axis=0) @@ -215,9 +214,11 @@ errors_val = errors_val.join(freq_feat).sort_values(by='freq', ascending=True) -errors_val_smoothed = errors_val.copy() #.loc[feat_freq_val > 1] -errors_val_smoothed[errors_val.columns[:-1]] = errors_val[errors_val.columns[:-1]].rolling(window=200, min_periods=1).mean() -ax = errors_val_smoothed.plot(x='freq', figsize=(15,10) ) +errors_val_smoothed = errors_val.copy() # .loc[feat_freq_val > 1] +errors_val_smoothed[errors_val.columns[:- + 1]] = errors_val[errors_val.columns[:- + 1]].rolling(window=200, min_periods=1).mean() +ax = errors_val_smoothed.plot(x='freq', figsize=(15, 10)) # errors_val_smoothed # %% @@ -230,8 +231,8 @@ # %% [markdown] # ## Comparisons # -# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) -# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) +# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) +# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) # > Could be changed. # %% [markdown] @@ -239,7 +240,7 @@ # # - all measured (identified, observed) peptides in validation data # -# > Does not make too much sense to compare collab and AEs, +# > Does not make too much sense to compare collab and AEs, # > as the setup differs of training and validation data differs # %% @@ -256,7 +257,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') @@ -285,7 +288,7 @@ # %% # val fname = args.out_preds / f"pred_val_{args.model_key}.csv" -setattr(args, fname.stem, fname.as_posix()) # add [] assignment? +setattr(args, fname.stem, fname.as_posix()) # add [] assignment? val_pred_fake_na.to_csv(fname) # test fname = args.out_preds / f"pred_test_{args.model_key}.csv" @@ -296,8 +299,8 @@ # ## Config # %% -figures # switch to fnames? +figures # switch to fnames? # %% -args.dump(fname=args.out_models/ f"model_config_{args.model_key}.yaml") +args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml") args diff --git a/project/01_1_train_RSN.ipynb b/project/01_1_train_RSN.ipynb index cee641bd3..95b208ebf 100644 --- a/project/01_1_train_RSN.ipynb +++ b/project/01_1_train_RSN.ipynb @@ -75,8 +75,8 @@ "# model\n", "sample_idx_position: int = 0 # position of index which is sample ID\n", "# model key (lower cased version will be used for file names)\n", - "axis: int = 1 # impute per row/sample (1) or per column/feat (0). \n", - "completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1)\n", + "axis: int = 1 # impute per row/sample (1) or per column/feat (0).\n", + "completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1)\n", "model_key: str = 'RSN'\n", "model: str = 'RSN' # model name\n", "save_pred_real_na: bool = True # Save all predictions for real na\n", @@ -458,7 +458,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_RSN.py b/project/01_1_train_RSN.py index 578b2be15..73643f02a 100644 --- a/project/01_1_train_RSN.py +++ b/project/01_1_train_RSN.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -52,8 +52,8 @@ # model sample_idx_position: int = 0 # position of index which is sample ID # model key (lower cased version will be used for file names) -axis: int = 1 # impute per row/sample (1) or per column/feat (0). -completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1) +axis: int = 1 # impute per row/sample (1) or per column/feat (0). +completeness = 0.6 # fractio of non missing values for row/sample (axis=0) or column/feat (axis=1) model_key: str = 'RSN' model: str = 'RSN' # model name save_pred_real_na: bool = True # Save all predictions for real na @@ -224,7 +224,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_train_VAE.ipynb b/project/01_1_train_VAE.ipynb index a100b8b74..d1346cc9c 100644 --- a/project/01_1_train_VAE.ipynb +++ b/project/01_1_train_VAE.ipynb @@ -17,6 +17,7 @@ }, "outputs": [], "source": [ + "\n", "import logging\n", "\n", "\n", @@ -27,24 +28,27 @@ "\n", "from torch.nn import Sigmoid\n", "\n", - "# overwriting Recorder callback with custom plot_loss\n", - "from vaep.models import plot_loss\n", - "from fastai import learner\n", - "learner.Recorder.plot_loss = plot_loss\n", - "\n", "import pandas as pd\n", + "\n", "import sklearn\n", - "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", "\n", "import vaep\n", - "from vaep.analyzers import analyzers\n", - "import vaep.model\n", - "import vaep.models as models\n", - "from vaep.models import ae\n", + "import vaep.nb\n", "from vaep.io import datasplits\n", + "from vaep.models import ae\n", + "import vaep.models as models\n", + "import vaep.model\n", + "from vaep.analyzers import analyzers\n", + "\n", + "\n", + "# overwriting Recorder callback with custom plot_loss\n", + "from vaep.models import plot_loss\n", + "from fastai import learner\n", + "learner.Recorder.plot_loss = plot_loss\n", + "\n", "\n", - "import vaep.nb\n", "logger = vaep.logging.setup_logger(logging.getLogger('vaep'))\n", "logger.info(\n", " \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n", @@ -502,7 +506,9 @@ "id": "2231b67e", "metadata": {}, "source": [ - "Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later" + "Adding a `EarlyStoppingCallback` results in an error. Potential fix in\n", + "[PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in\n", + "current version. Try again later" ] }, { @@ -882,7 +888,9 @@ "source": [ "### Test Datasplit\n", "\n", - "Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction." + "Fake NAs : Artificially created NAs. Some data was sampled and set\n", + "explicitly to misssing before it was fed to the model for\n", + "reconstruction." ] }, { diff --git a/project/01_1_train_VAE.py b/project/01_1_train_VAE.py index 4943c9d0f..6e5dac203 100644 --- a/project/01_1_train_VAE.py +++ b/project/01_1_train_VAE.py @@ -17,6 +17,7 @@ # # Variational Autoencoder # %% + import logging @@ -27,24 +28,27 @@ from torch.nn import Sigmoid -# overwriting Recorder callback with custom plot_loss -from vaep.models import plot_loss -from fastai import learner -learner.Recorder.plot_loss = plot_loss - import pandas as pd + import sklearn -from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer import vaep -from vaep.analyzers import analyzers -import vaep.model -import vaep.models as models -from vaep.models import ae +import vaep.nb from vaep.io import datasplits +from vaep.models import ae +import vaep.models as models +import vaep.model +from vaep.analyzers import analyzers + + +# overwriting Recorder callback with custom plot_loss +from vaep.models import plot_loss +from fastai import learner +learner.Recorder.plot_loss = plot_loss + -import vaep.nb logger = vaep.logging.setup_logger(logging.getLogger('vaep')) logger.info( "Experiment 03 - Analysis of latent spaces and performance comparisions") @@ -264,7 +268,9 @@ analysis.learn.show_training_loop() # %% [markdown] -# Adding a `EarlyStoppingCallback` results in an error. Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later +# Adding a `EarlyStoppingCallback` results in an error. Potential fix in +# [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in +# current version. Try again later # %% # learn.summary() @@ -438,7 +444,9 @@ # %% [markdown] # ### Test Datasplit # -# Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction. +# Fake NAs : Artificially created NAs. Some data was sampled and set +# explicitly to misssing before it was fed to the model for +# reconstruction. # %% added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na') diff --git a/project/01_1_transfer_NAGuideR_pred.py b/project/01_1_transfer_NAGuideR_pred.py index 3b85de686..18058df57 100644 --- a/project/01_1_transfer_NAGuideR_pred.py +++ b/project/01_1_transfer_NAGuideR_pred.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python diff --git a/project/02_1_aggregate_metrics.py.py b/project/02_1_aggregate_metrics.py.py index 3c7945550..ea11f334a 100644 --- a/project/02_1_aggregate_metrics.py.py +++ b/project/02_1_aggregate_metrics.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python diff --git a/project/02_1_join_metrics.py.ipynb b/project/02_1_join_metrics.py.ipynb index 387a722f6..561f06525 100644 --- a/project/02_1_join_metrics.py.ipynb +++ b/project/02_1_join_metrics.py.ipynb @@ -38,9 +38,7 @@ "cell_type": "code", "execution_count": 4, "id": "df472356", - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "filepath_out" @@ -52,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "## Example \n", + "## Example\n", "\n", "- first file" ] @@ -64,6 +62,8 @@ "metadata": {}, "outputs": [], "source": [ + "\n", + "\n", "def process(fpath: str) -> pd.DataFrame:\n", " df = pd.read_csv(fpath, index_col=POS_INDEX_COL, header=list(range(N_HEADER_COLS)))\n", " return df\n", diff --git a/project/02_1_join_metrics.py.py b/project/02_1_join_metrics.py.py index ec2aae537..8b395c187 100644 --- a/project/02_1_join_metrics.py.py +++ b/project/02_1_join_metrics.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -30,11 +30,13 @@ filepath_out # %% [markdown] -# ## Example +# ## Example # # - first file # %% + + def process(fpath: str) -> pd.DataFrame: df = pd.read_csv(fpath, index_col=POS_INDEX_COL, header=list(range(N_HEADER_COLS))) return df diff --git a/project/02_2_aggregate_configs.py.ipynb b/project/02_2_aggregate_configs.py.ipynb index 2d36b6b43..cdb9d77fb 100644 --- a/project/02_2_aggregate_configs.py.ipynb +++ b/project/02_2_aggregate_configs.py.ipynb @@ -20,10 +20,12 @@ "source": [ "from pathlib import Path\n", "import pandas as pd\n", - "pd.options.display.max_columns = 30 \n", "\n", - "from vaep.models.collect_dumps import collect_configs\n", "from vaep.logging import setup_nb_logger\n", + "from vaep.models.collect_dumps import collect_configs\n", + "\n", + "pd.options.display.max_columns = 30\n", + "\n", "logger = setup_nb_logger()" ] }, diff --git a/project/02_2_aggregate_configs.py.py b/project/02_2_aggregate_configs.py.py index f820ef03e..dc8ba3a3a 100644 --- a/project/02_2_aggregate_configs.py.py +++ b/project/02_2_aggregate_configs.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -20,10 +20,12 @@ # %% from pathlib import Path import pandas as pd -pd.options.display.max_columns = 30 -from vaep.models.collect_dumps import collect_configs from vaep.logging import setup_nb_logger +from vaep.models.collect_dumps import collect_configs + +pd.options.display.max_columns = 30 + logger = setup_nb_logger() # %% diff --git a/project/02_2_join_configs.py.ipynb b/project/02_2_join_configs.py.ipynb index 8b51442fb..4d1b871f5 100644 --- a/project/02_2_join_configs.py.ipynb +++ b/project/02_2_join_configs.py.ipynb @@ -103,7 +103,7 @@ ], "metadata": { "kernelspec": { - "display_name": "vaep", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/project/02_2_join_configs.py.py b/project/02_2_join_configs.py.py index 36a18ccc4..d8381e119 100644 --- a/project/02_2_join_configs.py.py +++ b/project/02_2_join_configs.py.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.0 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python diff --git a/project/02_3_grid_search_analysis.ipynb b/project/02_3_grid_search_analysis.ipynb index b47d75474..440b99b21 100644 --- a/project/02_3_grid_search_analysis.ipynb +++ b/project/02_3_grid_search_analysis.ipynb @@ -21,18 +21,18 @@ "import plotly.express as px\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", "\n", - "\n", + "import vaep.plotting.plotly as px_vaep\n", + "from vaep.analyzers import compare_predictions\n", + "from vaep import sampling\n", + "from vaep.io import datasplits\n", + "import vaep.utils\n", + "import vaep.pandas\n", + "import vaep.io\n", "import vaep.nb\n", "matplotlib.rcParams['figure.figsize'] = [12.0, 6.0]\n", "\n", - "import vaep.io\n", - "import vaep.pandas\n", - "import vaep.utils\n", - "from vaep.io import datasplits\n", - "from vaep import sampling\n", - "from vaep.analyzers import compare_predictions\n", - "import vaep.plotting.plotly as px_vaep\n", "\n", "pd.options.display.max_columns = 45\n", "pd.options.display.max_rows = 100\n", @@ -70,8 +70,8 @@ }, "outputs": [], "source": [ - "metrics_csv:str = \"path/to/all_metrics.csv\" # file path to metrics\n", - "configs_csv:str = \"path/to/all_configs.csv\" # file path to configs (\"meta data\")" + "metrics_csv: str = \"path/to/all_metrics.csv\" # file path to metrics\n", + "configs_csv: str = \"path/to/all_configs.csv\" # file path to configs (\"meta data\")" ] }, { @@ -170,7 +170,7 @@ "source": [ "# ToDo: integrate as parameters\n", "metric_columns = ['MSE', 'MAE']\n", - "model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used\n", + "model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used\n", "subset = metrics.columns.levels[0][0]\n", "print(f\"{subset = }\")" ] @@ -204,16 +204,16 @@ "meta['hidden_layers'] = (meta\n", " .loc[meta['hidden_layers'].notna(), 'hidden_layers']\n", " .apply(lambda x: tuple(eval(x)))\n", - ")\n", + " )\n", "meta['n_hidden_layers'] = (meta\n", " .loc[meta['hidden_layers'].notna(), 'hidden_layers']\n", " .apply(len)\n", - ")\n", + " )\n", "meta['n_hidden_layers'] = (meta\n", " ['n_hidden_layers']\n", " .fillna(0)\n", " .astype(int)\n", - ")\n", + " )\n", "meta.loc[meta['hidden_layers'].isna(), 'hidden_layers'] = None\n", "meta = meta.set_index('id')\n", "meta" @@ -225,7 +225,8 @@ "id": "b4dd468f-8995-403d-a389-6c4e4e912cd5", "metadata": {}, "source": [ - "Batch size for collab models depends on a factor (as the data in long format has roughly N samples * M features entries)." + "Batch size for collab models depends on a factor (as the data in long\n", + "format has roughly N samples * M features entries)." ] }, { @@ -271,19 +272,19 @@ "source": [ "# ToDo: To make it cleaner: own config for each model (interpolated and median)\n", "metrics_styled = (metrics\n", - " .set_index(\n", - " pd.MultiIndex\n", - " .from_frame(\n", - " meta\n", - " .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']]\n", - " # .loc[metrics.index]\n", - " )\n", - " )\n", - " .sort_index()\n", - " .stack('model')\n", - " .drop_duplicates()\n", - " .style.background_gradient(cmap)\n", - ")\n", + " .set_index(\n", + " pd.MultiIndex\n", + " .from_frame(\n", + " meta\n", + " .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']]\n", + " # .loc[metrics.index]\n", + " )\n", + " )\n", + " .sort_index()\n", + " .stack('model')\n", + " .drop_duplicates()\n", + " .style.background_gradient(cmap)\n", + " )\n", "\n", "metrics = metrics_styled.data\n", "metrics_styled" @@ -354,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0,1,2])\n", + "metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0, 1, 2])\n", "# columns_names = ['subset', 'data_split', 'model', 'metric_name']\n", "columns_names = list(metrics_long.columns.names)\n", "metrics_long.sample(5) if len(metrics_long) > 15 else metrics_long" @@ -395,13 +396,13 @@ "outputs": [], "source": [ "metrics_prop = (metrics_long\n", - " .loc[:, pd.IndexSlice[:, :, 'prop']]\n", - " .stack(['data_split', 'model'])\n", - " .reset_index()\n", - " .drop_duplicates()\n", - " .set_index(['id', 'data_split', 'model'])\n", - " .astype(int)\n", - " )\n", + " .loc[:, pd.IndexSlice[:, :, 'prop']]\n", + " .stack(['data_split', 'model'])\n", + " .reset_index()\n", + " .drop_duplicates()\n", + " .set_index(['id', 'data_split', 'model'])\n", + " .astype(int)\n", + " )\n", "metrics_prop" ] }, @@ -427,7 +428,7 @@ " .to_frame('metric_value')\n", " .reset_index('metric_name')\n", " .join(metrics_N)\n", - ")\n", + " )\n", "metrics_long" ] }, @@ -450,7 +451,7 @@ "metrics_long = (metrics_long\n", " .reset_index(['data_split'])\n", " .join(meta.set_index('model', append=True))\n", - " ).reset_index('model')\n", + " ).reset_index('model')\n", "# metrics_long.index.name = 'id'\n", "metrics_long.sample(5)" ] @@ -549,7 +550,7 @@ "id": "c4607c64-2e90-4ed6-b337-8e210d7c37de", "metadata": {}, "source": [ - "# Collection of Performance plots \n", + "# Collection of Performance plots\n", "\n", "- specify `labels_dict` for plotly plotting\n", "\n" @@ -607,7 +608,6 @@ }, "outputs": [], "source": [ - "import seaborn as sns\n", "plt.rcParams['figure.figsize'] = (8, 4)\n", "plt.rcParams['lines.linewidth'] = 2\n", "plt.rcParams['lines.markersize'] = 3\n", @@ -620,9 +620,9 @@ " x='n_params',\n", " y='metric_value',\n", " col=\"data_split\",\n", - " col_order = col_order,\n", + " col_order=col_order,\n", " row=\"metric_name\",\n", - " row_order = row_order,\n", + " row_order=row_order,\n", " hue=\"model\",\n", " # style=\"day\",\n", " palette=vaep.plotting.defaults.color_model_mapping,\n", @@ -633,11 +633,11 @@ "fg.fig.get_size_inches()\n", "\n", "(ax_00, ax_01), (ax_10, ax_11) = fg.axes\n", - "ax_00.set_ylabel('MAE')\n", - "ax_10.set_ylabel('MSE')\n", - "_ = ax_00.set_title('validation data')\n", - "_ = ax_01.set_title('test data')\n", - "ax_10.set_xlabel('number of parameters')\n", + "ax_00.set_ylabel(row_order[0])\n", + "ax_10.set_ylabel(row_order[1])\n", + "_ = ax_00.set_title('validation data') # col_order[0]\n", + "_ = ax_01.set_title('test data') # col_order[1]\n", + "ax_10.set_xlabel('number of parameters') # n_params\n", "ax_11.set_xlabel('number of parameters')\n", "ax_10.xaxis.set_major_formatter(\"{x:,.0f}\")\n", "ax_11.xaxis.set_major_formatter(\"{x:,.0f}\")\n", @@ -686,6 +686,7 @@ " yaxis={'title': {'standoff': 6}})\n", " return fig\n", "\n", + "\n", "dataset = \"test_fake_na\"\n", "fig = plot_by_params(dataset)\n", "fname = FOLDER / f\"hyperpar_{dataset}_results_by_parameters.pdf\"\n", @@ -730,8 +731,8 @@ "source": [ "group_by = ['data_split', 'latent_dim', 'metric_name', 'model']\n", "metrics_long_sel_min = metrics_long.reset_index(\n", - " ).groupby(by=group_by\n", - " ).apply(lambda df: df.sort_values(by='metric_value').iloc[0])\n", + ").groupby(by=group_by\n", + " ).apply(lambda df: df.sort_values(by='metric_value').iloc[0])\n", "metrics_long_sel_min" ] }, @@ -813,7 +814,7 @@ "source": [ "dataset = 'valid_fake_na'\n", "group_by = ['data_split', 'metric_name', 'model', 'latent_dim']\n", - "METRIC = 'MAE' # params.metric\n", + "METRIC = 'MAE' # params.metric\n", "selected = (metrics_long\n", " .reset_index()\n", " .groupby(by=group_by)\n", @@ -864,11 +865,11 @@ "outputs": [], "source": [ "min_latent = (selected\n", - " .loc[METRIC]\n", - " .loc[model_with_latent]\n", - " .groupby(level='latent_dim')\n", - " .agg({'metric_value': 'mean'})\n", - " .sort_values('metric_value')\n", + " .loc[METRIC]\n", + " .loc[model_with_latent]\n", + " .groupby(level='latent_dim')\n", + " .agg({'metric_value': 'mean'})\n", + " .sort_values('metric_value')\n", " )\n", "min_latent" ] @@ -1034,10 +1035,10 @@ " .value_counts()\n", " .sort_index()\n", " .plot(style='.',\n", - " xlabel='number of samples',\n", - " ylabel='observations')\n", - ")\n", - "vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf'])" + " xlabel='number of samples',\n", + " ylabel='observations')\n", + " )\n", + "vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf'])" ] }, { @@ -1117,8 +1118,8 @@ "msg_annotation = f\"(Latend dim: {min_latent}, No. of feat: {M_feat}, window_size: {window_size})\"\n", "print(msg_annotation)\n", "\n", - "files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = (FOLDER /\n", - " f'best_models_ld_{min_latent}_rolling_errors_by_freq')\n", + "files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = (\n", + " FOLDER / f'best_models_ld_{min_latent}_rolling_errors_by_freq')\n", "vaep.savefig(\n", " ax.get_figure(),\n", " name=files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'])" @@ -1165,8 +1166,8 @@ " )\n", "fig = px_vaep.apply_default_layout(fig)\n", "fig.update_layout(legend_title_text='') # remove legend title\n", - "files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = (FOLDER /\n", - " f'best_models_ld_{min_latent}_errors_by_freq_plotly.html')\n", + "files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = (\n", + " FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_plotly.html')\n", "fig.write_html(\n", " files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'])\n", "fig" @@ -1205,8 +1206,8 @@ " # title='mean error for features averaged for each frequency'\n", " xlim=(FREQ_MIN, freq_feat.max())\n", ")\n", - "files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = (FOLDER /\n", - " f'best_models_ld_{min_latent}_errors_by_freq_averaged')\n", + "files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = (\n", + " FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_averaged')\n", "vaep.savefig(\n", " ax.get_figure(),\n", " files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'])" diff --git a/project/02_3_grid_search_analysis.py b/project/02_3_grid_search_analysis.py index cd36ae19a..24c0ce374 100644 --- a/project/02_3_grid_search_analysis.py +++ b/project/02_3_grid_search_analysis.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -22,18 +22,18 @@ import plotly.express as px import matplotlib import matplotlib.pyplot as plt +import seaborn as sns - +import vaep.plotting.plotly as px_vaep +from vaep.analyzers import compare_predictions +from vaep import sampling +from vaep.io import datasplits +import vaep.utils +import vaep.pandas +import vaep.io import vaep.nb matplotlib.rcParams['figure.figsize'] = [12.0, 6.0] -import vaep.io -import vaep.pandas -import vaep.utils -from vaep.io import datasplits -from vaep import sampling -from vaep.analyzers import compare_predictions -import vaep.plotting.plotly as px_vaep pd.options.display.max_columns = 45 pd.options.display.max_rows = 100 @@ -48,8 +48,8 @@ # papermill parameters: # %% tags=["parameters"] -metrics_csv:str = "path/to/all_metrics.csv" # file path to metrics -configs_csv:str = "path/to/all_configs.csv" # file path to configs ("meta data") +metrics_csv: str = "path/to/all_metrics.csv" # file path to metrics +configs_csv: str = "path/to/all_configs.csv" # file path to configs ("meta data") # %% try: @@ -92,7 +92,7 @@ # %% # ToDo: integrate as parameters metric_columns = ['MSE', 'MAE'] -model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used +model_keys = metrics.stack('model').index.levels[-1].unique().to_list() # not used subset = metrics.columns.levels[0][0] print(f"{subset = }") @@ -107,22 +107,23 @@ meta['hidden_layers'] = (meta .loc[meta['hidden_layers'].notna(), 'hidden_layers'] .apply(lambda x: tuple(eval(x))) -) + ) meta['n_hidden_layers'] = (meta .loc[meta['hidden_layers'].notna(), 'hidden_layers'] .apply(len) -) + ) meta['n_hidden_layers'] = (meta ['n_hidden_layers'] .fillna(0) .astype(int) -) + ) meta.loc[meta['hidden_layers'].isna(), 'hidden_layers'] = None meta = meta.set_index('id') meta # %% [markdown] -# Batch size for collab models depends on a factor (as the data in long format has roughly N samples * M features entries). +# Batch size for collab models depends on a factor (as the data in long +# format has roughly N samples * M features entries). # %% [markdown] # ## Colorcoded metrics @@ -141,19 +142,19 @@ # %% # ToDo: To make it cleaner: own config for each model (interpolated and median) metrics_styled = (metrics - .set_index( - pd.MultiIndex - .from_frame( - meta - .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']] - # .loc[metrics.index] - ) - ) - .sort_index() - .stack('model') - .drop_duplicates() - .style.background_gradient(cmap) -) + .set_index( + pd.MultiIndex + .from_frame( + meta + .loc[metrics.index, ['latent_dim', 'hidden_layers', 'batch_size']] + # .loc[metrics.index] + ) + ) + .sort_index() + .stack('model') + .drop_duplicates() + .style.background_gradient(cmap) + ) metrics = metrics_styled.data metrics_styled @@ -189,7 +190,7 @@ # Rebuild metrics from dictionary # %% -metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0,1,2]) +metrics_long = pd.read_csv(path_metrics, index_col=[0], header=[0, 1, 2]) # columns_names = ['subset', 'data_split', 'model', 'metric_name'] columns_names = list(metrics_long.columns.names) metrics_long.sample(5) if len(metrics_long) > 15 else metrics_long @@ -210,13 +211,13 @@ # %% metrics_prop = (metrics_long - .loc[:, pd.IndexSlice[:, :, 'prop']] - .stack(['data_split', 'model']) - .reset_index() - .drop_duplicates() - .set_index(['id', 'data_split', 'model']) - .astype(int) - ) + .loc[:, pd.IndexSlice[:, :, 'prop']] + .stack(['data_split', 'model']) + .reset_index() + .drop_duplicates() + .set_index(['id', 'data_split', 'model']) + .astype(int) + ) metrics_prop # %% [markdown] @@ -229,7 +230,7 @@ .to_frame('metric_value') .reset_index('metric_name') .join(metrics_N) -) + ) metrics_long # %% [markdown] @@ -239,7 +240,7 @@ metrics_long = (metrics_long .reset_index(['data_split']) .join(meta.set_index('model', append=True)) - ).reset_index('model') + ).reset_index('model') # metrics_long.index.name = 'id' metrics_long.sample(5) @@ -286,7 +287,7 @@ logger.info(f"Saved metrics in long format: {fname}") # %% [markdown] -# # Collection of Performance plots +# # Collection of Performance plots # # - specify `labels_dict` for plotly plotting # @@ -316,7 +317,6 @@ hover_data['metric_value'] = ':.4f' # %% -import seaborn as sns plt.rcParams['figure.figsize'] = (8, 4) plt.rcParams['lines.linewidth'] = 2 plt.rcParams['lines.markersize'] = 3 @@ -329,9 +329,9 @@ x='n_params', y='metric_value', col="data_split", - col_order = col_order, + col_order=col_order, row="metric_name", - row_order = row_order, + row_order=row_order, hue="model", # style="day", palette=vaep.plotting.defaults.color_model_mapping, @@ -344,9 +344,9 @@ (ax_00, ax_01), (ax_10, ax_11) = fg.axes ax_00.set_ylabel(row_order[0]) ax_10.set_ylabel(row_order[1]) -_ = ax_00.set_title('validation data') # col_order[0] -_ = ax_01.set_title('test data') # col_order[1] -ax_10.set_xlabel('number of parameters') # n_params +_ = ax_00.set_title('validation data') # col_order[0] +_ = ax_01.set_title('test data') # col_order[1] +ax_10.set_xlabel('number of parameters') # n_params ax_11.set_xlabel('number of parameters') ax_10.xaxis.set_major_formatter("{x:,.0f}") ax_11.xaxis.set_major_formatter("{x:,.0f}") @@ -388,6 +388,7 @@ def plot_by_params(data_split: str = '', subset: str = ''): yaxis={'title': {'standoff': 6}}) return fig + dataset = "test_fake_na" fig = plot_by_params(dataset) fname = FOLDER / f"hyperpar_{dataset}_results_by_parameters.pdf" @@ -412,8 +413,8 @@ def plot_by_params(data_split: str = '', subset: str = ''): # %% group_by = ['data_split', 'latent_dim', 'metric_name', 'model'] metrics_long_sel_min = metrics_long.reset_index( - ).groupby(by=group_by - ).apply(lambda df: df.sort_values(by='metric_value').iloc[0]) +).groupby(by=group_by + ).apply(lambda df: df.sort_values(by='metric_value').iloc[0]) metrics_long_sel_min @@ -469,7 +470,7 @@ def get_plotly_figure(dataset: str, x='latent_dim'): # %% dataset = 'valid_fake_na' group_by = ['data_split', 'metric_name', 'model', 'latent_dim'] -METRIC = 'MAE' # params.metric +METRIC = 'MAE' # params.metric selected = (metrics_long .reset_index() .groupby(by=group_by) @@ -494,11 +495,11 @@ def get_plotly_figure(dataset: str, x='latent_dim'): # %% min_latent = (selected - .loc[METRIC] - .loc[model_with_latent] - .groupby(level='latent_dim') - .agg({'metric_value': 'mean'}) - .sort_values('metric_value') + .loc[METRIC] + .loc[model_with_latent] + .groupby(level='latent_dim') + .agg({'metric_value': 'mean'}) + .sort_values('metric_value') ) min_latent @@ -581,10 +582,10 @@ def get_plotly_figure(dataset: str, x='latent_dim'): .value_counts() .sort_index() .plot(style='.', - xlabel='number of samples', - ylabel='observations') -) -vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf']) + xlabel='number of samples', + ylabel='observations') + ) +vaep.savefig(ax.get_figure(), files_out[f'n_obs_error_counts_{dataset}.pdf']) # %% ax = errors.plot.scatter('freq', 'n_obs') @@ -621,8 +622,8 @@ def get_plotly_figure(dataset: str, x='latent_dim'): msg_annotation = f"(Latend dim: {min_latent}, No. of feat: {M_feat}, window_size: {window_size})" print(msg_annotation) -files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = (FOLDER / - f'best_models_ld_{min_latent}_rolling_errors_by_freq') +files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq'] = ( + FOLDER / f'best_models_ld_{min_latent}_rolling_errors_by_freq') vaep.savefig( ax.get_figure(), name=files_out[f'best_models_ld_{min_latent}_rolling_errors_by_freq']) @@ -647,8 +648,8 @@ def get_plotly_figure(dataset: str, x='latent_dim'): ) fig = px_vaep.apply_default_layout(fig) fig.update_layout(legend_title_text='') # remove legend title -files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = (FOLDER / - f'best_models_ld_{min_latent}_errors_by_freq_plotly.html') +files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html'] = ( + FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_plotly.html') fig.write_html( files_out[f'best_models_ld_{min_latent}_errors_by_freq_plotly.html']) fig @@ -673,8 +674,8 @@ def get_plotly_figure(dataset: str, x='latent_dim'): # title='mean error for features averaged for each frequency' xlim=(FREQ_MIN, freq_feat.max()) ) -files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = (FOLDER / - f'best_models_ld_{min_latent}_errors_by_freq_averaged') +files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged'] = ( + FOLDER / f'best_models_ld_{min_latent}_errors_by_freq_averaged') vaep.savefig( ax.get_figure(), files_out[f'best_models_ld_{min_latent}_errors_by_freq_averaged']) diff --git a/project/02_4_best_models_over_all_data.ipynb b/project/02_4_best_models_over_all_data.ipynb index 17c3f0dde..f252973d2 100644 --- a/project/02_4_best_models_over_all_data.ipynb +++ b/project/02_4_best_models_over_all_data.ipynb @@ -78,7 +78,7 @@ "source": [ "# snakemake.params.folder\n", "try:\n", - " models = snakemake.params.models # snakefile would need to be\n", + " models = snakemake.params.models # snakefile would need to be\n", "except AttributeError:\n", " models = ['Median', 'interpolated', 'CF', 'DAE', 'VAE']\n", "models" @@ -155,9 +155,9 @@ "outputs": [], "source": [ "_unique = metrics_long[\"data level\"].unique()\n", - "order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] #ensure predefined order\n", + "order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] # ensure predefined order\n", "_unique = metrics_long['model'].unique()\n", - "order_categories['model'] = [m for m in order_categories['model'] if m in _unique] #ensure predefined order\n", + "order_categories['model'] = [m for m in order_categories['model'] if m in _unique] # ensure predefined order\n", "\n", "semi_supervised = [m for m in ['CF', 'DAE', 'VAE'] if m in _unique]\n", "reference = [m for m in ['median', 'interpolated'] if m in _unique]\n", @@ -216,11 +216,11 @@ "source": [ "# select best model of top N with least parameters\n", "sel_on_val = (sel_on_val\n", - " .groupby(by=group_by)\n", - " .apply(\n", - " lambda df: df.sort_values(by='n_params').iloc[0]\n", - " )\n", - " ).loc[\n", + " .groupby(by=group_by)\n", + " .apply(\n", + " lambda df: df.sort_values(by='n_params').iloc[0]\n", + " )\n", + " ).loc[\n", " pd.IndexSlice[dataset, IDX_ORDER[0], 'MAE', IDX_ORDER[1]],\n", " selected_cols]\n", "sel_on_val.to_excel(writer, sheet_name=f'selected')\n", @@ -248,13 +248,13 @@ "idx = sel_on_val.droplevel(level='data_split').index\n", "sel_on_val = sel_on_val.reset_index(['latent_dim', 'hidden_layers', 'id'])\n", "\n", - "test_results = ( metrics_long\n", - " .query('data_split == \"test_fake_na\"')\n", - " .reset_index().set_index(idx.names)\n", - " .loc[idx]\n", - " .reset_index(['latent_dim', 'hidden_layers', 'id'])\n", - " .set_index('data_split', append=True)\n", - ")[selected_cols]\n", + "test_results = (metrics_long\n", + " .query('data_split == \"test_fake_na\"')\n", + " .reset_index().set_index(idx.names)\n", + " .loc[idx]\n", + " .reset_index(['latent_dim', 'hidden_layers', 'id'])\n", + " .set_index('data_split', append=True)\n", + " )[selected_cols]\n", "test_results" ] }, @@ -329,7 +329,7 @@ "metadata": {}, "outputs": [], "source": [ - "### Validation data results " + "### Validation data results" ] }, { @@ -344,7 +344,7 @@ "_to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]]\n", "\n", "_to_plot = _to_plot.set_index(['data level', 'model'])[['metric_value', 'text']]\n", - "_to_plot = _to_plot.loc[IDX_ORDER,:]\n", + "_to_plot = _to_plot.loc[IDX_ORDER, :]\n", "_to_plot.index.name = ''\n", "# text = test_results['text'].unstack().loc[IDX_ORDER].unstack()\n", "_to_plot = _to_plot['metric_value'].unstack().loc[IDX_ORDER]\n", @@ -383,7 +383,8 @@ "fname = 'best_models_1_val_plotly'\n", "_to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]]\n", "_to_plot = _to_plot.set_index(['data level', 'model'])\n", - "_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-')\n", + "_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[[\n", + " 'metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-')\n", "\n", "_to_plot = _to_plot.loc[pd.IndexSlice[IDX_ORDER], :]\n", "_to_plot.to_csv(FOLDER / f\"{fname}.csv\")\n", @@ -401,7 +402,7 @@ "fig = px.bar(_to_plot.reset_index(),\n", " x='data level',\n", " y='metric_value',\n", - " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", + " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", " color='model',\n", " barmode=\"group\",\n", " text='text',\n", @@ -439,8 +440,8 @@ " f' & metric_name == \"{METRIC}\"')\n", "\n", "best_on_average = metrics_long_sel.reset_index(\n", - " ).groupby(by=group_by\n", - " )['metric_value'].mean().sort_values().reset_index(level=group_by[1:])\n", + ").groupby(by=group_by\n", + " )['metric_value'].mean().sort_values().reset_index(level=group_by[1:])\n", "best_on_average" ] }, @@ -477,21 +478,21 @@ "data_split = 'test_fake_na'\n", "\n", "metrics_long_sel_test = metrics_long.query(f'data_split == \"{data_split}\"'\n", - " f' & metric_name == \"{METRIC}\"')\n", + " f' & metric_name == \"{METRIC}\"')\n", "\n", "to_plot = (metrics_long_sel_test\n", - " .reset_index().set_index(group_by)\n", - " .loc[best_on_average.index]\n", - " .reset_index().set_index(['model', 'data level'])\n", - " .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :])\n", + " .reset_index().set_index(group_by)\n", + " .loc[best_on_average.index]\n", + " .reset_index().set_index(['model', 'data level'])\n", + " .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :])\n", "\n", "\n", "to_plot = to_plot.reset_index()\n", "to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text']\n", - "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", + "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", "\n", "to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value'])\n", - "to_plot.to_csv(FOLDER /f\"{fname}.csv\")\n", + "to_plot.to_csv(FOLDER / f\"{fname}.csv\")\n", "to_plot" ] }, @@ -502,7 +503,7 @@ "metadata": {}, "outputs": [], "source": [ - "figsize= (10,8) # None # (10,8)\n", + "figsize = (10, 8) # None # (10,8)\n", "fig, ax = plt.subplots(figsize=figsize)\n", "to_plot.columns.name = ''\n", "ax = (to_plot\n", @@ -517,7 +518,19 @@ " width=.8,\n", " ax=ax,\n", " # colormap=\"Paired\",\n", - " color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']\n", + " color=[\n", + " '#a6cee3',\n", + " '#1f78b4',\n", + " '#b2df8a',\n", + " '#33a02c',\n", + " '#fb9a99',\n", + " '#e31a1c',\n", + " '#fdbf6f',\n", + " '#ff7f00',\n", + " '#cab2d6',\n", + " '#6a3d9a',\n", + " '#ffff99',\n", + " '#b15928']\n", " )\n", " )\n", "ax = vaep.plotting.add_height_to_barplot(ax, size=11)\n", @@ -546,7 +559,7 @@ " x='model',\n", " y='metric_value',\n", " color='data level',\n", - " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", + " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", " barmode=\"group\",\n", " color_discrete_sequence=px.colors.colorbrewer.Paired,\n", " # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'],\n", @@ -583,14 +596,14 @@ " .loc[best_on_average.index].reset_index()\n", " .set_index(['model', 'data level'])\n", " .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :]\n", - " )\n", + " )\n", "\n", "to_plot = to_plot.reset_index()\n", "to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text']\n", - "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", + "order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation\n", "\n", "to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value'])\n", - "to_plot.to_csv(FOLDER /f\"{fname}.csv\")\n", + "to_plot.to_csv(FOLDER / f\"{fname}.csv\")\n", "to_plot" ] }, @@ -601,7 +614,7 @@ "metadata": {}, "outputs": [], "source": [ - "figsize= (10,8) # None # (10,8)\n", + "figsize = (10, 8) # None # (10,8)\n", "fig, ax = plt.subplots(figsize=figsize)\n", "to_plot.columns.name = ''\n", "ax = (to_plot\n", @@ -616,7 +629,19 @@ " width=.8,\n", " ax=ax,\n", " # colormap=\"Paired\",\n", - " color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']\n", + " color=[\n", + " '#a6cee3',\n", + " '#1f78b4',\n", + " '#b2df8a',\n", + " '#33a02c',\n", + " '#fb9a99',\n", + " '#e31a1c',\n", + " '#fdbf6f',\n", + " '#ff7f00',\n", + " '#cab2d6',\n", + " '#6a3d9a',\n", + " '#ffff99',\n", + " '#b15928']\n", " )\n", " )\n", "ax = vaep.plotting.add_height_to_barplot(ax, size=11)\n", @@ -645,7 +670,7 @@ " x='model',\n", " y='metric_value',\n", " color='data level',\n", - " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", + " hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data\n", " barmode=\"group\",\n", " color_discrete_sequence=px.colors.colorbrewer.Paired,\n", " # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'],\n", diff --git a/project/02_4_best_models_over_all_data.py b/project/02_4_best_models_over_all_data.py index ffb4b941f..3aea8ecae 100644 --- a/project/02_4_best_models_over_all_data.py +++ b/project/02_4_best_models_over_all_data.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -59,7 +59,7 @@ # %% # snakemake.params.folder try: - models = snakemake.params.models # snakefile would need to be + models = snakemake.params.models # snakefile would need to be except AttributeError: models = ['Median', 'interpolated', 'CF', 'DAE', 'VAE'] models @@ -96,9 +96,9 @@ # %% _unique = metrics_long["data level"].unique() -order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] #ensure predefined order +order_categories['data level'] = [l for l in order_categories['data level'] if l in _unique] # ensure predefined order _unique = metrics_long['model'].unique() -order_categories['model'] = [m for m in order_categories['model'] if m in _unique] #ensure predefined order +order_categories['model'] = [m for m in order_categories['model'] if m in _unique] # ensure predefined order semi_supervised = [m for m in ['CF', 'DAE', 'VAE'] if m in _unique] reference = [m for m in ['median', 'interpolated'] if m in _unique] @@ -137,11 +137,11 @@ # %% # select best model of top N with least parameters sel_on_val = (sel_on_val - .groupby(by=group_by) - .apply( - lambda df: df.sort_values(by='n_params').iloc[0] - ) - ).loc[ + .groupby(by=group_by) + .apply( + lambda df: df.sort_values(by='n_params').iloc[0] + ) + ).loc[ pd.IndexSlice[dataset, IDX_ORDER[0], 'MAE', IDX_ORDER[1]], selected_cols] sel_on_val.to_excel(writer, sheet_name=f'selected') @@ -156,13 +156,13 @@ idx = sel_on_val.droplevel(level='data_split').index sel_on_val = sel_on_val.reset_index(['latent_dim', 'hidden_layers', 'id']) -test_results = ( metrics_long - .query('data_split == "test_fake_na"') - .reset_index().set_index(idx.names) - .loc[idx] - .reset_index(['latent_dim', 'hidden_layers', 'id']) - .set_index('data_split', append=True) -)[selected_cols] +test_results = (metrics_long + .query('data_split == "test_fake_na"') + .reset_index().set_index(idx.names) + .loc[idx] + .reset_index(['latent_dim', 'hidden_layers', 'id']) + .set_index('data_split', append=True) + )[selected_cols] test_results # %% [markdown] @@ -197,7 +197,7 @@ vaep.savefig(fig, fname, folder=FOLDER) # %% [markdown] -# ### Validation data results +# ### Validation data results # %% fname = 'best_models_1_val_mpl' @@ -205,7 +205,7 @@ _to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]] _to_plot = _to_plot.set_index(['data level', 'model'])[['metric_value', 'text']] -_to_plot = _to_plot.loc[IDX_ORDER,:] +_to_plot = _to_plot.loc[IDX_ORDER, :] _to_plot.index.name = '' # text = test_results['text'].unstack().loc[IDX_ORDER].unstack() _to_plot = _to_plot['metric_value'].unstack().loc[IDX_ORDER] @@ -230,7 +230,8 @@ fname = 'best_models_1_val_plotly' _to_plot = sel_on_val.reset_index(level=['data level', 'model']).loc[[('valid_fake_na', METRIC), ]] _to_plot = _to_plot.set_index(['data level', 'model']) -_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-') +_to_plot[['metric_value', 'latent_dim', 'hidden_layers', 'text']] = _to_plot[[ + 'metric_value', 'latent_dim', 'hidden_layers', 'text']].fillna('-') _to_plot = _to_plot.loc[pd.IndexSlice[IDX_ORDER], :] _to_plot.to_csv(FOLDER / f"{fname}.csv") @@ -241,7 +242,7 @@ fig = px.bar(_to_plot.reset_index(), x='data level', y='metric_value', - hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data + hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data color='model', barmode="group", text='text', @@ -266,8 +267,8 @@ f' & metric_name == "{METRIC}"') best_on_average = metrics_long_sel.reset_index( - ).groupby(by=group_by - )['metric_value'].mean().sort_values().reset_index(level=group_by[1:]) +).groupby(by=group_by + )['metric_value'].mean().sort_values().reset_index(level=group_by[1:]) best_on_average # %% @@ -284,25 +285,25 @@ data_split = 'test_fake_na' metrics_long_sel_test = metrics_long.query(f'data_split == "{data_split}"' - f' & metric_name == "{METRIC}"') + f' & metric_name == "{METRIC}"') to_plot = (metrics_long_sel_test - .reset_index().set_index(group_by) - .loc[best_on_average.index] - .reset_index().set_index(['model', 'data level']) - .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :]) + .reset_index().set_index(group_by) + .loc[best_on_average.index] + .reset_index().set_index(['model', 'data level']) + .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :]) to_plot = to_plot.reset_index() to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text'] -order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation +order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value']) -to_plot.to_csv(FOLDER /f"{fname}.csv") +to_plot.to_csv(FOLDER / f"{fname}.csv") to_plot # %% -figsize= (10,8) # None # (10,8) +figsize = (10, 8) # None # (10,8) fig, ax = plt.subplots(figsize=figsize) to_plot.columns.name = '' ax = (to_plot @@ -317,7 +318,19 @@ width=.8, ax=ax, # colormap="Paired", - color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'] + color=[ + '#a6cee3', + '#1f78b4', + '#b2df8a', + '#33a02c', + '#fb9a99', + '#e31a1c', + '#fdbf6f', + '#ff7f00', + '#cab2d6', + '#6a3d9a', + '#ffff99', + '#b15928'] ) ) ax = vaep.plotting.add_height_to_barplot(ax, size=11) @@ -333,7 +346,7 @@ x='model', y='metric_value', color='data level', - hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data + hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data barmode="group", color_discrete_sequence=px.colors.colorbrewer.Paired, # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'], @@ -357,18 +370,18 @@ .loc[best_on_average.index].reset_index() .set_index(['model', 'data level']) .loc[pd.IndexSlice[order_categories['model'], order_categories['data level']], :] - ) + ) to_plot = to_plot.reset_index() to_plot['model annotated'] = to_plot['model'] + ' - ' + to_plot['text'] -order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation +order_model = to_plot['model annotated'].drop_duplicates().to_list() # model name with annotation to_plot = to_plot.drop_duplicates(subset=['model', 'data level', 'metric_value']) -to_plot.to_csv(FOLDER /f"{fname}.csv") +to_plot.to_csv(FOLDER / f"{fname}.csv") to_plot # %% -figsize= (10,8) # None # (10,8) +figsize = (10, 8) # None # (10,8) fig, ax = plt.subplots(figsize=figsize) to_plot.columns.name = '' ax = (to_plot @@ -383,7 +396,19 @@ width=.8, ax=ax, # colormap="Paired", - color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'] + color=[ + '#a6cee3', + '#1f78b4', + '#b2df8a', + '#33a02c', + '#fb9a99', + '#e31a1c', + '#fdbf6f', + '#ff7f00', + '#cab2d6', + '#6a3d9a', + '#ffff99', + '#b15928'] ) ) ax = vaep.plotting.add_height_to_barplot(ax, size=11) @@ -399,7 +424,7 @@ x='model', y='metric_value', color='data level', - hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data + hover_data={'N': ':,d', 'n_params': ':,d'}, # format hover data barmode="group", color_discrete_sequence=px.colors.colorbrewer.Paired, # color_discrete_sequence=['#a6cee3', '#1f78b4', '#b2df8a'], diff --git a/project/03_1_best_models_comparison.ipynb b/project/03_1_best_models_comparison.ipynb index aff217903..0a02134a1 100644 --- a/project/03_1_best_models_comparison.ipynb +++ b/project/03_1_best_models_comparison.ipynb @@ -150,8 +150,8 @@ " split,\n", " :, 'MAE']].stack(1)\n", "view_long = (selected.stack()\n", - " .to_frame('MAE')\n", - " .reset_index())\n", + " .to_frame('MAE')\n", + " .reset_index())\n", "view_long" ] }, diff --git a/project/03_1_best_models_comparison.py b/project/03_1_best_models_comparison.py index 7061b53e2..00bcb8ada 100644 --- a/project/03_1_best_models_comparison.py +++ b/project/03_1_best_models_comparison.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -94,8 +94,8 @@ split, :, 'MAE']].stack(1) view_long = (selected.stack() - .to_frame('MAE') - .reset_index()) + .to_frame('MAE') + .reset_index()) view_long # %% diff --git a/project/03_2_best_models_comparison_fig2.py b/project/03_2_best_models_comparison_fig2.py index 1af9ae1a7..f5e66baca 100644 --- a/project/03_2_best_models_comparison_fig2.py +++ b/project/03_2_best_models_comparison_fig2.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python diff --git a/project/03_3_combine_experiment_result_tables.ipynb b/project/03_3_combine_experiment_result_tables.ipynb index a0b3a2d69..476023e23 100644 --- a/project/03_3_combine_experiment_result_tables.ipynb +++ b/project/03_3_combine_experiment_result_tables.ipynb @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "files = {Path(f).parent.name: f for f in snakemake.input }\n", + "files = {Path(f).parent.name: f for f in snakemake.input}\n", "files" ] }, @@ -47,7 +47,7 @@ "source": [ "table = []\n", "for key, file in files.items():\n", - " df = pd.read_excel(file, sheet_name='cp_mean_perf', index_col=0)\n", + " df = pd.read_excel(file, sheet_name=-1, index_col=0)\n", " df.columns = pd.MultiIndex.from_tuples([(key, x) for x in df.columns])\n", " table.append(df)\n", "\n", @@ -71,10 +71,10 @@ "outputs": [], "source": [ "order = (table\n", - " .loc[:, pd.IndexSlice[:, 'val']]\n", - " .mean(axis=1)\n", - " .sort_values()\n", - ")\n", + " .loc[:, pd.IndexSlice[:, 'val']]\n", + " .mean(axis=1)\n", + " .sort_values()\n", + " )\n", "order" ] }, diff --git a/project/03_3_combine_experiment_result_tables.py b/project/03_3_combine_experiment_result_tables.py index 7ba8472ba..37dd49f26 100644 --- a/project/03_3_combine_experiment_result_tables.py +++ b/project/03_3_combine_experiment_result_tables.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -23,7 +23,7 @@ # Use parent folder name as key # %% -files = {Path(f).parent.name: f for f in snakemake.input } +files = {Path(f).parent.name: f for f in snakemake.input} files # %% @@ -41,10 +41,10 @@ # %% order = (table - .loc[:, pd.IndexSlice[:, 'val']] - .mean(axis=1) - .sort_values() -) + .loc[:, pd.IndexSlice[:, 'val']] + .mean(axis=1) + .sort_values() + ) order # %% diff --git a/project/03_4_join_tables.py b/project/03_4_join_tables.py index 88e1004f1..f27f29253 100644 --- a/project/03_4_join_tables.py +++ b/project/03_4_join_tables.py @@ -7,7 +7,7 @@ # %% fname = 'runs/appl_ald_data/plasma/proteinGroups_all/01_2_performance_summary.xlsx' ald_pg_perf = pd.read_excel(fname, sheet_name=-1, index_col=0) -ald_pg_perf.columns = pd.MultiIndex.from_tuples([('ALD data','protein groups', x) for x in ald_pg_perf.columns]) +ald_pg_perf.columns = pd.MultiIndex.from_tuples([('ALD data', 'protein groups', x) for x in ald_pg_perf.columns]) ald_pg_perf # %% @@ -29,10 +29,10 @@ # %% order = (table - .loc[:, pd.IndexSlice[:, :, 'val']] - .mean(axis=1) - .sort_values() -) + .loc[:, pd.IndexSlice[:, :, 'val']] + .mean(axis=1) + .sort_values() + ) order # %% @@ -63,10 +63,10 @@ # %% # %% order = (table - .loc[:, pd.IndexSlice[:, 'val']] - .mean(axis=1) - .sort_values() -) + .loc[:, pd.IndexSlice[:, 'val']] + .mean(axis=1) + .sort_values() + ) order # %% diff --git a/project/03_5_join_benchmarks.py b/project/03_5_join_benchmarks.py index 976e3e339..493dff1b1 100644 --- a/project/03_5_join_benchmarks.py +++ b/project/03_5_join_benchmarks.py @@ -17,6 +17,8 @@ # %% # find folders in root folder and get files with tsv extension + + def find_tsv_benchmarks(root_folder: Path): """Find snakemake benchmark files in subfolders of root_folder (pimms workflow) @@ -39,6 +41,7 @@ def find_tsv_benchmarks(root_folder: Path): if file.suffix == '.tsv': yield file + files = find_tsv_benchmarks(root_folder) # %% @@ -47,7 +50,7 @@ def find_tsv_benchmarks(root_folder: Path): # files = (x for x in files if x.is_file()) # %% -COL = 'h:m:s' # 's' for seconds +COL = 'h:m:s' # 's' for seconds SPLIT_TERM = '_train_' data = dict() for file in files: @@ -64,7 +67,7 @@ def find_tsv_benchmarks(root_folder: Path): data = (pd .DataFrame(data) .drop('PRED') -) + ) data # %% diff --git a/project/04_1_train_pimms_models.ipynb b/project/04_1_train_pimms_models.ipynb index 21ac7cd0f..5f305f5d6 100644 --- a/project/04_1_train_pimms_models.ipynb +++ b/project/04_1_train_pimms_models.ipynb @@ -188,7 +188,7 @@ "id": "a76ba4ce", "metadata": {}, "source": [ - "Let's set up collaborative filtering without a validation or test set, using \n", + "Let's set up collaborative filtering without a validation or test set, using\n", "all the data there is." ] }, @@ -257,10 +257,10 @@ "metadata": {}, "outputs": [], "source": [ - "df_imputed = df_imputed.stack() # long-format\n", + "df_imputed = df_imputed.stack() # long-format\n", "observed = df_imputed.loc[df.index]\n", "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]\n", - "df_imputed = df_imputed.unstack() # back to wide-format\n", + "df_imputed = df_imputed.unstack() # back to wide-format\n", "# some checks\n", "assert len(df) == len(observed)\n", "assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)" @@ -273,30 +273,30 @@ "metadata": {}, "outputs": [], "source": [ - "fig, axes = plt.subplots(2, figsize=(8,4))\n", + "fig, axes = plt.subplots(2, figsize=(8, 4))\n", "\n", "min_max = vaep.plotting.data.get_min_max_iterable(\n", " [observed, imputed])\n", "label_template = '{method} (N={n:,d})'\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " observed,\n", - " ax=axes[0],\n", - " min_max=min_max,\n", - " label=label_template.format(method='measured',\n", - " n=len(observed),\n", - " ),\n", - " color='grey',\n", - " alpha=1)\n", + " observed,\n", + " ax=axes[0],\n", + " min_max=min_max,\n", + " label=label_template.format(method='measured',\n", + " n=len(observed),\n", + " ),\n", + " color='grey',\n", + " alpha=1)\n", "_ = ax.legend()\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " imputed,\n", - " ax=axes[1],\n", - " min_max=min_max,\n", - " label=label_template.format(method='CF imputed',\n", - " n=len(imputed),\n", - " ),\n", - " color=color_model_mapping['CF'],\n", - " alpha=1)\n", + " imputed,\n", + " ax=axes[1],\n", + " min_max=min_max,\n", + " label=label_template.format(method='CF imputed',\n", + " n=len(imputed),\n", + " ),\n", + " color=color_model_mapping['CF'],\n", + " alpha=1)\n", "_ = ax.legend()" ] }, @@ -330,8 +330,8 @@ "metadata": {}, "source": [ "The AutoEncoder model currently need validation data for training.\n", - "We will use 10% of the training data for validation. \n", - "> Expect this limitation to be dropped in the next release. It will still be recommended \n", + "We will use 10% of the training data for validation.\n", + "> Expect this limitation to be dropped in the next release. It will still be recommended\n", "> to use validation data for early stopping." ] }, @@ -352,7 +352,7 @@ "metadata": {}, "source": [ "We will use the `sampling` module to sample the validation data from the training data.\n", - "Could be split differently by providing another `weights` vector. " + "Could be split differently by providing another `weights` vector." ] }, { @@ -423,7 +423,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_selected = 'VAE' # 'DAE'\n", + "model_selected = 'VAE' # 'DAE'\n", "model = AETransformer(\n", " model=model_selected,\n", " hidden_layers=[512,],\n", @@ -535,8 +535,8 @@ "metadata": {}, "outputs": [], "source": [ - "df = df.stack() # long-format\n", - "df_imputed = df_imputed.stack() # long-format\n", + "df = df.stack() # long-format\n", + "df_imputed = df_imputed.stack() # long-format\n", "observed = df_imputed.loc[df.index]\n", "imputed = df_imputed.loc[df_imputed.index.difference(df.index)]" ] @@ -550,30 +550,30 @@ }, "outputs": [], "source": [ - "fig, axes = plt.subplots(2, figsize=(8,4))\n", + "fig, axes = plt.subplots(2, figsize=(8, 4))\n", "\n", "min_max = vaep.plotting.data.get_min_max_iterable(\n", " [observed, imputed])\n", "label_template = '{method} (N={n:,d})'\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " observed,\n", - " ax=axes[0],\n", - " min_max=min_max,\n", - " label=label_template.format(method='measured',\n", - " n=len(observed),\n", - " ),\n", - " color='grey',\n", - " alpha=1)\n", + " observed,\n", + " ax=axes[0],\n", + " min_max=min_max,\n", + " label=label_template.format(method='measured',\n", + " n=len(observed),\n", + " ),\n", + " color='grey',\n", + " alpha=1)\n", "_ = ax.legend()\n", "ax, _ = vaep.plotting.data.plot_histogram_intensities(\n", - " imputed,\n", - " ax=axes[1],\n", - " min_max=min_max,\n", - " label=label_template.format(method=f'{model_selected} imputed',\n", - " n=len(imputed),\n", - " ),\n", - " color=color_model_mapping[model_selected],\n", - " alpha=1)\n", + " imputed,\n", + " ax=axes[1],\n", + " min_max=min_max,\n", + " label=label_template.format(method=f'{model_selected} imputed',\n", + " n=len(imputed),\n", + " ),\n", + " color=color_model_mapping[model_selected],\n", + " alpha=1)\n", "_ = ax.legend()" ] }, diff --git a/project/04_1_train_pimms_models.py b/project/04_1_train_pimms_models.py index fea106857..3ddae8b24 100644 --- a/project/04_1_train_pimms_models.py +++ b/project/04_1_train_pimms_models.py @@ -86,7 +86,7 @@ # # # # CollaborativeFilteringTransformer? # %% [markdown] -# Let's set up collaborative filtering without a validation or test set, using +# Let's set up collaborative filtering without a validation or test set, using # all the data there is. # %% @@ -117,39 +117,39 @@ # Let's plot the distribution of the imputed values vs the ones used for training: # %% -df_imputed = df_imputed.stack() # long-format +df_imputed = df_imputed.stack() # long-format observed = df_imputed.loc[df.index] imputed = df_imputed.loc[df_imputed.index.difference(df.index)] -df_imputed = df_imputed.unstack() # back to wide-format +df_imputed = df_imputed.unstack() # back to wide-format # some checks assert len(df) == len(observed) assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed) # %% -fig, axes = plt.subplots(2, figsize=(8,4)) +fig, axes = plt.subplots(2, figsize=(8, 4)) min_max = vaep.plotting.data.get_min_max_iterable( [observed, imputed]) label_template = '{method} (N={n:,d})' ax, _ = vaep.plotting.data.plot_histogram_intensities( - observed, - ax=axes[0], - min_max=min_max, - label=label_template.format(method='measured', - n=len(observed), - ), - color='grey', - alpha=1) + observed, + ax=axes[0], + min_max=min_max, + label=label_template.format(method='measured', + n=len(observed), + ), + color='grey', + alpha=1) _ = ax.legend() ax, _ = vaep.plotting.data.plot_histogram_intensities( - imputed, - ax=axes[1], - min_max=min_max, - label=label_template.format(method='CF imputed', - n=len(imputed), - ), - color=color_model_mapping['CF'], - alpha=1) + imputed, + ax=axes[1], + min_max=min_max, + label=label_template.format(method='CF imputed', + n=len(imputed), + ), + color=color_model_mapping['CF'], + alpha=1) _ = ax.legend() # %% [markdown] @@ -166,8 +166,8 @@ # %% [markdown] # The AutoEncoder model currently need validation data for training. -# We will use 10% of the training data for validation. -# > Expect this limitation to be dropped in the next release. It will still be recommended +# We will use 10% of the training data for validation. +# > Expect this limitation to be dropped in the next release. It will still be recommended # > to use validation data for early stopping. # %% @@ -176,7 +176,7 @@ # %% [markdown] # We will use the `sampling` module to sample the validation data from the training data. -# Could be split differently by providing another `weights` vector. +# Could be split differently by providing another `weights` vector. # %% val_X, train_X = vaep.sampling.sample_data(df.stack(), @@ -204,7 +204,7 @@ # Select either `DAE` or `VAE` model: # %% -model_selected = 'VAE' # 'DAE' +model_selected = 'VAE' # 'DAE' model = AETransformer( model=model_selected, hidden_layers=[512,], @@ -257,36 +257,36 @@ df_imputed = df_imputed.replace(val_X) # %% -df = df.stack() # long-format -df_imputed = df_imputed.stack() # long-format +df = df.stack() # long-format +df_imputed = df_imputed.stack() # long-format observed = df_imputed.loc[df.index] imputed = df_imputed.loc[df_imputed.index.difference(df.index)] # %% -fig, axes = plt.subplots(2, figsize=(8,4)) +fig, axes = plt.subplots(2, figsize=(8, 4)) min_max = vaep.plotting.data.get_min_max_iterable( [observed, imputed]) label_template = '{method} (N={n:,d})' ax, _ = vaep.plotting.data.plot_histogram_intensities( - observed, - ax=axes[0], - min_max=min_max, - label=label_template.format(method='measured', - n=len(observed), - ), - color='grey', - alpha=1) + observed, + ax=axes[0], + min_max=min_max, + label=label_template.format(method='measured', + n=len(observed), + ), + color='grey', + alpha=1) _ = ax.legend() ax, _ = vaep.plotting.data.plot_histogram_intensities( - imputed, - ax=axes[1], - min_max=min_max, - label=label_template.format(method=f'{model_selected} imputed', - n=len(imputed), - ), - color=color_model_mapping[model_selected], - alpha=1) + imputed, + ax=axes[1], + min_max=min_max, + label=label_template.format(method=f'{model_selected} imputed', + n=len(imputed), + ), + color=color_model_mapping[model_selected], + alpha=1) _ = ax.legend() diff --git a/project/10_0_ald_data.ipynb b/project/10_0_ald_data.ipynb index 01027e035..e006f94e4 100644 --- a/project/10_0_ald_data.ipynb +++ b/project/10_0_ald_data.ipynb @@ -47,14 +47,14 @@ "print(*(folder_data.iterdir()), sep='\\n')\n", "\n", "fnames = dict(\n", - "plasma_proteinGroups = folder_data / 'Protein_ALDupgrade_Report.csv',\n", - "plasma_aggPeptides = folder_data / 'ald_proteome_spectronaut.tsv',\n", - "liver_proteinGroups = folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", - "liver_aggPeptides = folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", - "annotations = folder_data / 'ald_experiment_annotations.csv',\n", - "clinic = folder_data / 'labtest_integrated_numeric.csv',\n", - "raw_meta = folder_data / 'ald_metadata_rawfiles.csv')\n", - "fnames =vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict" + " plasma_proteinGroups=folder_data / 'Protein_ALDupgrade_Report.csv',\n", + " plasma_aggPeptides=folder_data / 'ald_proteome_spectronaut.tsv',\n", + " liver_proteinGroups=folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", + " liver_aggPeptides=folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv',\n", + " annotations=folder_data / 'ald_experiment_annotations.csv',\n", + " clinic=folder_data / 'labtest_integrated_numeric.csv',\n", + " raw_meta=folder_data / 'ald_metadata_rawfiles.csv')\n", + "fnames = vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict" ] }, { @@ -128,7 +128,7 @@ "metadata": {}, "outputs": [], "source": [ - "annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both" + "annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both" ] }, { @@ -306,7 +306,7 @@ "metadata": {}, "outputs": [], "source": [ - "clinic.loc[idx_overlap_plasma].to_csv(folder_data_out /'ald_metadata_cli.csv')" + "clinic.loc[idx_overlap_plasma].to_csv(folder_data_out / 'ald_metadata_cli.csv')" ] }, { @@ -456,7 +456,7 @@ "id": "cfe1c458-dc61-4890-b430-6efa7eb89e72", "metadata": {}, "source": [ - "## (Aggregated) Peptide Data " + "## (Aggregated) Peptide Data" ] }, { @@ -603,7 +603,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_plasma_aggPeptides_id_mappings.csv')\n", "id_mappings" @@ -618,25 +618,26 @@ "\n", "taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/)\n", "\n", - "feature | description \n", + "feature | description\n", "--- | ---\n", "PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group.\n", "PEP.StrippedSequence | -\n", "PEP.IsProteotypic | -\n", "PEP.PeptidePosition | -\n", - "PG.Cscore | - \n", + "PG.Cscore | -\n", "PG.ProteinAccessions | -\n", - "PG.Genes | - \n", + "PG.Genes | -\n", "PEP.Quantity | The quantitative value for that peptide as defined in the settings.\n", - "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] \n", + "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge]\n", "EG.Qvalue | The q-value (FDR) of the EG.\n", - "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. \n", + "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop" + "After discussing with Lili, `PEP.Quantity` is the fitting entity for\n", + "each unique aggregated Peptide. Duplicated entries are just to drop" ] }, { @@ -646,7 +647,7 @@ "metadata": {}, "outputs": [], "source": [ - "sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position\n", + "sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position\n", "df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2])\n", "df" ] @@ -691,7 +692,7 @@ "id": "b823acaf-2610-4b0a-91d8-2d6dd6ff4182", "metadata": {}, "source": [ - "- rawfile metadata -> keep " + "- rawfile metadata -> keep" ] }, { @@ -732,7 +733,8 @@ "id": "5dddafbb-edd7-4ef0-9787-3120b24d7f79", "metadata": {}, "source": [ - "For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`)" + "For one raw file no metadata could be extracted (`ERROR: Unable to\n", + "access the RAW file using the native Thermo library.`)" ] }, { @@ -875,7 +877,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_plasma_proteinGroups_id_mappings.csv', index=False)\n", "id_mappings" @@ -900,7 +902,7 @@ "outputs": [], "source": [ "column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]\n", - "column_types # 'PG.Quantity' expected" + "column_types # 'PG.Quantity' expected" ] }, { @@ -929,11 +931,12 @@ "metadata": {}, "outputs": [], "source": [ - "def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list):\n", - " to_drop = [x for x in df.index.names if not x in idx_to_keep]\n", + "def find_idx_to_drop(df: pd.DataFrame, idx_to_keep: list):\n", + " to_drop = [x for x in df.index.names if x not in idx_to_keep]\n", " logger.info(\"Columnns to drop: {}\".format(\",\".join((str(x) for x in to_drop))))\n", " return to_drop\n", - " \n", + "\n", + "\n", "to_drop = find_idx_to_drop(df, idx_cols)\n", "df = df.reset_index(level=to_drop, drop=True)\n", "df.head()" @@ -1135,7 +1138,7 @@ " 'title': 'protein group measurement distribution'}\n", "\n", "ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index(\n", - "), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0,**kwargs)\n", + "), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0, **kwargs)\n", "\n", "fig = ax.get_figure()\n", "fig.tight_layout()\n", @@ -1447,7 +1450,7 @@ }, "outputs": [], "source": [ - "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PEP.StrippedSequence\", \"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_liver_aggPeptides_id_mappings.csv')\n", "id_mappings" @@ -1462,25 +1465,26 @@ "\n", "taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/)\n", "\n", - "feature | description \n", + "feature | description\n", "--- | ---\n", "PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group.\n", "PEP.StrippedSequence | -\n", "PEP.IsProteotypic | -\n", "PEP.PeptidePosition | -\n", - "PG.Cscore | - \n", + "PG.Cscore | -\n", "PG.ProteinAccessions | -\n", - "PG.Genes | - \n", + "PG.Genes | -\n", "PEP.Quantity | The quantitative value for that peptide as defined in the settings.\n", - "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] \n", + "EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge]\n", "EG.Qvalue | The q-value (FDR) of the EG.\n", - "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. \n", + "EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. \n", + "> Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious.\n", "\n", - "After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop" + "After discussing with Lili, `PEP.Quantity` is the fitting entity for\n", + "each unique aggregated Peptide. Duplicated entries are just to drop" ] }, { @@ -1490,7 +1494,7 @@ "metadata": {}, "outputs": [], "source": [ - "sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position\n", + "sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position\n", "df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2]).squeeze()\n", "df" ] @@ -1513,7 +1517,7 @@ "metadata": {}, "source": [ "Select entry with maximum intensity of `duplicated entries`\n", - " \n", + "\n", "> change of variable and many duplicates -> could be PSM table? (close to evidence?)" ] }, @@ -1535,7 +1539,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = vaep.pandas.select_max_by(df=df.reset_index(), grouping_columns=sel_cols[:-1], selection_column=sel_cols[-1]).set_index(sel_cols[:-1])" + "df = vaep.pandas.select_max_by(df=df.reset_index(),\n", + " grouping_columns=sel_cols[:-1],\n", + " selection_column=sel_cols[-1]).set_index(sel_cols[:-1])" ] }, { @@ -1545,7 +1551,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert df.index.duplicated(False).sum() == 0 , \"Still missing values\"" + "assert df.index.duplicated(False).sum() == 0, \"Still missing values\"" ] }, { @@ -1577,7 +1583,7 @@ "id": "529fa0e7-7ad4-4c72-91b1-d37587835ce5", "metadata": {}, "source": [ - "- rawfile metadata -> keep " + "- rawfile metadata -> keep" ] }, { @@ -1614,7 +1620,7 @@ "%%time\n", "# des_data = df.describe() unnecessary computation which take too long\n", "des_data = df.isna().sum().to_frame('count').T\n", - "des_data " + "des_data" ] }, { @@ -1630,7 +1636,8 @@ "id": "44616770-fcc2-4a97-86f4-e0eadc98bb7a", "metadata": {}, "source": [ - "For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`)" + "For one raw file no metadata could be extracted (`ERROR: Unable to\n", + "access the RAW file using the native Thermo library.`)" ] }, { @@ -1777,7 +1784,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", + "id_mappings = [\"PG.ProteinAccessions\", \"PG.Genes\"]\n", "id_mappings = meta[id_mappings].drop_duplicates()\n", "id_mappings.to_csv(folder_data_out / 'ald_liver_proteinGroups_id_mappings.csv')\n", "id_mappings" @@ -1802,7 +1809,7 @@ "outputs": [], "source": [ "column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())]\n", - "column_types # 'PG.Quantity' expected" + "column_types # 'PG.Quantity' expected" ] }, { @@ -1900,7 +1907,7 @@ "metadata": {}, "outputs": [], "source": [ - "sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity\n", + "sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity\n", "df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:-1])" ] }, diff --git a/project/10_0_ald_data.py b/project/10_0_ald_data.py index f0c54f0d7..1e179b55e 100644 --- a/project/10_0_ald_data.py +++ b/project/10_0_ald_data.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -38,14 +38,14 @@ print(*(folder_data.iterdir()), sep='\n') fnames = dict( -plasma_proteinGroups = folder_data / 'Protein_ALDupgrade_Report.csv', -plasma_aggPeptides = folder_data / 'ald_proteome_spectronaut.tsv', -liver_proteinGroups = folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', -liver_aggPeptides = folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', -annotations = folder_data / 'ald_experiment_annotations.csv', -clinic = folder_data / 'labtest_integrated_numeric.csv', -raw_meta = folder_data / 'ald_metadata_rawfiles.csv') -fnames =vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict + plasma_proteinGroups=folder_data / 'Protein_ALDupgrade_Report.csv', + plasma_aggPeptides=folder_data / 'ald_proteome_spectronaut.tsv', + liver_proteinGroups=folder_data / 'Protein_20200221_121354_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', + liver_aggPeptides=folder_data / 'Peptide_20220819_100847_20200218_ALD_LiverTissue_PlateS1_Atlaslib_Report.csv', + annotations=folder_data / 'ald_experiment_annotations.csv', + clinic=folder_data / 'labtest_integrated_numeric.csv', + raw_meta=folder_data / 'ald_metadata_rawfiles.csv') +fnames = vaep.nb.Config.from_dict(fnames) # could be handeled kwargs as in normal dict # %% @@ -77,7 +77,7 @@ annotations # %% -annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both +annotations['Participant ID'].value_counts().value_counts() # some only have a blood sample, some both # %% [markdown] # ### Select ALD subcohort @@ -147,7 +147,7 @@ clinic["kleiner"].value_counts() # %% -clinic.loc[idx_overlap_plasma].to_csv(folder_data_out /'ald_metadata_cli.csv') +clinic.loc[idx_overlap_plasma].to_csv(folder_data_out / 'ald_metadata_cli.csv') # %% [markdown] # ## Rawfile information @@ -212,7 +212,7 @@ # > see section below # %% [markdown] -# ## (Aggregated) Peptide Data +# ## (Aggregated) Peptide Data # %% df = pd.read_table(fnames.plasma_aggPeptides, low_memory=False) @@ -263,7 +263,7 @@ meta.describe(include='all') # %% -id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_plasma_aggPeptides_id_mappings.csv') id_mappings @@ -273,28 +273,29 @@ # # taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/) # -# feature | description +# feature | description # --- | --- # PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group. # PEP.StrippedSequence | - # PEP.IsProteotypic | - # PEP.PeptidePosition | - -# PG.Cscore | - +# PG.Cscore | - # PG.ProteinAccessions | - -# PG.Genes | - +# PG.Genes | - # PEP.Quantity | The quantitative value for that peptide as defined in the settings. -# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] +# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] # EG.Qvalue | The q-value (FDR) of the EG. -# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. +# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop +# After discussing with Lili, `PEP.Quantity` is the fitting entity for +# each unique aggregated Peptide. Duplicated entries are just to drop # %% -sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position +sel_cols = ['Sample ID', 'PEP.StrippedSequence', 'PEP.Quantity'] # selected quantity in last position df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2]) df @@ -313,7 +314,7 @@ idx.describe() # %% [markdown] -# - rawfile metadata -> keep +# - rawfile metadata -> keep # %% df = df.set_index(idx) @@ -330,7 +331,8 @@ # ### Check for metadata from rawfile overlap # %% [markdown] -# For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`) +# For one raw file no metadata could be extracted (`ERROR: Unable to +# access the RAW file using the native Thermo library.`) # %% idx_diff = df.index.difference(raw_meta.index) @@ -390,7 +392,7 @@ meta.describe(include='all') # %% -id_mappings = ["PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_plasma_proteinGroups_id_mappings.csv', index=False) id_mappings @@ -401,7 +403,7 @@ # %% column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())] -column_types # 'PG.Quantity' expected +column_types # 'PG.Quantity' expected # %% df = df.set_index(list(df.columns[:N_FRIST_META])).sort_index(axis=1) @@ -412,11 +414,12 @@ # Drop index columns which are not selected # %% -def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): - to_drop = [x for x in df.index.names if not x in idx_to_keep] +def find_idx_to_drop(df: pd.DataFrame, idx_to_keep: list): + to_drop = [x for x in df.index.names if x not in idx_to_keep] logger.info("Columnns to drop: {}".format(",".join((str(x) for x in to_drop)))) return to_drop - + + to_drop = find_idx_to_drop(df, idx_cols) df = df.reset_index(level=to_drop, drop=True) df.head() @@ -503,7 +506,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): 'title': 'protein group measurement distribution'} ax = vaep.plotting.plot_counts(des_data.T.sort_values(by='count', ascending=False).reset_index( -), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0,**kwargs) +), feat_col_name='count', n_samples=len(df), ax=None, min_feat_prop=.0, **kwargs) fig = ax.get_figure() fig.tight_layout() @@ -623,7 +626,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): meta # %% -id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PEP.StrippedSequence", "PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_liver_aggPeptides_id_mappings.csv') id_mappings @@ -634,28 +637,29 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # # taken from [Spectronaut manuel](https://biognosys.com/resources/spectronaut-manual/) # -# feature | description +# feature | description # --- | --- # PEP.IsProteinGroupSpecific | True or False. Tells you whether the peptide only belongs to one Protein Group. # PEP.StrippedSequence | - # PEP.IsProteotypic | - # PEP.PeptidePosition | - -# PG.Cscore | - +# PG.Cscore | - # PG.ProteinAccessions | - -# PG.Genes | - +# PG.Genes | - # PEP.Quantity | The quantitative value for that peptide as defined in the settings. -# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] +# EG.PrecursorId | Unique Id for the precursor: [modified sequence] plus [charge] # EG.Qvalue | The q-value (FDR) of the EG. -# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. +# EG.TotalQuantity (Settings) | The quantitative value for that EG as defined in the settings. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. +# > Headers related to Peptides (PEP) as defined in the settings. Many headers related to Peptides are self-explanatory. Here are the most relevant and some which are not too obvious. # -# After discussing with Lili, `PEP.Quantity` is the fitting entity for each unique aggregated Peptide. Duplicated entries are just to drop +# After discussing with Lili, `PEP.Quantity` is the fitting entity for +# each unique aggregated Peptide. Duplicated entries are just to drop # %% -sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position +sel_cols = ['Sample ID', 'PEP.StrippedSequence', VAR_PEP] # selected quantity in last position df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:2]).squeeze() df @@ -666,7 +670,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # %% [markdown] # Select entry with maximum intensity of `duplicated entries` -# +# # > change of variable and many duplicates -> could be PSM table? (close to evidence?) # %% @@ -674,10 +678,12 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): df.loc[mask_idx_duplicated].sort_index() # %% -df = vaep.pandas.select_max_by(df=df.reset_index(), grouping_columns=sel_cols[:-1], selection_column=sel_cols[-1]).set_index(sel_cols[:-1]) +df = vaep.pandas.select_max_by(df=df.reset_index(), + grouping_columns=sel_cols[:-1], + selection_column=sel_cols[-1]).set_index(sel_cols[:-1]) # %% -assert df.index.duplicated(False).sum() == 0 , "Still missing values" +assert df.index.duplicated(False).sum() == 0, "Still missing values" # %% df = df.unstack() @@ -690,7 +696,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): idx.describe() # %% [markdown] -# - rawfile metadata -> keep +# - rawfile metadata -> keep # %% df = df.set_index(idx) @@ -706,13 +712,14 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # %%time # des_data = df.describe() unnecessary computation which take too long des_data = df.isna().sum().to_frame('count').T -des_data +des_data # %% [markdown] # ### Check for metadata from rawfile overlap # %% [markdown] -# For one raw file no metadata could be extracted (`ERROR: Unable to access the RAW file using the native Thermo library.`) +# For one raw file no metadata could be extracted (`ERROR: Unable to +# access the RAW file using the native Thermo library.`) # %% # idx_diff = df.index.difference(raw_meta.index) @@ -773,7 +780,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): meta.describe(include='all') # %% -id_mappings = ["PG.ProteinAccessions", "PG.Genes"] +id_mappings = ["PG.ProteinAccessions", "PG.Genes"] id_mappings = meta[id_mappings].drop_duplicates() id_mappings.to_csv(folder_data_out / 'ald_liver_proteinGroups_id_mappings.csv') id_mappings @@ -784,7 +791,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): # %% column_types = ['.'.join(x for x in tup) for tup in list(column_types.unique())] -column_types # 'PG.Quantity' expected +column_types # 'PG.Quantity' expected # %% df = df.set_index(list(df.columns[:N_FRIST_META])).sort_index(axis=1) @@ -824,7 +831,7 @@ def find_idx_to_drop(df:pd.DataFrame, idx_to_keep:list): df # %% -sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity +sel_cols = ['PG.ProteinAccessions', 'PG.Genes', 'Sample ID', VAR_PG] # last one gives quantity df = df.reset_index()[sel_cols].drop_duplicates().set_index(sel_cols[:-1]) # %% diff --git a/project/10_0_ald_data_3v3.py b/project/10_0_ald_data_3v3.py new file mode 100644 index 000000000..3d75c14c4 --- /dev/null +++ b/project/10_0_ald_data_3v3.py @@ -0,0 +1,31 @@ +# %% +from pathlib import Path +import pandas as pd + +# %% +FN_INTENSITIES = "data/ALD_study/processed/ald_plasma_proteinGroups.pkl" +fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" + +FN_INTENSITIES = Path(FN_INTENSITIES) + +# %% +df = pd.read_pickle(FN_INTENSITIES) +df + +# %% +meta = pd.read_csv(fn_clinical_data, index_col=0) +meta + +# %% +sel = pd.concat( + [df.loc[meta['kleiner'] == 0].sample(3), + df.loc[meta['kleiner'] == 4].sample(3), + ]) +sel + +# %% +fname = FN_INTENSITIES.parent / f'{FN_INTENSITIES.stem}_3v3.pkl' +sel.to_pickle(fname) +fname.as_posix() + +# %% diff --git a/project/10_1_ald_diff_analysis.ipynb b/project/10_1_ald_diff_analysis.ipynb index 59b6d06ae..95ca6552e 100644 --- a/project/10_1_ald_diff_analysis.ipynb +++ b/project/10_1_ald_diff_analysis.ipynb @@ -8,7 +8,7 @@ "\n", "- load missing values predictions\n", "- leave all other values as they were\n", - "- compare missing values predicition by model with baseline method \n", + "- compare missing values predicition by model with baseline method\n", " (default: draw from shifted normal distribution. short RSN)" ] }, @@ -66,19 +66,19 @@ "folder_experiment = \"runs/appl_ald_data/plasma/proteinGroups\"\n", "folder_data: str = '' # specify data directory if needed\n", "fn_clinical_data = \"data/ALD_study/processed/ald_metadata_cli.csv\"\n", - "fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", + "fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", "f_annotations = 'data/ALD_study/processed/ald_plasma_proteinGroups_id_mappings.csv'\n", "\n", "\n", "target: str = 'kleiner'\n", - "covar:str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num'\n", + "covar: str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num'\n", "\n", "file_format = \"csv\"\n", - "model_key = 'VAE' # model(s) to evaluate\n", - "model = None # default same as model_key, but could be overwritten (edge case)\n", - "value_name='intensity'\n", - "out_folder='diff_analysis'\n", - "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" + "model_key = 'VAE' # model(s) to evaluate\n", + "model = None # default same as model_key, but could be overwritten (edge case)\n", + "value_name = 'intensity'\n", + "out_folder = 'diff_analysis'\n", + "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" ] }, { @@ -180,7 +180,7 @@ "source": [ "df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0)\n", "df_clinic = df_clinic.loc[observed.index.levels[0]]\n", - "cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) # pick Berlin as reference?\n", + "cols_clinic = vaep.pandas.get_columns_accessor(df_clinic)\n", "df_clinic[[args.target, *args.covar]].describe()" ] }, @@ -321,7 +321,7 @@ "DATA_COMPLETENESS = 0.6\n", "# MIN_N_PROTEIN_GROUPS: int = 200\n", "FRAC_PROTEIN_GROUPS: int = 0.622\n", - "CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples\n", + "CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples\n", "\n", "ald_study, cutoffs = vaep.analyzers.diff_analysis.select_raw_data(observed.unstack(\n", "), data_completeness=DATA_COMPLETENESS, frac_protein_groups=FRAC_PROTEIN_GROUPS)\n", @@ -340,14 +340,14 @@ " qc_samples = pd.read_pickle(args.fn_qc_samples)\n", " qc_cv_feat = qc_samples.std() / qc_samples.mean()\n", " qc_cv_feat = qc_cv_feat.rename(qc_samples.columns.name)\n", - " fig, ax = plt.subplots(figsize=(4,7))\n", + " fig, ax = plt.subplots(figsize=(4, 7))\n", " ax = qc_cv_feat.plot.box(ax=ax)\n", " ax.set_ylabel('Coefficient of Variation')\n", " vaep.savefig(fig, name='cv_qc_samples', folder=args.out_figures)\n", " print((qc_cv_feat < CV_QC_SAMPLE).value_counts())\n", " # only to ald_study data\n", " ald_study = ald_study[vaep.analyzers.diff_analysis.select_feat(qc_samples[ald_study.columns])]\n", - " \n", + "\n", "ald_study" ] }, @@ -360,8 +360,8 @@ "outputs": [], "source": [ "fig, axes = vaep.plotting.plot_cutoffs(observed.unstack(),\n", - " feat_completness_over_samples=cutoffs.feat_completness_over_samples,\n", - " min_feat_in_sample=cutoffs.min_feat_in_sample)\n", + " feat_completness_over_samples=cutoffs.feat_completness_over_samples,\n", + " min_feat_in_sample=cutoffs.min_feat_in_sample)\n", "vaep.savefig(fig, name='tresholds_normal_imputation', folder=args.out_figures)" ] }, @@ -399,7 +399,7 @@ "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.model)\n", - "fname " + "fname" ] }, { @@ -462,7 +462,7 @@ "def plot_distributions(observed: pd.Series,\n", " imputation: pd.Series = None,\n", " model_key: str = 'MODEL',\n", - " figsize=(4,3),\n", + " figsize=(4, 3),\n", " sharex=True):\n", " \"\"\"Plots distributions of intensities provided as dictionary of labels to pd.Series.\"\"\"\n", " series_ = [observed, imputation] if imputation is not None else [observed]\n", @@ -474,8 +474,8 @@ " else:\n", " fig, ax = plt.subplots(1, figsize=figsize, sharex=sharex)\n", "\n", - " bins = range(min_bin, max_bin+1, 1)\n", - " \n", + " bins = range(min_bin, max_bin + 1, 1)\n", + "\n", " label = 'observed measurments'\n", " ax = observed.hist(ax=ax, bins=bins, color='grey')\n", " ax.set_title(f'{label} (N={len(observed):,d})')\n", @@ -483,14 +483,13 @@ " ax.locator_params(axis='y', integer=True)\n", " ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", "\n", - "\n", " if imputation is not None:\n", " ax = axes[1]\n", " label = f'Missing values imputed using {model_key.upper()}'\n", " color = vaep.plotting.defaults.color_model_mapping.get(model_key, None)\n", " if color is None:\n", " color = f'C{1}'\n", - " ax = imputation.hist(ax=ax,bins=bins, color=color)\n", + " ax = imputation.hist(ax=ax, bins=bins, color=color)\n", " ax.set_title(f'{label} (N={len(imputation):,d})')\n", " ax.set_ylabel('observations')\n", " ax.locator_params(axis='y', integer=True)\n", @@ -522,7 +521,7 @@ "source": [ "if pred_real_na is not None:\n", " shifts = (vaep.imputation.compute_moments_shift(observed, pred_real_na,\n", - " names=('observed', args.model_key)))\n", + " names=('observed', args.model_key)))\n", " display(pd.DataFrame(shifts).T)" ] }, @@ -543,8 +542,8 @@ " index_level = 0 # per sample\n", " mean_by_sample = pd.DataFrame(\n", " {'observed': vaep.imputation.stats_by_level(observed, index_level=index_level),\n", - " args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level)\n", - " })\n", + " args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level)\n", + " })\n", " mean_by_sample.loc['mean_shift'] = (mean_by_sample.loc['mean', 'observed'] -\n", " mean_by_sample.loc['mean']).abs() / mean_by_sample.loc['std', 'observed']\n", " mean_by_sample.loc['std shrinkage'] = mean_by_sample.loc['std'] / \\\n", @@ -602,10 +601,10 @@ "outputs": [], "source": [ "scores = vaep.stats.diff_analysis.analyze(df_proteomics=df,\n", - " df_clinic=df_clinic,\n", - " target=args.target,\n", - " covar=args.covar,\n", - " value_name=args.value_name)\n", + " df_clinic=df_clinic,\n", + " target=args.target,\n", + " covar=args.covar,\n", + " value_name=args.value_name)\n", "\n", "scores" ] @@ -623,7 +622,7 @@ " scores = (scores\n", " .join(gene_to_PG)\n", " .set_index(gene_to_PG.columns.to_list(), append=True)\n", - " )\n", + " )\n", "scores" ] }, @@ -648,7 +647,7 @@ }, "outputs": [], "source": [ - "fname = args.out_folder/ 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl'\n", + "fname = args.out_folder / 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl'\n", "files_out[fname.name] = fname.as_posix()\n", "fname.parent.mkdir(exist_ok=True, parents=True)\n", "scores.to_pickle(fname)\n", diff --git a/project/10_1_ald_diff_analysis.py b/project/10_1_ald_diff_analysis.py index cc5c3b6bb..15893ff5c 100644 --- a/project/10_1_ald_diff_analysis.py +++ b/project/10_1_ald_diff_analysis.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -17,7 +17,7 @@ # # - load missing values predictions # - leave all other values as they were -# - compare missing values predicition by model with baseline method +# - compare missing values predicition by model with baseline method # (default: draw from shifted normal distribution. short RSN) # %% @@ -48,19 +48,19 @@ folder_experiment = "runs/appl_ald_data/plasma/proteinGroups" folder_data: str = '' # specify data directory if needed fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" -fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' +fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' f_annotations = 'data/ALD_study/processed/ald_plasma_proteinGroups_id_mappings.csv' target: str = 'kleiner' -covar:str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num' +covar: str = 'age,bmi,gender_num,nas_steatosis_ordinal,abstinent_num' file_format = "csv" -model_key = 'VAE' # model(s) to evaluate -model = None # default same as model_key, but could be overwritten (edge case) -value_name='intensity' -out_folder='diff_analysis' -template_pred = 'pred_real_na_{}.csv' # fixed, do not change +model_key = 'VAE' # model(s) to evaluate +model = None # default same as model_key, but could be overwritten (edge case) +value_name = 'intensity' +out_folder = 'diff_analysis' +template_pred = 'pred_real_na_{}.csv' # fixed, do not change # %% @@ -109,7 +109,7 @@ # %% df_clinic = pd.read_csv(args.fn_clinical_data, index_col=0) df_clinic = df_clinic.loc[observed.index.levels[0]] -cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) # pick Berlin as reference? +cols_clinic = vaep.pandas.get_columns_accessor(df_clinic) df_clinic[[args.target, *args.covar]].describe() @@ -183,7 +183,7 @@ DATA_COMPLETENESS = 0.6 # MIN_N_PROTEIN_GROUPS: int = 200 FRAC_PROTEIN_GROUPS: int = 0.622 -CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples +CV_QC_SAMPLE: float = 0.4 # Coef. of variation on 13 QC samples ald_study, cutoffs = vaep.analyzers.diff_analysis.select_raw_data(observed.unstack( ), data_completeness=DATA_COMPLETENESS, frac_protein_groups=FRAC_PROTEIN_GROUPS) @@ -196,20 +196,20 @@ qc_samples = pd.read_pickle(args.fn_qc_samples) qc_cv_feat = qc_samples.std() / qc_samples.mean() qc_cv_feat = qc_cv_feat.rename(qc_samples.columns.name) - fig, ax = plt.subplots(figsize=(4,7)) + fig, ax = plt.subplots(figsize=(4, 7)) ax = qc_cv_feat.plot.box(ax=ax) ax.set_ylabel('Coefficient of Variation') vaep.savefig(fig, name='cv_qc_samples', folder=args.out_figures) print((qc_cv_feat < CV_QC_SAMPLE).value_counts()) # only to ald_study data ald_study = ald_study[vaep.analyzers.diff_analysis.select_feat(qc_samples[ald_study.columns])] - + ald_study # %% fig, axes = vaep.plotting.plot_cutoffs(observed.unstack(), - feat_completness_over_samples=cutoffs.feat_completness_over_samples, - min_feat_in_sample=cutoffs.min_feat_in_sample) + feat_completness_over_samples=cutoffs.feat_completness_over_samples, + min_feat_in_sample=cutoffs.min_feat_in_sample) vaep.savefig(fig, name='tresholds_normal_imputation', folder=args.out_figures) @@ -225,7 +225,7 @@ # %% fname = args.out_preds / args.template_pred.format(args.model) -fname +fname # %% [markdown] # Baseline comparison @@ -262,7 +262,7 @@ def plot_distributions(observed: pd.Series, imputation: pd.Series = None, model_key: str = 'MODEL', - figsize=(4,3), + figsize=(4, 3), sharex=True): """Plots distributions of intensities provided as dictionary of labels to pd.Series.""" series_ = [observed, imputation] if imputation is not None else [observed] @@ -274,8 +274,8 @@ def plot_distributions(observed: pd.Series, else: fig, ax = plt.subplots(1, figsize=figsize, sharex=sharex) - bins = range(min_bin, max_bin+1, 1) - + bins = range(min_bin, max_bin + 1, 1) + label = 'observed measurments' ax = observed.hist(ax=ax, bins=bins, color='grey') ax.set_title(f'{label} (N={len(observed):,d})') @@ -283,14 +283,13 @@ def plot_distributions(observed: pd.Series, ax.locator_params(axis='y', integer=True) ax.yaxis.set_major_formatter("{x:,.0f}") - if imputation is not None: ax = axes[1] label = f'Missing values imputed using {model_key.upper()}' color = vaep.plotting.defaults.color_model_mapping.get(model_key, None) if color is None: color = f'C{1}' - ax = imputation.hist(ax=ax,bins=bins, color=color) + ax = imputation.hist(ax=ax, bins=bins, color=color) ax.set_title(f'{label} (N={len(imputation):,d})') ax.set_ylabel('observations') ax.locator_params(axis='y', integer=True) @@ -312,7 +311,7 @@ def plot_distributions(observed: pd.Series, # %% if pred_real_na is not None: shifts = (vaep.imputation.compute_moments_shift(observed, pred_real_na, - names=('observed', args.model_key))) + names=('observed', args.model_key))) display(pd.DataFrame(shifts).T) # %% [markdown] @@ -323,8 +322,8 @@ def plot_distributions(observed: pd.Series, index_level = 0 # per sample mean_by_sample = pd.DataFrame( {'observed': vaep.imputation.stats_by_level(observed, index_level=index_level), - args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level) - }) + args.model_key: vaep.imputation.stats_by_level(pred_real_na, index_level=index_level) + }) mean_by_sample.loc['mean_shift'] = (mean_by_sample.loc['mean', 'observed'] - mean_by_sample.loc['mean']).abs() / mean_by_sample.loc['std', 'observed'] mean_by_sample.loc['std shrinkage'] = mean_by_sample.loc['std'] / \ @@ -352,10 +351,10 @@ def plot_distributions(observed: pd.Series, # Targets - Clinical variables # %% scores = vaep.stats.diff_analysis.analyze(df_proteomics=df, - df_clinic=df_clinic, - target=args.target, - covar=args.covar, - value_name=args.value_name) + df_clinic=df_clinic, + target=args.target, + covar=args.covar, + value_name=args.value_name) scores @@ -366,7 +365,7 @@ def plot_distributions(observed: pd.Series, scores = (scores .join(gene_to_PG) .set_index(gene_to_PG.columns.to_list(), append=True) - ) + ) scores # %% @@ -376,7 +375,7 @@ def plot_distributions(observed: pd.Series, # %% -fname = args.out_folder/ 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl' +fname = args.out_folder / 'scores' / f'diff_analysis_scores_{str(args.model_key)}.pkl' files_out[fname.name] = fname.as_posix() fname.parent.mkdir(exist_ok=True, parents=True) scores.to_pickle(fname) diff --git a/project/10_2_ald_compare_methods.ipynb b/project/10_2_ald_compare_methods.ipynb index e10ecbc66..68ad9f949 100644 --- a/project/10_2_ald_compare_methods.ipynb +++ b/project/10_2_ald_compare_methods.ipynb @@ -28,7 +28,7 @@ "logger = vaep.logging.setup_nb_logger()\n", "\n", "plt.rcParams['figure.figsize'] = (2, 2)\n", - "fontsize= 5\n", + "fontsize = 5\n", "vaep.plotting.make_large_descriptors(fontsize)" ] }, @@ -121,7 +121,7 @@ "source": [ "files_in = {\n", " 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv',\n", - " }\n", + "}\n", "files_in" ] }, @@ -152,7 +152,7 @@ "source": [ "writer_args = dict(float_format='%.3f')\n", "\n", - "fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'\n", + "fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'\n", "files_out[fname.name] = fname\n", "writer = pd.ExcelWriter(fname)\n", "fname" @@ -163,7 +163,7 @@ "id": "770d1f76-e86f-4ae3-9d7b-ceef9b9e9a22", "metadata": {}, "source": [ - "# Load scores " + "# Load scores" ] }, { @@ -183,7 +183,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname =args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'\n", + "fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'\n", "scores_baseline = pd.read_pickle(fname)\n", "scores_baseline" ] @@ -259,11 +259,12 @@ "cell_type": "code", "execution_count": null, "id": "53bd5597-221c-4d54-abf2-82956db42594", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "scores.describe(include=['bool', 'O'])\n", - "\n" + "scores.describe(include=['bool', 'O'])" ] }, { @@ -305,8 +306,8 @@ " .dropna()\n", " .reset_index(-1, drop=True)\n", " ).join(\n", - " freq_feat, how='left'\n", - " )\n", + " freq_feat, how='left'\n", + ")\n", "scores_common" ] }, @@ -323,7 +324,7 @@ "\n", "annotations = None\n", "for model, model_column in models.items():\n", - " if not annotations is None:\n", + " if annotations is not None:\n", " annotations += ' - '\n", " annotations += annotate_decision(scores_common,\n", " model=model, model_column=model_column)\n", @@ -395,7 +396,7 @@ "outputs": [], "source": [ "# should it be possible to run not only RSN?\n", - "to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs()\n", + "to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs()\n", "to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False)" ] }, @@ -430,7 +431,7 @@ "_ = ax.legend(fontsize=fontsize,\n", " title_fontsize=fontsize,\n", " markerscale=0.4,\n", - " title='',\n", + " title='',\n", " )\n", "ax.set_xlabel(f\"qvalue for {x_col}\")\n", "ax.set_ylabel(f\"qvalue for {y_col}\")\n", @@ -466,7 +467,7 @@ " y=to_plot.columns[1],\n", " size='frequency',\n", " s=size,\n", - " sizes=(5,20),\n", + " sizes=(5, 20),\n", " hue='Differential Analysis Comparison')\n", "_ = ax.legend(fontsize=fontsize,\n", " title_fontsize=fontsize,\n", @@ -504,7 +505,7 @@ " .loc[\n", " scores_model_only.index.difference(\n", " scores_common.index),\n", - " args.model_key]\n", + " args.model_key]\n", " .sort_values(by='qvalue', ascending=True)\n", " .join(freq_feat)\n", " )\n", @@ -592,7 +593,7 @@ "metadata": {}, "outputs": [], "source": [ - "feat_name = scores.index.names[0] # first index level is feature name\n", + "feat_name = scores.index.names[0] # first index level is feature name\n", "if args.annotaitons_gene_col in scores.index.names:\n", " logger.info(f\"Found gene annotation in scores index: {scores.index.names}\")\n", "else:\n", @@ -610,13 +611,13 @@ "outputs": [], "source": [ "gene_to_PG = (scores.droplevel(\n", - " list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col})\n", - " )\n", - " .index\n", - " .to_frame()\n", - " .reset_index(drop=True)\n", - " .set_index(args.annotaitons_gene_col)\n", - " )\n", + " list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col})\n", + ")\n", + " .index\n", + " .to_frame()\n", + " .reset_index(drop=True)\n", + " .set_index(args.annotaitons_gene_col)\n", + ")\n", "gene_to_PG.head()" ] }, diff --git a/project/10_2_ald_compare_methods.py b/project/10_2_ald_compare_methods.py index 5eac1e1ba..55037a3de 100644 --- a/project/10_2_ald_compare_methods.py +++ b/project/10_2_ald_compare_methods.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -29,7 +29,7 @@ logger = vaep.logging.setup_nb_logger() plt.rcParams['figure.figsize'] = (2, 2) -fontsize= 5 +fontsize = 5 vaep.plotting.make_large_descriptors(fontsize) # %% @@ -76,7 +76,7 @@ # %% files_in = { 'freq_features_observed.csv': args.folder_experiment / 'freq_features_observed.csv', - } +} files_in # %% [markdown] @@ -88,19 +88,19 @@ # %% writer_args = dict(float_format='%.3f') -fname = args.out_folder / 'diff_analysis_compare_methods.xlsx' +fname = args.out_folder / 'diff_analysis_compare_methods.xlsx' files_out[fname.name] = fname writer = pd.ExcelWriter(fname) fname # %% [markdown] -# # Load scores +# # Load scores # %% [x for x in args.scores_folder.iterdir() if 'scores' in str(x)] # %% -fname =args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl' +fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl' scores_baseline = pd.read_pickle(fname) scores_baseline @@ -133,7 +133,6 @@ scores.describe(include=['bool', 'O']) - # %% [markdown] # ## Load frequencies of observed features @@ -150,8 +149,8 @@ .dropna() .reset_index(-1, drop=True) ).join( - freq_feat, how='left' - ) + freq_feat, how='left' +) scores_common @@ -162,7 +161,7 @@ def annotate_decision(scores, model, model_column): annotations = None for model, model_column in models.items(): - if not annotations is None: + if annotations is not None: annotations += ' - ' annotations += annotate_decision(scores_common, model=model, model_column=model_column) @@ -199,7 +198,7 @@ def annotate_decision(scores, model, model_column): # %% # should it be possible to run not only RSN? -to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs() +to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs() to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False) # %% [markdown] @@ -222,7 +221,7 @@ def annotate_decision(scores, model, model_column): _ = ax.legend(fontsize=fontsize, title_fontsize=fontsize, markerscale=0.4, - title='', + title='', ) ax.set_xlabel(f"qvalue for {x_col}") ax.set_ylabel(f"qvalue for {y_col}") @@ -246,7 +245,7 @@ def annotate_decision(scores, model, model_column): y=to_plot.columns[1], size='frequency', s=size, - sizes=(5,20), + sizes=(5, 20), hue='Differential Analysis Comparison') _ = ax.legend(fontsize=fontsize, title_fontsize=fontsize, @@ -272,7 +271,7 @@ def annotate_decision(scores, model, model_column): .loc[ scores_model_only.index.difference( scores_common.index), - args.model_key] + args.model_key] .sort_values(by='qvalue', ascending=True) .join(freq_feat) ) @@ -307,7 +306,7 @@ def annotate_decision(scores, model, model_column): # %% # %% -feat_name = scores.index.names[0] # first index level is feature name +feat_name = scores.index.names[0] # first index level is feature name if args.annotaitons_gene_col in scores.index.names: logger.info(f"Found gene annotation in scores index: {scores.index.names}") else: @@ -318,13 +317,13 @@ def annotate_decision(scores, model, model_column): # %% gene_to_PG = (scores.droplevel( - list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col}) - ) - .index - .to_frame() - .reset_index(drop=True) - .set_index(args.annotaitons_gene_col) - ) + list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col}) +) + .index + .to_frame() + .reset_index(drop=True) + .set_index(args.annotaitons_gene_col) +) gene_to_PG.head() # %% diff --git a/project/10_3_ald_ml_new_feat.ipynb b/project/10_3_ald_ml_new_feat.ipynb index dbb096e61..d3d7dd763 100644 --- a/project/10_3_ald_ml_new_feat.ipynb +++ b/project/10_3_ald_ml_new_feat.ipynb @@ -40,8 +40,8 @@ "\n", "plt.rcParams['figure.figsize'] = (2.5, 2.5)\n", "plt.rcParams['lines.linewidth'] = 1\n", - "fontsize= 5\n", - "figsize= (2.5, 2.5)\n", + "fontsize = 5\n", + "figsize = (2.5, 2.5)\n", "vaep.plotting.make_large_descriptors(fontsize)\n", "\n", "\n", @@ -89,7 +89,7 @@ "cutoff_target: int = 2 # => for binarization target >= cutoff_target\n", "file_format = \"csv\"\n", "out_folder = 'diff_analysis'\n", - "fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", + "fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl'\n", "\n", "baseline = 'RSN' # default is RSN, as this was used in the original ALD Niu. et. al 2022\n", "template_pred = 'pred_real_na_{}.csv' # fixed, do not change" @@ -272,7 +272,7 @@ "outputs": [], "source": [ "fname = args.out_preds / args.template_pred.format(args.baseline)\n", - "pred_real_na_baseline = load_single_csv_pred_file(fname) #.loc[mask_has_target]\n", + "pred_real_na_baseline = load_single_csv_pred_file(fname) # .loc[mask_has_target]\n", "pred_real_na_baseline" ] }, @@ -291,7 +291,9 @@ "Repeat general approach for\n", " 1. all original ald data: all features justed in original ALD study\n", " 2. all model data: all features available my using the self supervised deep learning model\n", - " 3. newly available feat only: the subset of features available from the self supervised deep learning model which were newly retained using the new approach" + "3. newly available feat only: the subset of features available from the\n", + "self supervised deep learning model which were newly retained using the\n", + "new approach" ] }, { diff --git a/project/10_3_ald_ml_new_feat.py b/project/10_3_ald_ml_new_feat.py index fca637f80..5ac862558 100644 --- a/project/10_3_ald_ml_new_feat.py +++ b/project/10_3_ald_ml_new_feat.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -41,8 +41,8 @@ plt.rcParams['figure.figsize'] = (2.5, 2.5) plt.rcParams['lines.linewidth'] = 1 -fontsize= 5 -figsize= (2.5, 2.5) +fontsize = 5 +figsize = (2.5, 2.5) vaep.plotting.make_large_descriptors(fontsize) @@ -66,7 +66,7 @@ cutoff_target: int = 2 # => for binarization target >= cutoff_target file_format = "csv" out_folder = 'diff_analysis' -fn_qc_samples = '' #'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' +fn_qc_samples = '' # 'data/ALD_study/processed/qc_plasma_proteinGroups.pkl' baseline = 'RSN' # default is RSN, as this was used in the original ALD Niu. et. al 2022 template_pred = 'pred_real_na_{}.csv' # fixed, do not change @@ -160,7 +160,7 @@ # %% fname = args.out_preds / args.template_pred.format(args.baseline) -pred_real_na_baseline = load_single_csv_pred_file(fname) #.loc[mask_has_target] +pred_real_na_baseline = load_single_csv_pred_file(fname) # .loc[mask_has_target] pred_real_na_baseline # %% [markdown] @@ -174,7 +174,9 @@ # Repeat general approach for # 1. all original ald data: all features justed in original ALD study # 2. all model data: all features available my using the self supervised deep learning model -# 3. newly available feat only: the subset of features available from the self supervised deep learning model which were newly retained using the new approach +# 3. newly available feat only: the subset of features available from the +# self supervised deep learning model which were newly retained using the +# new approach # %% X = pd.concat([data, pred_real_na]).unstack() diff --git a/project/10_4_ald_compare_single_pg.ipynb b/project/10_4_ald_compare_single_pg.ipynb index a5e0612f1..80ea04a41 100644 --- a/project/10_4_ald_compare_single_pg.ipynb +++ b/project/10_4_ald_compare_single_pg.ipynb @@ -72,7 +72,7 @@ "source": [ "folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups'\n", "fn_clinical_data = \"data/ALD_study/processed/ald_metadata_cli.csv\"\n", - "make_plots = True # create histograms and swarmplots of diverging results\n", + "make_plots = True # create histograms and swarmplots of diverging results\n", "model_key = 'VAE'\n", "sample_id_col = 'Sample ID'\n", "target = 'kleiner'\n", @@ -81,7 +81,7 @@ "file_format = 'csv'\n", "baseline = 'RSN' # default is RSN, but could be any other trained model\n", "template_pred = 'pred_real_na_{}.csv' # fixed, do not change\n", - "ref_method_score = None # filepath to reference method score" + "ref_method_score = None # filepath to reference method score" ] }, { @@ -164,7 +164,7 @@ "outputs": [], "source": [ "# Reference dump\n", - "if args.ref_method_score: \n", + "if args.ref_method_score:\n", " scores_reference = (pd\n", " .read_pickle(args.ref_method_score)\n", " .rename({'None': 'None (100%)'},\n", @@ -207,7 +207,7 @@ " ).set_index(\n", " ('data', 'frequency'), append=True)\n", "qvalues.index.names = qvalues.index.names[:-1] + ['frequency']\n", - "fname = args.out_folder / 'qvalues_target.pkl'\n", + "fname = args.out_folder / 'qvalues_target.pkl'\n", "files_out[fname.name] = fname.as_posix()\n", "qvalues.to_pickle(fname)\n", "qvalues.to_excel(writer, sheet_name='qvalues_all')\n", @@ -227,7 +227,7 @@ " ).set_index(\n", " ('data', 'frequency'), append=True)\n", "pvalues.index.names = pvalues.index.names[:-1] + ['frequency']\n", - "fname = args.out_folder / 'pvalues_target.pkl'\n", + "fname = args.out_folder / 'pvalues_target.pkl'\n", "files_out[fname.name] = fname.as_posix()\n", "pvalues.to_pickle(fname)\n", "pvalues.to_excel(writer, sheet_name='pvalues_all')\n", @@ -244,7 +244,7 @@ "da_target = scores.loc[pd.IndexSlice[:, args.target],\n", " pd.IndexSlice[:, 'rejected']\n", " ].join(freq_feat\n", - " ).set_index(\n", + " ).set_index(\n", " ('data', 'frequency'), append=True)\n", "da_target.index.names = da_target.index.names[:-1] + ['frequency']\n", "fname = args.out_folder / 'equality_rejected_target.pkl'\n", @@ -677,7 +677,7 @@ " fig, ax = plt.subplots()\n", "\n", " # dummy plots, just to get the Path objects\n", - " tmp_dot = ax.scatter([1,2],[3,4], marker='X')\n", + " tmp_dot = ax.scatter([1, 2], [3, 4], marker='X')\n", " new_mk, = tmp_dot.get_paths()\n", " tmp_dot.remove()\n", "\n", diff --git a/project/10_4_ald_compare_single_pg.py b/project/10_4_ald_compare_single_pg.py index 7b3c9279c..925568114 100644 --- a/project/10_4_ald_compare_single_pg.py +++ b/project/10_4_ald_compare_single_pg.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -49,7 +49,7 @@ # %% tags=["parameters"] folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups' fn_clinical_data = "data/ALD_study/processed/ald_metadata_cli.csv" -make_plots = True # create histograms and swarmplots of diverging results +make_plots = True # create histograms and swarmplots of diverging results model_key = 'VAE' sample_id_col = 'Sample ID' target = 'kleiner' @@ -58,7 +58,7 @@ file_format = 'csv' baseline = 'RSN' # default is RSN, but could be any other trained model template_pred = 'pred_real_na_{}.csv' # fixed, do not change -ref_method_score = None # filepath to reference method score +ref_method_score = None # filepath to reference method score # %% @@ -99,7 +99,7 @@ # %% # Reference dump -if args.ref_method_score: +if args.ref_method_score: scores_reference = (pd .read_pickle(args.ref_method_score) .rename({'None': 'None (100%)'}, @@ -123,7 +123,7 @@ ).set_index( ('data', 'frequency'), append=True) qvalues.index.names = qvalues.index.names[:-1] + ['frequency'] -fname = args.out_folder / 'qvalues_target.pkl' +fname = args.out_folder / 'qvalues_target.pkl' files_out[fname.name] = fname.as_posix() qvalues.to_pickle(fname) qvalues.to_excel(writer, sheet_name='qvalues_all') @@ -136,7 +136,7 @@ ).set_index( ('data', 'frequency'), append=True) pvalues.index.names = pvalues.index.names[:-1] + ['frequency'] -fname = args.out_folder / 'pvalues_target.pkl' +fname = args.out_folder / 'pvalues_target.pkl' files_out[fname.name] = fname.as_posix() pvalues.to_pickle(fname) pvalues.to_excel(writer, sheet_name='pvalues_all') @@ -146,7 +146,7 @@ da_target = scores.loc[pd.IndexSlice[:, args.target], pd.IndexSlice[:, 'rejected'] ].join(freq_feat - ).set_index( + ).set_index( ('data', 'frequency'), append=True) da_target.index.names = da_target.index.names[:-1] + ['frequency'] fname = args.out_folder / 'equality_rejected_target.pkl' @@ -378,7 +378,7 @@ fig, ax = plt.subplots() # dummy plots, just to get the Path objects - tmp_dot = ax.scatter([1,2],[3,4], marker='X') + tmp_dot = ax.scatter([1, 2], [3, 4], marker='X') new_mk, = tmp_dot.get_paths() tmp_dot.remove() diff --git a/project/10_5_comp_diff_analysis_repetitions.ipynb b/project/10_5_comp_diff_analysis_repetitions.ipynb index 961dd02c0..0d09578be 100644 --- a/project/10_5_comp_diff_analysis_repetitions.ipynb +++ b/project/10_5_comp_diff_analysis_repetitions.ipynb @@ -27,9 +27,7 @@ "cell_type": "code", "execution_count": null, "id": "8bef6cd3-fef6-4499-85cb-63bd524c9edc", - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "files_out = dict()\n", @@ -46,15 +44,18 @@ "metadata": {}, "outputs": [], "source": [ - "def _load_pickle(pfath, run:int):\n", + "\n", + "\n", + "def _load_pickle(pfath, run: int):\n", " df = pd.read_pickle(pfath)\n", " df['run'] = f'run{run:02d}'\n", " df = df.set_index('run', append=True)\n", " return df\n", "\n", + "\n", "df_long_qvalues = pd.concat(\n", - " [_load_pickle(f,i) for i,f in enumerate(pickled_qvalues)]\n", - " )\n", + " [_load_pickle(f, i) for i, f in enumerate(pickled_qvalues)]\n", + ")\n", "df_long_qvalues" ] }, @@ -148,7 +149,7 @@ " [~da_target_same]\n", " .index\n", " .get_level_values(0)\n", - ")" + " )" ] }, { @@ -181,7 +182,7 @@ "qvalue_stats = (qvalue_stats\n", " .loc[idx_different]\n", " .sort_values(('None', 'qvalue', 'mean'))\n", - ")\n", + " )\n", "qvalue_stats" ] }, @@ -254,15 +255,15 @@ "source": [ "# pgs included in original ald study\n", "tab_diff_rejec_counts_old = (da_counts\n", - " .loc[mask_pgs_included_in_ald_study]\n", - " .reset_index()\n", - " .groupby(\n", - " by=da_counts.columns.to_list())\n", - " .size()\n", - " .to_frame('N')\n", - ")\n", + " .loc[mask_pgs_included_in_ald_study]\n", + " .reset_index()\n", + " .groupby(\n", + " by=da_counts.columns.to_list())\n", + " .size()\n", + " .to_frame('N')\n", + " )\n", "tab_diff_rejec_counts_old.to_excel(writer,\n", - " sheet_name='tab_diff_rejec_counts_old')\n", + " sheet_name='tab_diff_rejec_counts_old')\n", "tab_diff_rejec_counts_old" ] }, @@ -292,17 +293,16 @@ "source": [ "# new pgs\n", "tab_diff_rejec_counts_new = (da_counts\n", - " .loc[~mask_pgs_included_in_ald_study]\n", - " .reset_index()\n", - " .drop('RSN', axis=1)\n", - " .groupby(\n", - " by=\n", - " [m for m in da_counts.columns if m != 'RSN'])\n", - " .size()\n", - " .to_frame('N')\n", - ")\n", + " .loc[~mask_pgs_included_in_ald_study]\n", + " .reset_index()\n", + " .drop('RSN', axis=1)\n", + " .groupby(\n", + " by=[m for m in da_counts.columns if m != 'RSN'])\n", + " .size()\n", + " .to_frame('N')\n", + " )\n", "tab_diff_rejec_counts_new.to_excel(writer,\n", - " sheet_name='tab_diff_rejec_counts_new')\n", + " sheet_name='tab_diff_rejec_counts_new')\n", "tab_diff_rejec_counts_new" ] }, @@ -331,7 +331,7 @@ "outputs": [], "source": [ "mask_new_da_with_imp = mask_new_da_with_imputation = ((~mask_pgs_included_in_ald_study)\n", - " & (da_counts['None'] != 10))\n", + " & (da_counts['None'] != 10))\n", "\n", "tab_new_da_with_imp = vaep.pandas.combine_value_counts(\n", " da_counts\n", diff --git a/project/10_5_comp_diff_analysis_repetitions.py b/project/10_5_comp_diff_analysis_repetitions.py index 65df744be..909c940fd 100644 --- a/project/10_5_comp_diff_analysis_repetitions.py +++ b/project/10_5_comp_diff_analysis_repetitions.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -29,15 +29,18 @@ fname # %% -def _load_pickle(pfath, run:int): + + +def _load_pickle(pfath, run: int): df = pd.read_pickle(pfath) df['run'] = f'run{run:02d}' df = df.set_index('run', append=True) return df + df_long_qvalues = pd.concat( - [_load_pickle(f,i) for i,f in enumerate(pickled_qvalues)] - ) + [_load_pickle(f, i) for i, f in enumerate(pickled_qvalues)] +) df_long_qvalues # %% [markdown] @@ -77,7 +80,7 @@ def _load_pickle(pfath, run:int): [~da_target_same] .index .get_level_values(0) -) + ) # %% da_counts = da_counts.loc[idx_different] @@ -90,7 +93,7 @@ def _load_pickle(pfath, run:int): qvalue_stats = (qvalue_stats .loc[idx_different] .sort_values(('None', 'qvalue', 'mean')) -) + ) qvalue_stats # %% [markdown] @@ -123,15 +126,15 @@ def _load_pickle(pfath, run:int): # %% # pgs included in original ald study tab_diff_rejec_counts_old = (da_counts - .loc[mask_pgs_included_in_ald_study] - .reset_index() - .groupby( - by=da_counts.columns.to_list()) - .size() - .to_frame('N') -) + .loc[mask_pgs_included_in_ald_study] + .reset_index() + .groupby( + by=da_counts.columns.to_list()) + .size() + .to_frame('N') + ) tab_diff_rejec_counts_old.to_excel(writer, - sheet_name='tab_diff_rejec_counts_old') + sheet_name='tab_diff_rejec_counts_old') tab_diff_rejec_counts_old # %% @@ -147,17 +150,16 @@ def _load_pickle(pfath, run:int): # %% # new pgs tab_diff_rejec_counts_new = (da_counts - .loc[~mask_pgs_included_in_ald_study] - .reset_index() - .drop('RSN', axis=1) - .groupby( - by= - [m for m in da_counts.columns if m != 'RSN']) - .size() - .to_frame('N') -) + .loc[~mask_pgs_included_in_ald_study] + .reset_index() + .drop('RSN', axis=1) + .groupby( + by=[m for m in da_counts.columns if m != 'RSN']) + .size() + .to_frame('N') + ) tab_diff_rejec_counts_new.to_excel(writer, - sheet_name='tab_diff_rejec_counts_new') + sheet_name='tab_diff_rejec_counts_new') tab_diff_rejec_counts_new # %% @@ -172,7 +174,7 @@ def _load_pickle(pfath, run:int): # %% mask_new_da_with_imp = mask_new_da_with_imputation = ((~mask_pgs_included_in_ald_study) - & (da_counts['None'] != 10)) + & (da_counts['None'] != 10)) tab_new_da_with_imp = vaep.pandas.combine_value_counts( da_counts diff --git a/project/10_6_interpret_repeated_ald_da.py b/project/10_6_interpret_repeated_ald_da.py index b9cc384ed..bb685cb21 100644 --- a/project/10_6_interpret_repeated_ald_da.py +++ b/project/10_6_interpret_repeated_ald_da.py @@ -24,7 +24,7 @@ def load_pred_from_run(run_folder: Path, # %% reps_folder = 'runs/appl_ald_data/plasma/proteinGroups/reps' template_pred = 'pred_real_na_{}.csv' # fixed, do not change -model_keys = ['CF', 'DAE', 'KNN', 'Median', 'RSN', 'VAE','rf'] +model_keys = ['CF', 'DAE', 'KNN', 'Median', 'RSN', 'VAE', 'rf'] # %% @@ -52,12 +52,12 @@ def load_pred_from_run(run_folder: Path, for method in model_keys: pred_real_na_cvs[method] = pred_real_na[( method, 'std')] / pred_real_na[(method, 'mean')] - + pred_real_na_cvs.to_excel(writer, float_format='%.3f', sheet_name='CVs') ax = pred_real_na_cvs.plot.hist(bins=15, color=vaep.plotting.defaults.assign_colors(model_keys), - alpha=0.5) + alpha=0.5) ax.yaxis.set_major_formatter('{x:,.0f}') ax.set_xlabel(f'Coefficient of variation of imputed intensites (N={len(pred_real_na):,d})') fname = reps_folder / 'pred_real_na_cvs.png' diff --git a/project/10_7_ald_reduced_dataset_plots.ipynb b/project/10_7_ald_reduced_dataset_plots.ipynb index af290974c..3c6f8b974 100644 --- a/project/10_7_ald_reduced_dataset_plots.ipynb +++ b/project/10_7_ald_reduced_dataset_plots.ipynb @@ -25,6 +25,7 @@ "\n", "COLORS_TO_USE_MAPPTING = vaep.plotting.defaults.color_model_mapping\n", "\n", + "\n", "def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05,\n", " alpha=1.0, style='.', markersize=3):\n", " ax = df.plot.line(x=x,\n", @@ -40,14 +41,15 @@ " linestyles='dashed',\n", " color='grey',\n", " linewidth=1)\n", - " return ax\n", - "\n" + " return ax" ] }, { "cell_type": "markdown", "id": "9d21e1a1-7a46-49d4-8976-bc2031652ee4", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 0 + }, "source": [ "DA analysis" ] @@ -219,7 +221,7 @@ "source": [ "mask_lost_sign = (\n", " (da_target_sel['None'] == False)\n", - " & (da_target_sel[REF_MODEL] == True)\n", + " & (da_target_sel[REF_MODEL])\n", ")\n", "sel = qvalues_sel.loc[mask_lost_sign.squeeze()]\n", "sel.columns = sel.columns.droplevel(-1)\n", @@ -238,12 +240,12 @@ "# 0: FN\n", "# 1: TP\n", "da_target_sel_counts = (da_target_sel[ORDER_MODELS]\n", - " .loc[mask_lost_sign.squeeze()]\n", - " .astype(int)\n", - " .replace(\n", - " {0: 'FN',\n", - " 1: 'TP'}\n", - " ).droplevel(-1, axis=1)\n", + " .loc[mask_lost_sign.squeeze()]\n", + " .astype(int)\n", + " .replace(\n", + " {0: 'FN',\n", + " 1: 'TP'}\n", + ").droplevel(-1, axis=1)\n", ")\n", "da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)\n", "ax = da_target_sel_counts.T.plot.bar()\n", @@ -290,7 +292,7 @@ "outputs": [], "source": [ "mask_gained_signal = (\n", - " (da_target_sel['None'] == True)\n", + " (da_target_sel['None'])\n", " & (da_target_sel[REF_MODEL] == False)\n", ")\n", "sel = qvalues_sel.loc[mask_gained_signal.squeeze()]\n", @@ -308,12 +310,12 @@ "outputs": [], "source": [ "da_target_sel_counts = (da_target_sel[ORDER_MODELS]\n", - " .loc[mask_gained_signal.squeeze()]\n", - " .astype(int)\n", - " .replace(\n", - " {0: 'TN',\n", - " 1: 'FP'}\n", - " ).droplevel(-1, axis=1)\n", + " .loc[mask_gained_signal.squeeze()]\n", + " .astype(int)\n", + " .replace(\n", + " {0: 'TN',\n", + " 1: 'FP'}\n", + ").droplevel(-1, axis=1)\n", ")\n", "da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts)\n", "ax = da_target_sel_counts.T.plot.bar()\n", diff --git a/project/10_7_ald_reduced_dataset_plots.py b/project/10_7_ald_reduced_dataset_plots.py index 670f3a057..e62ffe87a 100644 --- a/project/10_7_ald_reduced_dataset_plots.py +++ b/project/10_7_ald_reduced_dataset_plots.py @@ -12,6 +12,7 @@ COLORS_TO_USE_MAPPTING = vaep.plotting.defaults.color_model_mapping + def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, alpha=1.0, style='.', markersize=3): ax = df.plot.line(x=x, @@ -30,10 +31,8 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, return ax - # %% [markdown] # DA analysis - # %% out_folder = 'runs/appl_ald_data/plasma/proteinGroups_80%_dataset/diff_analysis/kleiner/' out_folder = Path(out_folder) @@ -102,7 +101,7 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% mask_lost_sign = ( (da_target_sel['None'] == False) - & (da_target_sel[REF_MODEL] == True) + & (da_target_sel[REF_MODEL]) ) sel = qvalues_sel.loc[mask_lost_sign.squeeze()] sel.columns = sel.columns.droplevel(-1) @@ -114,12 +113,12 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # 0: FN # 1: TP da_target_sel_counts = (da_target_sel[ORDER_MODELS] - .loc[mask_lost_sign.squeeze()] - .astype(int) - .replace( - {0: 'FN', - 1: 'TP'} - ).droplevel(-1, axis=1) + .loc[mask_lost_sign.squeeze()] + .astype(int) + .replace( + {0: 'FN', + 1: 'TP'} +).droplevel(-1, axis=1) ) da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts) ax = da_target_sel_counts.T.plot.bar() @@ -146,7 +145,7 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% mask_gained_signal = ( - (da_target_sel['None'] == True) + (da_target_sel['None']) & (da_target_sel[REF_MODEL] == False) ) sel = qvalues_sel.loc[mask_gained_signal.squeeze()] @@ -157,12 +156,12 @@ def plot_qvalues(df, x: str, y: list, ax=None, cutoff=0.05, # %% da_target_sel_counts = (da_target_sel[ORDER_MODELS] - .loc[mask_gained_signal.squeeze()] - .astype(int) - .replace( - {0: 'TN', - 1: 'FP'} - ).droplevel(-1, axis=1) + .loc[mask_gained_signal.squeeze()] + .astype(int) + .replace( + {0: 'TN', + 1: 'FP'} +).droplevel(-1, axis=1) ) da_target_sel_counts = vaep.pandas.combine_value_counts(da_target_sel_counts) ax = da_target_sel_counts.T.plot.bar() diff --git a/project/erda_00_maxquant_file_reader.ipynb b/project/erda_00_maxquant_file_reader.ipynb index 26b0e9ac8..d4bcd42d9 100644 --- a/project/erda_00_maxquant_file_reader.ipynb +++ b/project/erda_00_maxquant_file_reader.ipynb @@ -50,14 +50,14 @@ "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", "from config import FOLDER_KEY # defines how filenames are parsed for use as indices\n", "\n", - "from config import FOLDER_DATA # project folder for storing the data\n", + "from config import FOLDER_DATA # project folder for storing the data\n", "print(f\"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}\")\n", "\n", "##################\n", "### Logging ######\n", "##################\n", "\n", - "#Delete Jupyter notebook root logger handler\n", + "# Delete Jupyter notebook root logger handler\n", "root_logger = logging.getLogger()\n", "root_logger.handlers = []\n", "\n", @@ -73,7 +73,7 @@ "metadata": {}, "outputs": [], "source": [ - "folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()]" + "folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()]" ] }, { @@ -82,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "folders_dict = {folder.name: folder for folder in sorted(folders) }\n", + "folders_dict = {folder.name: folder for folder in sorted(folders)}\n", "assert len(folders_dict) == len(folders), \"Non unique file names\"" ] }, @@ -113,7 +113,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking:" + "Results will be saved in a subfolder under `vaep/project/data` using the\n", + "name of the specified input-folder per default. Change to your liking:" ] }, { @@ -172,7 +173,7 @@ "source": [ "### Summaries\n", "\n", - "- aggregated in `vaep/project/erda_01_mq_aggregate_summaries.ipynb` \n", + "- aggregated in `vaep/project/erda_01_mq_aggregate_summaries.ipynb`\n", " - file selection based on summaries for further analysis thereafter" ] }, @@ -258,8 +259,8 @@ "source": [ "mqpar_files = (Path(FOLDER_DATA) / 'mqpar_files')\n", "\n", - "mqpar_files = [file for file in mqpar_files.iterdir() if file.suffix == '.xml']\n", - "len(mqpar_files) # nested search needed" + "mqpar_files = [file for file in mqpar_files.iterdir() if file.suffix == '.xml']\n", + "len(mqpar_files) # nested search needed" ] }, { @@ -296,8 +297,8 @@ "d_mqpar = dict()\n", "for file in tqdm(mqpar_files):\n", " d_mqpar[file.stem] = load_mqpar_xml(file)['MaxQuantParams']\n", - " \n", - "df_mqpar = pd.DataFrame(d_mqpar.values() , index=d_mqpar.keys()).convert_dtypes()\n", + "\n", + "df_mqpar = pd.DataFrame(d_mqpar.values(), index=d_mqpar.keys()).convert_dtypes()\n", "df_mqpar" ] }, @@ -340,7 +341,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "in order to see if there are different setting based on the string columns, drop duplicates \n", + "in order to see if there are different setting based on the string columns, drop duplicates\n", "\n", "- only one should remain" ] @@ -396,7 +397,7 @@ "metadata": {}, "outputs": [], "source": [ - "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" + "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" ] }, { diff --git a/project/erda_00_maxquant_file_reader.py b/project/erda_00_maxquant_file_reader.py index f441a31ac..74e572164 100644 --- a/project/erda_00_maxquant_file_reader.py +++ b/project/erda_00_maxquant_file_reader.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -51,14 +51,14 @@ from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED from config import FOLDER_KEY # defines how filenames are parsed for use as indices -from config import FOLDER_DATA # project folder for storing the data +from config import FOLDER_DATA # project folder for storing the data print(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}") ################## ### Logging ###### ################## -#Delete Jupyter notebook root logger handler +# Delete Jupyter notebook root logger handler root_logger = logging.getLogger() root_logger.handlers = [] @@ -68,10 +68,10 @@ logger.info('Start with handlers: \n' + "\n".join(f"- {repr(log_)}" for log_ in logger.handlers)) # %% -folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()] +folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()] # %% -folders_dict = {folder.name: folder for folder in sorted(folders) } +folders_dict = {folder.name: folder for folder in sorted(folders)} assert len(folders_dict) == len(folders), "Non unique file names" # %% Collapsed="false" @@ -84,7 +84,8 @@ mq_output # %% [markdown] -# Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking: +# Results will be saved in a subfolder under `vaep/project/data` using the +# name of the specified input-folder per default. Change to your liking: # %% [markdown] # > Go to the block you are interested in! @@ -107,7 +108,7 @@ # %% [markdown] Collapsed="false" # ### Summaries # -# - aggregated in `vaep/project/erda_01_mq_aggregate_summaries.ipynb` +# - aggregated in `vaep/project/erda_01_mq_aggregate_summaries.ipynb` # - file selection based on summaries for further analysis thereafter # %% @@ -152,8 +153,8 @@ # %% mqpar_files = (Path(FOLDER_DATA) / 'mqpar_files') -mqpar_files = [file for file in mqpar_files.iterdir() if file.suffix == '.xml'] -len(mqpar_files) # nested search needed +mqpar_files = [file for file in mqpar_files.iterdir() if file.suffix == '.xml'] +len(mqpar_files) # nested search needed # %% Collapsed="false" w_file = widgets.Dropdown(options=mqpar_files, description='Select a file') @@ -168,8 +169,8 @@ d_mqpar = dict() for file in tqdm(mqpar_files): d_mqpar[file.stem] = load_mqpar_xml(file)['MaxQuantParams'] - -df_mqpar = pd.DataFrame(d_mqpar.values() , index=d_mqpar.keys()).convert_dtypes() + +df_mqpar = pd.DataFrame(d_mqpar.values(), index=d_mqpar.keys()).convert_dtypes() df_mqpar # %% [markdown] @@ -188,7 +189,7 @@ df_mqpar.iloc[0].loc['fastaFiles'] # %% [markdown] -# in order to see if there are different setting based on the string columns, drop duplicates +# in order to see if there are different setting based on the string columns, drop duplicates # # - only one should remain @@ -212,7 +213,7 @@ mq_output.evidence # %% -mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands +mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands # %% [markdown] Collapsed="false" # ### Create peptide intensity dumps for each MQ outputfolder diff --git a/project/erda_01_mq_select_runs.ipynb b/project/erda_01_mq_select_runs.ipynb index af7dd493a..875fcb525 100644 --- a/project/erda_01_mq_select_runs.ipynb +++ b/project/erda_01_mq_select_runs.ipynb @@ -23,11 +23,22 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", "import logging\n", "from pathlib import Path, PurePosixPath\n", "import yaml\n", - "import random\n", + "\n", + "import ipywidgets as widgets\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from vaep.io.data_objects import MqAllSummaries\n", + "from vaep import plotting\n", + "from vaep.io.mq import MaxQuantOutputDynamic\n", + "\n", + "import config\n", + "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", + "\n", "\n", "##################\n", "### Logging ######\n", @@ -40,31 +51,18 @@ "\n", "logging.info('Start with handlers: \\n' + \"\\n\".join(f\"- {repr(log_)}\" for log_ in logger.handlers))\n", "\n", - "### Other imports\n", + "# Other imports\n", "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import ipywidgets as widgets\n", - "\n", - "from vaep.io.mq import MaxQuantOutputDynamic\n", - "from vaep import plotting\n", - "\n", - "from vaep.io import data_objects\n", - "from vaep.io.data_objects import MqAllSummaries \n", "\n", "##################\n", "##### CONFIG #####\n", "##################\n", - "import config\n", - "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", "\n", "ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')\n", "MAP_FOLDER_PATH = Path('config/file_paths')\n", "FPATH_ALL_SUMMARIES = FOLDER_PROCESSED / 'all_summaries.json'\n", "FN_RAWFILE_METADATA = 'data/rawfile_metadata.csv'\n", "\n", - "from config import FOLDER_DATA # project folder for storing the data\n", "logger.info(f\"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}\")" ] }, @@ -76,7 +74,8 @@ }, "outputs": [], "source": [ - "folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir() and not folder.name.startswith('.')]" + "folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir()\n", + " and not folder.name.startswith('.')]" ] }, { @@ -91,7 +90,7 @@ "assert len(folders_dict) == len(folders), \"Non unique file names\"\n", "\n", "with open(MAP_FOLDER_PATH, 'w') as f:\n", - " yaml.dump({ k: str(PurePosixPath(v)) for k, v in folders_dict.items()} , f)\n", + " yaml.dump({k: str(PurePosixPath(v)) for k, v in folders_dict.items()}, f)\n", "logger.info(f\"Save map of file names to file paths to: {str(MAP_FOLDER_PATH)}\")\n", "\n", "# w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files')\n", @@ -199,18 +198,19 @@ "source": [ "class col_summary:\n", " MS1 = 'MS'\n", - " MS2 = 'MS/MS' \n", - " MS2_identified = 'MS/MS Identified'\n", - " peptides_identified = 'Peptide Sequences Identified' # 'peptides.txt' should have this number of peptides\n", + " MS2 = 'MS/MS'\n", + " MS2_identified = 'MS/MS Identified'\n", + " peptides_identified = 'Peptide Sequences Identified' # 'peptides.txt' should have this number of peptides\n", + "\n", "\n", "df = mq_all_summaries.df\n", "if df is not None:\n", " MS_spectra = df[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]]\n", "\n", " def compute_summary(threshold_identified):\n", - " mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified\n", + " mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified\n", " display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10)))\n", - " \n", + "\n", " w_ions_range = widgets.IntSlider(value=15_000, min=.0, max=MS_spectra[col_summary.peptides_identified].max())\n", " display(widgets.interactive(compute_summary, threshold_identified=w_ions_range))" ] @@ -221,10 +221,10 @@ "metadata": {}, "outputs": [], "source": [ - "mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value\n", + "mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value\n", "logger.warning(f\"Save {mask.sum()} file names to configuration file of selected samples: \"\n", - "f\"{ELIGABLE_FILES_YAML} \"\n", - "f\"based on a minimum of {w_ions_range.value} peptides.\")\n", + " f\"{ELIGABLE_FILES_YAML} \"\n", + " f\"based on a minimum of {w_ions_range.value} peptides.\")\n", "idx_selected = MS_spectra.loc[mask].index\n", "MS_spectra.loc[idx_selected]" ] @@ -258,7 +258,9 @@ "metadata": {}, "outputs": [], "source": [ - "w_date_range = widgets.SelectionRangeSlider(options=df_meta_rawfiles[date_col], value=[min(df_meta_rawfiles[date_col]),max(df_meta_rawfiles[date_col]) ] )\n", + "w_date_range = widgets.SelectionRangeSlider(options=df_meta_rawfiles[date_col], value=[\n", + " min(df_meta_rawfiles[date_col]), max(df_meta_rawfiles[date_col])])\n", + "\n", "\n", "def show(range):\n", " mask = df_meta_rawfiles[date_col].between(*range)\n", @@ -315,11 +317,11 @@ "outputs": [], "source": [ "_max = MS_spectra[col_summary.peptides_identified].max() + 10_001\n", - "fig, ax = plt.subplots(figsize=(10,10))\n", + "fig, ax = plt.subplots(figsize=(10, 10))\n", "_ = MS_spectra[col_summary.peptides_identified].hist(\n", - " bins=range(0,_max, 10_000),\n", + " bins=range(0, _max, 10_000),\n", " legend=True,\n", - " ax = ax)\n", + " ax=ax)\n", "fig.suptitle('Number of samples, binned in 10K steps.')\n", "fig.tight_layout()" ] @@ -330,7 +332,8 @@ "metadata": {}, "outputs": [], "source": [ - "MS_spectra[col_summary.peptides_identified].mean(), MS_spectra[col_summary.peptides_identified].std() # including folders with 0 identified peptides" + "# including folders with 0 identified peptides\n", + "MS_spectra[col_summary.peptides_identified].mean(), MS_spectra[col_summary.peptides_identified].std()" ] }, { @@ -348,7 +351,8 @@ "\n", "\n", "# calc_cutoff()\n", - "display(widgets.interactive(calc_cutoff, threshold=widgets.IntSlider(value=10000.0, min=.0, max=MS_spectra[col_summary.peptides_identified].max())))" + "display(widgets.interactive(calc_cutoff, threshold=widgets.IntSlider(\n", + " value=10000.0, min=.0, max=MS_spectra[col_summary.peptides_identified].max())))" ] }, { @@ -357,23 +361,32 @@ "metadata": {}, "outputs": [], "source": [ - "fig, axes = plt.subplots(2,2, figsize=(20,20), sharex=True)\n", + "fig, axes = plt.subplots(2, 2, figsize=(20, 20), sharex=True)\n", "\n", - "ylim_hist = (0,600)\n", + "ylim_hist = (0, 600)\n", "xlim_dens = (0, 70_000)\n", "\n", - "ax = axes[0,0]\n", - "ax = mq_all_summaries.df[col_summary.peptides_identified].plot(kind='hist', bins=50, title=\"Histogram including samples with zero identified peptides\", grid=True, ax=ax, ylim=ylim_hist)\n", - "ax = axes[1,0]\n", - "_ = mq_all_summaries.df[col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title=\"Density plot including samples with zero identified peptides.\", xlim=xlim_dens)\n", + "ax = axes[0, 0]\n", + "ax = mq_all_summaries.df[col_summary.peptides_identified].plot(\n", + " kind='hist', bins=50, title=\"Histogram including samples with zero identified peptides\", grid=True, ax=ax, ylim=ylim_hist)\n", + "ax = axes[1, 0]\n", + "_ = mq_all_summaries.df[col_summary.peptides_identified].astype(float).plot.kde(\n", + " ax=ax, title=\"Density plot including samples with zero identified peptides.\", xlim=xlim_dens)\n", "\n", "threshold_m2_identified = 15_000\n", "mask = mq_all_summaries.df[col_summary.peptides_identified] >= threshold_m2_identified\n", "\n", - "ax = axes[0,1]\n", - "ax = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].plot(kind='hist', bins=40, title=f\"Histogram including samples with {threshold_m2_identified:,d} and more identified peptides\", grid=True, ax=ax, ylim=ylim_hist)\n", - "ax = axes[1,1]\n", - "_ = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title=f\"Density plot including samples with {threshold_m2_identified:,d} and more identified peptides.\", xlim=xlim_dens)\n", + "ax = axes[0, 1]\n", + "ax = mq_all_summaries.df.loc[mask,\n", + " col_summary.peptides_identified].plot(kind='hist',\n", + " bins=40,\n", + " title=f\"Histogram including samples with {threshold_m2_identified:,d} and more identified peptides\",\n", + " grid=True,\n", + " ax=ax,\n", + " ylim=ylim_hist)\n", + "ax = axes[1, 1]\n", + "_ = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].astype(float).plot.kde(\n", + " ax=ax, title=f\"Density plot including samples with {threshold_m2_identified:,d} and more identified peptides.\", xlim=xlim_dens)\n", "\n", "plotting._savefig(fig, name='distribution_peptides_in_samples', folder=config.FIGUREFOLDER)" ] diff --git a/project/erda_01_mq_select_runs.py b/project/erda_01_mq_select_runs.py index 070de215e..6ae571352 100644 --- a/project/erda_01_mq_select_runs.py +++ b/project/erda_01_mq_select_runs.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -24,11 +24,22 @@ # There is are many files more, where several files seem to be available in several times in different formats. # %% -import sys import logging from pathlib import Path, PurePosixPath import yaml -import random + +import ipywidgets as widgets +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +from vaep.io.data_objects import MqAllSummaries +from vaep import plotting +from vaep.io.mq import MaxQuantOutputDynamic + +import config +from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED + ################## ### Logging ###### @@ -41,42 +52,30 @@ logging.info('Start with handlers: \n' + "\n".join(f"- {repr(log_)}" for log_ in logger.handlers)) -### Other imports +# Other imports -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import ipywidgets as widgets - -from vaep.io.mq import MaxQuantOutputDynamic -from vaep import plotting - -from vaep.io import data_objects -from vaep.io.data_objects import MqAllSummaries ################## ##### CONFIG ##### ################## -import config -from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml') MAP_FOLDER_PATH = Path('config/file_paths') FPATH_ALL_SUMMARIES = FOLDER_PROCESSED / 'all_summaries.json' FN_RAWFILE_METADATA = 'data/rawfile_metadata.csv' -from config import FOLDER_DATA # project folder for storing the data logger.info(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}") # %% Collapsed="false" -folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir() and not folder.name.startswith('.')] +folders = [folder for folder in Path(FOLDER_MQ_TXT_DATA).iterdir() if folder.is_dir() + and not folder.name.startswith('.')] # %% Collapsed="false" folders_dict = {folder.name: folder for folder in sorted(folders)} assert len(folders_dict) == len(folders), "Non unique file names" with open(MAP_FOLDER_PATH, 'w') as f: - yaml.dump({ k: str(PurePosixPath(v)) for k, v in folders_dict.items()} , f) + yaml.dump({k: str(PurePosixPath(v)) for k, v in folders_dict.items()}, f) logger.info(f"Save map of file names to file paths to: {str(MAP_FOLDER_PATH)}") # w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files') @@ -126,26 +125,27 @@ # %% class col_summary: MS1 = 'MS' - MS2 = 'MS/MS' - MS2_identified = 'MS/MS Identified' - peptides_identified = 'Peptide Sequences Identified' # 'peptides.txt' should have this number of peptides + MS2 = 'MS/MS' + MS2_identified = 'MS/MS Identified' + peptides_identified = 'Peptide Sequences Identified' # 'peptides.txt' should have this number of peptides + df = mq_all_summaries.df if df is not None: MS_spectra = df[[col_summary.MS1, col_summary.MS2, col_summary.MS2_identified, col_summary.peptides_identified]] def compute_summary(threshold_identified): - mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified + mask = MS_spectra[col_summary.peptides_identified] >= threshold_identified display(MS_spectra.loc[mask].describe(np.linspace(0.05, 0.95, 10))) - + w_ions_range = widgets.IntSlider(value=15_000, min=.0, max=MS_spectra[col_summary.peptides_identified].max()) display(widgets.interactive(compute_summary, threshold_identified=w_ions_range)) # %% -mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value +mask = MS_spectra[col_summary.peptides_identified] >= w_ions_range.value logger.warning(f"Save {mask.sum()} file names to configuration file of selected samples: " -f"{ELIGABLE_FILES_YAML} " -f"based on a minimum of {w_ions_range.value} peptides.") + f"{ELIGABLE_FILES_YAML} " + f"based on a minimum of {w_ions_range.value} peptides.") idx_selected = MS_spectra.loc[mask].index MS_spectra.loc[idx_selected] @@ -163,7 +163,9 @@ def compute_summary(threshold_identified): df_meta_rawfiles.sort_values(date_col, inplace=True) # %% -w_date_range = widgets.SelectionRangeSlider(options=df_meta_rawfiles[date_col], value=[min(df_meta_rawfiles[date_col]),max(df_meta_rawfiles[date_col]) ] ) +w_date_range = widgets.SelectionRangeSlider(options=df_meta_rawfiles[date_col], value=[ + min(df_meta_rawfiles[date_col]), max(df_meta_rawfiles[date_col])]) + def show(range): mask = df_meta_rawfiles[date_col].between(*range) @@ -194,16 +196,17 @@ def show(range): # %% _max = MS_spectra[col_summary.peptides_identified].max() + 10_001 -fig, ax = plt.subplots(figsize=(10,10)) +fig, ax = plt.subplots(figsize=(10, 10)) _ = MS_spectra[col_summary.peptides_identified].hist( - bins=range(0,_max, 10_000), + bins=range(0, _max, 10_000), legend=True, - ax = ax) + ax=ax) fig.suptitle('Number of samples, binned in 10K steps.') fig.tight_layout() # %% -MS_spectra[col_summary.peptides_identified].mean(), MS_spectra[col_summary.peptides_identified].std() # including folders with 0 identified peptides +# including folders with 0 identified peptides +MS_spectra[col_summary.peptides_identified].mean(), MS_spectra[col_summary.peptides_identified].std() # %% @@ -216,25 +219,35 @@ def calc_cutoff(threshold=1): # calc_cutoff() -display(widgets.interactive(calc_cutoff, threshold=widgets.IntSlider(value=10000.0, min=.0, max=MS_spectra[col_summary.peptides_identified].max()))) +display(widgets.interactive(calc_cutoff, threshold=widgets.IntSlider( + value=10000.0, min=.0, max=MS_spectra[col_summary.peptides_identified].max()))) # %% -fig, axes = plt.subplots(2,2, figsize=(20,20), sharex=True) +fig, axes = plt.subplots(2, 2, figsize=(20, 20), sharex=True) -ylim_hist = (0,600) +ylim_hist = (0, 600) xlim_dens = (0, 70_000) -ax = axes[0,0] -ax = mq_all_summaries.df[col_summary.peptides_identified].plot(kind='hist', bins=50, title="Histogram including samples with zero identified peptides", grid=True, ax=ax, ylim=ylim_hist) -ax = axes[1,0] -_ = mq_all_summaries.df[col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title="Density plot including samples with zero identified peptides.", xlim=xlim_dens) +ax = axes[0, 0] +ax = mq_all_summaries.df[col_summary.peptides_identified].plot( + kind='hist', bins=50, title="Histogram including samples with zero identified peptides", grid=True, ax=ax, ylim=ylim_hist) +ax = axes[1, 0] +_ = mq_all_summaries.df[col_summary.peptides_identified].astype(float).plot.kde( + ax=ax, title="Density plot including samples with zero identified peptides.", xlim=xlim_dens) threshold_m2_identified = 15_000 mask = mq_all_summaries.df[col_summary.peptides_identified] >= threshold_m2_identified -ax = axes[0,1] -ax = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].plot(kind='hist', bins=40, title=f"Histogram including samples with {threshold_m2_identified:,d} and more identified peptides", grid=True, ax=ax, ylim=ylim_hist) -ax = axes[1,1] -_ = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].astype(float).plot.kde(ax=ax, title=f"Density plot including samples with {threshold_m2_identified:,d} and more identified peptides.", xlim=xlim_dens) +ax = axes[0, 1] +ax = mq_all_summaries.df.loc[mask, + col_summary.peptides_identified].plot(kind='hist', + bins=40, + title=f"Histogram including samples with {threshold_m2_identified:,d} and more identified peptides", + grid=True, + ax=ax, + ylim=ylim_hist) +ax = axes[1, 1] +_ = mq_all_summaries.df.loc[mask, col_summary.peptides_identified].astype(float).plot.kde( + ax=ax, title=f"Density plot including samples with {threshold_m2_identified:,d} and more identified peptides.", xlim=xlim_dens) plotting._savefig(fig, name='distribution_peptides_in_samples', folder=config.FIGUREFOLDER) diff --git a/project/erda_02_mq_count_features.ipynb b/project/erda_02_mq_count_features.ipynb index 1d50f1f7d..2dd052693 100644 --- a/project/erda_02_mq_count_features.ipynb +++ b/project/erda_02_mq_count_features.ipynb @@ -15,33 +15,27 @@ }, "outputs": [], "source": [ + "from collections import Counter\n", "import os\n", - "import sys\n", "import logging\n", "from pathlib import Path\n", "import random\n", "import yaml\n", - "import json\n", "\n", "import pandas as pd\n", - "import ipywidgets as widgets\n", - "\n", - "### Logging setup ######\n", - "from vaep.logging import setup_nb_logger\n", - "setup_nb_logger()\n", "\n", - "### vaep imports ######\n", - "from vaep.io.mq import MaxQuantOutputDynamic\n", - "from vaep.io.data_objects import MqAllSummaries\n", - "from vaep.io.data_objects import PeptideCounter\n", "import vaep.pandas\n", + "from vaep.io.data_objects import PeptideCounter\n", + "from vaep.io.mq import MaxQuantOutputDynamic\n", "\n", - "##################\n", "##### CONFIG #####\n", - "##################\n", "from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED\n", + "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", + "\n", + "### Logging setup ######\n", + "from vaep.logging import setup_nb_logger\n", + "setup_nb_logger()\n", "\n", - "from config import FOLDER_DATA # project folder for storing the data\n", "logging.info(f\"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}\")" ] }, @@ -82,7 +76,7 @@ }, "outputs": [], "source": [ - "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", + "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", "df_ids = pd.read_csv(fn_id_old_new)\n", "df_ids" ] @@ -98,14 +92,15 @@ "cell_type": "code", "execution_count": null, "metadata": { + "lines_to_next_cell": 2, "tags": [] }, "outputs": [], "source": [ - "folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}\n", + "folders_dict = {sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}\n", "# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}\n", "# folders_dict\n", - "folders = [Path(folder_path) for folder_path in folders_dict.values()]\n" + "folders = [Path(folder_path) for folder_path in folders_dict.values()]" ] }, { @@ -119,7 +114,6 @@ "OVERWRITE = False\n", "OVERWRITE = True\n", "\n", - "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", "\n", "FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES" ] @@ -139,7 +133,6 @@ }, "outputs": [], "source": [ - "import random\n", "pd.set_option('display.max_columns', 60)\n", "random_folder, random_path = random.sample(folders_dict.items(), 1)[0]\n", "mq_output = MaxQuantOutputDynamic(random_path)\n", @@ -156,7 +149,7 @@ "outputs": [], "source": [ "use_columns = mq_output.peptides.columns[33:45]\n", - "df = mq_output.peptides[use_columns].convert_dtypes() #.to_json('test.json')\n", + "df = mq_output.peptides[use_columns].convert_dtypes() # .to_json('test.json')\n", "df" ] }, @@ -203,7 +196,7 @@ }, "outputs": [], "source": [ - "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" + "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" ] }, { @@ -294,7 +287,7 @@ "metadata": {}, "outputs": [], "source": [ - "c.most_common(10) # peptide_counter.counter.most_common(10)" + "c.most_common(10) # peptide_counter.counter.most_common(10)" ] }, { @@ -309,15 +302,15 @@ "N = 1000\n", "with open(FOLDER_PROCESSED / f'most_common_{10}_peptides.py', 'w') as f:\n", " f.write('import pandas as pd\\n\\n')\n", - " \n", - " #pprint.pformat list -> do this using standardlibrary\n", + "\n", + " # pprint.pformat list -> do this using standardlibrary\n", " # https://docs.python.org/3/library/pprint.html\n", " f.write(f\"most_common = [\\n \")\n", " f.write(',\\n '.join(f\"{str(t)}\" for t in c.most_common(N)))\n", " f.write(\"\\n]\\n\\n\")\n", - " \n", - " #peptide_counter.loaded()\n", - " \n", + "\n", + " # peptide_counter.loaded()\n", + "\n", " f.write(\"pd.DataFrame.from_records(most_common, index='Sequence', columns=['Sequence', 'counts'])\\n\")" ] }, @@ -339,7 +332,7 @@ "outputs": [], "source": [ "evidence_cols = vaep.pandas.get_columns_accessor(mq_output.evidence.reset_index())\n", - "evidence_cols # vaep.mq get this list" + "evidence_cols # vaep.mq get this list" ] }, { @@ -394,7 +387,7 @@ "metadata": {}, "outputs": [], "source": [ - "mask = evidence[evidence_cols.Intensity].isna()\n", + "mask = evidence[evidence_cols.Intensity].isna()\n", "evidence.loc[mask, evidence_cols.Type].value_counts()" ] }, @@ -405,7 +398,12 @@ "outputs": [], "source": [ "evidence_cols = vaep.io.data_objects.evidence_cols\n", - "use_cols = [evidence_cols.mz, evidence_cols.Protein_group_IDs, evidence_cols.Intensity, evidence_cols.Score, evidence_cols.Potential_contaminant]\n", + "use_cols = [\n", + " evidence_cols.mz,\n", + " evidence_cols.Protein_group_IDs,\n", + " evidence_cols.Intensity,\n", + " evidence_cols.Score,\n", + " evidence_cols.Potential_contaminant]\n", "\n", "evidence_selected = vaep.io.data_objects.select_evidence(evidence[use_cols])\n", "evidence_selected" @@ -427,7 +425,9 @@ "metadata": {}, "outputs": [], "source": [ - "evidence_selected = vaep.pandas.select_max_by(evidence_selected.reset_index(), [evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score)\n", + "evidence_selected = vaep.pandas.select_max_by(\n", + " evidence_selected.reset_index(), [\n", + " evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score)\n", "evidence_selected" ] }, @@ -437,7 +437,6 @@ "metadata": {}, "outputs": [], "source": [ - "from collections import Counter\n", "c = Counter()\n", "c.update(evidence.index)\n", "c.most_common(10)" @@ -516,7 +515,7 @@ "\n", "- protein groups between files\n", " - aggregate by GENE ?\n", - " - " + " -" ] }, { @@ -545,21 +544,22 @@ "outputs": [], "source": [ "use_cols = [\n", - "# pg_cols.Protein_IDs,\n", - " pg_cols.Majority_protein_IDs,\n", - " pg_cols.Gene_names,\n", - " pg_cols.Evidence_IDs,\n", - " pg_cols.Q_value,\n", - " pg_cols.Score,\n", - " pg_cols.Only_identified_by_site,\n", - " pg_cols.Reverse,\n", - " pg_cols.Potential_contaminant,\n", - " pg_cols.Intensity,\n", + " # pg_cols.Protein_IDs,\n", + " pg_cols.Majority_protein_IDs,\n", + " pg_cols.Gene_names,\n", + " pg_cols.Evidence_IDs,\n", + " pg_cols.Q_value,\n", + " pg_cols.Score,\n", + " pg_cols.Only_identified_by_site,\n", + " pg_cols.Reverse,\n", + " pg_cols.Potential_contaminant,\n", + " pg_cols.Intensity,\n", "]\n", "\n", "pd.options.display.max_rows = 100\n", "pd.options.display.min_rows = 40\n", - "mask = mq_output.proteinGroups[[pg_cols.Only_identified_by_site, pg_cols.Reverse, pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0\n", + "mask = mq_output.proteinGroups[[pg_cols.Only_identified_by_site,\n", + " pg_cols.Reverse, pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0\n", "mq_output.proteinGroups.loc[mask, use_cols]" ] }, @@ -571,7 +571,7 @@ "source": [ "msg = \"Omitting the data drops {0:.3f} % of the data.\"\n", "print(msg.format(\n", - "mask.sum() / len(mask) * 100\n", + " mask.sum() / len(mask) * 100\n", "))" ] }, @@ -582,7 +582,7 @@ "outputs": [], "source": [ "selection = mq_output.proteinGroups.loc[~mask, use_cols]\n", - "gene_counts = selection[pg_cols.Gene_names].value_counts() # Gene Names not unique\n", + "gene_counts = selection[pg_cols.Gene_names].value_counts() # Gene Names not unique\n", "msg = 'proportion of entries with non-unique genes: {:.3f}'\n", "print(msg.format(gene_counts.loc[gene_counts > 1].sum() / gene_counts.sum()))\n", "gene_counts.head(20)" @@ -594,7 +594,7 @@ "metadata": {}, "outputs": [], "source": [ - "mask = selection.Intensity > 0 \n", + "mask = selection.Intensity > 0\n", "msg = \"Proportion of non-zero Intensities: {:.3f} (zero_ count = {})\"\n", "print(msg.format(mask.sum() / len(mask), (~mask).sum()))\n", "selection.loc[~mask]" @@ -616,7 +616,7 @@ "Some Proteins have no gene annotation\n", " - P56181 -> mitochondrial\n", "\n", - "In the online version of Uniprot these seems to be annotated (brief check). \n", + "In the online version of Uniprot these seems to be annotated (brief check).\n", "So latest version probably has a gene annotation, so therefore these files are kept" ] }, @@ -629,7 +629,7 @@ "gene_set = selection[pg_cols.Gene_names].str.split(';')\n", "\n", "col_loc_gene_names = selection.columns.get_loc(pg_cols.Gene_names)\n", - "_ = selection.insert(col_loc_gene_names+1, 'Number of Genes', gene_set.apply(vaep.pandas.length))\n", + "_ = selection.insert(col_loc_gene_names + 1, 'Number of Genes', gene_set.apply(vaep.pandas.length))\n", "\n", "mask = gene_set.isna()\n", "selection.loc[mask]" @@ -651,7 +651,7 @@ "metadata": {}, "source": [ "Most `proteinGroups` have single genes assigned to them. If one only looks at gene sets,\n", - "one can increase uniquely identified `proteinGroups` further. \n", + "one can increase uniquely identified `proteinGroups` further.\n", "\n", "> Can `geneGroups` (sets of `Gene Names`) be used instead of `proteinGroups`?" ] @@ -722,7 +722,8 @@ "metadata": {}, "outputs": [], "source": [ - "selection = vaep.pandas.select_max_by(df=selection.loc[~mask_no_gene].reset_index(), grouping_columns=[pg_cols.Gene_names], selection_column=pg_cols.Score)\n", + "selection = vaep.pandas.select_max_by(df=selection.loc[~mask_no_gene].reset_index(), grouping_columns=[\n", + " pg_cols.Gene_names], selection_column=pg_cols.Score)\n", "logging.info(f\"Selection shape after dropping duplicates by gene: {selection.shape}\")\n", "selection = selection.set_index(pg_cols.Protein_IDs)\n", "mask = selection[cols.Gene_names].isin(non_unique_genes)\n", @@ -756,7 +757,7 @@ "metadata": {}, "outputs": [], "source": [ - "vaep.pandas.counts_with_proportion(pd.Series(c)) # Most proteinGroups are unique" + "vaep.pandas.counts_with_proportion(pd.Series(c)) # Most proteinGroups are unique" ] }, { @@ -766,7 +767,7 @@ "### Count genes\n", "Genes sets could be used to identify common features.\n", "\n", - "> The assignment of isoforms to one proteinGroup or another might be volatile. \n", + "> The assignment of isoforms to one proteinGroup or another might be volatile.\n", "> A single (unique) peptide could lead to different assignments.\n", "> Imputation on the evidence level could be a way to alleviate this problem\n", "\n", @@ -782,14 +783,14 @@ "gene_counter = vaep.io.data_objects.GeneCounter(FNAME_C_GENES, overwrite=OVERWRITE)\n", "\n", "if not gene_counter.dumps:\n", - " #empty dict, replace\n", - " gene_counter.dumps = dict(protein_groups_counter.dumps) # prot proteinGroups files to GeneCounter\n", + " # empty dict, replace\n", + " gene_counter.dumps = dict(protein_groups_counter.dumps) # prot proteinGroups files to GeneCounter\n", "pg_dumps = list(gene_counter.dumps.values())\n", "\n", "c_genes = gene_counter.sum_over_files(folders=pg_dumps)\n", "\n", "c_genes = pd.Series(c_genes)\n", - "vaep.pandas.counts_with_proportion(c_genes) # Most proteinGroups are unique" + "vaep.pandas.counts_with_proportion(c_genes) # Most proteinGroups are unique" ] }, { diff --git a/project/erda_02_mq_count_features.py b/project/erda_02_mq_count_features.py index 80821096b..a92c3a098 100644 --- a/project/erda_02_mq_count_features.py +++ b/project/erda_02_mq_count_features.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.15.1 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -16,33 +16,27 @@ # # Count peptides over all files # %% +from collections import Counter import os -import sys import logging from pathlib import Path import random import yaml -import json import pandas as pd -import ipywidgets as widgets - -### Logging setup ###### -from vaep.logging import setup_nb_logger -setup_nb_logger() -### vaep imports ###### -from vaep.io.mq import MaxQuantOutputDynamic -from vaep.io.data_objects import MqAllSummaries -from vaep.io.data_objects import PeptideCounter import vaep.pandas +from vaep.io.data_objects import PeptideCounter +from vaep.io.mq import MaxQuantOutputDynamic -################## ##### CONFIG ##### -################## from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED +from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES + +### Logging setup ###### +from vaep.logging import setup_nb_logger +setup_nb_logger() -from config import FOLDER_DATA # project folder for storing the data logging.info(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}") # %% [markdown] @@ -63,7 +57,7 @@ assert len(files) == len(folders_dict) == len(folders) # %% -fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id +fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id df_ids = pd.read_csv(fn_id_old_new) df_ids @@ -71,7 +65,7 @@ # Select files and create list of folders # %% -folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']} +folders_dict = {sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']} # folders_dict = {p.stem : p.parent / p.stem for p in folders_dict} # folders_dict folders = [Path(folder_path) for folder_path in folders_dict.values()] @@ -81,7 +75,6 @@ OVERWRITE = False OVERWRITE = True -from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES @@ -89,7 +82,6 @@ # ## Random example # %% -import random pd.set_option('display.max_columns', 60) random_folder, random_path = random.sample(folders_dict.items(), 1)[0] mq_output = MaxQuantOutputDynamic(random_path) @@ -98,7 +90,7 @@ # %% use_columns = mq_output.peptides.columns[33:45] -df = mq_output.peptides[use_columns].convert_dtypes() #.to_json('test.json') +df = mq_output.peptides[use_columns].convert_dtypes() # .to_json('test.json') df # %% @@ -113,7 +105,7 @@ pd.read_json(df_json_string, orient='index') # %% -mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands +mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands # %% [markdown] # ## Count aggregated peptides @@ -150,22 +142,22 @@ new_name # %% -c.most_common(10) # peptide_counter.counter.most_common(10) +c.most_common(10) # peptide_counter.counter.most_common(10) # %% # To share as python file N = 1000 with open(FOLDER_PROCESSED / f'most_common_{10}_peptides.py', 'w') as f: f.write('import pandas as pd\n\n') - - #pprint.pformat list -> do this using standardlibrary + + # pprint.pformat list -> do this using standardlibrary # https://docs.python.org/3/library/pprint.html f.write(f"most_common = [\n ") f.write(',\n '.join(f"{str(t)}" for t in c.most_common(N))) f.write("\n]\n\n") - - #peptide_counter.loaded() - + + # peptide_counter.loaded() + f.write("pd.DataFrame.from_records(most_common, index='Sequence', columns=['Sequence', 'counts'])\n") # %% [markdown] Collapsed="false" @@ -175,7 +167,7 @@ # %% evidence_cols = vaep.pandas.get_columns_accessor(mq_output.evidence.reset_index()) -evidence_cols # vaep.mq get this list +evidence_cols # vaep.mq get this list # %% evidence = mq_output.evidence.set_index(evidence_cols.Charge, append=True) @@ -198,12 +190,17 @@ # These are apparently peptides identified by an MS2 spectrum but which could not be quantified by a MS1 scans # %% -mask = evidence[evidence_cols.Intensity].isna() +mask = evidence[evidence_cols.Intensity].isna() evidence.loc[mask, evidence_cols.Type].value_counts() # %% evidence_cols = vaep.io.data_objects.evidence_cols -use_cols = [evidence_cols.mz, evidence_cols.Protein_group_IDs, evidence_cols.Intensity, evidence_cols.Score, evidence_cols.Potential_contaminant] +use_cols = [ + evidence_cols.mz, + evidence_cols.Protein_group_IDs, + evidence_cols.Intensity, + evidence_cols.Score, + evidence_cols.Potential_contaminant] evidence_selected = vaep.io.data_objects.select_evidence(evidence[use_cols]) evidence_selected @@ -213,11 +210,12 @@ evidence_selected # %% -evidence_selected = vaep.pandas.select_max_by(evidence_selected.reset_index(), [evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score) +evidence_selected = vaep.pandas.select_max_by( + evidence_selected.reset_index(), [ + evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score) evidence_selected # %% -from collections import Counter c = Counter() c.update(evidence.index) c.most_common(10) @@ -254,7 +252,7 @@ # # - protein groups between files # - aggregate by GENE ? -# - +# - # %% mq_output.proteinGroups.describe(include='all') @@ -265,38 +263,39 @@ # %% use_cols = [ -# pg_cols.Protein_IDs, - pg_cols.Majority_protein_IDs, - pg_cols.Gene_names, - pg_cols.Evidence_IDs, - pg_cols.Q_value, - pg_cols.Score, - pg_cols.Only_identified_by_site, - pg_cols.Reverse, - pg_cols.Potential_contaminant, - pg_cols.Intensity, + # pg_cols.Protein_IDs, + pg_cols.Majority_protein_IDs, + pg_cols.Gene_names, + pg_cols.Evidence_IDs, + pg_cols.Q_value, + pg_cols.Score, + pg_cols.Only_identified_by_site, + pg_cols.Reverse, + pg_cols.Potential_contaminant, + pg_cols.Intensity, ] pd.options.display.max_rows = 100 pd.options.display.min_rows = 40 -mask = mq_output.proteinGroups[[pg_cols.Only_identified_by_site, pg_cols.Reverse, pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0 +mask = mq_output.proteinGroups[[pg_cols.Only_identified_by_site, + pg_cols.Reverse, pg_cols.Potential_contaminant]].notna().sum(axis=1) > 0 mq_output.proteinGroups.loc[mask, use_cols] # %% msg = "Omitting the data drops {0:.3f} % of the data." print(msg.format( -mask.sum() / len(mask) * 100 + mask.sum() / len(mask) * 100 )) # %% selection = mq_output.proteinGroups.loc[~mask, use_cols] -gene_counts = selection[pg_cols.Gene_names].value_counts() # Gene Names not unique +gene_counts = selection[pg_cols.Gene_names].value_counts() # Gene Names not unique msg = 'proportion of entries with non-unique genes: {:.3f}' print(msg.format(gene_counts.loc[gene_counts > 1].sum() / gene_counts.sum())) gene_counts.head(20) # %% -mask = selection.Intensity > 0 +mask = selection.Intensity > 0 msg = "Proportion of non-zero Intensities: {:.3f} (zero_ count = {})" print(msg.format(mask.sum() / len(mask), (~mask).sum())) selection.loc[~mask] @@ -308,14 +307,14 @@ # Some Proteins have no gene annotation # - P56181 -> mitochondrial # -# In the online version of Uniprot these seems to be annotated (brief check). +# In the online version of Uniprot these seems to be annotated (brief check). # So latest version probably has a gene annotation, so therefore these files are kept # %% gene_set = selection[pg_cols.Gene_names].str.split(';') col_loc_gene_names = selection.columns.get_loc(pg_cols.Gene_names) -_ = selection.insert(col_loc_gene_names+1, 'Number of Genes', gene_set.apply(vaep.pandas.length)) +_ = selection.insert(col_loc_gene_names + 1, 'Number of Genes', gene_set.apply(vaep.pandas.length)) mask = gene_set.isna() selection.loc[mask] @@ -327,7 +326,7 @@ # %% [markdown] # Most `proteinGroups` have single genes assigned to them. If one only looks at gene sets, -# one can increase uniquely identified `proteinGroups` further. +# one can increase uniquely identified `proteinGroups` further. # # > Can `geneGroups` (sets of `Gene Names`) be used instead of `proteinGroups`? @@ -360,7 +359,8 @@ selection_no_gene # %% -selection = vaep.pandas.select_max_by(df=selection.loc[~mask_no_gene].reset_index(), grouping_columns=[pg_cols.Gene_names], selection_column=pg_cols.Score) +selection = vaep.pandas.select_max_by(df=selection.loc[~mask_no_gene].reset_index(), grouping_columns=[ + pg_cols.Gene_names], selection_column=pg_cols.Score) logging.info(f"Selection shape after dropping duplicates by gene: {selection.shape}") selection = selection.set_index(pg_cols.Protein_IDs) mask = selection[cols.Gene_names].isin(non_unique_genes) @@ -374,13 +374,13 @@ c = protein_groups_counter.sum_over_files(folders=folders) # %% -vaep.pandas.counts_with_proportion(pd.Series(c)) # Most proteinGroups are unique +vaep.pandas.counts_with_proportion(pd.Series(c)) # Most proteinGroups are unique # %% [markdown] # ### Count genes # Genes sets could be used to identify common features. # -# > The assignment of isoforms to one proteinGroup or another might be volatile. +# > The assignment of isoforms to one proteinGroup or another might be volatile. # > A single (unique) peptide could lead to different assignments. # > Imputation on the evidence level could be a way to alleviate this problem # @@ -390,14 +390,14 @@ gene_counter = vaep.io.data_objects.GeneCounter(FNAME_C_GENES, overwrite=OVERWRITE) if not gene_counter.dumps: - #empty dict, replace - gene_counter.dumps = dict(protein_groups_counter.dumps) # prot proteinGroups files to GeneCounter + # empty dict, replace + gene_counter.dumps = dict(protein_groups_counter.dumps) # prot proteinGroups files to GeneCounter pg_dumps = list(gene_counter.dumps.values()) c_genes = gene_counter.sum_over_files(folders=pg_dumps) c_genes = pd.Series(c_genes) -vaep.pandas.counts_with_proportion(c_genes) # Most proteinGroups are unique +vaep.pandas.counts_with_proportion(c_genes) # Most proteinGroups are unique # %% [markdown] Collapsed="false" # ## Theoretial Peptides from used fasta-file diff --git a/project/erda_03_training_data.ipynb b/project/erda_03_training_data.ipynb index 269f55bc7..1047d6ec9 100644 --- a/project/erda_03_training_data.ipynb +++ b/project/erda_03_training_data.ipynb @@ -32,15 +32,10 @@ "\n", "import vaep\n", "\n", - "import config" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "import config\n", + "from config.training_data import peptides as cfg\n", + "\n", + "\n", "def join_as_str(seq):\n", " ret = \"_\".join(str(x) for x in seq)\n", " return ret" @@ -63,10 +58,10 @@ "outputs": [], "source": [ "RANDOM_SEED: int = 42 # Random seed for reproducibility\n", - "FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature\n", + "FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature\n", "SAMPLE_COL = 'Sample ID'\n", "OUT_FOLDER = 'data/selected/'\n", - "FN_ID_OLD_NEW: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id" + "FN_ID_OLD_NEW: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id" ] }, { @@ -85,7 +80,6 @@ "outputs": [], "source": [ "# options = ['peptides', 'evidence', 'proteinGroups']\n", - "from config.training_data import peptides as cfg\n", "# from config.training_data import evidence as cfg\n", "# from config.training_data import proteinGroups as cfg\n", "\n", @@ -178,7 +172,7 @@ "outputs": [], "source": [ "if TYPES_COUNT:\n", - " counts = counts.convert_dtypes().astype({'Charge': int}) #\n", + " counts = counts.convert_dtypes().astype({'Charge': int})\n", "mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF\n", "counts.loc[mask]" ] @@ -250,14 +244,16 @@ "source": [ "def load_fct(path):\n", " s = (\n", - " pd.read_csv(path, index_col=cfg.IDX_COLS_LONG[1:], usecols=[*cfg.IDX_COLS_LONG[1:], \"Intensity\"])\n", - " .squeeze()\n", - " .astype(pd.Int64Dtype())\n", + " pd.read_csv(path, index_col=cfg.IDX_COLS_LONG[1:], usecols=[*cfg.IDX_COLS_LONG[1:], \"Intensity\"])\n", + " .squeeze()\n", + " .astype(pd.Int64Dtype())\n", " )\n", " if len(cfg.IDX_COLS_LONG[1:]) > 1:\n", " s.index = s.index.map(join_as_str)\n", - " \n", + "\n", " return s\n", + "\n", + "\n", "load_fct(selected_dumps[0][-1])" ] }, @@ -288,7 +284,7 @@ " logging.warning(f\"Empty file: {path}\")\n", " failed.append((id, path))\n", " pbar.update(1)\n", - " \n", + "\n", " return all" ] }, @@ -305,7 +301,7 @@ "metadata": {}, "outputs": [], "source": [ - "all = None # free memory\n", + "all = None # free memory\n", "\n", "collect_intensities = partial(collect, index=IDX_selected, load_fct=load_fct)\n", "\n", @@ -316,10 +312,10 @@ " tqdm_notebook(\n", " p.imap(collect_intensities,\n", " np.array_split(selected_dumps, N_WORKERS)),\n", - " total=N_WORKERS,\n", + " total=N_WORKERS,\n", " )\n", - " ) \n", - " \n", + " )\n", + "\n", "all = pd.concat(all, axis=1)\n", "all" ] @@ -351,7 +347,7 @@ "outputs": [], "source": [ "%%time\n", - "fname = out_folder / config.insert_shape(all, 'intensities_wide_selected{}.pkl') \n", + "fname = out_folder / config.insert_shape(all, 'intensities_wide_selected{}.pkl')\n", "all.to_pickle(fname)\n", "fname" ] diff --git a/project/erda_03_training_data.py b/project/erda_03_training_data.py index 1d32f85d4..09bfdcc41 100644 --- a/project/erda_03_training_data.py +++ b/project/erda_03_training_data.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -36,9 +36,9 @@ import vaep import config +from config.training_data import peptides as cfg -# %% def join_as_str(seq): ret = "_".join(str(x) for x in seq) return ret @@ -49,10 +49,10 @@ def join_as_str(seq): # %% [tag=parameters] RANDOM_SEED: int = 42 # Random seed for reproducibility -FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature +FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature SAMPLE_COL = 'Sample ID' OUT_FOLDER = 'data/selected/' -FN_ID_OLD_NEW: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id +FN_ID_OLD_NEW: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id # %% [markdown] @@ -60,7 +60,6 @@ def join_as_str(seq): # %% # options = ['peptides', 'evidence', 'proteinGroups'] -from config.training_data import peptides as cfg # from config.training_data import evidence as cfg # from config.training_data import proteinGroups as cfg @@ -111,7 +110,7 @@ def join_as_str(seq): # %% if TYPES_COUNT: - counts = counts.convert_dtypes().astype({'Charge': int}) # + counts = counts.convert_dtypes().astype({'Charge': int}) mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF counts.loc[mask] @@ -148,14 +147,16 @@ def join_as_str(seq): # %% def load_fct(path): s = ( - pd.read_csv(path, index_col=cfg.IDX_COLS_LONG[1:], usecols=[*cfg.IDX_COLS_LONG[1:], "Intensity"]) - .squeeze() - .astype(pd.Int64Dtype()) + pd.read_csv(path, index_col=cfg.IDX_COLS_LONG[1:], usecols=[*cfg.IDX_COLS_LONG[1:], "Intensity"]) + .squeeze() + .astype(pd.Int64Dtype()) ) if len(cfg.IDX_COLS_LONG[1:]) > 1: s.index = s.index.map(join_as_str) - + return s + + load_fct(selected_dumps[0][-1]) @@ -181,7 +182,7 @@ def collect(folders, index, load_fct): logging.warning(f"Empty file: {path}") failed.append((id, path)) pbar.update(1) - + return all @@ -189,7 +190,7 @@ def collect(folders, index, load_fct): # ## Collect intensities in parallel # %% -all = None # free memory +all = None # free memory collect_intensities = partial(collect, index=IDX_selected, load_fct=load_fct) @@ -200,10 +201,10 @@ def collect(folders, index, load_fct): tqdm_notebook( p.imap(collect_intensities, np.array_split(selected_dumps, N_WORKERS)), - total=N_WORKERS, + total=N_WORKERS, ) - ) - + ) + all = pd.concat(all, axis=1) all @@ -217,7 +218,7 @@ def collect(folders, index, load_fct): # %% # %%time -fname = out_folder / config.insert_shape(all, 'intensities_wide_selected{}.pkl') +fname = out_folder / config.insert_shape(all, 'intensities_wide_selected{}.pkl') all.to_pickle(fname) fname diff --git a/project/erda_04_transpose_file.ipynb b/project/erda_04_transpose_file.ipynb index 6ddcf659e..a8e1c6f1d 100644 --- a/project/erda_04_transpose_file.ipynb +++ b/project/erda_04_transpose_file.ipynb @@ -38,13 +38,13 @@ "metadata": {}, "outputs": [], "source": [ - "# out_folder = Path('data/selected/proteinGroups') \n", + "# out_folder = Path('data/selected/proteinGroups')\n", "# fname = out_folder / 'intensities_wide_selected_N04550_M07444.pkl'\n", "\n", - "# out_folder = Path('data/selected/peptides') \n", + "# out_folder = Path('data/selected/peptides')\n", "# fname = out_folder / 'intensities_wide_selected_N42881_M07441.pkl'\n", "\n", - "out_folder = Path('data/selected/evidence') \n", + "out_folder = Path('data/selected/evidence')\n", "fname = out_folder / 'intensities_wide_selected_N49560_M07444.pkl'" ] }, @@ -60,9 +60,11 @@ " stem = fname.stem.split(split)[0]\n", " return f\"{stem}{{}}{ext}\"\n", "\n", + "\n", "def memory_usage_in_mb(df):\n", " return df.memory_usage(deep=True).sum() / (2**20)\n", "\n", + "\n", "template = get_template(fname)\n", "template" ] @@ -119,7 +121,7 @@ "source": [ "# %%time\n", "# df = pd.read_csv(fname.with_suffix('.csv'), index_col=0)\n", - "# df.memory_usage(deep=True).sum() / (2**20) " + "# df.memory_usage(deep=True).sum() / (2**20)" ] }, { @@ -223,7 +225,7 @@ "outputs": [], "source": [ "%%time\n", - "fname = out_folder / config.insert_shape(df, 'absent_0_present_1_selected{}.pkl')\n", + "fname = out_folder / config.insert_shape(df, 'absent_0_present_1_selected{}.pkl')\n", "\n", "files_out[fname.name] = fname.as_posix()\n", "df.to_pickle(fname)" diff --git a/project/erda_04_transpose_file.py b/project/erda_04_transpose_file.py index c9c6db02c..e26aa156e 100644 --- a/project/erda_04_transpose_file.py +++ b/project/erda_04_transpose_file.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -27,13 +27,13 @@ # Paramters # %% -# out_folder = Path('data/selected/proteinGroups') +# out_folder = Path('data/selected/proteinGroups') # fname = out_folder / 'intensities_wide_selected_N04550_M07444.pkl' -# out_folder = Path('data/selected/peptides') +# out_folder = Path('data/selected/peptides') # fname = out_folder / 'intensities_wide_selected_N42881_M07441.pkl' -out_folder = Path('data/selected/evidence') +out_folder = Path('data/selected/evidence') fname = out_folder / 'intensities_wide_selected_N49560_M07444.pkl' @@ -43,9 +43,11 @@ def get_template(fname, split='_N'): stem = fname.stem.split(split)[0] return f"{stem}{{}}{ext}" + def memory_usage_in_mb(df): return df.memory_usage(deep=True).sum() / (2**20) + template = get_template(fname) template @@ -69,7 +71,7 @@ def memory_usage_in_mb(df): # %% # # %%time # df = pd.read_csv(fname.with_suffix('.csv'), index_col=0) -# df.memory_usage(deep=True).sum() / (2**20) +# df.memory_usage(deep=True).sum() / (2**20) # %% # %%time @@ -119,7 +121,7 @@ def memory_usage_in_mb(df): # %% # %%time -fname = out_folder / config.insert_shape(df, 'absent_0_present_1_selected{}.pkl') +fname = out_folder / config.insert_shape(df, 'absent_0_present_1_selected{}.pkl') files_out[fname.name] = fname.as_posix() df.to_pickle(fname) diff --git a/project/erda_05_parse_paramter_files.ipynb b/project/erda_05_parse_paramter_files.ipynb index 60a05b72d..ba59184a3 100644 --- a/project/erda_05_parse_paramter_files.ipynb +++ b/project/erda_05_parse_paramter_files.ipynb @@ -34,6 +34,12 @@ }, "outputs": [], "source": [ + "import logging\n", + "\n", + "import xml.etree.ElementTree as ET\n", + "\n", + "logger = logging.getLogger()\n", + "\n", "test_file = 'data/mqpar_example.xml'" ] }, @@ -90,7 +96,7 @@ }, "outputs": [], "source": [ - "import xml.etree.ElementTree as ET\n", + "\n", "\n", "def add_record(data, tag, record):\n", " if tag in data:\n", diff --git a/project/erda_05_parse_paramter_files.py b/project/erda_05_parse_paramter_files.py index 7f3cb7603..eb108d4f5 100644 --- a/project/erda_05_parse_paramter_files.py +++ b/project/erda_05_parse_paramter_files.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.15.1 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 (ipykernel) # language: python diff --git a/project/erda_12_explore_raw_MQ_data.ipynb b/project/erda_12_explore_raw_MQ_data.ipynb index 03dfa8716..d2ffa8b9a 100644 --- a/project/erda_12_explore_raw_MQ_data.ipynb +++ b/project/erda_12_explore_raw_MQ_data.ipynb @@ -43,18 +43,14 @@ "import vaep.io.mq as mq\n", "from vaep.io.mq import mq_col\n", "\n", - "\n", - "from vaep.logging import setup_nb_logger\n", - "logger = setup_nb_logger()\n", - "\n", - "##################\n", "##### CONFIG #####\n", - "##################\n", - "\n", "import config\n", - "from config import FIGUREFOLDER\n", - "# from config import FOLDER_RAW_DATA\n", "from config import FOLDER_MQ_TXT_DATA as FOLDER_RAW_DATA\n", + "from config import FIGUREFOLDER\n", + "\n", + "\n", + "from vaep.logging import setup_nb_logger\n", + "logger = setup_nb_logger()\n", "\n", "\n", "print(f\"Search Raw-Files on path: {FOLDER_RAW_DATA}\")" @@ -138,7 +134,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Not all peptides are associated with a Protein or Gene by MQ, although there is evidence for the peptide. This is due to potential `CON_`taminants in the medium which is encouded by default by MQ." + "Not all peptides are associated with a Protein or Gene by MQ, although\n", + "there is evidence for the peptide. This is due to potential\n", + "`CON_`taminants in the medium which is encouded by default by MQ." ] }, { @@ -154,7 +152,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## `evidence.txt` \n", + "## `evidence.txt`\n", "\n", "contains\n", "- retention time for peptides\n", @@ -302,7 +300,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's see several quartiles for both median and standard deviation (the columns are independent from each other) for the retention time" + "Let's see several quartiles for both median and standard deviation (the\n", + "columns are independent from each other) for the retention time" ] }, { @@ -368,7 +367,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Model evaluation possibility: Discard samples with several measurements from an experiment and predict value. See which intensity measurement corresponds more closely. " + "Model evaluation possibility: Discard samples with several measurements\n", + "from an experiment and predict value. See which intensity measurement\n", + "corresponds more closely." ] }, { @@ -438,7 +439,8 @@ "## Differences in intensities b/w peptides.txt and evidence.txt\n", "\n", "\n", - "The intensity reported in `peptides.txt` corresponds to roughly to the sum of the intensities found in different scans:" + "The intensity reported in `peptides.txt` corresponds to roughly to the\n", + "sum of the intensities found in different scans:" ] }, { @@ -530,7 +532,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences. " + "Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences." ] }, { @@ -837,7 +839,7 @@ "aggregators = [\"Sequence\", \"Score\", mq_col.INTENSITY]\n", "mask_intensity_not_na = mq_output.evidence.Intensity.notna()\n", "seq_max_score_max_intensity = mq_output.evidence.loc[mask_intensity_not_na].reset_index(\n", - ")[aggregators+[\"Proteins\", \"Gene names\"]].sort_values(by=aggregators).set_index(\"Sequence\").groupby(level=0).last()\n", + ")[aggregators + [\"Proteins\", \"Gene names\"]].sort_values(by=aggregators).set_index(\"Sequence\").groupby(level=0).last()\n", "seq_max_score_max_intensity" ] }, @@ -882,8 +884,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These might be a candiate for evaluating predictions, as the information is measured, but unknown. \n", - "If they cannot be assigned, the closest fit on different genes with model predictions could be a criterion for selection" + "These might be a candiate for evaluating predictions, as the information is measured, but unknown.\n", + "If they cannot be assigned, the closest fit on different genes with\n", + "model predictions could be a criterion for selection" ] }, { @@ -923,7 +926,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some hundred peptides map to more than two genes " + "Some hundred peptides map to more than two genes" ] }, { @@ -955,7 +958,7 @@ "- multiple genes:\n", " - select first and add reference in others\n", " - split and dump repeatedly\n", - " \n", + "\n", "Load fasta-file information" ] }, @@ -1025,7 +1028,8 @@ "outputs": [], "source": [ "mask = mq_output.peptides[mq_col.LEADING_RAZOR_PROTEIN].isin(set_proteins_to_remove)\n", - "mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts() # ToDo: Remove potential contaminants, check evidence.txt" + "# ToDo: Remove potential contaminants, check evidence.txt\n", + "mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts()" ] }, { @@ -1178,8 +1182,7 @@ "\n", "Does a group of peptide only assigns unique set of genes? Genes can have more than one protein.\n", " - first build groups\n", - " - then see matches (see further below)\n", - " " + " - then see matches (see further below)\n" ] }, { @@ -1235,9 +1238,8 @@ "metadata": {}, "outputs": [], "source": [ - "_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(\";\"\n", - " ).apply(lambda x: [True if \"CON_\" in item else False for item in x]\n", - " ).apply(all)\n", + "_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(\";\").apply(\n", + " lambda x: [True if \"CON_\" in item else False for item in x]).apply(all)\n", "\n", "assert _mask_con.sum() == 0, \"There are peptides resulting only from possible confounders: {}\".format(\n", " \", \".join(str(x) for x in peptides_with_single_gene.loc[mask, mq_col.PROTEINS].loc[_mask_con].index))" @@ -1294,7 +1296,7 @@ "metadata": {}, "outputs": [], "source": [ - "gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein?" + "gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein?" ] }, { @@ -1354,7 +1356,7 @@ "peps_in_data = gene_data.index\n", "\n", "mq.calculate_completness_for_sample(\n", - " peps_exact_cleaved=peps_exact_cleaved, \n", + " peps_exact_cleaved=peps_exact_cleaved,\n", " peps_in_data=peps_in_data)" ] }, @@ -1548,7 +1550,7 @@ "metadata": {}, "outputs": [], "source": [ - "s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene')\n", + "s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene')\n", "s_completeness.describe()" ] }, @@ -1559,13 +1561,21 @@ "outputs": [], "source": [ "N_BINS = 20\n", - "ax = s_completeness.plot(kind='hist',\n", - " bins=N_BINS,\n", - " xticks=[x/100 for x in range(0, 101, 5)],\n", - " figsize=(10, 5),\n", - " rot=90,\n", - " title=f\"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins\"\n", - " f\"\\nin sample {mq_output.folder.stem}\")\n", + "ax = s_completeness.plot(\n", + " kind='hist',\n", + " bins=N_BINS,\n", + " xticks=[\n", + " x /\n", + " 100 for x in range(\n", + " 0,\n", + " 101,\n", + " 5)],\n", + " figsize=(\n", + " 10,\n", + " 5),\n", + " rot=90,\n", + " title=f\"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins\"\n", + " f\"\\nin sample {mq_output.folder.stem}\")\n", "\n", "_ = ax.set_xlabel(\n", " \"Proportion of exactly observed peptides (including up to 2 mis-cleavages)\")\n", diff --git a/project/erda_12_explore_raw_MQ_data.py b/project/erda_12_explore_raw_MQ_data.py index fc788ad17..13b0c6d0d 100644 --- a/project/erda_12_explore_raw_MQ_data.py +++ b/project/erda_12_explore_raw_MQ_data.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3.8.13 ('vaep') # language: python @@ -46,18 +46,14 @@ import vaep.io.mq as mq from vaep.io.mq import mq_col - -from vaep.logging import setup_nb_logger -logger = setup_nb_logger() - -################## ##### CONFIG ##### -################## - import config -from config import FIGUREFOLDER -# from config import FOLDER_RAW_DATA from config import FOLDER_MQ_TXT_DATA as FOLDER_RAW_DATA +from config import FIGUREFOLDER + + +from vaep.logging import setup_nb_logger +logger = setup_nb_logger() print(f"Search Raw-Files on path: {FOLDER_RAW_DATA}") @@ -95,13 +91,15 @@ intensities # %% [markdown] -# Not all peptides are associated with a Protein or Gene by MQ, although there is evidence for the peptide. This is due to potential `CON_`taminants in the medium which is encouded by default by MQ. +# Not all peptides are associated with a Protein or Gene by MQ, although +# there is evidence for the peptide. This is due to potential +# `CON_`taminants in the medium which is encouded by default by MQ. # %% mq_output.peptides[FASTA_KEYS].isna().sum() # %% [markdown] -# ## `evidence.txt` +# ## `evidence.txt` # # contains # - retention time for peptides @@ -170,7 +168,8 @@ rt_summary # %% [markdown] -# Let's see several quartiles for both median and standard deviation (the columns are independent from each other) for the retention time +# Let's see several quartiles for both median and standard deviation (the +# columns are independent from each other) for the retention time # %% rt_summary.describe(percentiles=[0.8, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99]) @@ -196,7 +195,9 @@ mq_output.evidence.loc[mask_indices] # %% [markdown] -# Model evaluation possibility: Discard samples with several measurements from an experiment and predict value. See which intensity measurement corresponds more closely. +# Model evaluation possibility: Discard samples with several measurements +# from an experiment and predict value. See which intensity measurement +# corresponds more closely. # %% _peptide = random.choice(mask_indices) @@ -228,7 +229,8 @@ # ## Differences in intensities b/w peptides.txt and evidence.txt # # -# The intensity reported in `peptides.txt` corresponds to roughly to the sum of the intensities found in different scans: +# The intensity reported in `peptides.txt` corresponds to roughly to the +# sum of the intensities found in different scans: # %% col_intensity = mq_col.INTENSITY @@ -270,7 +272,7 @@ _diff[mask_diff].describe() # %% [markdown] -# Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences. +# Several smaller and larger differences in an intensity range way below the detection limit arise for some sequences. # %% [markdown] # ### Ideas on source of difference @@ -411,7 +413,7 @@ aggregators = ["Sequence", "Score", mq_col.INTENSITY] mask_intensity_not_na = mq_output.evidence.Intensity.notna() seq_max_score_max_intensity = mq_output.evidence.loc[mask_intensity_not_na].reset_index( -)[aggregators+["Proteins", "Gene names"]].sort_values(by=aggregators).set_index("Sequence").groupby(level=0).last() +)[aggregators + ["Proteins", "Gene names"]].sort_values(by=aggregators).set_index("Sequence").groupby(level=0).last() seq_max_score_max_intensity # %% @@ -430,8 +432,9 @@ seq_max_score_max_intensity.loc[mask_seq_selected_not_assigned] # %% [markdown] -# These might be a candiate for evaluating predictions, as the information is measured, but unknown. -# If they cannot be assigned, the closest fit on different genes with model predictions could be a criterion for selection +# These might be a candiate for evaluating predictions, as the information is measured, but unknown. +# If they cannot be assigned, the closest fit on different genes with +# model predictions could be a criterion for selection # %% [markdown] # ## Create dumps of intensities in `peptides.txt` @@ -447,7 +450,7 @@ # ## Create dumps per gene # %% [markdown] -# Some hundred peptides map to more than two genes +# Some hundred peptides map to more than two genes # %% seq_max_score_max_intensity[mq_col.GENE_NAMES].str.split(";" @@ -465,7 +468,7 @@ # - multiple genes: # - select first and add reference in others # - split and dump repeatedly -# +# # Load fasta-file information # %% @@ -497,7 +500,8 @@ # %% mask = mq_output.peptides[mq_col.LEADING_RAZOR_PROTEIN].isin(set_proteins_to_remove) -mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts() # ToDo: Remove potential contaminants, check evidence.txt +# ToDo: Remove potential contaminants, check evidence.txt +mq_output.peptides.loc[mask, 'Potential contaminant'].value_counts() # %% [markdown] # ### `id_map`: Find genes based on fasta file @@ -597,7 +601,7 @@ # Does a group of peptide only assigns unique set of genes? Genes can have more than one protein. # - first build groups # - then see matches (see further below) -# +# # %% peptides_with_single_gene = mq.get_peptides_with_single_gene( @@ -619,9 +623,8 @@ peptides_with_single_gene.loc[mask] # %% -_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(";" - ).apply(lambda x: [True if "CON_" in item else False for item in x] - ).apply(all) +_mask_con = peptides_with_single_gene.loc[mask, mq_col.PROTEINS].str.split(";").apply( + lambda x: [True if "CON_" in item else False for item in x]).apply(all) assert _mask_con.sum() == 0, "There are peptides resulting only from possible confounders: {}".format( ", ".join(str(x) for x in peptides_with_single_gene.loc[mask, mq_col.PROTEINS].loc[_mask_con].index)) @@ -650,7 +653,7 @@ set_of_proteins # %% -gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein? +gene_data[mq_col.PROTEINS].value_counts() # combine? select first in case of a CON_ as leading razor protein? # %% protein_id = set_of_proteins.pop() @@ -678,7 +681,7 @@ peps_in_data = gene_data.index mq.calculate_completness_for_sample( - peps_exact_cleaved=peps_exact_cleaved, + peps_exact_cleaved=peps_exact_cleaved, peps_in_data=peps_in_data) # %% [markdown] @@ -800,18 +803,26 @@ def __repr__(self): # #### Descriptics # %% -s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene') +s_completeness = pd.Series(completeness_per_gene, name='completenes_by_gene') s_completeness.describe() # %% N_BINS = 20 -ax = s_completeness.plot(kind='hist', - bins=N_BINS, - xticks=[x/100 for x in range(0, 101, 5)], - figsize=(10, 5), - rot=90, - title=f"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins" - f"\nin sample {mq_output.folder.stem}") +ax = s_completeness.plot( + kind='hist', + bins=N_BINS, + xticks=[ + x / + 100 for x in range( + 0, + 101, + 5)], + figsize=( + 10, + 5), + rot=90, + title=f"Frequency of proportion of observed exact peptides (completness) per razor protein from 0 to 1 in {N_BINS} bins" + f"\nin sample {mq_output.folder.stem}") _ = ax.set_xlabel( "Proportion of exactly observed peptides (including up to 2 mis-cleavages)") diff --git a/project/erda_data_available.ipynb b/project/erda_data_available.ipynb index c9e9f2283..8213b7ae1 100644 --- a/project/erda_data_available.ipynb +++ b/project/erda_data_available.ipynb @@ -7,6 +7,7 @@ "metadata": {}, "outputs": [], "source": [ + "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", "import logging\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", @@ -18,7 +19,6 @@ "from vaep.logging import setup_nb_logger\n", "setup_nb_logger(level=logging.INFO)\n", "\n", - "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", "\n", "FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES" ] @@ -60,7 +60,7 @@ "outputs": [], "source": [ "peptide_counts = peptide_counter.get_df_counts()\n", - "# peptide_counts.index += 1 \n", + "# peptide_counts.index += 1\n", "peptide_counts.head()" ] }, @@ -71,7 +71,7 @@ "metadata": {}, "outputs": [], "source": [ - "peptide_counts.describe(percentiles=np.linspace(0.1,1,10))" + "peptide_counts.describe(percentiles=np.linspace(0.1, 1, 10))" ] }, { @@ -165,7 +165,7 @@ "source": [ "gene_counter = data_objects.GeneCounter(FNAME_C_GENES)\n", "gene_count = gene_counter.get_df_counts()\n", - "gene_count.head() # remove NaN entry" + "gene_count.head() # remove NaN entry" ] }, { @@ -176,7 +176,7 @@ "outputs": [], "source": [ "gene_count = gene_count.iloc[1:]\n", - "gene_count.head() " + "gene_count.head()" ] }, { @@ -186,7 +186,7 @@ "metadata": {}, "outputs": [], "source": [ - "ax = gene_counter.plot_counts(df_counts=gene_count) # provide manuelly manipulated gene counts" + "ax = gene_counter.plot_counts(df_counts=gene_count) # provide manuelly manipulated gene counts" ] } ], diff --git a/project/erda_data_available.py b/project/erda_data_available.py index 81d787108..f7decce5c 100644 --- a/project/erda_data_available.py +++ b/project/erda_data_available.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -13,6 +13,7 @@ # --- # %% +from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES import logging import matplotlib import matplotlib.pyplot as plt @@ -24,7 +25,6 @@ from vaep.logging import setup_nb_logger setup_nb_logger(level=logging.INFO) -from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES @@ -40,11 +40,11 @@ # %% peptide_counts = peptide_counter.get_df_counts() -# peptide_counts.index += 1 +# peptide_counts.index += 1 peptide_counts.head() # %% -peptide_counts.describe(percentiles=np.linspace(0.1,1,10)) +peptide_counts.describe(percentiles=np.linspace(0.1, 1, 10)) # %% vaep.plotting.make_large_descriptors() @@ -80,12 +80,12 @@ # %% gene_counter = data_objects.GeneCounter(FNAME_C_GENES) gene_count = gene_counter.get_df_counts() -gene_count.head() # remove NaN entry +gene_count.head() # remove NaN entry # %% gene_count = gene_count.iloc[1:] -gene_count.head() +gene_count.head() # %% -ax = gene_counter.plot_counts(df_counts=gene_count) # provide manuelly manipulated gene counts +ax = gene_counter.plot_counts(df_counts=gene_count) # provide manuelly manipulated gene counts diff --git a/project/misc_FASTA_data_agg_by_gene.ipynb b/project/misc_FASTA_data_agg_by_gene.ipynb index c4733a5c7..fc31a5697 100644 --- a/project/misc_FASTA_data_agg_by_gene.ipynb +++ b/project/misc_FASTA_data_agg_by_gene.ipynb @@ -14,7 +14,9 @@ "outputs": [], "source": [ "from collections import defaultdict\n", + "import itertools\n", "import json\n", + "from pprint import pprint\n", "from tqdm.notebook import tqdm\n", "\n", "import numpy as np\n", @@ -33,7 +35,7 @@ "outputs": [], "source": [ "with open(FN_FASTA_DB) as f:\n", - " data_fasta = json.load(f)#, indent=4, sort_keys=False)\n", + " data_fasta = json.load(f) # , indent=4, sort_keys=False)\n", "len(data_fasta)" ] }, @@ -61,7 +63,7 @@ "metadata": {}, "outputs": [], "source": [ - "gene = 'ACTG1' # Actin as a contaminant protein\n", + "gene = 'ACTG1' # Actin as a contaminant protein\n", "gene_isotopes[gene]" ] }, @@ -71,7 +73,6 @@ "metadata": {}, "outputs": [], "source": [ - "from pprint import pprint\n", "for isotope in gene_isotopes[gene]:\n", " pprint(data_fasta[isotope])" ] @@ -129,7 +130,8 @@ "metadata": {}, "outputs": [], "source": [ - "alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) # Identical? Maybe check if this is more than once the case?\n", + "# Identical? Maybe check if this is more than once the case?\n", + "alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0'])\n", "for alignment in alignments:\n", " print(alignment)" ] @@ -149,7 +151,7 @@ "metadata": {}, "outputs": [], "source": [ - "alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical?\n", + "alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical?\n", "for alignment in alignments:\n", " print(alignment)\n", " break" @@ -161,7 +163,7 @@ "metadata": {}, "outputs": [], "source": [ - "alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical?\n", + "alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical?\n", "for alignment in alignments:\n", " print(alignment)\n", " break" @@ -180,13 +182,12 @@ "metadata": {}, "outputs": [], "source": [ - "import itertools\n", "peptides = {}\n", "for isotope in gene_isotopes[gene]:\n", " sequences[isotope] = data_fasta[isotope][fasta_keys.peptides][0]\n", "\n", "for peptides in itertools.zip_longest(*sequences.values, fillvalue=''):\n", - " if len(set(peptides)) == 1: \n", + " if len(set(peptides)) == 1:\n", " print(f'all identical: {peptides[0]}')\n", " else:\n", " print('\\t'.join(peptides))" @@ -199,7 +200,7 @@ "outputs": [], "source": [ "for j, peptides in enumerate(sequences.values):\n", - " if j==0:\n", + " if j == 0:\n", " set_overlap = set(peptides)\n", " else:\n", " set_overlap = set_overlap.intersection(peptides)\n", diff --git a/project/misc_FASTA_data_agg_by_gene.py b/project/misc_FASTA_data_agg_by_gene.py index 60d7a8888..072b02b90 100644 --- a/project/misc_FASTA_data_agg_by_gene.py +++ b/project/misc_FASTA_data_agg_by_gene.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.0 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -17,7 +17,9 @@ # %% from collections import defaultdict +import itertools import json +from pprint import pprint from tqdm.notebook import tqdm import numpy as np @@ -30,7 +32,7 @@ # %% with open(FN_FASTA_DB) as f: - data_fasta = json.load(f)#, indent=4, sort_keys=False) + data_fasta = json.load(f) # , indent=4, sort_keys=False) len(data_fasta) # %% @@ -46,11 +48,10 @@ print(f"#{len(protein_wo_gene)} proteins have not gene associated: {', '.join(protein_wo_gene[:10])}, ...") # %% -gene = 'ACTG1' # Actin as a contaminant protein +gene = 'ACTG1' # Actin as a contaminant protein gene_isotopes[gene] # %% -from pprint import pprint for isotope in gene_isotopes[gene]: pprint(data_fasta[isotope]) @@ -74,7 +75,8 @@ aligner = Align.PairwiseAligner() # %% -alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) # Identical? Maybe check if this is more than once the case? +# Identical? Maybe check if this is more than once the case? +alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3I0']) for alignment in alignments: print(alignment) @@ -82,13 +84,13 @@ data_fasta['I3L1U9'][fasta_keys.seq] == data_fasta['I3L3I0'][fasta_keys.seq] # %% -alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical? +alignments = aligner.align(sequences.loc['I3L1U9'], sequences.loc['I3L3R2']) # Identical? for alignment in alignments: print(alignment) break # %% -alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical? +alignments = aligner.align(sequences.loc['P63261'], sequences.loc['K7EM38']) # Identical? for alignment in alignments: print(alignment) break @@ -97,20 +99,19 @@ # ## Unique Peptides # %% -import itertools peptides = {} for isotope in gene_isotopes[gene]: sequences[isotope] = data_fasta[isotope][fasta_keys.peptides][0] for peptides in itertools.zip_longest(*sequences.values, fillvalue=''): - if len(set(peptides)) == 1: + if len(set(peptides)) == 1: print(f'all identical: {peptides[0]}') else: print('\t'.join(peptides)) # %% for j, peptides in enumerate(sequences.values): - if j==0: + if j == 0: set_overlap = set(peptides) else: set_overlap = set_overlap.intersection(peptides) diff --git a/project/misc_FASTA_tryptic_digest.ipynb b/project/misc_FASTA_tryptic_digest.ipynb index a2d970059..a45249d10 100644 --- a/project/misc_FASTA_tryptic_digest.ipynb +++ b/project/misc_FASTA_tryptic_digest.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# Process FASTA files\n", - "> uses only the provided fasta files in `src.config.py` by `FOLDER_FASTA` \n", + "> uses only the provided fasta files in `src.config.py` by `FOLDER_FASTA`\n", "\n", "- create theoretically considered peptides considered by search engines\n", "- dump results as human readable json to `FN_FASTA_DB` file specifed in src.config.\n", @@ -19,6 +19,7 @@ "metadata": {}, "outputs": [], "source": [ + "\n", "from collections import defaultdict, namedtuple\n", "import os\n", "import json\n", @@ -41,6 +42,7 @@ "from vaep.fasta import cleave_to_tryptic\n", "from vaep.fasta import iterFlatten\n", "from vaep.fasta import count_peptide_matches\n", + "from vaep.fasta import read_fasta\n", "from vaep.io import search_files\n", "from vaep.pandas import combine_value_counts\n", "from vaep.databases.uniprot import query_uniprot_id_mapping\n", @@ -58,7 +60,9 @@ "from config import FIGUREFOLDER\n", "from config import FN_ID_MAP\n", "from config import FN_PROT_GENE_MAP\n", - "from config import FN_PEP_TO_PROT" + "from config import FN_PEP_TO_PROT\n", + "\n", + "from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_GENE_NAME, KEY_PEPTIDES" ] }, { @@ -80,7 +84,10 @@ "test_data = {\n", " \"meta\": \">tr|A0A024R1R8|A0A024R1R8_HUMAN HCG2014768, isoform CRA_a OS=Homo sapiens OX=9606 GN=hCG_2014768 PE=4 SV=1\",\n", " \"seq\": \"MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKKSGKK\",\n", - " \"peptides\": [\"MSSHEGGK\", \"EMDEEEK\", \"GPLATGGIK\"],\n", + " \"peptides\": [\n", + " \"MSSHEGGK\",\n", + " \"EMDEEEK\",\n", + " \"GPLATGGIK\"],\n", "}" ] }, @@ -110,7 +117,7 @@ "\n", "- map peptide set of peptides (how to deal with mis-cleavages?)\n", " - mis-cleavages can happen both to the peptide before and after.\n", - " > `pep1, pep2, pep3, pep4, pep5` \n", + " > `pep1, pep2, pep3, pep4, pep5`\n", " > `pep1pep2, pep2pep3, pep3pep4, pep4pep5`\n", " - sliding windows can pass trough the list of peptides - should work with recursion" ] @@ -129,7 +136,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`add_rxk` should add pattern of starting R and trailing K ? " + "`add_rxk` should add pattern of starting R and trailing K ?" ] }, { @@ -203,7 +210,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "rdx peptides are a subset of two missed cleavage sites peptides. There are omitted when two and more cleavage site can be skipped." + "rdx peptides are a subset of two missed cleavage sites peptides. There\n", + "are omitted when two and more cleavage site can be skipped." ] }, { @@ -223,7 +231,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Data Structure is no a list of list. Maybe this could be improved. Information what kind of type the peptide is from, is still interesting." + "Data Structure is no a list of list. Maybe this could be improved.\n", + "Information what kind of type the peptide is from, is still interesting." ] }, { @@ -252,9 +261,9 @@ "source": [ "### Define Setup\n", "\n", - "Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse. \n", + "Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse.\n", "\n", - "Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides " + "Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides" ] }, { @@ -313,9 +322,7 @@ "metadata": {}, "outputs": [], "source": [ - "from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_GENE_NAME, KEY_PEPTIDES\n", "\n", - "from vaep.fasta import read_fasta\n", "\n", "data_fasta = {}\n", "\n", @@ -328,7 +335,7 @@ "# }\n", "# # or dataclass\n", "# from dataclasses import make_dataclass\n", - "# FastaEntry = make_dataclass(cls_name='FastaEntry', \n", + "# FastaEntry = make_dataclass(cls_name='FastaEntry',\n", "# fields=[\n", "# (KEY_FASTA_HEADER, 'str'),\n", "# (KEY_GENE_NAME, 'str'),\n", @@ -431,6 +438,7 @@ "source": [ "test_series = pd.Series({\"A\": 4, \"B\": 1, \"C\": 0, \"D\": 4})\n", "\n", + "\n", "def get_indices_with_value(s: pd.Series, value):\n", " \"\"\"Return indices for with the value is true\"\"\"\n", " return s[s == value].index\n", @@ -443,7 +451,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Boolean Indexing, remember to set [parantheses](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing)" + "Boolean Indexing, remember to set\n", + "[parantheses](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing)" ] }, { @@ -577,7 +586,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Possible to join \"isoforms\" by joining all variants to one. Isoforms are numbered from the second on by appending `-i` for $i>1$, i.e. starting with `-2`. The gene name of which the protein (isoform) originate can be obtained by using [id mapping](https://www.uniprot.org/help/api_idmapping). Isoforms are not mapped automatically by Uniprot to its GENENAME, i.e. you have to strip all `-i`, e.g `-2`, `-3`, for querying. Here the protein, gene pairs are mapped to the unique protein identifiers." + "Possible to join \"isoforms\" by joining all variants to one. Isoforms are\n", + "numbered from the second on by appending `-i` for $i>1$, i.e. starting\n", + "with `-2`. The gene name of which the protein (isoform) originate can be\n", + "obtained by using [id\n", + "mapping](https://www.uniprot.org/help/api_idmapping). Isoforms are not\n", + "mapped automatically by Uniprot to its GENENAME, i.e. you have to strip\n", + "all `-i`, e.g `-2`, `-3`, for querying. Here the protein, gene pairs are\n", + "mapped to the unique protein identifiers." ] }, { @@ -722,7 +738,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Add gene names from UniProt to `id_map` DataFrame by an outer join (keeping all information based on the protein names shared by isotopes)" + "Add gene names from UniProt to `id_map` DataFrame by an outer join\n", + "(keeping all information based on the protein names shared by isotopes)" ] }, { @@ -759,10 +776,10 @@ "outputs": [], "source": [ "genes_fasta_offline = pd.DataFrame(\n", - " ((_key, _data[KEY_GENE_NAME]) for _key, _data in data_fasta.items()),\n", - " columns=[\"prot_id\", \"gene_fasta\"],\n", - " ).set_index(\"prot_id\"\n", - " ).replace('', np.nan)\n", + " ((_key, _data[KEY_GENE_NAME]) for _key, _data in data_fasta.items()),\n", + " columns=[\"prot_id\", \"gene_fasta\"],\n", + ").set_index(\"prot_id\"\n", + " ).replace('', np.nan)\n", "genes_fasta_offline.loc[genes_fasta_offline.gene_fasta.isna()]" ] }, @@ -795,7 +812,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the genes from the fasta file header reduces the number of missing genes, but additionally other differences arise in the comparison to the lastest version." + "Using the genes from the fasta file header reduces the number of missing\n", + "genes, but additionally other differences arise in the comparison to the\n", + "lastest version." ] }, { @@ -846,7 +865,8 @@ "source": [ "### Isotopes mapping\n", "\n", - "Isotopes are mapped now to a protein with the same name. The same can be achieved by just discarding everything behind the hypen `-`" + "Isotopes are mapped now to a protein with the same name. The same can be\n", + "achieved by just discarding everything behind the hypen `-`" ] }, { @@ -928,7 +948,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Map peptide to either identifier, common protein or gene \n" + "### Map peptide to either identifier, common protein or gene\n" ] }, { @@ -980,12 +1000,14 @@ "source": [ "### Plot histograms for different levels of abstraction\n", "\n", - "Plot counts of matched \n", + "Plot counts of matched\n", " 1. protein IDs\n", " 2. proteins (joining isoforms)\n", " 3. genes\n", - " \n", - "to their peptides. See how many unique peptides exist. The number of peptides should stay the same, so the counts do not have to be normalized." + "\n", + "to their peptides. See how many unique peptides exist. The number of\n", + "peptides should stay the same, so the counts do not have to be\n", + "normalized." ] }, { @@ -1114,7 +1136,7 @@ "ax.set_ylabel(\"peptide counts\")\n", "ax.set_xlabel(\"number of matched levels\")\n", "# ax.yaxis.set_major_formatter(\"{x:,}\")\n", - "_y_ticks = ax.set_yticks(list(range(0, 3_500_000, 500_000))) # is there a ways to transform float to int in matplotlib?\n", + "_y_ticks = ax.set_yticks(list(range(0, 3_500_000, 500_000))) # is there a ways to transform float to int in matplotlib?\n", "_y_ticks_labels = ax.set_yticklabels([f\"{x:,}\" for x in range(0, 3_500_000, 500_000)])\n", "\n", "_savefig(fig, folder=\"figures\", name=\"fasta_top4\")" @@ -1147,7 +1169,7 @@ "\n", "axes = axes.reshape((2, 2))\n", "\n", - "pad = 5 # in point\n", + "pad = 5 # in point\n", "for i in range(2):\n", " axes[-1, i].set_xlabel(\"Count of number of matches for a peptide\")\n", " axes[i, 0].set_ylabel(\"number of peptides\")\n", diff --git a/project/misc_FASTA_tryptic_digest.py b/project/misc_FASTA_tryptic_digest.py index 80dde87eb..58a49f179 100644 --- a/project/misc_FASTA_tryptic_digest.py +++ b/project/misc_FASTA_tryptic_digest.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.0 +# jupytext_version: 1.15.0 # kernelspec: # display_name: vaep # language: python @@ -14,7 +14,7 @@ # %% [markdown] # # Process FASTA files -# > uses only the provided fasta files in `src.config.py` by `FOLDER_FASTA` +# > uses only the provided fasta files in `src.config.py` by `FOLDER_FASTA` # # - create theoretically considered peptides considered by search engines # - dump results as human readable json to `FN_FASTA_DB` file specifed in src.config. @@ -22,6 +22,7 @@ # > Based on notebook received by [Annelaura Bach](https://www.cpr.ku.dk/staff/mann-group/?pure=en/persons/443836) and created by Johannes B. Müller \[[scholar](https://scholar.google.com/citations?user=Rn1OS8oAAAAJ&hl=de), [MPI Biochemistry](https://www.biochem.mpg.de/person/93696/2253)\] # %% + from collections import defaultdict, namedtuple import os import json @@ -38,6 +39,7 @@ from vaep.fasta import cleave_to_tryptic from vaep.fasta import iterFlatten from vaep.fasta import count_peptide_matches +from vaep.fasta import read_fasta from vaep.io import search_files from vaep.pandas import combine_value_counts from vaep.databases.uniprot import query_uniprot_id_mapping @@ -51,6 +53,8 @@ from config import FN_PROT_GENE_MAP from config import FN_PEP_TO_PROT +from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_GENE_NAME, KEY_PEPTIDES + # %% [markdown] # ## Core Functionality - Example # @@ -61,7 +65,10 @@ test_data = { "meta": ">tr|A0A024R1R8|A0A024R1R8_HUMAN HCG2014768, isoform CRA_a OS=Homo sapiens OX=9606 GN=hCG_2014768 PE=4 SV=1", "seq": "MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAKVVGKGPLATGGIKKSGKK", - "peptides": ["MSSHEGGK", "EMDEEEK", "GPLATGGIK"], + "peptides": [ + "MSSHEGGK", + "EMDEEEK", + "GPLATGGIK"], } # %% [markdown] @@ -77,7 +84,7 @@ # # - map peptide set of peptides (how to deal with mis-cleavages?) # - mis-cleavages can happen both to the peptide before and after. -# > `pep1, pep2, pep3, pep4, pep5` +# > `pep1, pep2, pep3, pep4, pep5` # > `pep1pep2, pep2pep3, pep3pep4, pep4pep5` # - sliding windows can pass trough the list of peptides - should work with recursion @@ -86,7 +93,7 @@ l_peptides # %% [markdown] -# `add_rxk` should add pattern of starting R and trailing K ? +# `add_rxk` should add pattern of starting R and trailing K ? # %% last_pep = "" @@ -128,7 +135,8 @@ print("".join(example_peptides_fasta[0]), *example_peptides_fasta, sep="\n") # %% [markdown] -# rdx peptides are a subset of two missed cleavage sites peptides. There are omitted when two and more cleavage site can be skipped. +# rdx peptides are a subset of two missed cleavage sites peptides. There +# are omitted when two and more cleavage site can be skipped. # %% example_peptides_fasta = cleave_to_tryptic( @@ -138,7 +146,8 @@ example_peptides_fasta[-1] # %% [markdown] -# Data Structure is no a list of list. Maybe this could be improved. Information what kind of type the peptide is from, is still interesting. +# Data Structure is no a list of list. Maybe this could be improved. +# Information what kind of type the peptide is from, is still interesting. # %% [markdown] # ## Process Fasta Files @@ -153,9 +162,9 @@ # %% [markdown] # ### Define Setup # -# Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse. +# Set input FASTA, Output .txt name, lower legth cutoff, missed cleavages and if to report reverse. # -# Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides +# Tryptic digest of Fastas to Peptides >6 in list for matching with measured peptides # %% CUTOFF_LEN_PEP = 7 @@ -190,9 +199,7 @@ # ### Schema for single fasta entry # %% -from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_GENE_NAME, KEY_PEPTIDES -from vaep.fasta import read_fasta data_fasta = {} @@ -205,7 +212,7 @@ # } # # or dataclass # from dataclasses import make_dataclass -# FastaEntry = make_dataclass(cls_name='FastaEntry', +# FastaEntry = make_dataclass(cls_name='FastaEntry', # fields=[ # (KEY_FASTA_HEADER, 'str'), # (KEY_GENE_NAME, 'str'), @@ -266,6 +273,7 @@ # %% test_series = pd.Series({"A": 4, "B": 1, "C": 0, "D": 4}) + def get_indices_with_value(s: pd.Series, value): """Return indices for with the value is true""" return s[s == value].index @@ -274,7 +282,8 @@ def get_indices_with_value(s: pd.Series, value): get_indices_with_value(test_series, 4) # %% [markdown] -# Boolean Indexing, remember to set [parantheses](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing) +# Boolean Indexing, remember to set +# [parantheses](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing) # %% MIN_AA_IN_SEQ = 10 @@ -348,7 +357,14 @@ def get_indices_with_value(s: pd.Series, value): # ### Proteins' Isoforms # %% [markdown] -# Possible to join "isoforms" by joining all variants to one. Isoforms are numbered from the second on by appending `-i` for $i>1$, i.e. starting with `-2`. The gene name of which the protein (isoform) originate can be obtained by using [id mapping](https://www.uniprot.org/help/api_idmapping). Isoforms are not mapped automatically by Uniprot to its GENENAME, i.e. you have to strip all `-i`, e.g `-2`, `-3`, for querying. Here the protein, gene pairs are mapped to the unique protein identifiers. +# Possible to join "isoforms" by joining all variants to one. Isoforms are +# numbered from the second on by appending `-i` for $i>1$, i.e. starting +# with `-2`. The gene name of which the protein (isoform) originate can be +# obtained by using [id +# mapping](https://www.uniprot.org/help/api_idmapping). Isoforms are not +# mapped automatically by Uniprot to its GENENAME, i.e. you have to strip +# all `-i`, e.g `-2`, `-3`, for querying. Here the protein, gene pairs are +# mapped to the unique protein identifiers. # %% prot_ids = list(data_fasta.keys()) @@ -425,7 +441,8 @@ def get_indices_with_value(s: pd.Series, value): ), f"The number of proteins associated to a gene found on 11.11.2020 was 72471, now it's {len(genes)}" # %% [markdown] -# Add gene names from UniProt to `id_map` DataFrame by an outer join (keeping all information based on the protein names shared by isotopes) +# Add gene names from UniProt to `id_map` DataFrame by an outer join +# (keeping all information based on the protein names shared by isotopes) # %% id_map = id_map.merge(genes, how="outer", left_on="protein", right_index=True) @@ -440,10 +457,10 @@ def get_indices_with_value(s: pd.Series, value): # %% genes_fasta_offline = pd.DataFrame( - ((_key, _data[KEY_GENE_NAME]) for _key, _data in data_fasta.items()), - columns=["prot_id", "gene_fasta"], - ).set_index("prot_id" - ).replace('', np.nan) + ((_key, _data[KEY_GENE_NAME]) for _key, _data in data_fasta.items()), + columns=["prot_id", "gene_fasta"], +).set_index("prot_id" + ).replace('', np.nan) genes_fasta_offline.loc[genes_fasta_offline.gene_fasta.isna()] # %% @@ -460,7 +477,9 @@ def get_indices_with_value(s: pd.Series, value): id_map.loc[mask_no_gene] # %% [markdown] -# Using the genes from the fasta file header reduces the number of missing genes, but additionally other differences arise in the comparison to the lastest version. +# Using the genes from the fasta file header reduces the number of missing +# genes, but additionally other differences arise in the comparison to the +# lastest version. # %% mask_gene_diffs = id_map.gene != id_map.gene_fasta @@ -483,7 +502,8 @@ def get_indices_with_value(s: pd.Series, value): # %% [markdown] # ### Isotopes mapping # -# Isotopes are mapped now to a protein with the same name. The same can be achieved by just discarding everything behind the hypen `-` +# Isotopes are mapped now to a protein with the same name. The same can be +# achieved by just discarding everything behind the hypen `-` # %% id_map.loc[id_map.index.str.contains("-")] @@ -523,7 +543,7 @@ def get_indices_with_value(s: pd.Series, value): f"Proteins are mapped to a total number of genes of {len(set(dict_protein_to_gene.values()))}" # %% [markdown] -# ### Map peptide to either identifier, common protein or gene +# ### Map peptide to either identifier, common protein or gene # # %% @@ -548,12 +568,14 @@ def get_indices_with_value(s: pd.Series, value): # %% [markdown] # ### Plot histograms for different levels of abstraction # -# Plot counts of matched +# Plot counts of matched # 1. protein IDs # 2. proteins (joining isoforms) # 3. genes -# -# to their peptides. See how many unique peptides exist. The number of peptides should stay the same, so the counts do not have to be normalized. +# +# to their peptides. See how many unique peptides exist. The number of +# peptides should stay the same, so the counts do not have to be +# normalized. # %% USE_OFFLINE_FASTA_GENES = True @@ -610,7 +632,7 @@ def get_indices_with_value(s: pd.Series, value): ax.set_ylabel("peptide counts") ax.set_xlabel("number of matched levels") # ax.yaxis.set_major_formatter("{x:,}") -_y_ticks = ax.set_yticks(list(range(0, 3_500_000, 500_000))) # is there a ways to transform float to int in matplotlib? +_y_ticks = ax.set_yticks(list(range(0, 3_500_000, 500_000))) # is there a ways to transform float to int in matplotlib? _y_ticks_labels = ax.set_yticklabels([f"{x:,}" for x in range(0, 3_500_000, 500_000)]) _savefig(fig, folder="figures", name="fasta_top4") @@ -637,7 +659,7 @@ def get_indices_with_value(s: pd.Series, value): axes = axes.reshape((2, 2)) -pad = 5 # in point +pad = 5 # in point for i in range(2): axes[-1, i].set_xlabel("Count of number of matches for a peptide") axes[i, 0].set_ylabel("number of peptides") diff --git a/project/misc_id_mapper.py b/project/misc_id_mapper.py new file mode 100644 index 000000000..944d938bc --- /dev/null +++ b/project/misc_id_mapper.py @@ -0,0 +1,458 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.0 +# kernelspec: +# display_name: vaep +# language: python +# name: vaep +# --- + +# %% [markdown] +# # Analyse peptides +# +# ## Specification +# - access different levels of peptides easily +# - select training data per gene easily +# + +# %% +import json +import logging +logging.basicConfig(level=logging.INFO) # configures root logger +logger = logging.getLogger() +logger.info("test") + +# %% +import pandas as pd +from config import FN_FASTA_DB, FN_ID_MAP, FN_PEPTIDE_INTENSITIES + +# %% +id_map = pd.read_json(FN_ID_MAP, orient="split") + +mask_no_gene = id_map.gene.isna() +id_map.loc[mask_no_gene, "gene"] = "-" + + +with open(FN_FASTA_DB) as f: + data_fasta = json.load(f) + +# %% +data_peptides = pd.read_pickle(FN_PEPTIDE_INTENSITIES) + +# %% +set_peptides = set(data_peptides.columns) + +# %% [markdown] +# - switch between list of proteins with any support and non +# - set threshold of number of peptides per protein over all samples (some peptides uniquely matched to one protein in on sample is just noise -> check razor peptides) +# - show support + +# %% +from collections import defaultdict +import ipywidgets as w +from config import KEY_FASTA_HEADER, KEY_FASTA_SEQ, KEY_PEPTIDES, KEY_GENE_NAME, KEY_GENE_NAME_FASTA + +TGREEN = "\033[32m" # Green Text +RESET = "\033[0;0m" + +w_first_letter = w.Dropdown( + options=id_map[KEY_GENE_NAME_FASTA].str[0].unique()) +w_genes = w.Dropdown( + options=id_map.gene.loc[id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value].unique(), + value='ACTB' +) + +mask = id_map.gene == w_genes.value +selected = id_map.loc[mask, "protein"] + + +w_proteins_ids = w.Dropdown(options=selected.index) +w_protein = w.Dropdown(options=selected.unique()) + + +def update_gene_list(first_letter): + """Update proteins when new gene is selected""" + mask_selected_genes = id_map[KEY_GENE_NAME_FASTA].str[0] == w_first_letter.value + w_genes.options = id_map.gene.loc[mask_selected_genes].unique() + + +_ = w.interactive_output(update_gene_list, {"first_letter": w_first_letter}) + + +def update_protein_list(gene): + mask = id_map[KEY_GENE_NAME_FASTA] == gene + selected = id_map.loc[mask, "protein"] + w_protein.options = selected.unique() +# w_proteins_ids.options = selected.loc[selected == w_protein.value].index + + +_ = w.interactive_output(update_protein_list, {"gene": w_genes}) + + +def update_protein_id_list(protein): + """Update isotope list when protein is selected""" + mask = id_map.protein == w_protein.value + selected = id_map.protein.loc[mask] + w_proteins_ids.options = selected.index + +_ = w.interactive_output(update_protein_id_list, {'protein': w_protein}) + +d_peptides_observed_prot_id = defaultdict(list) + +def show_sequences(prot_id): + _data = data_fasta[prot_id] + print(f"Protein_ID on Uniport: {prot_id}") + print(f"HEADER: {_data[KEY_FASTA_HEADER]}") +# print(f"Seq : {_data[KEY_FASTA_SEQ]}") + annotate_seq = "Peptides: " + global d_peptides_observed_prot_id + for i, _l in enumerate(_data[KEY_PEPTIDES]): + annotate_seq += f"\nNo. of missed K or R: {i}" + prot_seq_annotated = _data[KEY_FASTA_SEQ] + for j, _pep in enumerate(_l): + if _pep in set_peptides: + d_peptides_observed_prot_id[prot_id].append(_pep) + _pep_in_green = TGREEN + f"{_pep}" + RESET + prot_seq_annotated = prot_seq_annotated.replace(_pep, _pep_in_green) + _pep = _pep_in_green + if j==0: + annotate_seq += "\n\t" + _pep + else: + annotate_seq += ",\n\t" + _pep + print(f"Seq {i}: {prot_seq_annotated}") + print(annotate_seq) + + + display(data_peptides[d_peptides_observed_prot_id[prot_id]].dropna(how='all')) + +w_out = w.interactive_output(show_sequences, {"prot_id": w_proteins_ids}) + +label_first_letter = w.Label(value='First letter of Gene') +label_genes = w.Label('Gene') +label_protein = w.Label('Protein') +label_proteins_ids = w.Label('Protein Isotopes') + +panel_levels = w.VBox([ + w.HBox([ + w.VBox([label_first_letter, w_first_letter]), + w.VBox([label_genes, w_genes]), + w.VBox([label_protein, w_protein]), + w.VBox([label_proteins_ids, w_proteins_ids]) + ]), + w_out] +) +panel_levels + +# %% [markdown] +# - relatively short peptides resulting from one missed cleaveage, do not appear in the upper part. + +# %% [markdown] +# - `gene` `->` `Protein_ID` (contains information of `gene` `->` `protein_isotopes` +# - `protein_ID` `->` `sequences` (`FN_FASTA_DB`) + +# %% +import pickle +from tqdm.notebook import tqdm +from config import FN_PROTEIN_SUPPORT_MAP, FN_PROTEIN_SUPPORT_FREQ +try: + df_protein_support = pd.read_pickle(FN_PROTEIN_SUPPORT_MAP) + with open(FN_PROTEIN_SUPPORT_FREQ, 'rb') as f: + d_protein_support_freq = pickle.load(f) +except FileNotFoundError: + from vaep.utils import sample_iterable + d_protein_support = {} + d_protein_support_freq = {} + for prot_id in tqdm(data_fasta.keys()): + _data = data_fasta[prot_id] + peptides_measured = [] + for i, _l in enumerate(_data[KEY_PEPTIDES]): + for _pep in _l: + if _pep in set_peptides: + peptides_measured.append(_pep) + _d_protein_support = {} + _df_support_protein = data_peptides[peptides_measured].dropna(how='all') + + _n_samples = len(_df_support_protein) + if _n_samples > 0: + _d_protein_support['N_samples'] = _n_samples + d_protein_support_freq[prot_id] = _df_support_protein.notna().sum().to_dict() + d_protein_support[prot_id] = _d_protein_support + else: + d_protein_support[prot_id] = None + + df_protein_support = pd.DataFrame(d_protein_support).T.dropna() + df_protein_support = df_protein_support.join(id_map) + df_protein_support.to_pickle(FN_PROTEIN_SUPPORT_MAP) + + with open(FN_PROTEIN_SUPPORT_FREQ, 'wb') as f: + pickle.dump(d_protein_support_freq, f) + +# %% +l_proteins_good_support = df_protein_support.sort_values(by='N_samples').tail(100).index.to_list() + +# %% +d_protein_support_freq['I3L3I0'] + +# %% [markdown] +# ## Connect to experimental peptide data +# +# Check if counts by `data_fasta`. + +# %% +from tqdm.notebook import tqdm + +counts_observed_by_missed_cleavages = {} +for _protein_id, _data in tqdm(data_fasta.items()): + _peptides = _data[KEY_PEPTIDES] + _counts = {} + for i, _l in enumerate(_peptides): + _counts[i] = 0 + for _pep in _l: + if _pep in set_peptides: + _counts[i] += 1 + counts_observed_by_missed_cleavages[_protein_id] = _counts + +# %% +df_counts_observed_by_missed_cleavages = pd.DataFrame( + counts_observed_by_missed_cleavages +).T + +# %% +import matplotlib.pyplot as plt +from matplotlib import table + +fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [5, 1], "wspace": 0.2}, figsize=(10,4)) + +_counts_summed = df_counts_observed_by_missed_cleavages.sum() +_counts_summed.name = "frequency" + +ax = axes[0] +_ = _counts_summed.plot(kind="bar", ax=ax) +ax.set_xlabel("peptides from n miscleavages") +ax.set_ylabel("frequency") + +ax = axes[1] +ax.axis("off") +_ = pd.plotting.table(ax=ax, data=_counts_summed, loc="best", colWidths=[1], edges='open') +_ = fig.suptitle('Peptides frequencies') + +# %% [markdown] +# These are unnormalized counts in the meaning of that _razor_ peptides are counted as often as they are matched. + +# %% +mask = df_counts_observed_by_missed_cleavages != 0 +df_prot_observed = df_counts_observed_by_missed_cleavages.replace(0, pd.NA) + +# %% +df_prot_observed = df_prot_observed.dropna(axis=0, how="all") +df_prot_observed = df_prot_observed.fillna(0) +df_prot_observed = df_prot_observed.convert_dtypes() + +# %% +from vaep.pandas import combine_value_counts + +combine_value_counts(df_prot_observed) + +# %% +freq_pep_mapped_to_protID = df_prot_observed.sum(axis=1).value_counts() +freq_pep_mapped_to_protID = freq_pep_mapped_to_protID.sort_index() + +# %% +freq_pep_mapped_to_protID + +# %% [markdown] +# ### Genes with support in data +# +# try software to identify the _most likely_ protein. OpenMS or russian alternative? + +# %% + +# %% [markdown] +# ## Imputation: Train model +# +# > Select Gene or Protein +# +# As the samples are all obtained from the same biological sample (in principal), the single run should somehow be comparable. +# An description of variablity (from the Data Scientist perspective) can highlight some commenly known facts about proteomics experiments: +# - batch effects: Measurements on consecutive days are have to be normalized to each other +# - scoring: PSM are assigned to a peptide based on a score. Small variations can lead to different assignments +# +# Can a complex representation of a sample level out experimental variation on an in principle comparable data. +# +# ### Strategy +# - first start using peptides from single Protein_IDs +# - then move to all models from genes +# - explore structure + +# %% +import torch + +# %% +d_peptides_observed_prot_id + +# %% +w_select_proteins_good_support = w.Dropdown(options=l_proteins_good_support) +w_select_proteins_queried = w.Dropdown(options=list(d_peptides_observed_prot_id.keys())) +w.HBox( + [ + w.VBox( + [ + w.Label(f"Top {len(l_proteins_good_support)} covered proteins"), + w_select_proteins_good_support, + ] + ), + w.VBox([w.Label("Queried proteins from above"), w_select_proteins_queried]), + ] +) +# select from top100 or above selection + +# %% [markdown] +# Idea: Select a protein which leads to training. Each selection will create a dump of the selected data, which can be used in the `XZY.ipynb` for model fine-tuning. + +# %% +prot_id = w_select_proteins_good_support.value +id_map.loc[prot_id] + +# %% +prot_id = 'P00338' # 'I3L3I0' # w_select_proteins_queried.value # +_protein, _gene, _ = id_map.loc[prot_id] +# _gene_fasta + +# %% +w_first_letter.value = _gene[0] +w_genes.value = _gene +w_protein.value = _protein +w_proteins_ids.value = prot_id + +# %% +peptides_measured = d_peptides_observed_prot_id[prot_id] +n_peptides_in_selection = len(peptides_measured) +print(f"Selected a total of {n_peptides_in_selection} peptides.") + +# %% +data_peptides[peptides_measured].notna().sum(axis=1).value_counts().sort_index() + +# %% +PROP_DATA_COMPLETENESS = 0.75 +mask_samples_selected = data_peptides[peptides_measured].notna().sum(axis=1) >= int(n_peptides_in_selection * 0.75) +print(f"Using a share of at least {PROP_DATA_COMPLETENESS}, i.e. at least {int(n_peptides_in_selection * 0.75)} out of {n_peptides_in_selection}.", + f"In total {mask_samples_selected.sum()} samples.", sep="\n") + +# %% +from config import PROTEIN_DUMPS +_ = data_peptides.loc[mask_samples_selected, peptides_measured] +_.to_json(PROTEIN_DUMPS / f"{prot_id}.pkl") +_ + +# %% +import vaep +from vaep.transform import log + +peptides_selected_log10 = data_peptides.loc[mask_samples_selected, peptides_measured].apply(log) # selected in widget overview above +peptides_selected_log10 + +# %% [markdown] +# > The data to be seen here should be **assigned** peptides. Razor peptides are for now not put to one or the other protein (focus only on unique peptides?). + +# %% [markdown] +# ### Hyperparameters + +# %% +n_samples, n_features = peptides_selected_log10.shape + +# %% +from vaep.models.cmd import parser + +BATCH_SIZE = 16 +EPOCHS = 600 +args = ['--batch-size', str(BATCH_SIZE), '--seed', '43', '--epochs', str(EPOCHS), '--log-interval', str(BATCH_SIZE)] +args = parser.parse_args(args) +args.cuda = not args.no_cuda and torch.cuda.is_available() +args + +# %% +torch.manual_seed(args.seed) +device = torch.device("cuda" if args.cuda else "cpu") +device = torch.device("cpu") + +# %% +# torch.device? + +# %% [markdown] +# ### Dataset and DataLoader +# +# The `torch.utils.data.Dataset` can load data into memory, or just create a mapping to data somewhere to be continously loaded by the `torch.utils.data.DataLoader`. + +# %% +peptide_intensities = peptides_selected_log10 +detection_limit = float(int(peptide_intensities.min().min())) +detection_limit + +# %% +# from vaep.model import PeptideDatasetInMemory + +from torch.utils.data import Dataset +class PeptideDatasetInMemory(Dataset): + """Peptide Dataset fully in memory.""" + + def __init__(self, data: pd.DataFrame, fill_na=0): + self.mask_obs = torch.from_numpy(data.notna().values) + data = data.fillna(fill_na) + self.peptides = torch.from_numpy(data.values) + self.length_ = len(data) + + def __len__(self): + return self.length_ + + def __getitem__(self, idx): + return self.peptides[idx], self.mask_obs[idx] + + +dataset_in_memory = PeptideDatasetInMemory(peptide_intensities.copy(), detection_limit) + +# %% +kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {} +train_loader = torch.utils.data.DataLoader( + dataset=dataset_in_memory, + batch_size=args.batch_size, shuffle=True, **kwargs) + +# %% +for i, (data, mask) in enumerate(train_loader): + print("Nummber of samples in mini-batch: {}".format(len(data)), + "\tObject-Type: {}".format(type(mask))) +# print(data) +# print(mask) + break + +# %% +data[~mask] = 0 +plt.imshow(data) + +# %% [markdown] +# create logged information for tensorboard, see tutorial and docs. + +# %% +from datetime import datetime +from torch.utils.tensorboard import SummaryWriter +writer = SummaryWriter(f'runs/{prot_id}_{format(datetime.now(), "%y%m%d_%H%M")}') + +# %% +writer.add_image(f'{len(data)} samples heatmap', data, dataformats='HW') + +# %% +# import importlib; importlib.reload(vaep.model) +from IPython.core.debugger import set_trace + +from torch import optim +from vaep.models.ae import VAE +from vaep.models.ae import loss_function + +model = VAE(n_features=n_features, n_neurons=30).double().to(device) +writer.add_graph(model, input_to_model=data) + +optimizer = optim.Adam(model.parameters(), lr=1e-4) diff --git a/project/misc_illustrations.ipynb b/project/misc_illustrations.ipynb index dd7e6350b..3e09a695d 100644 --- a/project/misc_illustrations.ipynb +++ b/project/misc_illustrations.ipynb @@ -36,9 +36,9 @@ "source": [ "plt.rcParams.update({'xtick.labelsize': 'xx-large',\n", " 'ytick.labelsize': 'xx-large',\n", - " 'axes.titlesize' : 'xx-large',\n", - " 'axes.labelsize' : 'xx-large',\n", - " })\n", + " 'axes.titlesize': 'xx-large',\n", + " 'axes.labelsize': 'xx-large',\n", + " })\n", "# {k:v for k,v in plt.rcParams.items() if 'tick' in k and 'size' in k}" ] }, @@ -63,16 +63,16 @@ "mu = 25.0\n", "stddev = 1.0\n", "\n", - "x = np.linspace(mu -5, mu + 5, num=101)\n", + "x = np.linspace(mu - 5, mu + 5, num=101)\n", "\n", "y_normal = scipy.stats.norm.pdf(x, loc=mu, scale=stddev)\n", "\n", - "mu_shifted = mu - (1.8*stddev)\n", - "stddev_shifted = 0.3*stddev\n", + "mu_shifted = mu - (1.8 * stddev)\n", + "stddev_shifted = 0.3 * stddev\n", "print(f\"Downshifted: {mu_shifted = }, {stddev_shifted = }\")\n", - "y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8*stddev), scale=0.3*stddev)\n", + "y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8 * stddev), scale=0.3 * stddev)\n", "\n", - "colors = plt.cm.viridis([0.25,0.75]) \n", + "colors = plt.cm.viridis([0.25, 0.75])\n", "\n", "fig, ax = plt.subplots(1, 1, figsize=(5, 4))\n", "\n", @@ -106,7 +106,8 @@ "\n", "- what does log2 transformation mean for the error\n", "\n", - "If the error is calculated in log2 space, the larger values have to be predicted with higher precision (in comparison to the original space)" + "If the error is calculated in log2 space, the larger values have to be\n", + "predicted with higher precision (in comparison to the original space)" ] }, { @@ -115,22 +116,23 @@ "metadata": {}, "outputs": [], "source": [ - "def get_original_error_log2(x:float, error_log2:float):\n", - " return 2 ** (np.log2(x) + error_log2) - x \n", + "def get_original_error_log2(x: float, error_log2: float):\n", + " return 2 ** (np.log2(x) + error_log2) - x\n", + "\n", "\n", "print(\n", " f\"{get_original_error_log2(1e9, 0.5) = :,.1f}\",\n", " f\"{get_original_error_log2(1e8, 0.5) = :,.1f}\",\n", " sep='\\n'\n", - " )" + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If we try to find the rel log2 error equalling the original error, this can be done by \n", - "equating: \n", + "If we try to find the rel log2 error equalling the original error, this can be done by\n", + "equating:\n", "\n", "$$ \\exp(\\ln(a)+e) - a = \\exp(\\ln(a)+e^*) - b $$\n", "\n", @@ -147,12 +149,13 @@ "source": [ "def rel_error(measurment, log_error, other_measurment):\n", " numerator = 2 ** (np.log2(measurment) + log_error)\n", - " numerator-=measurment\n", - " numerator+=other_measurment\n", - " \n", + " numerator -= measurment\n", + " numerator += other_measurment\n", + "\n", " denominator = other_measurment\n", " return np.log2(numerator / denominator)\n", "\n", + "\n", "rel_error = rel_error(1.e9, 0.5, 1e8)\n", "print(f\"{rel_error = :.3f}\")" ] @@ -167,14 +170,14 @@ " f\"0.500 rel to 1e9: {get_original_error_log2(1e9, 0.5) :,.1f}\",\n", " f\"{rel_error:.3f} rel to 1e8: {get_original_error_log2(1e8, rel_error) :,.1f}\",\n", " sep='\\n'\n", - " )" + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace, \n", + "So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace,\n", "whereas the error in the original space is the same" ] }, diff --git a/project/misc_illustrations.py b/project/misc_illustrations.py index 55328e65d..e63c68278 100644 --- a/project/misc_illustrations.py +++ b/project/misc_illustrations.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.0 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -27,9 +27,9 @@ # %% plt.rcParams.update({'xtick.labelsize': 'xx-large', 'ytick.labelsize': 'xx-large', - 'axes.titlesize' : 'xx-large', - 'axes.labelsize' : 'xx-large', - }) + 'axes.titlesize': 'xx-large', + 'axes.labelsize': 'xx-large', + }) # {k:v for k,v in plt.rcParams.items() if 'tick' in k and 'size' in k} # %% [markdown] @@ -42,16 +42,16 @@ mu = 25.0 stddev = 1.0 -x = np.linspace(mu -5, mu + 5, num=101) +x = np.linspace(mu - 5, mu + 5, num=101) y_normal = scipy.stats.norm.pdf(x, loc=mu, scale=stddev) -mu_shifted = mu - (1.8*stddev) -stddev_shifted = 0.3*stddev +mu_shifted = mu - (1.8 * stddev) +stddev_shifted = 0.3 * stddev print(f"Downshifted: {mu_shifted = }, {stddev_shifted = }") -y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8*stddev), scale=0.3*stddev) +y_impute = scipy.stats.norm.pdf(x, loc=mu - (1.8 * stddev), scale=0.3 * stddev) -colors = plt.cm.viridis([0.25,0.75]) +colors = plt.cm.viridis([0.25, 0.75]) fig, ax = plt.subplots(1, 1, figsize=(5, 4)) @@ -75,22 +75,24 @@ # # - what does log2 transformation mean for the error # -# If the error is calculated in log2 space, the larger values have to be predicted with higher precision (in comparison to the original space) +# If the error is calculated in log2 space, the larger values have to be +# predicted with higher precision (in comparison to the original space) # %% -def get_original_error_log2(x:float, error_log2:float): - return 2 ** (np.log2(x) + error_log2) - x +def get_original_error_log2(x: float, error_log2: float): + return 2 ** (np.log2(x) + error_log2) - x + print( f"{get_original_error_log2(1e9, 0.5) = :,.1f}", f"{get_original_error_log2(1e8, 0.5) = :,.1f}", sep='\n' - ) +) # %% [markdown] -# If we try to find the rel log2 error equalling the original error, this can be done by -# equating: +# If we try to find the rel log2 error equalling the original error, this can be done by +# equating: # # $$ \exp(\ln(a)+e) - a = \exp(\ln(a)+e^*) - b $$ # @@ -101,12 +103,13 @@ def get_original_error_log2(x:float, error_log2:float): # %% def rel_error(measurment, log_error, other_measurment): numerator = 2 ** (np.log2(measurment) + log_error) - numerator-=measurment - numerator+=other_measurment - + numerator -= measurment + numerator += other_measurment + denominator = other_measurment return np.log2(numerator / denominator) + rel_error = rel_error(1.e9, 0.5, 1e8) print(f"{rel_error = :.3f}") @@ -115,10 +118,10 @@ def rel_error(measurment, log_error, other_measurment): f"0.500 rel to 1e9: {get_original_error_log2(1e9, 0.5) :,.1f}", f"{rel_error:.3f} rel to 1e8: {get_original_error_log2(1e8, rel_error) :,.1f}", sep='\n' - ) +) # %% [markdown] -# So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace, +# So the relative error of 0.5 for $10^9$ is five times larger for $10^8$ in the logspace, # whereas the error in the original space is the same # %% [markdown] diff --git a/vaep/analyzers/__init__.py b/vaep/analyzers/__init__.py index 4274cfd4d..c50b18b25 100644 --- a/vaep/analyzers/__init__.py +++ b/vaep/analyzers/__init__.py @@ -1,9 +1,10 @@ from types import SimpleNamespace -from . import diff_analysis +from . import diff_analysis from . import compare_predictions __all__ = ['diff_analysis', 'compare_predictions', 'Analysis'] + class Analysis(SimpleNamespace): - pass \ No newline at end of file + pass diff --git a/vaep/analyzers/analyzers.py b/vaep/analyzers/analyzers.py index 3f0609dfa..7dbd95917 100644 --- a/vaep/analyzers/analyzers.py +++ b/vaep/analyzers/analyzers.py @@ -31,22 +31,22 @@ def verify_df(df, - fname, - index_col:str, # could be potentially 0 for the first column - verify_fname: bool = False, - usecols=None, - ): + fname, + index_col: str, # could be potentially 0 for the first column + verify_fname: bool = False, + usecols=None, + ): if usecols and isinstance(index_col, str): assert index_col in usecols, 'Add index_col to usecols Sequence' if verify_fname: if not len(df.shape) == 2: raise ValueError(f"Expected 2 -dimensional array, not {len(df.shape)} -dimensional," - f" of type: {type(df)}") + f" of type: {type(df)}") N, M = df.shape assert f'N{N:05d}' in str(fname) and f'M{M:05d}' in str(fname), \ ("Filename number don't match loaded numbers: " f"{fname} should contain N{N} and M{M}") - + class AnalyzePeptides(SimpleNamespace): """Namespace for current analysis @@ -61,7 +61,7 @@ class AnalyzePeptides(SimpleNamespace): Many more attributes are set dynamically depending on the concrete analysis. """ - def __init__(self, data:pd.DataFrame, + def __init__(self, data: pd.DataFrame, is_log_transformed: bool = False, is_wide_format: bool = True, ind_unstack: str = '',): if not is_wide_format: @@ -115,7 +115,7 @@ def from_pickle(cls, fname: str, def get_consecutive_dates(self, n_samples, seed=42): """Select n consecutive samples using a seed. - + Updated the original DataFrame attribute: df """ self.df.sort_index(inplace=True) @@ -138,7 +138,11 @@ def df_long(self): return self._df_long return self.to_long_format(colname_values='intensity', index_name=self.index_col) - def to_long_format(self, colname_values: str = 'intensity', index_name: str = 'Sample ID', inplace: str = False) -> pd.DataFrame: + def to_long_format( + self, + colname_values: str = 'intensity', + index_name: str = 'Sample ID', + inplace: str = False) -> pd.DataFrame: """[summary] Parameters @@ -178,7 +182,11 @@ def to_long_format(self, colname_values: str = 'intensity', index_name: str = 'S def df_wide(self): return self.to_wide_format() - def to_wide_format(self, columns: str = 'Sample ID', name_values: str = 'intensity', inplace: bool = False) -> pd.DataFrame: + def to_wide_format( + self, + columns: str = 'Sample ID', + name_values: str = 'intensity', + inplace: bool = False) -> pd.DataFrame: """[summary] Parameters @@ -197,11 +205,11 @@ def to_wide_format(self, columns: str = 'Sample ID', name_values: str = 'intensi """ """Build wide data view. - + Return df attribute in case this is in wide-format. If df attribute is in long-format this is used. If df is wide, but long-format exist, then the wide format is build. - - + + """ if self.is_wide_format: return self.df @@ -264,15 +272,14 @@ def get_PCA(self, n_components=2, imputer=SimpleImputer): def calculate_PCs(self, new_df, is_wide=True): if not is_wide: - new_df = new_df.unstack(new_df.index.names[1:]) - + new_df = new_df.unstack(new_df.index.names[1:]) + X = self.imputer_.transform(new_df) X = _add_indices(X, new_df) PCs = self.pca_.transform(X) PCs = _add_indices(PCs, new_df, index_only=True) PCs.columns = [f'PC {i+1}' for i in range(PCs.shape[-1])] - return PCs - + return PCs def plot_pca(self,): """Create principal component plot with three heatmaps showing @@ -294,7 +301,8 @@ def plot_pca(self,): self.dim = Dim(*self.df.shape) fig.suptitle( - f'First two Principal Components of {self.dim.M} most abundant peptides \n for {self.dim.N} samples', fontsize=30) + f'First two Principal Components of {self.dim.M} most abundant peptides \n for {self.dim.N} samples', + fontsize=30) # by instrument ax = axes[0] @@ -401,14 +409,14 @@ def _plot(self, fct, meta_key: str, save: bool = True): title=f'{self.model_name} latent space PCA of {self.latent_dim} dimensions by {meta_key}') if save: vaep.plotting._savefig(fig, name=f'{self.model_name}_latent_by_{meta_key}', - folder=self.folder) + folder=self.folder) return fig, ax # def read_csv(fname:str, nrows:int, index_col:str=None)-> pd.DataFrame: # return pd.read_csv(fname, index_col=index_col, low_memory=False, nrows=nrows) -def build_metadata_df(filenames:pd.Index) -> pd.DataFrame: +def build_metadata_df(filenames: pd.Index) -> pd.DataFrame: """Build a DataFrame based on a list of strings (an Index) to parse. Is strongly coupled to the analysis context. @@ -422,17 +430,18 @@ def build_metadata_df(filenames:pd.Index) -> pd.DataFrame: pd.DataFrame A DataFrame with the parsed metadata. """ - + d_meta = metadata.get_metadata_from_filenames(filenames) df_meta = pd.DataFrame.from_dict(d_meta, orient='index') df_meta.index.name = filenames.name return df_meta + def get_consecutive_data_indices(df, n_samples): index = df.sort_index().index start_sample = len(index) - n_samples start_sample = random.randint(0, start_sample) - return df.loc[index[start_sample:start_sample+n_samples]] + return df.loc[index[start_sample:start_sample + n_samples]] def corr_lower_triangle(df): @@ -453,13 +462,13 @@ def plot_corr_histogram(corr_lower_triangle, bins=10): ax.yaxis.set_major_formatter("{x:,.0f}") ax = axes[1] plt.axis('off') - data = values.describe(percentiles=np.linspace(0.1,1,10)).round(2) + data = values.describe(percentiles=np.linspace(0.1, 1, 10)).round(2) data.name = '' _ = pd.plotting.table(ax=ax, data=data, loc="best", edges="open") return fig, axes -def run_pca(df_wide:pd.DataFrame, n_components:int=2) -> Tuple[pd.DataFrame, PCA]: +def run_pca(df_wide: pd.DataFrame, n_components: int = 2) -> Tuple[pd.DataFrame, PCA]: """Run PCA on DataFrame and return result. Parameters @@ -531,10 +540,10 @@ def seaborn_scatter(df, ax, seaborn.scatterplot(x=df[cols[0]], y=df[cols[1]], hue=meta, ax=ax, palette='deep', s=size, alpha=alpha) _ = ax.legend(fontsize=fontsize, - title_fontsize=fontsize, - markerscale=0.4, - title=meta.name, - ) + title_fontsize=fontsize, + markerscale=0.4, + title=meta.name, + ) ax.set_title(title, fontsize=fontsize) return ax @@ -546,9 +555,9 @@ def scatter_plot_w_dates(ax, df, size=2): """plot first vs. second column in DataFrame. Use dates to color data. - - - + + + errors : {'ignore', 'raise', 'coerce'}, default 'raise' Passed on to pandas.to_datetime - If 'raise', then invalid parsing will raise an exception. @@ -576,7 +585,7 @@ def scatter_plot_w_dates(ax, df, def add_date_colorbar(mappable, ax): loc = mdates.AutoDateLocator() cbar = ax.get_figure().colorbar(mappable, ax=ax, ticks=loc, - format=mdates.AutoDateFormatter(loc)) + format=mdates.AutoDateFormatter(loc)) return cbar diff --git a/vaep/analyzers/compare_predictions.py b/vaep/analyzers/compare_predictions.py index ea0d16935..75878537d 100644 --- a/vaep/analyzers/compare_predictions.py +++ b/vaep/analyzers/compare_predictions.py @@ -24,7 +24,7 @@ def load_predictions(pred_files: List, shared_columns=['observed']): def load_split_prediction_by_modelkey(experiment_folder: Path, split: str, - model_keys:list[str], + model_keys: list[str], allow_missing=False, shared_columns: list[str] = None): """Load predictions from a list of models. @@ -63,7 +63,7 @@ def load_split_prediction_by_modelkey(experiment_folder: Path, return load_predictions(pred_files, shared_columns=shared_columns) -def load_single_csv_pred_file(fname:str|Path, value_name:str='intensity') -> pd.Series: +def load_single_csv_pred_file(fname: str | Path, value_name: str = 'intensity') -> pd.Series: """Load a single pred file from a single model. Last column are measurments, other are index. @@ -79,7 +79,7 @@ def load_single_csv_pred_file(fname:str|Path, value_name:str='intensity') -> pd. pd.Series measurments as a single column with set indices """ - pred = pd.read_csv(fname) # getattr for other file formats + pred = pd.read_csv(fname) # getattr for other file formats pred = pred.set_index(pred.columns[:-1].tolist()) pred = pred.squeeze() pred.name = value_name diff --git a/vaep/analyzers/diff_analysis.py b/vaep/analyzers/diff_analysis.py index deafe4311..6b115f6ba 100644 --- a/vaep/analyzers/diff_analysis.py +++ b/vaep/analyzers/diff_analysis.py @@ -31,7 +31,7 @@ def select_raw_data(df: pd.DataFrame, return df, Cutoffs(min_sample_for_feat, min_feat_per_sample) -def select_feat(df_qc:pd.DataFrame, threshold:float=0.4, axis:int=0): +def select_feat(df_qc: pd.DataFrame, threshold: float = 0.4, axis: int = 0): qc_cv_feat = df_qc.std(axis=axis) / df_qc.mean(axis=axis) mask = qc_cv_feat < threshold - return qc_cv_feat.loc[mask].index \ No newline at end of file + return qc_cv_feat.loc[mask].index diff --git a/vaep/analyzers/metadata.py b/vaep/analyzers/metadata.py index 27cb60906..4e843093d 100644 --- a/vaep/analyzers/metadata.py +++ b/vaep/analyzers/metadata.py @@ -37,7 +37,7 @@ assert re.search(regex_not_researcher, 'MA').group() == 'MA' assert re.search(regex_not_researcher, 'QC').group() == 'QC' assert re.search(regex_not_researcher, 'MA_OFF').group() == 'MA' -assert re.search(regex_not_researcher, '_LiNi_') == None +assert re.search(regex_not_researcher, '_LiNi_') is None type_run = {'MA': 'MNT', diff --git a/vaep/databases/__init__.py b/vaep/databases/__init__.py index 1a1fb4a49..e28e2c7c8 100644 --- a/vaep/databases/__init__.py +++ b/vaep/databases/__init__.py @@ -1,2 +1 @@ from . import uniprot - diff --git a/vaep/databases/diseases.py b/vaep/databases/diseases.py index edea11912..f4800f76f 100644 --- a/vaep/databases/diseases.py +++ b/vaep/databases/diseases.py @@ -3,7 +3,8 @@ logger = logging.getLogger(__name__) -def get_disease_association(doid:int, limit:int=1000): + +def get_disease_association(doid: int, limit: int = 1000): params = {'type1': -26, 'type2': 'value2', 'id1': f'DOID:{doid}', @@ -11,8 +12,8 @@ def get_disease_association(doid:int, limit:int=1000): 'limit': limit, 'format': 'json'} diseases_url_all = 'https://api.jensenlab.org/Integration' - - r = requests.get(diseases_url_all, params=params) + + r = requests.get(diseases_url_all, params=params) if r.status_code == 200: data, is_there_more = r.json() else: diff --git a/vaep/databases/uniprot.py b/vaep/databases/uniprot.py index 123c14ae2..65c03270b 100644 --- a/vaep/databases/uniprot.py +++ b/vaep/databases/uniprot.py @@ -179,11 +179,11 @@ def query_uniprot_id_mapping(query_list: list, FROM='UniProtKB_AC-ID', TO='Gene_ """Query Uniprot ID mappings programatically (utility function) See availabe mappings: https://www.uniprot.org/help/api_idmapping Function is programmed to query gene IDs based on protein IDs. - + Parameters ---------- query_list : list - list of strings containing queries in format specified + list of strings containing queries in format specified in FROM parameter. FROM : str, optional Format of string-ids in query_list, by default 'ACC+ID' @@ -191,7 +191,7 @@ def query_uniprot_id_mapping(query_list: list, FROM='UniProtKB_AC-ID', TO='Gene_ Format to which strings-ids should be matched with, by default 'GENENAME' FORMAT : str, optional Separator for Uniprot-ID, by default 'tab' - + Returns ------- list: @@ -208,7 +208,7 @@ def query_uniprot_id_mapping(query_list: list, FROM='UniProtKB_AC-ID', TO='Gene_ if __name__ == "__main__": - ids= ['A0A075B6I0', 'A0A075B6I1', 'A0A075B6I6', 'A0A075B6I9',] + ids = ['A0A075B6I0', 'A0A075B6I1', 'A0A075B6I6', 'A0A075B6I9',] results = query_uniprot_id_mapping(ids) print(results) # {'A0A075B6I0': 'IGLV8-61', 'A0A075B6I1': 'IGLV4-60', 'A0A075B6I6': 'IGLV1-50', 'A0A075B6I9': 'IGLV7-46'} diff --git a/vaep/io/__init__.py b/vaep/io/__init__.py index 5371ec649..21cfc5518 100644 --- a/vaep/io/__init__.py +++ b/vaep/io/__init__.py @@ -15,9 +15,10 @@ logger = logging.getLogger(__name__) logger.info(f"Calling from {__name__}") + def search_files(path='.', query='.txt'): - """Uses Pathlib to find relative to path files - with the query text in their file names. Returns + """Uses Pathlib to find relative to path files + with the query text in their file names. Returns the path relative to the specified path. Parameters @@ -30,7 +31,7 @@ def search_files(path='.', query='.txt'): Returns ------- list - list with files as string containig query key. + list with files as string containig query key. """ path = Path(path) files = [] @@ -71,11 +72,11 @@ def get_subfolders(path): return directories -def resolve_path(path:Union[str, Path], to:Union[str, Path]='.')-> Path: +def resolve_path(path: Union[str, Path], to: Union[str, Path] = '.') -> Path: """Resolve a path partly overlapping with to another path.""" pwd = Path(to).absolute() pwd = [p for p in pwd.parts] - ret = [p for p in Path(path).parts if p not in pwd] + ret = [p for p in Path(path).parts if p not in pwd] return Path('/'.join(ret)) @@ -144,8 +145,8 @@ def parse_dict(input_dict: dict, d = dict() for k, v in input_dict.items(): for (old_type, fct) in types: - if isinstance(v, old_type): - v = fct(v) + if isinstance(v, old_type): + v = fct(v) d[k] = v return d @@ -170,4 +171,4 @@ def extend_name(fname: Union[str, Path], extend_by: str, ext: str = None) -> Pat ext = fname.suffix fname = fname.parent / f"{fname.stem}{extend_by}" fname = fname.with_suffix(ext) - return fname \ No newline at end of file + return fname diff --git a/vaep/io/data_objects.py b/vaep/io/data_objects.py index 8ffb62340..62d81ac71 100644 --- a/vaep/io/data_objects.py +++ b/vaep/io/data_objects.py @@ -21,6 +21,8 @@ from vaep.plotting import plot_feat_counts # from .config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED +from fastcore.imports import IN_IPYTHON, IN_JUPYTER, IN_COLAB, IN_NOTEBOOK + logger = logging.getLogger(__name__) logger.info(f"Calling from {__name__}") @@ -39,7 +41,6 @@ DEFAULTS.COUNT_ALL_PEPTIDES = FOLDER_PROCESSED / 'count_all_peptides.json' # fastcore.imports has in_notebook, etc functionality -from fastcore.imports import IN_IPYTHON, IN_JUPYTER, IN_COLAB, IN_NOTEBOOK # IN_IPYTHON,IN_JUPYTER,IN_COLAB,IN_NOTEBOOK = in_ipython(),in_jupyter(),in_colab(),in_notebook() N_WORKERS_DEFAULT = os.cpu_count() - 1 if os.cpu_count() <= 16 else 16 @@ -172,6 +173,8 @@ def get_files_w_min_MS2(self, threshold=10_000, relativ_to=FOLDER_MQ_TXT_DATA): return [Path(relativ_to) / folder for folder in self.df.loc[mask].index] # maybe move functions related to fnames + + def get_fname(N, M): """Helper function to get file for intensities""" return f'df_intensities_N{N:05d}_M{M:05d}' @@ -185,8 +188,7 @@ def create_parent_folder_name(folder: Path) -> str: return folder.stem[:4] -## plotting function for value_counts from FeatureCounter.get_df_counts - +# plotting function for value_counts from FeatureCounter.get_df_counts def collect_in_chuncks(paths: Iterable[Union[str, Path]], @@ -196,7 +198,7 @@ def collect_in_chuncks(paths: Iterable[Union[str, Path]], desc='Run chunks in parallel') -> List: """collect the results from process_chunk_fct (chunk of files to loop over). The idea is that process_chunk_fct creates a more memory-efficient intermediate - result than possible if only callling single fpaths in paths. + result than possible if only callling single fpaths in paths. Parameters ---------- @@ -216,8 +218,8 @@ def collect_in_chuncks(paths: Iterable[Union[str, Path]], if n_workers > 1: with multiprocessing.Pool(n_workers) as p: collected = list(tqdm(p.imap(process_chunk_fct, paths_splits), - total=len(paths_splits), - desc=desc)) + total=len(paths_splits), + desc=desc)) else: collected = map(process_chunk_fct, paths_splits) return collected @@ -225,9 +227,9 @@ def collect_in_chuncks(paths: Iterable[Union[str, Path]], class FeatureCounter(): def __init__(self, fp_counter: str, counting_fct: Callable[[List], Counter], - idx_names:Union[List, None]=None, - feature_name='feature', - overwrite=False): + idx_names: Union[List, None] = None, + feature_name='feature', + overwrite=False): self.fp = Path(fp_counter) self.counting_fct = counting_fct self.idx_names = idx_names @@ -238,7 +240,7 @@ def __init__(self, fp_counter: str, counting_fct: Callable[[List], Counter], self.loaded = set(folder for folder in d['based_on']) self.dumps = d['dumps'] else: - self.loaded = set() # None + self.loaded = set() # None self.counter = Counter() self.dumps = dict() @@ -262,10 +264,10 @@ def sum_over_files(self, folders: List[Path], n_workers=N_WORKERS_DEFAULT, save= if folders: list_of_sample_dicts = collect_in_chuncks(folders, - process_chunk_fct=self.counting_fct, - n_workers = n_workers, - chunks=n_workers*3, - desc = 'Count features in 100 chunks') + process_chunk_fct=self.counting_fct, + n_workers=n_workers, + chunks=n_workers * 3, + desc='Count features in 100 chunks') for d in tqdm(list_of_sample_dicts, total=len(list_of_sample_dicts), @@ -286,7 +288,6 @@ def sum_over_files(self, folders: List[Path], n_workers=N_WORKERS_DEFAULT, save= @property def n_samples(self): return len(self.loaded) - def get_df_counts(self) -> pd.DataFrame: """Counted features as DataFrame with proportion values. @@ -297,14 +298,14 @@ def get_df_counts(self) -> pd.DataFrame: _description_ """ feat_counts = (pd.Series(self.counter) - .sort_values(ascending=False) - .to_frame('counts')) + .sort_values(ascending=False) + .to_frame('counts')) feat_counts['proportion'] = feat_counts / self.n_samples - if self.idx_names: + if self.idx_names: feat_counts.index.names = self.idx_names feat_counts.reset_index(inplace=True) feat_counts.index.name = 'consecutive count' - return feat_counts + return feat_counts def plot_counts(self, df_counts: pd.DataFrame = None, ax=None, prop_feat=0.25, min_feat_prop=.01): """Plot counts based on get_df_counts.""" @@ -348,8 +349,8 @@ def save(self): } """ d = {'counter': self.counter, - 'based_on': list(self.loaded), - 'dumps': {k: str(v) for k, v in self.dumps.items()}} + 'based_on': list(self.loaded), + 'dumps': {k: str(v) for k, v in self.dumps.items()}} logger.info(f"Save to: {self.fp}") dump_json(d, filename=self.fp) @@ -357,7 +358,7 @@ def load(self, fp): with open(self.fp) as f: d = json.load(f) d['counter'] = Counter(d['counter']) - d['dumps'] = {k: Path(v) for k,v in d['dumps'].items()} + d['dumps'] = {k: Path(v) for k, v in d['dumps'].items()} return d def load_dump(self, fpath, fct=pd.read_csv, use_cols=None): @@ -392,11 +393,12 @@ def __call__(self, folders, c.update(df.index) if self.dump: fpath_dict[folder.stem] = dump_to_csv(df, folder=folder, outfolder=self.outfolder, - parent_folder_fct=self.parent_folder_fct) + parent_folder_fct=self.parent_folder_fct) ret = {'counter': c, 'dumps': fpath_dict} return ret -### aggregated peptides +# aggregated peptides + # # check df for redundant information (same feature value for all entries) usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE, 'PEP'] @@ -418,11 +420,12 @@ def count_peptides(folders: List[Path], dump=True, c.update(peptides.index) if dump: fpath_dict[folder.stem] = dump_to_csv(peptides.drop('Potential contaminant', axis=1), - folder=folder, outfolder=outfolder, - parent_folder_fct=parent_folder_fct) + folder=folder, outfolder=outfolder, + parent_folder_fct=parent_folder_fct) ret = {'counter': c, 'dumps': fpath_dict} return ret + d_dtypes_training_sample = { 'Sequence': pd.StringDtype(), 'Proteins': pd.StringDtype(), @@ -438,6 +441,7 @@ def load_agg_peptide_dump(fpath): peptides = pd.read_csv(fpath, index_col=0, dtype=d_dtypes_training_sample) return peptides + @delegates() class PeptideCounter(FeatureCounter): @@ -446,7 +450,7 @@ def __init__(self, counting_fct: Callable[[List], Counter] = count_peptides, idx_names=['Sequence'], feature_name='aggregated peptide', - **kwargs): + **kwargs): super().__init__(fp_counter, counting_fct=counting_fct, idx_names=idx_names, feature_name=feature_name, **kwargs) @@ -455,8 +459,7 @@ def load_dump(fpath): return load_agg_peptide_dump(fpath) - -### Evidence +# Evidence evidence_cols = mq.mq_evidence_cols @@ -504,7 +507,7 @@ def count_evidence(folders: List[Path], c.update(evidence.index) if dump: fpath_dict[folder.stem] = dump_to_csv(evidence, folder=folder, outfolder=outfolder, - parent_folder_fct=parent_folder_fct) + parent_folder_fct=parent_folder_fct) ret = {'counter': c, 'dumps': fpath_dict} return ret @@ -530,7 +533,7 @@ def save(self): } """ d = {'counter': vaep.pandas.create_dict_of_dicts(self.counter), - 'based_on': list(self.loaded), + 'based_on': list(self.loaded), 'dumps': {k: str(v) for k, v in self.dumps.items()}} print(f"Save to: {self.fp}") dump_json(d, filename=self.fp) @@ -540,7 +543,7 @@ def load(self, fp): d = json.load(f) d['counter'] = Counter( vaep.pandas.flatten_dict_of_dicts(d['counter'])) - d['dumps'] = {k: Path(v) for k,v in d['dumps'].items()} + d['dumps'] = {k: Path(v) for k, v in d['dumps'].items()} return d @@ -548,7 +551,7 @@ def load_evidence_dump(fpath, index_col=['Sequence', 'Charge']): df = pd.read_csv(fpath, index_col=index_col) return df -### Protein Groups +# Protein Groups pg_cols = mq.mq_protein_groups_cols @@ -557,7 +560,7 @@ def load_evidence_dump(fpath, index_col=['Sequence', 'Charge']): def load_and_process_proteinGroups(folder: Union[str, Path], - #use_cols not really a parameter (or needs asserts?) + # use_cols not really a parameter (or needs asserts?) use_cols: List = [ pg_cols.Protein_IDs, pg_cols.Majority_protein_IDs, @@ -580,7 +583,7 @@ def load_and_process_proteinGroups(folder: Union[str, Path], pg = pg.loc[mask] gene_set = pg[pg_cols.Gene_names].str.split(';') col_loc_gene_names = pg.columns.get_loc(pg_cols.Gene_names) - _ = pg.insert(col_loc_gene_names+1, 'Number of Genes', + _ = pg.insert(col_loc_gene_names + 1, 'Number of Genes', gene_set.apply(vaep.pandas.length)) mask_no_gene = pg[pg_cols.Gene_names].isna() pg_no_gene = pg.loc[mask_no_gene] @@ -593,9 +596,6 @@ def load_and_process_proteinGroups(folder: Union[str, Path], return pg - - - count_protein_groups = Count(load_and_process_proteinGroups, use_cols=[ pg_cols.Protein_IDs, @@ -619,7 +619,7 @@ class ProteinGroupsCounter(FeatureCounter): def __init__(self, fp_counter: str, counting_fct: Callable[[List], Counter] = count_protein_groups, - idx_names=[pg_cols.Protein_IDs], # mq_specfic + idx_names=[pg_cols.Protein_IDs], # mq_specfic feature_name='protein group', **kwargs): super().__init__(fp_counter, counting_fct, idx_names=idx_names, @@ -631,9 +631,10 @@ def load_pg_dump(folder, use_cols=None): df = pd.read_csv(folder, index_col=pg_cols.Protein_IDs, usecols=use_cols) return df -## Gene Counter +# Gene Counter + -def pg_idx_gene_fct(folder:Union[str, Path], use_cols=None): +def pg_idx_gene_fct(folder: Union[str, Path], use_cols=None): folder = Path(folder) logger.debug(f"Load: {folder}") df = pd.read_csv(folder, index_col=pg_cols.Gene_names, usecols=use_cols) @@ -650,7 +651,7 @@ def pg_idx_gene_fct(folder:Union[str, Path], use_cols=None): dump=False) -#summing needs to be done over processed proteinGroup dumps +# summing needs to be done over processed proteinGroup dumps @delegates() class GeneCounter(FeatureCounter): """Gene Counter to count gene in dumped proteinGroups.""" diff --git a/vaep/io/dataloaders.py b/vaep/io/dataloaders.py index e98d2dd03..7443d28d6 100644 --- a/vaep/io/dataloaders.py +++ b/vaep/io/dataloaders.py @@ -33,7 +33,7 @@ def __init__(self, scaler : [type] A pipeline of transform to apply to the dataset. DataSetClass : torch.utils.data.Dataset - Type of dataset to use for generating single samples based on + Type of dataset to use for generating single samples based on DataFrames. batch_size : int Batch size to use. @@ -49,7 +49,10 @@ def __init__(self, self.scaler = scaler self.batch_size = batch_size - def get_dls(self, shuffle_train: bool = True, **kwargs) -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]: + def get_dls(self, + shuffle_train: bool = True, + **kwargs) -> Tuple[torch.utils.data.DataLoader, + torch.utils.data.DataLoader]: self.shuffle_train = shuffle_train dl_train = DataLoader( dataset=self.data_train, @@ -106,7 +109,7 @@ def get_dls(train_X: pandas.DataFrame, transforms = VaepPipeline(df_train=train_X, encode=dae_default_pipeline, decode=['normalize']) - dls = get_dls(train_X, val_X, transforms, bs=4) + dls = get_dls(train_X, val_X, transforms, bs=4) """ train_ds = datasets.DatasetWithTarget(df=train_X, transformer=transformer) diff --git a/vaep/io/datasets.py b/vaep/io/datasets.py index 9f854e1ce..53839626d 100644 --- a/vaep/io/datasets.py +++ b/vaep/io/datasets.py @@ -9,6 +9,7 @@ DEFAULT_DTYPE = torch.get_default_dtype() + class PeptideDatasetInMemory(Dataset): """Peptide Dataset fully in memory.""" @@ -23,7 +24,7 @@ def __init__(self, data: np.array, mask: np.array = None, fill_na=0.0): Peptide data for training, potentially with missings. mask : [type], optional Mask selecting values for evaluation from data(y), by default None - If no mask is provided, all non-missing values from `data`-array + If no mask is provided, all non-missing values from `data`-array will be used. fill_na : int, optional value to replace missing values with, by default 0 @@ -83,19 +84,21 @@ def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: mask_isna = self.mask_isna.iloc[idx] data = self.data.iloc[idx] mask_isna, data = to_tensor(mask_isna), to_tensor(data) - return mask_isna, data + return mask_isna, data + class DatasetWithTarget(DatasetWithMaskAndNoTarget): def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: mask, data = super().__getitem__(idx) - return mask, data, data + return mask, data, data + class DatasetWithTargetSpecifyTarget(DatasetWithMaskAndNoTarget): - def __init__(self, df: pd.DataFrame, targets:pd.DataFrame, + def __init__(self, df: pd.DataFrame, targets: pd.DataFrame, transformer: sklearn.pipeline.Pipeline = None): - """Create a dataset for validation. + """Create a dataset for validation. Parameters ---------- @@ -113,7 +116,7 @@ def __init__(self, df: pd.DataFrame, targets:pd.DataFrame, self.columns = df.columns self.transformer = transformer - self.target = df.fillna(targets) # not really necessary, without mask would not be needed + self.target = df.fillna(targets) # not really necessary, without mask would not be needed if transformer: if hasattr(transformer, 'transform'): @@ -125,16 +128,16 @@ def __init__(self, df: pd.DataFrame, targets:pd.DataFrame, self.data = df self.length_ = len(self.data) - def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: - mask_isna, data = super().__getitem__(idx) - target = to_tensor(self.target.iloc[idx]) + mask_isna, data = super().__getitem__(idx) + target = to_tensor(self.target.iloc[idx]) return mask_isna, data, target + class PeptideDatasetInMemoryMasked(DatasetWithMaskAndNoTarget): """Peptide Dataset fully in memory. - + Dataset: torch.utils.data.Dataset """ @@ -155,7 +158,7 @@ def __init__(self, *args, fill_na=0, **kwargs): class PeptideDatasetInMemoryNoMissings(Dataset): """Peptide Dataset fully in memory. - + Dataset: torch.utils.data.Dataset """ diff --git a/vaep/io/datasplits.py b/vaep/io/datasplits.py index a6de3ace5..daae63bc2 100644 --- a/vaep/io/datasplits.py +++ b/vaep/io/datasplits.py @@ -17,6 +17,7 @@ # 'pickle': 'to_pickle', 'csv': ('to_csv', 'read_csv')} + def long_format(df: pd.DataFrame, colname_values: str = 'intensity', # index_name: str = 'Sample ID' @@ -49,7 +50,6 @@ class DataSplits(): train_X: pd.DataFrame = None val_y: pd.DataFrame = None test_y: pd.DataFrame = None - def __post_init__(self): self._items = sorted(self.__dict__) @@ -64,12 +64,12 @@ def __dir__(self): return ['dump', 'from_folder', 'interpolate', 'load', 'test_X', 'test_y', 'to_long_format', 'to_wide_format', 'train_X', 'val_X', 'val_y'] - def dump(self, folder='data', file_format='csv')-> dict: + def dump(self, folder='data', file_format='csv') -> dict: """dump in long format.""" folder = Path(folder) folder.mkdir(parents=True, exist_ok=True) - if not file_format in FILE_FORMAT_TO_DUMP_FCT: + if file_format not in FILE_FORMAT_TO_DUMP_FCT: raise ValueError(f"Select one of these formats: {', '.join(FILE_FORMAT_TO_DUMP_FCT.keys())}") dumps = {} n_dumped = 0 @@ -128,11 +128,11 @@ def to_wide_format(self): _df = _series.unstack() setattr(self, _attr, _df) self._is_wide = True - - def to_long_format(self, name_values:str='intensity'): - if not self._is_wide: + + def to_long_format(self, name_values: str = 'intensity'): + if not self._is_wide: return - + for _attr, _df in self: if _df is None: continue @@ -142,7 +142,7 @@ def to_long_format(self, name_values:str='intensity'): self._is_wide = False # singledispatch possible - def interpolate(self, dataset:Union[str, pd.DataFrame]): + def interpolate(self, dataset: Union[str, pd.DataFrame]): if issubclass(type(dataset), pd.DataFrame): ds = dataset elif issubclass(type(dataset), pd.Series): @@ -152,7 +152,7 @@ def interpolate(self, dataset:Union[str, pd.DataFrame]): ds = getattr(self, dataset) except AttributeError: raise AttributeError(f"Please provide a valid attribute, not '{dataset}'. " - "Valid attributes are {}".format(', '.join(x for x in self._items))) + "Valid attributes are {}".format(', '.join(x for x in self._items))) if dataset[-1] in ['y', 'Y']: logger.warning( f'Attempting to interpolate target: {dataset} ' @@ -160,15 +160,13 @@ def interpolate(self, dataset:Union[str, pd.DataFrame]): if ds is None: raise ValueError(f'Attribute is None: {dataset!r}.') if not self._is_wide: - ds = ds.unstack() # series is unstack to DataFrame + ds = ds.unstack() # series is unstack to DataFrame else: raise TypeError(f"Unknown type: {classname(dataset)}." - f" None of str, {class_full_module(pd.DataFrame)}, {class_full_module(pd.Series)}" - ) - - return interpolate(wide_df=ds) - + f" None of str, {class_full_module(pd.DataFrame)}, {class_full_module(pd.Series)}" + ) + return interpolate(wide_df=ds) def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv') -> dict: @@ -184,7 +182,8 @@ def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv read_fct = getattr(pd, FILE_FORMAT_TO_DUMP_FCT[file_format][1]) _df = read_fct(fname) # logic below is suited for csv reader -> maybe split up loading and saving later? - if len(_df.shape) == 1: _df = _df.to_frame().reset_index() # in case Series was pickled + if len(_df.shape) == 1: + _df = _df.to_frame().reset_index() # in case Series was pickled cols = list(_df.columns) if use_wide_format: _df = wide_format(_df.set_index(cols[1:-1]), columns=cols[0], name_values=cols[-1]) @@ -196,7 +195,7 @@ def load_items(folder: str, items: dict, use_wide_format=False, file_format='csv # set default file name -> intergrate into DataSplits? -def load_freq(folder:str, file='freq_features.pkl'): +def load_freq(folder: str, file='freq_features.pkl'): folder = Path(folder) fname = folder / file if fname.suffix == '.json': @@ -206,4 +205,4 @@ def load_freq(folder:str, file='freq_features.pkl'): freq_per_feature = pd.read_pickle(fname) else: raise ValueError(f"Unknown Fileextension: {fname.suffix}") - return freq_per_feature \ No newline at end of file + return freq_per_feature diff --git a/vaep/io/filenames.py b/vaep/io/filenames.py index ee9c122fa..d6338e6d7 100644 --- a/vaep/io/filenames.py +++ b/vaep/io/filenames.py @@ -4,11 +4,13 @@ logger = logging.getLogger(__name__) -def read_number_from_str(fname:str, regex:str='M[0-9]*', strip:int=1) -> int: + +def read_number_from_str(fname: str, regex: str = 'M[0-9]*', strip: int = 1) -> int: M = re.search(regex, fname).group() logger.info(f"Found: {M}") M = int(M[strip:]) return M + read_M_features = functools.partial(read_number_from_str, regex='M[0-9]*', strip=1) read_N_samples = functools.partial(read_number_from_str, regex='N[0-9]*', strip=1) diff --git a/vaep/io/format.py b/vaep/io/format.py index f316f8b5f..8ac18df95 100644 --- a/vaep/io/format.py +++ b/vaep/io/format.py @@ -11,7 +11,7 @@ def classname(obj): """ Return entire object's class name (repr notation) as str. Source: https://gist.github.com/clbarnes/edd28ea32010eb159b34b075687bb49e - + Parameters ---------- obj : object diff --git a/vaep/io/mq.py b/vaep/io/mq.py index daa843b0e..3bae23ff1 100644 --- a/vaep/io/mq.py +++ b/vaep/io/mq.py @@ -210,10 +210,10 @@ class MaxQuantOutput(): Attributes ---------- - self.files : list + self.files : list list of files in `folder`. _inital_attritubutes : list - Initial set of non-magic attributes + Initial set of non-magic attributes NAME_FILE_MAP : dict Keys for known MaxQuant output files. """ @@ -319,7 +319,7 @@ class MaxQuantOutputDynamic(MaxQuantOutput): name_file_map : dict Keys for known MaxQuant output files. _inital_attritubutes : list - Initial set of non-magic attributes + Initial set of non-magic attributes """ def __init__(self, folder): @@ -368,7 +368,7 @@ def check_df(df, columns): missing = [] for col in columns: - if not col in df: + if col not in df: missing.append(col) if missing: @@ -380,7 +380,7 @@ def check_df(df, columns): def get_peptides_with_single_gene(peptides, keep_columns=COLS_, gene_column=mq_col.GENE_NAMES): """Get long-data-format. Ungroup gene names. Peptides "shared" by genes - are assigned individual rows. retains only cases with full list of + are assigned individual rows. retains only cases with full list of features provided by `keep_columns`. Parameters @@ -388,7 +388,7 @@ def get_peptides_with_single_gene(peptides, keep_columns=COLS_, gene_column=mq_c peptides: pandas.DataFrame MaxQuant txt output loaded as `pandas.DataFrame`. keep_columns: list - List of columns to keep from the `peptides`.txt, default + List of columns to keep from the `peptides`.txt, default {cols_} gene_column: str Column containing group information of format "group1;group2", @@ -474,7 +474,7 @@ def count_genes_in_sets(gene_sets, sep=';'): def get_identifier_from_column(df: pd.DataFrame, identifier_col: str): - """Get unique identifier in a column of a DataFrame. + """Get unique identifier in a column of a DataFrame. Parameters ---------- @@ -579,7 +579,7 @@ def find_exact_cleaved_peptides_for_razor_protein(gene_data, fasta_db, gene_id: # pendants (e.g. Keratin: Q04695;CON__Q04695) # exclude potential other contaminents protein_sets = [ - x for x in proteins_shared_by_all if not 'CON__' in x] # .sorted() + x for x in proteins_shared_by_all if 'CON__' not in x] # .sorted() if len(protein_sets) == 0: # raise KeyError("No other overall protein found for sequences.") logger.warning( @@ -606,7 +606,7 @@ def calculate_completness_for_sample( peps_exact_cleaved : Iterable[str] Iterable of peptides exactly cleaved peps_in_data : Iterable[str] - Iterable of peptides found during a run / in a sample. Check if peptides + Iterable of peptides found during a run / in a sample. Check if peptides overlap with any of the exact peptides. Returns diff --git a/vaep/io/rawfiles.py b/vaep/io/rawfiles.py index 7fd0b897b..04df932cf 100644 --- a/vaep/io/rawfiles.py +++ b/vaep/io/rawfiles.py @@ -18,7 +18,7 @@ def find_indices_containing_query(query, X): def get_unique_stem(query, index: pd.Index): """Gets stem filename, by splitting filename left of query and remove last underscore _. - + Fractionated samples seem to be named by fraction type. Last field indicates fraction. """ ret = index.str.split(query).str[0].str.rsplit("_", n=1).str[0] @@ -34,7 +34,7 @@ def show_fractions(stub: str, df): class RawFileViewer: - def __init__(self, df:pd.DataFrame, start_query: str="[Ff]rac", outputfolder: str='.', path_col='path'): + def __init__(self, df: pd.DataFrame, start_query: str = "[Ff]rac", outputfolder: str = '.', path_col='path'): """Indices are used.""" self.df = df self.file_names = df.index @@ -42,7 +42,7 @@ def __init__(self, df:pd.DataFrame, start_query: str="[Ff]rac", outputfolder: st self.w_query = widgets.Text(start_query) self.query = start_query - + self.save_button = widgets.Button(description='Save current files.') self.save_button.on_click(self.save_current_files) @@ -59,10 +59,10 @@ def get_options(self, query): sub_df = self.find_indices_containing_query(query) ret = get_unique_stem(query, sub_df.index) return ret - except: + except BaseException: print(f"Not a valid query: {query} ") return () - + def save_current_files(self, button): """Save files in current views as txt file. """ @@ -74,12 +74,12 @@ def save_current_files(self, button): with open(fname, 'w') as f: f.write(f'-lmkdir {self.stub}\n') for _path in files: - _local_path = PurePosixPath(self.stub)/_path.name + _local_path = PurePosixPath(self.stub) / _path.name _remote_path = PurePosixPath(_path) line = line_template.format(remote_path=_remote_path, local_path=_local_path) f.write(f'{line}\n') print(f"Saved file paths to: {fname}") - + def viewer(self, query, stub: str): if query != self.query: self.query = query @@ -91,7 +91,7 @@ def viewer(self, query, stub: str): print(f"Nothing to display for QUERY: {query}") stub = None # find_indices_containing_query = partial(find_indices_containing_query, X=data_unique) - if stub and stub!=self.stub: + if stub and stub != self.stub: try: subset = self.df[self.df.index.str.contains(stub)] print('current stub: ', repr(stub)) diff --git a/vaep/io/thermo_raw_files.py b/vaep/io/thermo_raw_files.py index 2238d5afc..4bebfb58a 100644 --- a/vaep/io/thermo_raw_files.py +++ b/vaep/io/thermo_raw_files.py @@ -23,4 +23,3 @@ 'injection volume setting', 'dilution factor', ] - diff --git a/vaep/io/types.py b/vaep/io/types.py index b738d04a7..5f807113a 100644 --- a/vaep/io/types.py +++ b/vaep/io/types.py @@ -1,7 +1,9 @@ """ papermill strategy to determine type -see: https://github.com/nteract/papermill/blob/76906a882bb5b3e719ad113c7b2447e0ddffb2c7/papermill/cli.py#L275-L307 +see: https://github.com/nteract/papermill/blob/76906a882bb5b3e719ad113c7b2447e0ddffb2c7/papermill/cli.py#L275-L307 """ + + def resolve_type(value): if value == "True": return True @@ -34,4 +36,4 @@ def _is_float(value): except ValueError: return False else: - return True \ No newline at end of file + return True diff --git a/vaep/model.py b/vaep/model.py index 6d0e39c24..74e947643 100644 --- a/vaep/model.py +++ b/vaep/model.py @@ -13,17 +13,6 @@ logger = logging.getLogger(__name__) - - - - - - - - - - - def build_df_from_pred_batches(pred, scaler=None, index=None, columns=None): pred = np.vstack(pred) if scaler: @@ -32,10 +21,10 @@ def build_df_from_pred_batches(pred, scaler=None, index=None, columns=None): return pred -def get_latent_space(model_method_call:callable, - dl:torch.utils.data.DataLoader, - dl_index:pd.Index, - latent_tuple_pos:int=0) -> pd.DataFrame: +def get_latent_space(model_method_call: callable, + dl: torch.utils.data.DataLoader, + dl_index: pd.Index, + latent_tuple_pos: int = 0) -> pd.DataFrame: """Create a DataFrame of the latent space based on the model method call to be used (here: the model encoder or a latent space helper method) @@ -60,7 +49,7 @@ def get_latent_space(model_method_call:callable, for b in dl: model_input = b[1] res = model_method_call(model_input) - #if issubclass(type(res), torch.Tensor): + # if issubclass(type(res), torch.Tensor): if isinstance(res, tuple): res = res[latent_tuple_pos] res = res.detach().numpy() @@ -74,8 +63,6 @@ def get_latent_space(model_method_call:callable, return latent_space - - # # Defining the model manuelly # import torch.nn as nn diff --git a/vaep/models/__init__.py b/vaep/models/__init__.py index 55ac20670..c3c34de02 100644 --- a/vaep/models/__init__.py +++ b/vaep/models/__init__.py @@ -10,7 +10,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -import torch +import torch from fastcore.foundation import L from fastai import learner import sklearn.metrics as sklm @@ -26,8 +26,8 @@ def plot_loss(recorder: learner.Recorder, - norm_train:np.int64=np.int64(1), - norm_val:np.int64=np.int64(1), + norm_train: np.int64 = np.int64(1), + norm_val: np.int64 = np.int64(1), skip_start: int = 5, with_valid: bool = True, ax: plt.Axes = None) -> plt.Axes: @@ -39,9 +39,9 @@ def plot_loss(recorder: learner.Recorder, recorder : learner.Recorder fastai Recorder object, learn.recorder norm_train: np.int64, optional - Normalize epoch loss by number of training samples, by default 1 + Normalize epoch loss by number of training samples, by default 1 norm_val: np.int64, optional - Normalize epoch loss by number of validation samples, by default 1 + Normalize epoch loss by number of validation samples, by default 1 skip_start : int, optional Skip N first batch metrics, by default 5 with_valid : bool, optional @@ -61,7 +61,7 @@ def plot_loss(recorder: learner.Recorder, if with_valid: idx = (np.array(recorder.iters) < skip_start).sum() ax.plot(recorder.iters[idx:], L( - recorder.values[idx:]).itemgot(1) / norm_val , label='valid') + recorder.values[idx:]).itemgot(1) / norm_val, label='valid') ax.legend() return ax @@ -70,7 +70,7 @@ def plot_training_losses(learner: learner.Learner, name: str, ax=None, save_recorder: bool = True, - norm_factors = np.array([1,1], dtype='int'), + norm_factors=np.array([1, 1], dtype='int'), folder='figures', figsize=(15, 8)): if ax is None: @@ -96,6 +96,7 @@ def calc_net_weight_count(model: torch.nn.modules.module.Module) -> int: weight_count += np.prod(param.size()) return int(weight_count) + class RecorderDump: """Simple Class to hold fastai Recorder Callback data for serialization using pickle. """ @@ -120,8 +121,6 @@ def load(cls, filepath, name): plot_loss = plot_loss - - def split_prediction_by_mask(pred: pd.DataFrame, mask: pd.DataFrame, check_keeps_all: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: @@ -152,17 +151,17 @@ def split_prediction_by_mask(pred: pd.DataFrame, def compare_indices(first_index: pd.Index, second_index: pd.Index) -> pd.Index: """Show difference of indices in other index wrt. to first. First should be the larger collection wrt to the second. This is the set difference of two Index objects. - + If second index is a superset of indices of the first, the set will be empty, although there are differences (default behaviour in pandas). - + Parameters ---------- first_index : pd.Index Index, should be superset second_index : pd.Index Index, should be the subset - + Returns ------- pd.Index @@ -181,8 +180,7 @@ def compare_indices(first_index: pd.Index, second_index: pd.Index) -> pd.Index: ('MAE', sklm.mean_absolute_error)] - -def collect_metrics(metrics_jsons:List, key_fct: Callable) -> dict: +def collect_metrics(metrics_jsons: List, key_fct: Callable) -> dict: """Collect and aggregate a bunch of json metrics. Parameters @@ -207,7 +205,7 @@ def collect_metrics(metrics_jsons:List, key_fct: Callable) -> dict: fname = Path(fname) logger.info(f"Load file: {fname = }") - key = key_fct(fname) # level, repeat + key = key_fct(fname) # level, repeat logger.debug(f"{key = }") with open(fname) as f: diff --git a/vaep/models/ae.py b/vaep/models/ae.py index 29cc99dff..1c59157a2 100644 --- a/vaep/models/ae.py +++ b/vaep/models/ae.py @@ -28,8 +28,6 @@ logger = logging.getLogger(__name__) - - def get_preds_from_df(df: pd.DataFrame, learn: fastai.learner.Learner, transformer: vaep.transform.VaepPipeline, @@ -60,7 +58,7 @@ def get_preds_from_df(df: pd.DataFrame, dl = vaep.io.dataloaders.get_test_dl(df=df, transformer=transformer, dataset=dataset) - res = learn.get_preds(dl=dl) # -> dl could be int + res = learn.get_preds(dl=dl) # -> dl could be int if position_pred_tuple is not None and issubclass(type(res[0]), tuple): res = (res[0][position_pred_tuple], *res[1:]) res = L(res).map(lambda x: pd.DataFrame( @@ -69,9 +67,9 @@ def get_preds_from_df(df: pd.DataFrame, return res - leaky_relu_default = nn.LeakyReLU(.1) + class Autoencoder(nn.Module): """Autoencoder base class. @@ -116,8 +114,8 @@ def build_layer(in_feat, out_feat): # Encoder self.encoder = [] - for i in range(len(self.layers)-1): - in_feat, out_feat = self.layers[i:i+2] + for i in range(len(self.layers) - 1): + in_feat, out_feat = self.layers[i:i + 2] self.encoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) self.encoder.append(nn.Linear(out_feat, dim_latent)) @@ -133,11 +131,11 @@ def build_layer(in_feat, out_feat): out_feat=out_feat) i = -1 # in case a single hidden layer is passed - for i in range(len(self.layers_decoder)-2): - in_feat, out_feat = self.layers_decoder[i:i+2] + for i in range(len(self.layers_decoder) - 2): + in_feat, out_feat = self.layers_decoder[i:i + 2] self.decoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) - in_feat, out_feat = self.layers_decoder[i+1:i+3] + in_feat, out_feat = self.layers_decoder[i + 1:i + 3] self.decoder.append(nn.Linear(in_feat, out_feat)) if last_decoder_activation is not None: @@ -159,7 +157,7 @@ def get_missing_values(df_train_wide: pd.DataFrame, Parameters ---------- df_train_wide : pd.DataFrame - Training data in wide format. + Training data in wide format. val_idx : pd.Index Indices (MultiIndex of Sample and Feature) of validation split test_idx : pd.Index @@ -170,7 +168,7 @@ def get_missing_values(df_train_wide: pd.DataFrame, Returns ------- pd.Series - Multiindex series of missing values in training data which are not + Multiindex series of missing values in training data which are not in validiation and test split. """ # all idx missing in training data @@ -215,7 +213,7 @@ def get_missing_values(df_train_wide: pd.DataFrame, # assert self.layers_decoder is not self.layers # assert out_feat == self.layers_decoder[0] # self.decoder = [nn.Linear(self.dim_latent, out_feat), -# activation(), +# activation(), # nn.BatchNorm1d(out_feat)] # for i in range(len(self.layers_decoder)-1): # in_feat, out_feat = self.layers_decoder[i:i+2] @@ -278,9 +276,9 @@ def after_pred(self): class ModelAdapterFlatPred(DatasetWithTargetAdapter): - """Models forward only expects on input matrix. + """Models forward only expects on input matrix. Apply mask from dataloader to both pred and targets. - + Return only predictions and target for non NA inputs. """ @@ -302,9 +300,9 @@ def after_pred(self): class ModelAdapter(ModelAdapterFlatPred): - """Models forward only expects on input matrix. + """Models forward only expects on input matrix. Apply mask from dataloader to both pred and targets. - + Keep original dimension, i.e. also predictions for NA.""" def after_pred(self): @@ -321,7 +319,7 @@ def after_loss(self): class ModelAdapterVAEFlat(DatasetWithTargetAdapter): - """Models forward method only expects one input matrix. + """Models forward method only expects one input matrix. Apply mask from dataloader to both pred and targets.""" def before_batch(self): @@ -336,7 +334,7 @@ def after_pred(self): pred, mu, logvar = self.pred # return predictions self.learn.pred = (pred[self._mask], mu, logvar) # is this flat? elif len(self.pred) == 4: - x_mu,x_logvar, z_mu, z_logvar = self.pred + x_mu, x_logvar, z_mu, z_logvar = self.pred self.learn.pred = (x_mu[self._mask], x_logvar[self._mask], z_mu, z_logvar) # same as ModelAdapter. Inheritence is limiting composition here @@ -356,43 +354,39 @@ def after_loss(self): self.learn.yb = (self._all_y,) - - - class AutoEncoderAnalysis(analysis.ModelAnalysis): def __init__(self, - train_df:pd.DataFrame, - val_df:pd.DataFrame, # values to use for validation - model:torch.nn.modules.module.Module, - model_kwargs:dict, + train_df: pd.DataFrame, + val_df: pd.DataFrame, # values to use for validation + model: torch.nn.modules.module.Module, + model_kwargs: dict, transform: sklearn.pipeline.Pipeline, decode: List[str], bs=64 ): - self.transform = vaep.transform.VaepPipeline( - df_train=train_df, - encode=transform, - decode=decode) + self.transform = vaep.transform.VaepPipeline( + df_train=train_df, + encode=transform, + decode=decode) self.dls = vaep.io.dataloaders.get_dls( - train_X=train_df, - valid_X=val_df, - transformer=self.transform, bs=bs) + train_X=train_df, + valid_X=val_df, + transformer=self.transform, bs=bs) # M = data.train_X.shape[-1] self.kwargs_model = model_kwargs self.params = dict(self.kwargs_model) self.model = model(**self.kwargs_model) - + self.n_params_ae = vaep.models.calc_net_weight_count(self.model) self.params['n_parameters'] = self.n_params_ae self.learn = None - - def get_preds_from_df(self, df_wide:pd.DataFrame) -> pd.DataFrame: - if self.learn is None: raise ValueError("Assign Learner first as learn attribute.") - return get_preds_from_df(df=df_wide, learn=self.learn, transformer=self.transform) - - def get_test_dl(self, df_wide:pd.DataFrame, bs:int=64) -> pd.DataFrame: - return vaep.io.dataloaders.get_test_dl(df=df_wide, transformer=self.transform, bs=bs) + def get_preds_from_df(self, df_wide: pd.DataFrame) -> pd.DataFrame: + if self.learn is None: + raise ValueError("Assign Learner first as learn attribute.") + return get_preds_from_df(df=df_wide, learn=self.learn, transformer=self.transform) + def get_test_dl(self, df_wide: pd.DataFrame, bs: int = 64) -> pd.DataFrame: + return vaep.io.dataloaders.get_test_dl(df=df_wide, transformer=self.transform, bs=bs) diff --git a/vaep/models/analysis.py b/vaep/models/analysis.py index d0d6a1dbf..93d8a2aaa 100644 --- a/vaep/models/analysis.py +++ b/vaep/models/analysis.py @@ -5,11 +5,12 @@ from vaep.analyzers import Analysis + class ModelAnalysis(Analysis): """Class describing what an ModelAnalysis is supposed to have as attributes.""" model: torch.nn.Module dls: fastai.data.core.DataLoaders - learn: fastai.learner.Learner + learn: fastai.learner.Learner params: dict - transform: vaep.transform.VaepPipeline \ No newline at end of file + transform: vaep.transform.VaepPipeline diff --git a/vaep/models/cmd.py b/vaep/models/cmd.py index c0893401a..c31003c61 100644 --- a/vaep/models/cmd.py +++ b/vaep/models/cmd.py @@ -30,16 +30,16 @@ def create_argparser(): BATCH_SIZE = 16 EPOCHS = 600 + def get_args(batch_size=BATCH_SIZE, epochs=EPOCHS, log_interval=10, no_cuda=False): """Helper function to create arg.""" - args = ['--batch-size', str(batch_size), - '--seed', '43', - '--epochs', str(epochs), + args = ['--batch-size', str(batch_size), + '--seed', '43', + '--epochs', str(epochs), '--log-interval', str(log_interval)] if no_cuda: args.append('--no-cuda') args = parser.parse_args(args) args.cuda = torch.cuda.is_available() and not args.no_cuda return args - diff --git a/vaep/models/collab.py b/vaep/models/collab.py index 3d045a26c..bb37a9b46 100644 --- a/vaep/models/collab.py +++ b/vaep/models/collab.py @@ -38,7 +38,7 @@ def forward(self, x): def combine_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[pd.DataFrame, float]: - """Helper function to combine training and validation data in long-format. The + """Helper function to combine training and validation data in long-format. The training and validation data will be mixed up in CF training as the sample embeddings have to be trained for all samples. The returned frac can be used to have the same number of (non-missing) validation samples as before. @@ -57,7 +57,7 @@ def combine_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[pd.DataF Fraction of samples originally in validation data. """ X = train_df.append(val_df).reset_index() - frac = len(val_df) / (len(train_df)+len(val_df)) + frac = len(val_df) / (len(train_df) + len(val_df)) return X, frac @@ -100,7 +100,7 @@ def collab_dot_product(sample_embeddings: torch.tensor, sample_bias: torch.tenso res = res.detach() if y_range is None: return res - return torch.sigmoid(res) * (y_range[1]-y_range[0]) + y_range[0] + return torch.sigmoid(res) * (y_range[1] - y_range[0]) + y_range[0] def collab_prediction(idx_samples: torch.tensor, @@ -112,20 +112,20 @@ def collab_prediction(idx_samples: torch.tensor, Parameters ---------- idx_samples : torch.tensor - An array containing the neighreast neighbors in the training data for + An array containing the neighreast neighbors in the training data for set of list of test samples. Normallay obtained from a sklearn KNN search. learn : fastai.learner.Learner The learner used for collab training index_samples : pd.Index, optional The pandas.Index for the training samples. If no index_samples is provided, the samples will just be numbered, - by default None + by default None Returns ------- pd.DataFrame predictions as DataFrame for all features encoded by the model for all samples. - + """ # Matrix multiplication way test_sample_embeddings = learn.u_weight( @@ -141,7 +141,7 @@ def collab_prediction(idx_samples: torch.tensor, res = res + feat_biases.T + test_sample_biases if learn.y_range is not None: - res = torch.sigmoid(res) * (learn.y_range[1]-learn.y_range[0] + res = torch.sigmoid(res) * (learn.y_range[1] - learn.y_range[0] ) + learn.y_range[0] res = pd.DataFrame(res, @@ -162,7 +162,7 @@ def __init__(self, batch_size=64): if datasplits.val_y is not None: self.X, self.frac = combine_data(datasplits.train_X, - datasplits.val_y) + datasplits.val_y) else: self.X, self.frac = datasplits.train_X.reset_index(), 0.0 self.batch_size = batch_size @@ -172,16 +172,23 @@ def __init__(self, item_name=item_column, rating_name=target_column, bs=self.batch_size) - user_name=sample_column - item_name=item_column - rating_name=target_column - cat_names = [user_name,item_name] + user_name = sample_column + item_name = item_column + rating_name = target_column + cat_names = [user_name, item_name] ratings = self.X splits = None if datasplits.val_y is not None: - idx_splitter = IndexSplitter(list(range(len(datasplits.train_X), len(datasplits.train_X)+ len(datasplits.val_y) ))) + idx_splitter = IndexSplitter( + list(range(len(datasplits.train_X), len(datasplits.train_X) + len(datasplits.val_y)))) splits = idx_splitter(self.X) - to = TabularCollab(ratings, [Categorify], cat_names, y_names=[rating_name], y_block=TransformBlock(), splits=splits) + to = TabularCollab( + ratings, + [Categorify], + cat_names, + y_names=[rating_name], + y_block=TransformBlock(), + splits=splits) self.dls = to.dataloaders(path='.', bs=self.batch_size) self.params = {} self.model_kwargs = model_kwargs diff --git a/vaep/models/collect_dumps.py b/vaep/models/collect_dumps.py index 1854bd8f2..d0359ac22 100644 --- a/vaep/models/collect_dumps.py +++ b/vaep/models/collect_dumps.py @@ -55,4 +55,4 @@ def collect(paths: Iterable, collect_configs = partial(collect, load_fn=load_config_file, ) -collect_configs = update_wrapper(collect_configs, collect) \ No newline at end of file +collect_configs = update_wrapper(collect_configs, collect) diff --git a/vaep/models/vae.py b/vaep/models/vae.py index e7e2e8401..6395773e7 100644 --- a/vaep/models/vae.py +++ b/vaep/models/vae.py @@ -15,6 +15,7 @@ leaky_relu_default = nn.LeakyReLU(.1) + class VAE(nn.Module): def __init__(self, n_features: int, @@ -39,11 +40,11 @@ def build_layer(in_feat, out_feat): # Encoder self.encoder = [] - for i in range(len(self.layers)-1): - in_feat, out_feat = self.layers[i:i+2] + for i in range(len(self.layers) - 1): + in_feat, out_feat = self.layers[i:i + 2] self.encoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) - self.encoder.append(nn.Linear(out_feat, dim_latent*2)) + self.encoder.append(nn.Linear(out_feat, dim_latent * 2)) self.encoder = nn.Sequential(*self.encoder) @@ -56,13 +57,13 @@ def build_layer(in_feat, out_feat): out_feat=out_feat) i = -1 # in case a single hidden layer is passed - for i in range(len(self.layers_decoder)-2): - in_feat, out_feat = self.layers_decoder[i:i+2] + for i in range(len(self.layers_decoder) - 2): + in_feat, out_feat = self.layers_decoder[i:i + 2] self.decoder.extend(build_layer(in_feat=in_feat, out_feat=out_feat)) - in_feat, out_feat = self.layers_decoder[i+1:i+3] + in_feat, out_feat = self.layers_decoder[i + 1:i + 3] - self.decoder.append(nn.Linear(in_feat, out_feat*2)) + self.decoder.append(nn.Linear(in_feat, out_feat * 2)) if last_decoder_activation is not None: self.append(last_decoder_activation) @@ -84,7 +85,7 @@ def decode(self, z): return x_mu, x_logvar def reparameterize(self, mu, logvar): - std = torch.exp(0.5*logvar) + std = torch.exp(0.5 * logvar) return mu + torch.randn_like(std) * std def forward(self, x): @@ -95,11 +96,11 @@ def forward(self, x): def compute_kld(z_mu, z_logvar): - return 0.5*(z_mu**2 + torch.exp(z_logvar) - 1 - z_logvar) + return 0.5 * (z_mu**2 + torch.exp(z_logvar) - 1 - z_logvar) def gaussian_log_prob(z, mu, logvar): - return -0.5*(math.log(2*math.pi) + logvar + (z-mu)**2/torch.exp(logvar)) + return -0.5 * (math.log(2 * math.pi) + logvar + (z - mu)**2 / torch.exp(logvar)) def loss_fct(pred, y, reduction='sum', results: List = None, freebits=0.1): @@ -108,7 +109,7 @@ def loss_fct(pred, y, reduction='sum', results: List = None, freebits=0.1): l_rec = -torch.sum(gaussian_log_prob(batch, x_mu, x_logvar)) l_reg = torch.sum(F.relu(compute_kld(z_mu, z_logvar) - - freebits*math.log(2))+freebits*math.log(2), 1) + freebits * math.log(2)) + freebits * math.log(2), 1) if results is not None: results.append((l_rec.item(), torch.mean(l_reg).item())) diff --git a/vaep/nb.py b/vaep/nb.py index a8a4246e7..0d13104b7 100644 --- a/vaep/nb.py +++ b/vaep/nb.py @@ -9,14 +9,15 @@ class Config(): - """Config class with a setter enforcing that config entries cannot + """Config class with a setter enforcing that config entries cannot be overwritten. Can contain configs, which are itself configs: keys, paths, - + """ + def __setattr__(self, entry, value): """Set if attribute not in instance.""" if hasattr(self, entry) and getattr(self, entry) != value: @@ -45,7 +46,7 @@ def dump(self, fname=None): logger.info(f"Dumped config to: {fname}") @classmethod - def from_dict(cls, d:dict): + def from_dict(cls, d: dict): cfg = cls() for k, v in d.items(): setattr(cfg, k, v) @@ -57,17 +58,18 @@ def update_from_dict(self, params: dict): setattr(self, k, v) except AttributeError: logger.info(f"Already set attribute: {k} has value {v}") - + def keys(self): return vars(self).keys() def items(self): return vars(self).items() - + def values(self): return vars(self).values() -def get_params(args:dict.keys, globals, remove=True) -> dict: + +def get_params(args: dict.keys, globals, remove=True) -> dict: params = {k: v for k, v in globals.items() if k not in args and k[0] != '_'} if not remove: return params diff --git a/vaep/pandas/__init__.py b/vaep/pandas/__init__.py index bdbfa97f3..b10ce763a 100644 --- a/vaep/pandas/__init__.py +++ b/vaep/pandas/__init__.py @@ -12,6 +12,7 @@ from .calc_errors import calc_errors_per_feat, get_absolute_error + def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame: """Pass a selection of columns to combine it's value counts. @@ -40,9 +41,9 @@ def combine_value_counts(X: pd.DataFrame, dropna=True) -> pd.DataFrame: def counts_with_proportion(s: pd.Series) -> pd.DataFrame: - """Counts with proportion of counts(!). - - Note: In case of missing values the proportion is not based on the total number of + """Counts with proportion of counts(!). + + Note: In case of missing values the proportion is not based on the total number of rows in the DataFrame. """ s = s.value_counts() @@ -111,11 +112,13 @@ def replace_with(string_key: str, replace: str = "()/", replace_with: str = '') string_key = string_key.replace(symbol, replace_with) return string_key -def index_to_dict(index:pd.Index) -> dict: + +def index_to_dict(index: pd.Index) -> dict: cols = {replace_with(col.replace(' ', '_').replace( '-', '_')): col for col in index} return cols + def get_columns_accessor(df: pd.DataFrame, all_lower_case=False) -> omegaconf.OmegaConf: if isinstance(df.columns, pd.MultiIndex): raise ValueError("MultiIndex not supported.") @@ -132,6 +135,7 @@ def get_columns_accessor_from_iterable(cols: Iterable[str], cols = {k.lower(): v for k, v in cols.items()} return omegaconf.OmegaConf.create(cols) + def select_max_by(df: pd.DataFrame, grouping_columns: list, selection_column: str) -> pd.DataFrame: df = df.sort_values(by=[*grouping_columns, selection_column], ascending=False) df = df.drop_duplicates(subset=grouping_columns, @@ -189,7 +193,7 @@ def _add_indices(array: np.array, original_df: pd.DataFrame, def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: """Interpolate NA values with the values before and after. Uses n=3 replicates. - First rows replicates are the two following. + First rows replicates are the two following. Last rows replicates are the two preceding. Parameters @@ -219,7 +223,7 @@ def interpolate(wide_df: pd.DataFrame, name='interpolated') -> pd.DataFrame: ret.iloc[0] = first_row ret.iloc[-1] = last_row - ret = ret[mask].stack().dropna().squeeze() # does not work with MultiIndex columns + ret = ret[mask].stack().dropna().squeeze() # does not work with MultiIndex columns ret.rename(name, inplace=True) return ret @@ -236,7 +240,7 @@ def create_dict_of_dicts(d: dict, verbose=False, print(f"current key: {str(keys):90}: {len(v):>5}") current_dict = ret for k in keys[:-1]: - if not k in current_dict: + if k not in current_dict: current_dict[k] = dict() current_dict = current_dict[k] last_key = keys[-1] @@ -319,13 +323,13 @@ def length(x): Otherwise return length of list, pandas.Series, numpy.array, dict, etc.""" try: return len(x) - except: + except BaseException: return 0 def get_last_index_matching_proportion(df_counts: pd.DataFrame, - prop:float=0.25, - prop_col:str='proportion') -> object: + prop: float = 0.25, + prop_col: str = 'proportion') -> object: """df_counts needs to be sorted by "prop_col" (descending). Parameters @@ -349,8 +353,8 @@ def get_last_index_matching_proportion(df_counts: pd.DataFrame, return idx_cutoff -def get_lower_whiskers(df:pd.DataFrame, factor:float=1.5) -> pd.Series: +def get_lower_whiskers(df: pd.DataFrame, factor: float = 1.5) -> pd.Series: ret = df.describe() iqr = ret.loc['75%'] - ret.loc['25%'] - ret = ret.loc['25%'] - iqr*factor - return ret \ No newline at end of file + ret = ret.loc['25%'] - iqr * factor + return ret diff --git a/vaep/pandas/missing_data.py b/vaep/pandas/missing_data.py index b810895e5..7bd62e0ae 100644 --- a/vaep/pandas/missing_data.py +++ b/vaep/pandas/missing_data.py @@ -4,12 +4,15 @@ import pandas as pd + def percent_missing(df: pd.DataFrame) -> float: return df.isna().sum().sum() / math.prod(df.shape) + def percent_non_missing(df: pd.DataFrame) -> float: return df.notna().sum().sum() / math.prod(df.shape) + def list_files(folder='.') -> list[str]: return [f.as_posix() for f in Path(folder).iterdir()] @@ -29,4 +32,3 @@ def get_record(data: pd.DataFrame, columns_sample=False) -> dict: N_mis=int(N_mis), missing=float(missing), ) return record - diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index ebbb8def2..a1fab486a 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -1,4 +1,5 @@ from __future__ import annotations + import numpy as np import pandas as pd import matplotlib @@ -9,22 +10,22 @@ import vaep.pandas +from .errors import plot_rolling_error +from . import errors +from . import data +from . import plotly +from . defaults import order_categories, labels_dict, IDX_ORDER + seaborn.set_style("whitegrid") # seaborn.set_theme() -plt.rcParams['figure.figsize'] = [16.0, 7.0] # [4, 2], [4, 3] +plt.rcParams['figure.figsize'] = [16.0, 7.0] # [4, 2], [4, 3] plt.rcParams['pdf.fonttype'] = 42 plt.rcParams['ps.fonttype'] = 42 plt.rcParams['figure.dpi'] = 147 -from . defaults import order_categories, labels_dict, IDX_ORDER -from . import plotly -from . import data -from . import errors -from .errors import plot_rolling_error - logger = logging.getLogger(__name__) __all__ = ['plotly', @@ -40,6 +41,7 @@ 'plot_cutoffs', ] + def _savefig(fig, name, folder: pathlib.Path = '.', pdf=True, dpi=300 # default 'figure' @@ -107,9 +109,9 @@ def select_dates(date_series: pd.Series, max_ticks=30) -> np.array: def make_large_descriptors(size='xx-large'): - """Helper function to have very large titles, labes and tick texts for + """Helper function to have very large titles, labes and tick texts for matplotlib plots per default. - + size: str fontsize or allowed category. Change default if necessary, default 'xx-large' """ @@ -141,13 +143,13 @@ def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, ax2 = ax.twinx() n_min, n_max = np.round(ax.get_ybound()) logger.info(f"{n_min = }, {n_max = }") - lower_prop = n_min/n_samples + (ax.get_ybound()[0] - n_min) / n_samples - upper_prop = n_max/n_samples + (ax.get_ybound()[1] - n_max) / n_samples + lower_prop = n_min / n_samples + (ax.get_ybound()[0] - n_min) / n_samples + upper_prop = n_max / n_samples + (ax.get_ybound()[1] - n_max) / n_samples logger.info(f'{lower_prop = }, {upper_prop = }') ax2.set_ybound(lower_prop, upper_prop) # _ = ax2.set_yticks(np.linspace(n_min/n_samples, # n_max /n_samples, len(ax.get_yticks())-2)) - _ = ax2.set_yticks(ax.get_yticks()[1:-1]/n_samples) + _ = ax2.set_yticks(ax.get_yticks()[1:-1] / n_samples) ax2.yaxis.set_major_formatter( matplotlib.ticker.StrMethodFormatter(format_str)) return ax2 @@ -160,7 +162,7 @@ def add_height_to_barplot(ax, size=5): ax.annotate(text=format(bar.get_height(), '.2f'), xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), - xytext=(0, int(size/2)), + xytext=(0, int(size / 2)), ha='center', va='center', size=size, @@ -208,7 +210,7 @@ def format_large_numbers(ax: matplotlib.axes.Axes, return ax -def plot_feat_counts(df_counts:pd.DataFrame, feat_name:str, n_samples:int, +def plot_feat_counts(df_counts: pd.DataFrame, feat_name: str, n_samples: int, ax=None, figsize=(15, 10), count_col='counts', **kwargs): @@ -218,7 +220,7 @@ def plot_feat_counts(df_counts:pd.DataFrame, feat_name:str, n_samples:int, title=f'Count and proportion of {len(df_counts):,d} {feat_name}s over {n_samples:,d} samples', ) args.update(kwargs) - + ax = df_counts[count_col].plot( figsize=figsize, @@ -236,8 +238,8 @@ def plot_feat_counts(df_counts:pd.DataFrame, feat_name:str, n_samples:int, def plot_counts(df_counts: pd.DataFrame, n_samples, - feat_col_name:str='count', - feature_name=None, + feat_col_name: str = 'count', + feature_name=None, ax=None, prop_feat=0.25, min_feat_prop=.01, **kwargs): """Plot counts based on get_df_counts.""" @@ -251,7 +253,7 @@ def plot_counts(df_counts: pd.DataFrame, n_samples, ax=ax, **kwargs) df_counts['prop'] = df_counts[feat_col_name] / n_samples n_feat_cutoff = vaep.pandas.get_last_index_matching_proportion( - df_counts=df_counts, prop=prop_feat, prop_col='prop') + df_counts=df_counts, prop=prop_feat, prop_col='prop') n_samples_cutoff = df_counts.loc[n_feat_cutoff, feat_col_name] logger.info(f'{n_feat_cutoff = }, {n_samples_cutoff = }') x_lim_max = vaep.pandas.get_last_index_matching_proportion( @@ -307,5 +309,3 @@ def plot_cutoffs(df: pd.DataFrame, if min_feat_in_sample is not None: ax.axhline(min_feat_in_sample) return fig, axes - - diff --git a/vaep/sampling.py b/vaep/sampling.py index a5c5f5f29..9716c14f1 100644 --- a/vaep/sampling.py +++ b/vaep/sampling.py @@ -16,11 +16,11 @@ def feature_frequency(df_wide: pd.DataFrame, measure_name: str = 'freq') -> pd.S Returns ------- pd.Series - Frequency on non-missing entries per feature (column). + Frequency on non-missing entries per feature (column). """ # if hasattr(df_wide.columns, "levels"): # is columns.names always set? # is listed as attribute: https://pandas.pydata.org/docs/reference/api/pandas.Index.html - _df_feat = df_wide.stack(df_wide.columns.names) # ensure that columns are named + _df_feat = df_wide.stack(df_wide.columns.names) # ensure that columns are named _df_feat = _df_feat.to_frame(measure_name) # implicit as stack puts column index in the last position (here: 1) @@ -61,8 +61,8 @@ def sample_data(series: pd.Series, sample_index_to_drop: Union[str, int], Parameters ---------- series : pd.Series - Long-format data in pd.Series. Index name is feature name. 2 dimensional - MultiIndex. + Long-format data in pd.Series. Index name is feature name. 2 dimensional + MultiIndex. sample_index_to_drop : Union[str, int] Sample index (as str or integer Index position). Unit to group by (i.e. Samples) frac : float, optional diff --git a/vaep/stats/__init__.py b/vaep/stats/__init__.py index 37992dbc5..3b7233754 100644 --- a/vaep/stats/__init__.py +++ b/vaep/stats/__init__.py @@ -1 +1 @@ -from . import diff_analysis \ No newline at end of file +from . import diff_analysis diff --git a/vaep/stats/diff_analysis.py b/vaep/stats/diff_analysis.py index d5ca991d2..13f411693 100644 --- a/vaep/stats/diff_analysis.py +++ b/vaep/stats/diff_analysis.py @@ -11,11 +11,11 @@ def ancova_pg(df_long: pd.DataFrame, feat_col: str, dv: str, between: str, - covar: list[str]|str, + covar: list[str] | str, fdr=0.05) -> pd.DataFrame: """ Analysis of covariance (ANCOVA) using pg.ancova https://pingouin-stats.org/generated/pingouin.ancova.html - + Adds multiple hypothesis testing correction by Benjamini-Hochberg (qvalue, rejected) @@ -64,7 +64,7 @@ def ancova_pg(df_long: pd.DataFrame, scores['-Log10 pvalue'] = -np.log10(scores['p-unc']) scores = scores[scores.Source != 'Residual'] - #FDR correction + # FDR correction scores = add_fdr_scores(scores, random_seed=123) return scores @@ -83,7 +83,7 @@ def analyze(df_proteomics: pd.DataFrame, df_clinic: pd.DataFrame, target: str, covar: list[str], - value_name: str='intensity') -> pd.DataFrame: + value_name: str = 'intensity') -> pd.DataFrame: """apply ancova and multiple test correction. Parameters diff --git a/vaep/tests/io/test_data_objects.py b/vaep/tests/io/test_data_objects.py index b36acf491..78ffa67cf 100644 --- a/vaep/tests/io/test_data_objects.py +++ b/vaep/tests/io/test_data_objects.py @@ -24,7 +24,7 @@ expected = """ Sequence,Charge,m/z,Protein group IDs,Intensity,Score YYYIPQYK,2,569.2844,3745,147680000.0,83.801 -YYVTIIDAPGHR,3,468.91386,2873,8630000000.0,131.83 +YYVTIIDAPGHR,3,468.91386,2873,8630000000.0,131.83 YYVTIIDAPGHR,2,702.867151,2873,2458400000.0,70.028 YYVLNALK,2,492.28166,3521,147430000.0,58.687 """ diff --git a/vaep/tests/io/test_dataloaders.py b/vaep/tests/io/test_dataloaders.py index 67206bd8b..a801beeba 100644 --- a/vaep/tests/io/test_dataloaders.py +++ b/vaep/tests/io/test_dataloaders.py @@ -10,7 +10,7 @@ def test_get_dls(): N, M = 23, 11 X_train = create_random_df(N, M) - N_valid = int(N*0.3) + N_valid = int(N * 0.3) X_valid = create_random_df( N_valid, M, prop_na=.1, start_idx=len(X_train)) @@ -18,13 +18,11 @@ def test_get_dls(): [('normalize', StandardScaler()), ('impute', SimpleImputer(add_indicator=False))]) transforms = VaepPipeline(df_train=X_train, - encode=dae_default_pipeline, - decode=['normalize']) + encode=dae_default_pipeline, + decode=['normalize']) BS = 4 dls = get_dls(train_X=X_train, valid_X=X_valid, transformer=transforms, bs=BS) assert len(dls.train_ds) == N assert len(dls.valid_ds) == N batch = dls.one_batch() assert batch[0].shape == (BS, M) - - diff --git a/vaep/tests/io/test_dataset.py b/vaep/tests/io/test_dataset.py index ae9da5a40..7fc12d431 100644 --- a/vaep/tests/io/test_dataset.py +++ b/vaep/tests/io/test_dataset.py @@ -6,7 +6,8 @@ from vaep.io.datasets import DatasetWithMaskAndNoTarget -def test_DatasetWithMaskAndNoTarget(): + +def test_DatasetWithMaskAndNoTarget(): with pytest.raises(ValueError): DatasetWithMaskAndNoTarget(df=np.random.rand(10, 5)) @@ -14,4 +15,4 @@ def test_DatasetWithMaskAndNoTarget(): data = helpers.create_DataFrame() ds = DatasetWithMaskAndNoTarget(df=data) assert all(ds[-1][1] == torch.tensor([95, 96, 97, 98, 99], dtype=torch.int32)) - assert all(ds[-1][0] == torch.tensor([False, False, False, False, False])) \ No newline at end of file + assert all(ds[-1][0] == torch.tensor([False, False, False, False, False])) diff --git a/vaep/tests/io/test_datasplits.py b/vaep/tests/io/test_datasplits.py index acae6f5c7..d9a4ebde0 100644 --- a/vaep/tests/io/test_datasplits.py +++ b/vaep/tests/io/test_datasplits.py @@ -8,14 +8,14 @@ X = np.random.rand(N, M) df = (pd.DataFrame(X, - index=[f'sample_{i}' for i in range(N)], - columns=(f'feat_{i}' for i in range(M))) - .rename_axis('Sample ID') - .rename_axis('Feature Name', axis=1)) + index=[f'sample_{i}' for i in range(N)], + columns=(f'feat_{i}' for i in range(M))) + .rename_axis('Sample ID') + .rename_axis('Feature Name', axis=1)) -_splits = {'train_X': df.iloc[:int(N*0.6)], - 'val_y': df.iloc[int(N*0.6):int(N*0.8)], - 'test_y': df.iloc[int(N*0.8):]} +_splits = {'train_X': df.iloc[:int(N * 0.6)], + 'val_y': df.iloc[int(N * 0.6):int(N * 0.8)], + 'test_y': df.iloc[int(N * 0.8):]} def test_DataSplits_iter(): @@ -54,11 +54,12 @@ def test_dump_load(tmp_path): splits = DataSplits(is_wide_format=None) splits.load(folder=tmp_path, use_wide_format=True) assert splits.train_X is not _splits['train_X'] - + npt.assert_almost_equal(_splits['train_X'].values, splits.train_X) # #ToDo: Index and Column names are not yet correctly set # assert splits.train_X.equals(_splits['train_X']) + def test_to_long_format(tmp_path): splits = DataSplits(**_splits, is_wide_format=True) splits.dump(folder=tmp_path) @@ -72,6 +73,7 @@ def test_to_long_format(tmp_path): assert splits.val_y is not expected assert splits.val_y.equals(expected) + def test_to_wide_format(tmp_path): splits = DataSplits(**_splits, is_wide_format=True) splits.dump(folder=tmp_path) @@ -85,9 +87,10 @@ def test_to_wide_format(tmp_path): assert splits.val_y is not expected assert splits.val_y.equals(expected) + def test_interpolate(): splits = DataSplits(**_splits, is_wide_format=True) - splits._is_wide = True # ToDo. Is not correctly set when init is called. + splits._is_wide = True # ToDo. Is not correctly set when init is called. with pytest.raises(AttributeError): _ = splits.interpolate('non-existing') diff --git a/vaep/tests/models/__pycache__/test_collect_dumps.py b/vaep/tests/models/__pycache__/test_collect_dumps.py index fd585f567..5dbc9b986 100644 --- a/vaep/tests/models/__pycache__/test_collect_dumps.py +++ b/vaep/tests/models/__pycache__/test_collect_dumps.py @@ -7,7 +7,3 @@ def test_select_content(): 'model_metrics_collab'] for test_case in test_cases: assert select_content(test_case, first_split='metrics_') == test_case.split('metrics_')[1] - - - - diff --git a/vaep/tests/pandas/test_calc_errors.py b/vaep/tests/pandas/test_calc_errors.py index c56e20bea..0749cd2d7 100644 --- a/vaep/tests/pandas/test_calc_errors.py +++ b/vaep/tests/pandas/test_calc_errors.py @@ -6,7 +6,7 @@ @fixture def example_data(): - """Example data with duplicated index values. Normally MulitIndex is used with + """Example data with duplicated index values. Normally MulitIndex is used with unique combination of sample and feat values.""" data = [[25.47317633, 27.23206642, 26.43510602, 28.40661375, 27.6536975], [30.57866718, 30.17035425, 30.22881888, 29.82725333, 30.1177242], @@ -21,7 +21,7 @@ def example_data(): data = pd.DataFrame(data, index=(f'feat_{i}' for i in [ 0, 0, 1, 1, 1, 2, 3, 4, 5, 6]), - columns=['observed'] + ['model_' + str(i+1) for i in range(4)]) + columns=['observed'] + ['model_' + str(i + 1) for i in range(4)]) data.columns.name = 'model' data.index.name = 'feat' data['freq_feat'] = [4, 5, 5, 4, 6, 7, 7, 9, 8, 6] diff --git a/vaep/tests/test_ae.py b/vaep/tests/test_ae.py index 1b30424f5..319f91e81 100644 --- a/vaep/tests/test_ae.py +++ b/vaep/tests/test_ae.py @@ -19,6 +19,7 @@ ) )""" + def test_basic_repr(): model = ae.Autoencoder(n_features=100, n_neurons=30) actual_repr = repr(model) @@ -26,6 +27,3 @@ def test_basic_repr(): assert model.dim_latent == 10 assert model.n_neurons == [30] assert model.n_features == 100 - - - diff --git a/vaep/tests/test_collab.py b/vaep/tests/test_collab.py index 42c4e4cc2..1e770d911 100644 --- a/vaep/tests/test_collab.py +++ b/vaep/tests/test_collab.py @@ -13,16 +13,17 @@ index=[f'sample_{i}' for i in range(N)], columns=(f'feat_{i}' for i in range(M))) -data = {'train_X': df.iloc[:int(N*0.6)], - 'val_y': df.iloc[int(N*0.6):int(N*0.8)], - 'test_y': df.iloc[int(N*0.8):]} +data = {'train_X': df.iloc[:int(N * 0.6)], + 'val_y': df.iloc[int(N * 0.6):int(N * 0.8)], + 'test_y': df.iloc[int(N * 0.8):]} data = DataSplits(**data, is_wide_format=True) -assert data._is_wide +assert data._is_wide data.to_long_format() + def test_combine_data(): N_train, N_val = len(data.train_X), len(data.val_y) X, frac = collab.combine_data(data.train_X, data.val_y) assert len(X) == N_train + N_val - npt.assert_almost_equal(frac, N_val / (N_train+N_val)) + npt.assert_almost_equal(frac, N_val / (N_train + N_val)) diff --git a/vaep/tests/test_helpers.py b/vaep/tests/test_helpers.py index d860bc319..fcde11887 100644 --- a/vaep/tests/test_helpers.py +++ b/vaep/tests/test_helpers.py @@ -3,9 +3,9 @@ from vaep.utils import create_random_missing_data + def test_create_random_missing_data(): data = create_random_missing_data(N=43, M=13, prop_missing=0.2) assert data.shape == (43, 13) assert np.isnan(data).sum() - assert abs((float(np.isnan(data).sum()) / (43 * 13) ) - 0.2 ) < 0.05 - + assert abs((float(np.isnan(data).sum()) / (43 * 13)) - 0.2) < 0.05 diff --git a/vaep/tests/test_imputation.py b/vaep/tests/test_imputation.py index 7419705c8..747b71c80 100644 --- a/vaep/tests/test_imputation.py +++ b/vaep/tests/test_imputation.py @@ -9,7 +9,7 @@ fraction_missing = proteins.notna().mean() -data = data[data.columns[fraction_missing > 0.4]] +data = data[data.columns[fraction_missing > 0.4]] N_FEAT = 200 N_FEAT_digits = len(str(N_FEAT)) data = data.sample(N_FEAT, axis=1) @@ -56,30 +56,29 @@ def test_imputation_normal_dist(): # def test_imputation_mixed_norm_KNN(): # pass + + @pytest.mark.parametrize('axis', [0, 1]) def test_impute_shifted_normal(example_data, axis): - mean_shift=1.8 - # remove zeros as these lead to -inf + mean_shift = 1.8 + # remove zeros as these lead to -inf example_data = np.log2(example_data.replace({0.0: np.nan}) - ).dropna(thresh=10, axis=1-axis) + ).dropna(thresh=10, axis=1 - axis) N, M = example_data.shape mask_observed = example_data.notna() imputed = impute_shifted_normal(example_data, axis=axis, mean_shift=mean_shift) - assert len(imputed) == ((N*M) - len(example_data.stack())) - + assert len(imputed) == ((N * M) - len(example_data.stack())) + if axis == 1: min_N = int(len(example_data) * 0.6) selected = example_data.dropna(axis=1, thresh=min_N) elif axis == 0: min_M = int(example_data.shape[1] * 0.6) selected = example_data.dropna(axis=0, thresh=min_M) - + mean = selected.mean(axis=axis) std = selected.std(axis=axis) mean_shifted = mean - (std * mean_shift) mean_imputed = imputed.unstack().mean(axis=axis) assert (mean_shifted - mean_imputed).abs().max() < 0.35 - - - diff --git a/vaep/tests/test_io.py b/vaep/tests/test_io.py index 610746a4c..143d705c5 100644 --- a/vaep/tests/test_io.py +++ b/vaep/tests/test_io.py @@ -3,16 +3,17 @@ import numpy as np import numpy.testing as npt -import vaep.io +import vaep.io from vaep.io.datasets import PeptideDatasetInMemory -data = np.random.random(size=(10,5)) +data = np.random.random(size=(10, 5)) mask = ~(data < 0.1) data_w_na = np.where(mask, data, np.nan) assert (data != data_w_na).any() assert (~np.isnan(data_w_na) == mask).all() + def test_PeptideDatasetInMemory_wo_Mask(): train_ds = PeptideDatasetInMemory(data_w_na, fill_na=0.0) mask_isna = np.isnan(data_w_na) @@ -25,8 +26,8 @@ def test_PeptideDatasetInMemory_wo_Mask(): def test_relative_to(): fpath = Path('project/runs/experiment_name/run') - pwd = 'project/runs/' # per defaut '.' (the current working directory) - expected = Path('experiment_name/run') + pwd = 'project/runs/' # per defaut '.' (the current working directory) + expected = Path('experiment_name/run') acutal = vaep.io.resolve_path(fpath, pwd) assert expected == acutal @@ -36,4 +37,4 @@ def test_relative_to(): # pwd = 'root/home/project/runs/' # per defaut '.' (the current working directory) # expected = Path('root/home/project/data/file') # acutal = vaep.io.resolve_path(fpath, pwd) - # assert expected == acutal \ No newline at end of file + # assert expected == acutal diff --git a/vaep/tests/test_nb.py b/vaep/tests/test_nb.py index 906406edb..a6dddb8b5 100644 --- a/vaep/tests/test_nb.py +++ b/vaep/tests/test_nb.py @@ -6,4 +6,4 @@ def test_Config(): cfg = Config() cfg.test = 'test' with pytest.raises(AttributeError): - cfg.test = 'raise AttributeError' \ No newline at end of file + cfg.test = 'raise AttributeError' diff --git a/vaep/tests/test_pandas.py b/vaep/tests/test_pandas.py index 555772594..7f66e1431 100644 --- a/vaep/tests/test_pandas.py +++ b/vaep/tests/test_pandas.py @@ -5,12 +5,12 @@ def test_interpolate(): test_data = { - "pep1": {0: nan, 1: 27.8, 2: 28.9, 3: nan, 4: 28.7}, - "pep2": {0: 29.1, 1: nan, 2: 27.6, 3: 29.1, 4: nan}, + "pep1": {0: nan, 1: 27.8, 2: 28.9, 3: nan, 4: 28.7}, + "pep2": {0: 29.1, 1: nan, 2: 27.6, 3: 29.1, 4: nan}, # 4 values replace based on one (edge case): - "pep3": {0: nan, 1: nan, 2: 23.6, 3: nan, 4: nan}, - "pep4": {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}, - "pep5": {0: 26.0, 1: 27.0, 2: nan, 3: nan, 4: nan}, + "pep3": {0: nan, 1: nan, 2: 23.6, 3: nan, 4: nan}, + "pep4": {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}, + "pep5": {0: 26.0, 1: 27.0, 2: nan, 3: nan, 4: nan}, } df_test_data = pd.DataFrame(test_data) @@ -52,11 +52,12 @@ def test_flatten_dict_of_dicts(): assert expected == actual + def test_create_dict_of_dicts(): data = {('a', 'a1', 'a2'): 1, - ('a', 'a1', 'a3'): 2, - ('b', 'b1', 'b2'): 3, - ('b', 'b1', 'b3'): 4} + ('a', 'a1', 'a3'): 2, + ('b', 'b1', 'b2'): 3, + ('b', 'b1', 'b3'): 4} expected = { "a": {'a1': {'a2': 1, 'a3': 2}}, "b": {'b1': {'b2': 3, 'b3': 4}} @@ -65,9 +66,9 @@ def test_create_dict_of_dicts(): assert expected == actual data = {('a', 'a1', 'a2'): (1, 1), - ('a', 'a1', 'a3'): (2, 2), - ('b', 'b1', 'b2'): (3, 3), - ('b', 'b1', 'b3'): (4, 4)} + ('a', 'a1', 'a3'): (2, 2), + ('b', 'b1', 'b2'): (3, 3), + ('b', 'b1', 'b3'): (4, 4)} expected = { "a": {'a1': {'a2': [1, 1], 'a3': [2, 2]}}, "b": {'b1': {'b2': [3, 3], 'b3': [4, 4]}} @@ -112,4 +113,4 @@ def test_key_map(): 'gamma': ('a', 'b'), 'delta': None}} actual = vaep.pandas.key_map(d) - assert expected == actual \ No newline at end of file + assert expected == actual diff --git a/vaep/tests/test_transfrom.py b/vaep/tests/test_transfrom.py index 0aa0176e8..2a1463580 100644 --- a/vaep/tests/test_transfrom.py +++ b/vaep/tests/test_transfrom.py @@ -4,7 +4,6 @@ import numpy.testing as npt - import sklearn from sklearn import preprocessing from sklearn import impute @@ -43,7 +42,7 @@ def test_Vaep_Pipeline(): dae_default_pipeline = sklearn.pipeline.Pipeline( [ ('normalize', preprocessing.StandardScaler()), - ('impute', impute.SimpleImputer(add_indicator=False)) # True won't work + ('impute', impute.SimpleImputer(add_indicator=False)) # True won't work ] ) from random_data import data @@ -52,20 +51,20 @@ def test_Vaep_Pipeline(): # new procs, transform equal encode, inverse_transform equals decode dae_transforms = VaepPipeline(df, encode=dae_default_pipeline) res = dae_transforms.transform(df) - assert type(res) == pd.DataFrame + assert isinstance(res, pd.DataFrame) with pytest.raises(ValueError): res = dae_transforms.inverse_transform(res) # pd.DataFrame - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.iloc[0]) # pd.DataFrame - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.loc['sample_156']) # pd.DataFrame - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(to_tensor(res)) # torch.Tensor - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.values) # numpy.array - with pytest.raises(ValueError): + with pytest.raises(ValueError): _ = dae_transforms.inverse_transform(res.values[0]) # single sample dae_transforms = VaepPipeline(df, encode=dae_default_pipeline, decode=['normalize']) res = dae_transforms.transform(df) res = dae_transforms.inverse_transform(res) - npt.assert_array_almost_equal(df.values[mask], res.values[mask]) \ No newline at end of file + npt.assert_array_almost_equal(df.values[mask], res.values[mask]) diff --git a/vaep/tf_board.py b/vaep/tf_board.py index 90f5b6ecf..bf8959e09 100644 --- a/vaep/tf_board.py +++ b/vaep/tf_board.py @@ -5,9 +5,10 @@ class TensorboardModelNamer(): """PyTorch SummaryWriter helper class for experiments. - + Creates new SummaryWriter for an experiment """ + def __init__(self, prefix_folder, root_dir=Path('runs')): """[summary] @@ -30,7 +31,7 @@ def get_model_name(self, hidden_layers: int, name = 'model_' name += f'hl{hidden_layers:02d}' - if type(neurons) == str: + if isinstance(neurons, str): neurons = neurons.split() elif not type(neurons) in [list, tuple]: raise TypeError( @@ -39,7 +40,7 @@ def get_model_name(self, hidden_layers: int, for x in neurons: name += f'_{x}' - if type(scaler) == str: + if isinstance(scaler, str): name += f'_{scaler}' else: name += f'_{scaler!r}' diff --git a/vaep/transform.py b/vaep/transform.py index ee3b9ac87..533599161 100644 --- a/vaep/transform.py +++ b/vaep/transform.py @@ -73,6 +73,7 @@ def inverse_transform(self, X, copy=None): # # arguments, see https://fastcore.fast.ai/meta.html#Metaprogramming # # decorate() + def transform(self, X, **kwargs): res = super(self.__class__, self).transform(X, **kwargs) if isinstance(X, pd.DataFrame): @@ -140,8 +141,9 @@ def get_df_fitted_mean_std(self, index): class VaepPipeline(): """Custom Pipeline combining a pandas.DataFrame and a sklearn.pipeline.Pipleine.""" - def __init__(self, df_train:pd.DataFrame, encode:sklearn.pipeline.Pipeline, - decode:List[str] =None): + + def __init__(self, df_train: pd.DataFrame, encode: sklearn.pipeline.Pipeline, + decode: List[str] = None): """[summary] Parameters @@ -153,7 +155,7 @@ def __init__(self, df_train:pd.DataFrame, encode:sklearn.pipeline.Pipeline, decode : List[str], optional subset of transforms (their string name) as an Iterable, by default None, i.e. the same as encode - """ + """ self.columns = df_train.columns self.M = len(df_train.columns) self.encode = encode @@ -163,20 +165,18 @@ def __init__(self, df_train:pd.DataFrame, encode:sklearn.pipeline.Pipeline, for d in decode: self.decode.append( (d, self.encode.named_steps[d]) - ) + ) self.decode = sklearn.pipeline.Pipeline(self.decode) else: self.decode = self.encode - - - + def transform(self, X): res = self.encode.transform(X) if isinstance(X, pd.DataFrame): return pd.DataFrame(res, columns=X.columns, index=X.index) return res - + # Option: single-dispatch based on type of X def inverse_transform(self, X, index=None): columns = self.columns @@ -195,4 +195,4 @@ def inverse_transform(self, X, index=None): X = X.reshape(-1, self.M) res = self.decode.inverse_transform(X) res = pd.DataFrame(res, columns=columns, index=index) - return res \ No newline at end of file + return res diff --git a/vaep/utils.py b/vaep/utils.py index 0ddccb131..d0ac0810e 100644 --- a/vaep/utils.py +++ b/vaep/utils.py @@ -19,8 +19,8 @@ def append_to_filepath(filepath: Union[pathlib.Path, str], to_append: str, sep: str = '_', new_suffix: str = None) -> pathlib.Path: - """Append filepath with specified to_append using a seperator. - + """Append filepath with specified to_append using a seperator. + Example: `data.csv` to data_processed.csv """ filepath = pathlib.Path(filepath) @@ -59,11 +59,11 @@ def create_long_df(N: int, M: int, prop_missing=0.1): def create_random_df(N: int, M: int, - scaling_factor: float = 30.0, - prop_na: float = 0.0, - start_idx: int = 0, - name_index='Sample ID', - name_columns='peptide'): + scaling_factor: float = 30.0, + prop_na: float = 0.0, + start_idx: int = 0, + name_index='Sample ID', + name_columns='peptide'): X = np.random.rand(N, M) if prop_na > 0.0 and prop_na < 1.0: @@ -74,7 +74,7 @@ def create_random_df(N: int, M: int, X = pd.DataFrame(X, index=[f'sample_{i:0{len(str(N))}}' - for i in range(start_idx, start_idx+N)], + for i in range(start_idx, start_idx + N)], columns=(f'feat_{i:0{len(str(M))}}' for i in range(M))) X.index.name = name_index X.columns.name = name_columns