diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb index f6df4bd16..5d0214392 100644 --- a/project/01_0_split_data.ipynb +++ b/project/01_0_split_data.ipynb @@ -16,10 +16,10 @@ "outputs": [], "source": [ "from pathlib import Path\n", - "\n", + "import logging\n", "from typing import Union, List\n", "\n", - "\n", + "from IPython.display import display\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", @@ -28,31 +28,31 @@ "\n", "import vaep\n", "from vaep.io.datasplits import DataSplits\n", - "from vaep.io import thermo_raw_files\n", "from vaep.sampling import feature_frequency, sample_data\n", "\n", "from vaep.analyzers import analyzers\n", - "from vaep.analyzers.analyzers import AnalyzePeptides\n", + "from vaep.analyzers.analyzers import AnalyzePeptides\n", "\n", "logger = vaep.logging.setup_nb_logger()\n", "logger.info(\"Split data and make diagnostic plots\")\n", + "logging.getLogger('fontTools').setLevel(logging.WARNING)\n", + "\n", "\n", - "def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):\n", + "def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):\n", " try:\n", - " analysis.df = analysis.df.loc[df_meta.index]\n", + " df = df.loc[df_meta.index]\n", " except KeyError as e:\n", " logger.warning(e)\n", " logger.warning(\"Ignore missing samples in quantified samples\")\n", - " analysis.df = analysis.df.loc[analysis.df.index.intersection(\n", + " df = df.loc[df.index.intersection(\n", " df_meta.index)]\n", - "\n", - " analysis.df_meta = df_meta\n", - " return analysis\n", + " return df_meta\n", "\n", "\n", "pd.options.display.max_columns = 32\n", "plt.rcParams['figure.figsize'] = [4, 2]\n", - "vaep.plotting.make_large_descriptors(5)\n", + "\n", + "vaep.plotting.make_large_descriptors(6)\n", "\n", "figures = {} # collection of ax or figures\n", "dumps = {} # collection of data dumps" @@ -80,39 +80,35 @@ "cell_type": "code", "execution_count": null, "metadata": { - "lines_to_next_cell": 2, "tags": [ "parameters" ] }, "outputs": [], "source": [ - "# Sample (rows) intensiites for features (columns)\n", - "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n", - "# Can be either a string or position (typical 0 for first column), or a list of these.\n", - "index_col: Union[str, int] = 0\n", - "# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise)\n", - "# Manuelly set column names (of Index object in columns)\n", - "column_names: List[str] = [\"Gene Names\"]\n", - "# Machine parsed metadata from raw file (see workflows/metadata), wide format per sample\n", - "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n", - "# Minimum number or fraction of feature prevalence across samples to be kept\n", - "feat_prevalence: Union[int, float] = 0.25\n", - "# Minimum number or fraction of total requested features per Sample\n", - "sample_completeness: Union[int, float] = 0.5\n", + "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' # Sample (rows), features (columns)\n", + "index_col: Union[str, int] = 0 # Can be either a string or position (default 0 for first column), or a list of these.\n", + "column_names: List[str] = [\"Gene Names\"] # Manuelly set column names (of Index object in columns)\n", + "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # metadata for samples (rows)\n", + "feat_prevalence: Union[int, float] = 0.25 # Minimum number or fraction of feature prevalence across samples to be kept\n", + "sample_completeness: Union[int, float] = 0.5 # Minimum number or fraction of total requested features per Sample\n", "select_N: int = None # only use latest N samples\n", - "sample_N: bool = False # if select_N, sample N randomly instead of using latest?\n", + "sample_N: bool = False # if select_N, sample N randomly instead of using latest N\n", "random_state: int = 42 # random state for reproducibility of splits\n", - "# based on raw file meta data, only take samples with RT > min_RT_time\n", - "min_RT_time: Union[int, float] = None\n", - "# Log transformation of initial data (select one of the existing in numpy)\n", - "logarithm: str = 'log2'\n", - "folder_experiment: str = f'runs/example'\n", - "folder_data: str = '' # specify data directory if needed\n", + "min_RT_time: Union[int, float] = None # based on raw file meta data, only take samples with RT > min_RT_time\n", + "logarithm: str = 'log2' # Log transformation of initial data (select one of the existing in numpy)\n", + "folder_experiment: str = 'runs/example' # folder to save figures and data dumps\n", + "folder_data: str = '' # specify special data directory if needed\n", "file_format: str = 'csv' # file format of create splits, default pickle (pkl)\n", "# metadata -> defaults for metadata extracted from machine data, used for plotting\n", "meta_date_col: str = None # date column in meta data\n", - "meta_cat_col: str = None # category column in meta data" + "meta_cat_col: str = None # category column in meta data\n", + "# train, validation and test data splits\n", + "frac_non_train: float = 0.1 # fraction of non training data (validation and test split)\n", + "frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random\n", + "\n", + "meta_date_col: str = 'Content Creation Date'\n", + "meta_cat_col: str = 'Software Version'" ] }, { @@ -187,6 +183,7 @@ "metadata": {}, "outputs": [], "source": [ + "! factor out file reading to a separate module, not class\n", "# AnalyzePeptides.from_csv\n", "constructor = getattr(AnalyzePeptides, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT])\n", "analysis = constructor(fname=params.FN_INTENSITIES,\n", @@ -202,7 +199,8 @@ "log_fct = getattr(np, params.logarithm)\n", "analysis.log_transform(log_fct)\n", "logger.info(f\"{analysis = }\")\n", - "analysis.df" + "df = analysis.df\n", + "df" ] }, { @@ -213,8 +211,13 @@ }, "outputs": [], "source": [ - "ax = analysis.df.notna().sum(axis=0).to_frame(\n", - " analysis.df.columns.name).plot.box()\n", + "ax = (df\n", + " .notna()\n", + " .sum(axis=0)\n", + " .to_frame(df.columns.name)\n", + " .plot\n", + " .box()\n", + " )\n", "ax.set_ylabel('number of observation across samples')" ] }, @@ -228,7 +231,7 @@ "dumps[fname.name] = fname.as_posix()\n", "writer = pd.ExcelWriter(fname)\n", "\n", - "notna = analysis.df.notna()\n", + "notna = df.notna()\n", "data_stats_original = pd.concat(\n", " [\n", " notna.sum().describe().rename('feat_stats'),\n", @@ -258,15 +261,16 @@ " ret = \"_\".join(str(x) for x in seq)\n", " return ret\n", "\n", + "\n", "# ToDo: join multiindex samples indices (pkl dumps)\n", - "# if hasattr(analysis.df.columns, \"levels\"):\n", - "if isinstance(analysis.df.columns, pd.MultiIndex):\n", + "# if hasattr(df.columns, \"levels\"):\n", + "if isinstance(df.columns, pd.MultiIndex):\n", " logger.warning(\"combine MultiIndex columns to one feature column\")\n", - " print(analysis.df.columns[:10].map(join_as_str))\n", - " _new_name = join_as_str(analysis.df.columns.names)\n", - " analysis.df.columns = analysis.df.columns.map(join_as_str)\n", - " analysis.df.columns.name = _new_name\n", - " logger.warning(f\"New name: {analysis.df.columns.names = }\")" + " print(df.columns[:10].map(join_as_str))\n", + " _new_name = join_as_str(df.columns.names)\n", + " df.columns = df.columns.map(join_as_str)\n", + " df.columns.name = _new_name\n", + " logger.warning(f\"New name: {df.columns.names = }\")" ] }, { @@ -294,8 +298,8 @@ " if params.meta_cat_col:\n", " raise ValueError(\n", " f\"No metadata provided, but data column set: {params.meta_cat_col}\")\n", - " df_meta = pd.DataFrame(index=analysis.df.index)\n", - "df_meta = df_meta.loc[analysis.df.index.to_list()] # index is sample index\n", + " df_meta = pd.DataFrame(index=df.index)\n", + "df_meta = df_meta.loc[df.index.to_list()] # index is sample index\n", "if df_meta.index.name is None:\n", " df_meta.index.name = params.index_col[0]\n", "df_meta" @@ -304,7 +308,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "if params.meta_date_col:\n", @@ -316,20 +322,6 @@ "df_meta" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if df_meta.columns.isin(thermo_raw_files.cols_instrument).sum() == len(thermo_raw_files.cols_instrument):\n", - " display(df_meta.groupby(thermo_raw_files.cols_instrument)[\n", - " params.meta_date_col].agg(['count', 'min', 'max']))\n", - "else:\n", - " logger.info(\n", - " f\"Instrument column not found: {thermo_raw_files.cols_instrument}\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -364,7 +356,7 @@ " logger.info(msg)\n", " df_meta = df_meta.loc[mask_RT]\n", "else:\n", - " logger.warning(f\"Retention time filtering deactivated.\")" + " logger.warning(\"Retention time filtering deactivated.\")" ] }, { @@ -396,7 +388,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "try:\n", @@ -409,65 +403,13 @@ " display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)])" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Optional, if using ThermoRawFileParser: check some columns describing settings\n", - " - software can be updated: `Software Version`\n", - " - `mass resolution` setting for instrument\n", - " - colision type for MS2: `beam-type collision-induced dissocation`\n", - " - missing `dilution factor`\n", - " - omit (uncomment if needed):\n", - " - quite some variation due to `MS max charge`: omit\n", - " - variation by `injection volume setting` and instrument over time\n", - " - 500ng of peptides should be injected, based on concentration of peptides this setting is adjusted to get it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "meta_raw_settings = [\n", - " 'Thermo Scientific instrument model',\n", - " 'instrument serial number',\n", - " 'Software Version',\n", - " # 'MS max charge',\n", - " 'mass resolution',\n", - " 'beam-type collision-induced dissociation',\n", - " # 'injection volume setting',\n", - " 'dilution factor',\n", - "]\n", - "\n", - "if df_meta.columns.isin(meta_raw_settings).sum() == len(meta_raw_settings):\n", - " display(\n", - " # index gives first example with this combination\n", - " df_meta[meta_raw_settings].drop_duplicates()\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- check for variation in `software Version` and `injection volume setting`\n", - "\n", - "\n", - "Update selection of samples based on metadata (e.g. minimal retention time)\n", - "- sort data the same as sorted meta data" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "analysis = add_meta_data(analysis, df_meta=df_meta)" + "df_meta = add_meta_data(df, df_meta=df_meta)" ] }, { @@ -483,7 +425,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert analysis.df.index.is_unique, \"Duplicates in index\"" + "assert df.index.is_unique, \"Duplicates in index\"" ] }, { @@ -504,21 +446,23 @@ "outputs": [], "source": [ "if params.select_N is not None:\n", - " params.select_N = min(params.select_N, len(analysis.df_meta))\n", + " params.select_N = min(params.select_N, len(df_meta))\n", " if params.sample_N:\n", - " analysis.df_meta = analysis.df_meta.sample(params.select_N)\n", + " df_meta = df_meta.sample(params.select_N)\n", " else:\n", - " analysis.df_meta = analysis.df_meta.iloc[-params.select_N:]\n", + " df_meta = df_meta.iloc[-params.select_N:]\n", "\n", - " analysis.df = analysis.df.loc[analysis.df_meta.index].dropna(\n", + " df = df.loc[df_meta.index].dropna(\n", " how='all', axis=1)\n", - " ax = analysis.df.T.describe().loc['count'].hist()\n", + " ax = df.T.describe().loc['count'].hist()\n", " _ = ax.set_title('histogram of features for all eligable samples')" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "## First Step: Select features by prevalence\n", "- `feat_prevalence` across samples" @@ -530,9 +474,9 @@ "metadata": {}, "outputs": [], "source": [ - "freq_per_feature = analysis.df.notna().sum() # on wide format\n", + "freq_per_feature = df.notna().sum() # on wide format\n", "if isinstance(params.feat_prevalence, float):\n", - " N_samples = len(analysis.df_meta)\n", + " N_samples = len(df)\n", " logger.info(f\"Current number of samples: {N_samples}\")\n", " logger.info(\n", " f\"Feature has to be present in at least {params.feat_prevalence:.2%} of samples\")\n", @@ -546,13 +490,19 @@ "mask = freq_per_feature >= params.feat_prevalence\n", "logger.info(f\"Drop {(~mask).sum()} features\")\n", "freq_per_feature = freq_per_feature.loc[mask]\n", - "analysis.df = analysis.df.loc[:, mask]\n", - "analysis.N, analysis.M = analysis.df.shape\n", - "\n", + "df = df.loc[:, mask]\n", + "analysis.N, analysis.M = df.shape\n", "# # potentially create freq based on DataFrame\n", - "analysis.df\n", - "\n", - "notna = analysis.df.notna()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "notna = df.notna()\n", "data_stats_filtered = pd.concat(\n", " [\n", " notna.sum().describe().rename('feat_stats'),\n", @@ -587,11 +537,11 @@ " msg = f'Fraction of minimum sample completeness over all features specified with: {params.sample_completeness}\\n'\n", " # assumes df in wide format\n", " params.overwrite_entry('sample_completeness', int(\n", - " analysis.df.shape[1] * params.sample_completeness))\n", + " df.shape[1] * params.sample_completeness))\n", " msg += f'This translates to a minimum number of features per sample (to be included): {params.sample_completeness}'\n", " logger.info(msg)\n", "\n", - "sample_counts = analysis.df.notna().sum(axis=1) # if DataFrame\n", + "sample_counts = df.notna().sum(axis=1) # if DataFrame\n", "sample_counts.describe()" ] }, @@ -604,8 +554,8 @@ "mask = sample_counts > params.sample_completeness\n", "msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.'\n", "print(msg)\n", - "analysis.df = analysis.df.loc[mask]\n", - "analysis.df = analysis.df.dropna(\n", + "df = df.loc[mask]\n", + "df = df.dropna(\n", " axis=1, how='all') # drop now missing features" ] }, @@ -615,8 +565,8 @@ "metadata": {}, "outputs": [], "source": [ - "params.N, params.M = analysis.df.shape # save data dimensions\n", - "params.used_samples = analysis.df.index.to_list()" + "params.N, params.M = df.shape # save data dimensions\n", + "params.used_samples = df.index.to_list()" ] }, { @@ -632,7 +582,7 @@ "metadata": {}, "outputs": [], "source": [ - "ax = analysis.df.notna().sum(axis=1).hist()\n", + "ax = df.notna().sum(axis=1).hist()\n", "ax.set_xlabel('features per eligable sample')\n", "ax.set_ylabel('observations')\n", "fname = params.out_figures / 'hist_features_per_sample'\n", @@ -648,7 +598,7 @@ }, "outputs": [], "source": [ - "ax = analysis.df.notna().sum(axis=0).sort_values().plot()\n", + "ax = df.notna().sum(axis=0).sort_values().plot()\n", "_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]\n", "_ = ax.set_xticklabels(_new_labels, rotation=45,\n", " horizontalalignment='right')\n", @@ -672,9 +622,9 @@ "metadata": {}, "outputs": [], "source": [ - "min_max = vaep.plotting.data.min_max(analysis.df.stack())\n", + "min_max = vaep.plotting.data.min_max(df.stack())\n", "ax, bins = vaep.plotting.data.plot_histogram_intensities(\n", - " analysis.df.stack(), min_max=min_max)\n", + " df.stack(), min_max=min_max)\n", "\n", "fname = params.out_figures / 'intensity_distribution_overall'\n", "figures[fname.stem] = fname\n", @@ -688,7 +638,7 @@ "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", - " data=analysis.df, type='scatter')\n", + " data=df, type='scatter')\n", "fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" @@ -701,7 +651,7 @@ "outputs": [], "source": [ "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n", - " data=analysis.df, type='boxplot')\n", + " data=df, type='boxplot')\n", "fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), fname)" @@ -714,13 +664,6 @@ "### Interactive and Single plots" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Scatter plots need to become interactive." - ] - }, { "cell_type": "code", "execution_count": null, @@ -737,9 +680,9 @@ "outputs": [], "source": [ "K = 2\n", - "analysis.df = analysis.df.astype(float)\n", + "df = df.astype(float)\n", "pcs = analysis.get_PCA(n_components=K) # should be renamed to get_PCs\n", - "pcs = pcs.iloc[:, :K].join(analysis.df_meta).join(sample_counts)\n", + "pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts)\n", "\n", "pcs_name = pcs.columns[:2]\n", "pcs_index_name = pcs.index.name\n", @@ -763,11 +706,11 @@ "outputs": [], "source": [ "if params.meta_cat_col:\n", - " fig, ax = plt.subplots(figsize=(2,2))\n", + " fig, ax = plt.subplots(figsize=(2, 2))\n", " analyzers.seaborn_scatter(\n", " pcs[pcs_name], ax, meta=pcs[params.meta_cat_col], title=f\"by {params.meta_cat_col}\")\n", " fname = (params.out_figures\n", - " / f'pca_sample_by_{\"_\".join(params.meta_cat_col.split())}')\n", + " / f'pca_sample_by_{\"_\".join(params.meta_cat_col.split())}')\n", " figures[fname.stem] = fname\n", " vaep.savefig(fig, fname)" ] @@ -791,7 +734,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- software version: Does it make a difference?\n", "- size: number of features in a single sample" ] }, @@ -830,8 +772,8 @@ " # color=pcs['Software Version'],\n", " color=col_identified_feat,\n", " template='none',\n", - " width=1200, # 4 inches x 300 dpi\n", - " height=600 # 2 inches x 300 dpi\n", + " width=1200, # 4 inches x 300 dpi\n", + " height=600 # 2 inches x 300 dpi\n", ")\n", "fname = (params.out_figures\n", " / f'pca_sample_by_{\"_\".join(col_identified_feat.split())}_plotly.pdf')\n", @@ -853,7 +795,7 @@ "metadata": {}, "outputs": [], "source": [ - "analysis.df.head()" + "df.head()" ] }, { @@ -862,12 +804,12 @@ "metadata": {}, "outputs": [], "source": [ - "df = analysis.df\n", - "df = df.join(df_meta[params.meta_date_col])\n", - "df = df.set_index(params.meta_date_col).sort_index()\n", + "df_w_date = df.join(df_meta[params.meta_date_col])\n", + "df_w_date = df_w_date.set_index(params.meta_date_col).sort_index()\n", "if not params.meta_date_col == 'PlaceholderTime':\n", - " df.to_period('min')\n", - "df = df.T" + " df_w_date.to_period('min')\n", + "df_w_date = df_w_date.T\n", + "df_w_date" ] }, { @@ -876,13 +818,18 @@ "metadata": {}, "outputs": [], "source": [ - "ax = df.boxplot(rot=80, figsize=(8, 3), fontsize=5,\n", - " showfliers=False, showcaps=False)\n", + "ax = df_w_date.boxplot(rot=80,\n", + " figsize=(8, 3),\n", + " fontsize=6,\n", + " showfliers=False,\n", + " showcaps=False\n", + " )\n", "_ = vaep.plotting.select_xticks(ax)\n", "fig = ax.get_figure()\n", "fname = params.out_figures / 'median_boxplot'\n", "figures[fname.stem] = fname\n", - "vaep.savefig(fig, fname)" + "vaep.savefig(fig, fname)\n", + "del df_w_date" ] }, { @@ -898,7 +845,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 10))" + "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))" ] }, { @@ -918,15 +865,14 @@ "source": [ "if not params.meta_date_col == 'PlaceholderTime':\n", " dates = df_meta[params.meta_date_col].sort_values()\n", - " # dates.name = 'date'\n", - " median_sample_intensity = (analysis.df\n", + " median_sample_intensity = (df\n", " .median(axis=1)\n", " .to_frame('median intensity'))\n", " median_sample_intensity = median_sample_intensity.join(dates)\n", "\n", " ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity',\n", " rot=90,\n", - " fontsize='large',\n", + " # fontsize=6,\n", " figsize=(8, 2),\n", " s=5,\n", " xticks=vaep.plotting.select_dates(\n", @@ -948,40 +894,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Feature frequency in data\n", - "\n", - "- higher count, higher probability to be sampled into training data\n", - "- missing peptides are sampled both into training as well as into validation dataset\n", - "- everything not in training data is validation data\n", - "\n", - "Based on unmodified training data" + "## Feature frequency in data" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "msg = \"Total number of samples in training data split: {}\"\n", - "print(msg.format(len(analysis.df)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ - "# # analysis.splits.to_wide_format()\n", - "# assert analysis.splits is splits, \"Sanity check failed.\"" + "msg = \"Total number of samples in data: {}\"\n", + "print(msg.format(len(df)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Recalculate feature frequency after selecting some samples" + "Recalculate feature frequency after selecting samples" ] }, { @@ -990,14 +922,16 @@ "metadata": {}, "outputs": [], "source": [ - "freq_per_feature = feature_frequency(analysis.df)\n", + "freq_per_feature = feature_frequency(df)\n", "freq_per_feature" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "# freq_per_feature.name = 'Gene names freq' # name it differently?\n", @@ -1010,29 +944,29 @@ "freq_per_feature.to_pickle(fname)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Conserning sampling with frequency weights:\n", - " - larger weight -> higher probablility of being sampled\n", - " - weights need to be alignable to index of original DataFrame before grouping (same index)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split: Train, validation and test data\n", "\n", - "- test data is in clinical language often denoted as independent validation cohort\n", - "- validation data (for model)" + "Select features as described in\n", + "> Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016.\n", + "> “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative\n", + "> Proteomics Data Sets to Compare Imputation Strategies.”\n", + "> Journal of Proteome Research 15 (4): 1116–25.\n", + "\n", + "- select `frac_mnar` based on threshold matrix on quantile of overall frac of data to be used\n", + " for validation and test data split, e.g. 0.1 = quantile(0.1)\n", + "- select frac_mnar from intensities selected using threshold matrix" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "analysis.splits = DataSplits(is_wide_format=False)\n", @@ -1045,23 +979,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Sample targets (Fake NAs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add goldstandard targets for valiation and test data\n", - "- based on same day\n", - "- same instrument" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create some target values by sampling 5% of the validation and test data." + "Create some target values by sampling X% of the validation and test data.\n", + "Simulated missing values are not used for validation and testing." ] }, { @@ -1080,15 +999,87 @@ "metadata": {}, "outputs": [], "source": [ - "fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),\n", - " sample_index_to_drop=0,\n", - " weights=freq_per_feature,\n", - " frac=0.1,\n", - " random_state=params.random_state,)\n", - "assert len(splits.train_X) > len(fake_na)\n", - "splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state).sort_index()\n", - "splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]\n", - "# splits" + "# if not mnar:\n", + "# fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),\n", + "# sample_index_to_drop=0,\n", + "# # weights=freq_per_feature,\n", + "# frac=0.1,\n", + "# random_state=params.random_state,)\n", + "# assert len(splits.train_X) > len(fake_na)\n", + "! move parameter checks to start of script\n", + "if 0.0 <= params.frac_mnar <= 1.0:\n", + " fig, axes = plt.subplots(1, 2, figsize=(8, 2))\n", + " quantile_frac = analysis.df_long.quantile(params.frac_non_train)\n", + " rng = np.random.default_rng(params.random_state)\n", + " threshold = pd.Series(rng.normal(loc=float(quantile_frac),\n", + " scale=float(0.3 * analysis.df_long.std()),\n", + " size=len(analysis.df_long),\n", + " ),\n", + " index=analysis.df_long.index,\n", + " )\n", + " # plot data vs threshold data\n", + " ax = axes[0]\n", + " from functools import partial\n", + " plot_histogram_intensities = partial(vaep.plotting.data.plot_histogram_intensities,\n", + " min_max=min_max,\n", + " alpha=0.8)\n", + " plot_histogram_intensities(\n", + " analysis.df_long.squeeze(),\n", + " ax=ax,\n", + " label='observed')\n", + " plot_histogram_intensities(\n", + " threshold,\n", + " ax=ax,\n", + " label='thresholds')\n", + " ax.legend()\n", + " # select MNAR (intensity between randomly sampled threshold)\n", + " mask = analysis.df_long.squeeze() < threshold\n", + " ! subsample to have exact fraction of MNAR?\n", + " N = len(analysis.df_long)\n", + " logger.info(f\"{int(N * params.frac_non_train) = :,d}\")\n", + " N_MNAR = int(params.frac_non_train * params.frac_mnar * N)\n", + " fake_na_mnar = analysis.df_long.loc[mask]\n", + " if len(fake_na_mnar) > N_MNAR:\n", + " fake_na_mnar = fake_na_mnar.sample(N_MNAR,\n", + " random_state=params.random_state)\n", + " splits.train_X = analysis.df_long.loc[\n", + " analysis.df_long.index.difference(\n", + " fake_na_mnar.index)\n", + " ]\n", + " logger.info(f\"{len(fake_na_mnar) = :,d}\")\n", + " N_MCAR = int(N * (1 - params.frac_mnar) * params.frac_non_train)\n", + " fake_na_mcar = splits.train_X.sample(N_MCAR,\n", + " random_state=params.random_state)\n", + " logger.info(f\"{len(fake_na_mcar) = :,d}\")\n", + " splits.train_X = (splits\n", + " .train_X\n", + " .loc[splits\n", + " .train_X\n", + " .index\n", + " .difference(\n", + " fake_na_mcar.index)]\n", + " ).squeeze()\n", + " logger.info(f\"{len(splits.train_X) = :,d}\")\n", + " fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze()\n", + " logger.info(f\"{len(fake_na) = :,d}\")\n", + " ax = axes[1]\n", + " plot_histogram_intensities(\n", + " fake_na_mnar.squeeze(),\n", + " ax=ax,\n", + " label=f'MNAR ({N_MNAR:,d})',\n", + " color='C2')\n", + " plot_histogram_intensities(\n", + " fake_na_mcar.squeeze(),\n", + " ax=ax,\n", + " color='C3',\n", + " label=f'MCAR ({N_MCAR:,d})')\n", + " ax.legend()\n", + " assert len(fake_na) + len(splits.train_X) == len(analysis.df_long)\n", + "else:\n", + " raise ValueError(f\"Invalid MNAR float value (should be betw. 0 and 1): {params.frac_mnar}\")\n", + "\n", + "splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state)\n", + "splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]" ] }, { @@ -1131,14 +1122,14 @@ "# per feature are allowd.\n", "\n", "diff = (splits\n", - " .val_y\n", - " .index\n", - " .levels[-1]\n", - " .difference(splits\n", - " .train_X\n", - " .index\n", - " .levels[-1]\n", - " ).to_list())\n", + " .val_y\n", + " .index\n", + " .levels[-1]\n", + " .difference(splits\n", + " .train_X\n", + " .index\n", + " .levels[-1]\n", + " ).to_list())\n", "if diff:\n", " to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]\n", " display(to_remove)\n", @@ -1156,14 +1147,14 @@ "outputs": [], "source": [ "diff = (splits\n", - " .test_y\n", - " .index\n", - " .levels[-1]\n", - " .difference(splits\n", - " .train_X\n", - " .index\n", - " .levels[-1]\n", - " ).to_list())\n", + " .test_y\n", + " .index\n", + " .levels[-1]\n", + " .difference(splits\n", + " .train_X\n", + " .index\n", + " .levels[-1]\n", + " ).to_list())\n", "if diff:\n", " to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]\n", " display(to_remove)\n", @@ -1228,7 +1219,7 @@ "splits_df['val'] = splits.val_y\n", "splits_df['test'] = splits.test_y\n", "stats_splits = splits_df.describe()\n", - "stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')\n", + "# stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')\n", "stats_splits" ] }, @@ -1263,7 +1254,7 @@ " bins=bins,\n", " ax=None,\n", " color='C0',\n", - "))\n", + " ))\n", "_ = (splits\n", " .val_y\n", " .plot\n", @@ -1292,7 +1283,7 @@ " legend=False,\n", " stacked=True,\n", " color=['C0', 'C1', 'C2'],\n", - " )\n", + " )\n", "ax.legend(_legend)\n", "ax.set_xlabel('Intensity bins')\n", "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", @@ -1312,7 +1303,7 @@ " color=['C1', 'C2'],\n", " legend=False,\n", " stacked=True,\n", - " )\n", + " )\n", "ax.legend(_legend[1:])\n", "ax.set_xlabel('Intensity bins')\n", "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n", @@ -1363,6 +1354,40 @@ "vaep.savefig(ax.get_figure(), fname)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "medians = (splits\n", + " .train_X\n", + " .median()\n", + " .astype(int)\n", + " ).to_frame('median_floor')\n", + "\n", + "feat_with_median = medians.groupby('median_floor').size().rename('M feat')\n", + "medians = medians.join(feat_with_median, on='median_floor')\n", + "medians = medians.apply(lambda s: \"{:02,d} (N={:3,d})\".format(*s), axis=1)\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 2))\n", + "s = 1\n", + "s_axes = pd.DataFrame({'medians': medians,\n", + " 'validation split': splits.val_y.notna().sum(),\n", + " 'training split': splits.train_X.notna().sum()}\n", + " ).plot.box(by='medians',\n", + " subplots=True,\n", + " boxprops=dict(linewidth=s),\n", + " flierprops=dict(markersize=s),\n", + " ax=ax)\n", + "for ax in s_axes:\n", + " _ = ax.set_xticklabels(ax.get_xticklabels(),\n", + " rotation=45,\n", + " horizontalalignment='right')" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py index 48b1fa31d..9507aeb39 100644 --- a/project/01_0_split_data.py +++ b/project/01_0_split_data.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.0 # kernelspec: # display_name: Python 3 # language: python @@ -20,10 +20,10 @@ # %% from pathlib import Path - +import logging from typing import Union, List - +from IPython.display import display import numpy as np import pandas as pd import matplotlib.pyplot as plt @@ -32,31 +32,31 @@ import vaep from vaep.io.datasplits import DataSplits -from vaep.io import thermo_raw_files from vaep.sampling import feature_frequency, sample_data from vaep.analyzers import analyzers -from vaep.analyzers.analyzers import AnalyzePeptides +from vaep.analyzers.analyzers import AnalyzePeptides logger = vaep.logging.setup_nb_logger() logger.info("Split data and make diagnostic plots") +logging.getLogger('fontTools').setLevel(logging.WARNING) + -def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): +def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame): try: - analysis.df = analysis.df.loc[df_meta.index] + df = df.loc[df_meta.index] except KeyError as e: logger.warning(e) logger.warning("Ignore missing samples in quantified samples") - analysis.df = analysis.df.loc[analysis.df.index.intersection( + df = df.loc[df.index.intersection( df_meta.index)] - - analysis.df_meta = df_meta - return analysis + return df_meta pd.options.display.max_columns = 32 plt.rcParams['figure.figsize'] = [4, 2] -vaep.plotting.make_large_descriptors(5) + +vaep.plotting.make_large_descriptors(6) figures = {} # collection of ax or figures dumps = {} # collection of data dumps @@ -70,33 +70,29 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): args = dict(globals()).keys() # %% tags=["parameters"] -# Sample (rows) intensiites for features (columns) -FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' -# Can be either a string or position (typical 0 for first column), or a list of these. -index_col: Union[str, int] = 0 -# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise) -# Manuelly set column names (of Index object in columns) -column_names: List[str] = ["Gene Names"] -# Machine parsed metadata from raw file (see workflows/metadata), wide format per sample -fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' -# Minimum number or fraction of feature prevalence across samples to be kept -feat_prevalence: Union[int, float] = 0.25 -# Minimum number or fraction of total requested features per Sample -sample_completeness: Union[int, float] = 0.5 +FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv' # Sample (rows), features (columns) +index_col: Union[str, int] = 0 # Can be either a string or position (default 0 for first column), or a list of these. +column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns) +fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # metadata for samples (rows) +feat_prevalence: Union[int, float] = 0.25 # Minimum number or fraction of feature prevalence across samples to be kept +sample_completeness: Union[int, float] = 0.5 # Minimum number or fraction of total requested features per Sample select_N: int = None # only use latest N samples -sample_N: bool = False # if select_N, sample N randomly instead of using latest? +sample_N: bool = False # if select_N, sample N randomly instead of using latest N random_state: int = 42 # random state for reproducibility of splits -# based on raw file meta data, only take samples with RT > min_RT_time -min_RT_time: Union[int, float] = None -# Log transformation of initial data (select one of the existing in numpy) -logarithm: str = 'log2' -folder_experiment: str = f'runs/example' -folder_data: str = '' # specify data directory if needed +min_RT_time: Union[int, float] = None # based on raw file meta data, only take samples with RT > min_RT_time +logarithm: str = 'log2' # Log transformation of initial data (select one of the existing in numpy) +folder_experiment: str = 'runs/example' # folder to save figures and data dumps +folder_data: str = '' # specify special data directory if needed file_format: str = 'csv' # file format of create splits, default pickle (pkl) # metadata -> defaults for metadata extracted from machine data, used for plotting meta_date_col: str = None # date column in meta data meta_cat_col: str = None # category column in meta data +# train, validation and test data splits +frac_non_train: float = 0.1 # fraction of non training data (validation and test split) +frac_mnar: float = 0.0 # fraction of missing not at random data, rest: missing completely at random +meta_date_col: str = 'Content Creation Date' +meta_cat_col: str = 'Software Version' # %% args = vaep.nb.get_params(args, globals=globals()) @@ -132,6 +128,7 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): f"File format (extension): {FILE_EXT} (!specifies data loading function!)") # %% +# ! factor out file reading to a separate module, not class # AnalyzePeptides.from_csv constructor = getattr(AnalyzePeptides, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT]) analysis = constructor(fname=params.FN_INTENSITIES, @@ -147,11 +144,17 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): log_fct = getattr(np, params.logarithm) analysis.log_transform(log_fct) logger.info(f"{analysis = }") -analysis.df +df = analysis.df +df # %% -ax = analysis.df.notna().sum(axis=0).to_frame( - analysis.df.columns.name).plot.box() +ax = (df + .notna() + .sum(axis=0) + .to_frame(df.columns.name) + .plot + .box() + ) ax.set_ylabel('number of observation across samples') @@ -160,7 +163,7 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame): dumps[fname.name] = fname.as_posix() writer = pd.ExcelWriter(fname) -notna = analysis.df.notna() +notna = df.notna() data_stats_original = pd.concat( [ notna.sum().describe().rename('feat_stats'), @@ -181,15 +184,16 @@ def join_as_str(seq): ret = "_".join(str(x) for x in seq) return ret + # ToDo: join multiindex samples indices (pkl dumps) -# if hasattr(analysis.df.columns, "levels"): -if isinstance(analysis.df.columns, pd.MultiIndex): +# if hasattr(df.columns, "levels"): +if isinstance(df.columns, pd.MultiIndex): logger.warning("combine MultiIndex columns to one feature column") - print(analysis.df.columns[:10].map(join_as_str)) - _new_name = join_as_str(analysis.df.columns.names) - analysis.df.columns = analysis.df.columns.map(join_as_str) - analysis.df.columns.name = _new_name - logger.warning(f"New name: {analysis.df.columns.names = }") + print(df.columns[:10].map(join_as_str)) + _new_name = join_as_str(df.columns.names) + df.columns = df.columns.map(join_as_str) + df.columns.name = _new_name + logger.warning(f"New name: {df.columns.names = }") # %% [markdown] # ## Machine metadata @@ -207,8 +211,8 @@ def join_as_str(seq): if params.meta_cat_col: raise ValueError( f"No metadata provided, but data column set: {params.meta_cat_col}") - df_meta = pd.DataFrame(index=analysis.df.index) -df_meta = df_meta.loc[analysis.df.index.to_list()] # index is sample index + df_meta = pd.DataFrame(index=df.index) +df_meta = df_meta.loc[df.index.to_list()] # index is sample index if df_meta.index.name is None: df_meta.index.name = params.index_col[0] df_meta @@ -222,13 +226,6 @@ def join_as_str(seq): df_meta[params.meta_date_col] = range(len(df_meta)) df_meta -# %% -if df_meta.columns.isin(thermo_raw_files.cols_instrument).sum() == len(thermo_raw_files.cols_instrument): - display(df_meta.groupby(thermo_raw_files.cols_instrument)[ - params.meta_date_col].agg(['count', 'min', 'max'])) -else: - logger.info( - f"Instrument column not found: {thermo_raw_files.cols_instrument}") # %% df_meta.describe(datetime_is_numeric=True, @@ -249,7 +246,7 @@ def join_as_str(seq): logger.info(msg) df_meta = df_meta.loc[mask_RT] else: - logger.warning(f"Retention time filtering deactivated.") + logger.warning("Retention time filtering deactivated.") # %% df_meta = df_meta.sort_values(params.meta_date_col) @@ -271,51 +268,15 @@ def join_as_str(seq): if 'unique' in meta_stats.index: display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)]) -# %% [markdown] -# Optional, if using ThermoRawFileParser: check some columns describing settings -# - software can be updated: `Software Version` -# - `mass resolution` setting for instrument -# - colision type for MS2: `beam-type collision-induced dissocation` -# - missing `dilution factor` -# - omit (uncomment if needed): -# - quite some variation due to `MS max charge`: omit -# - variation by `injection volume setting` and instrument over time -# - 500ng of peptides should be injected, based on concentration of peptides this setting is adjusted to get it - -# %% -meta_raw_settings = [ - 'Thermo Scientific instrument model', - 'instrument serial number', - 'Software Version', - # 'MS max charge', - 'mass resolution', - 'beam-type collision-induced dissociation', - # 'injection volume setting', - 'dilution factor', -] - -if df_meta.columns.isin(meta_raw_settings).sum() == len(meta_raw_settings): - display( - # index gives first example with this combination - df_meta[meta_raw_settings].drop_duplicates() - ) - - -# %% [markdown] -# - check for variation in `software Version` and `injection volume setting` -# -# -# Update selection of samples based on metadata (e.g. minimal retention time) -# - sort data the same as sorted meta data # %% -analysis = add_meta_data(analysis, df_meta=df_meta) +df_meta = add_meta_data(df, df_meta=df_meta) # %% [markdown] # Ensure unique indices # %% -assert analysis.df.index.is_unique, "Duplicates in index" +assert df.index.is_unique, "Duplicates in index" # %% [markdown] # ## Select a subset of samples if specified (reduce the number of samples) @@ -326,25 +287,26 @@ def join_as_str(seq): # %% if params.select_N is not None: - params.select_N = min(params.select_N, len(analysis.df_meta)) + params.select_N = min(params.select_N, len(df_meta)) if params.sample_N: - analysis.df_meta = analysis.df_meta.sample(params.select_N) + df_meta = df_meta.sample(params.select_N) else: - analysis.df_meta = analysis.df_meta.iloc[-params.select_N:] + df_meta = df_meta.iloc[-params.select_N:] - analysis.df = analysis.df.loc[analysis.df_meta.index].dropna( + df = df.loc[df_meta.index].dropna( how='all', axis=1) - ax = analysis.df.T.describe().loc['count'].hist() + ax = df.T.describe().loc['count'].hist() _ = ax.set_title('histogram of features for all eligable samples') # %% [markdown] # ## First Step: Select features by prevalence # - `feat_prevalence` across samples + # %% -freq_per_feature = analysis.df.notna().sum() # on wide format +freq_per_feature = df.notna().sum() # on wide format if isinstance(params.feat_prevalence, float): - N_samples = len(analysis.df_meta) + N_samples = len(df) logger.info(f"Current number of samples: {N_samples}") logger.info( f"Feature has to be present in at least {params.feat_prevalence:.2%} of samples") @@ -358,13 +320,13 @@ def join_as_str(seq): mask = freq_per_feature >= params.feat_prevalence logger.info(f"Drop {(~mask).sum()} features") freq_per_feature = freq_per_feature.loc[mask] -analysis.df = analysis.df.loc[:, mask] -analysis.N, analysis.M = analysis.df.shape - +df = df.loc[:, mask] +analysis.N, analysis.M = df.shape # # potentially create freq based on DataFrame -analysis.df +df -notna = analysis.df.notna() +# %% +notna = df.notna() data_stats_filtered = pd.concat( [ notna.sum().describe().rename('feat_stats'), @@ -385,30 +347,30 @@ def join_as_str(seq): msg = f'Fraction of minimum sample completeness over all features specified with: {params.sample_completeness}\n' # assumes df in wide format params.overwrite_entry('sample_completeness', int( - analysis.df.shape[1] * params.sample_completeness)) + df.shape[1] * params.sample_completeness)) msg += f'This translates to a minimum number of features per sample (to be included): {params.sample_completeness}' logger.info(msg) -sample_counts = analysis.df.notna().sum(axis=1) # if DataFrame +sample_counts = df.notna().sum(axis=1) # if DataFrame sample_counts.describe() # %% mask = sample_counts > params.sample_completeness msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.' print(msg) -analysis.df = analysis.df.loc[mask] -analysis.df = analysis.df.dropna( +df = df.loc[mask] +df = df.dropna( axis=1, how='all') # drop now missing features # %% -params.N, params.M = analysis.df.shape # save data dimensions -params.used_samples = analysis.df.index.to_list() +params.N, params.M = df.shape # save data dimensions +params.used_samples = df.index.to_list() # %% [markdown] # ### Histogram of features per sample # %% -ax = analysis.df.notna().sum(axis=1).hist() +ax = df.notna().sum(axis=1).hist() ax.set_xlabel('features per eligable sample') ax.set_ylabel('observations') fname = params.out_figures / 'hist_features_per_sample' @@ -416,7 +378,7 @@ def join_as_str(seq): vaep.savefig(ax.get_figure(), fname) # %% -ax = analysis.df.notna().sum(axis=0).sort_values().plot() +ax = df.notna().sum(axis=0).sort_values().plot() _new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()] _ = ax.set_xticklabels(_new_labels, rotation=45, horizontalalignment='right') @@ -431,9 +393,9 @@ def join_as_str(seq): # ### Number off observations accross feature value # %% -min_max = vaep.plotting.data.min_max(analysis.df.stack()) +min_max = vaep.plotting.data.min_max(df.stack()) ax, bins = vaep.plotting.data.plot_histogram_intensities( - analysis.df.stack(), min_max=min_max) + df.stack(), min_max=min_max) fname = params.out_figures / 'intensity_distribution_overall' figures[fname.stem] = fname @@ -441,14 +403,14 @@ def join_as_str(seq): # %% ax = vaep.plotting.data.plot_feat_median_over_prop_missing( - data=analysis.df, type='scatter') + data=df, type='scatter') fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) # %% ax = vaep.plotting.data.plot_feat_median_over_prop_missing( - data=analysis.df, type='boxplot') + data=df, type='boxplot') fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) @@ -456,17 +418,14 @@ def join_as_str(seq): # %% [markdown] # ### Interactive and Single plots -# %% [markdown] -# Scatter plots need to become interactive. - # %% sample_counts.name = 'identified features' # %% K = 2 -analysis.df = analysis.df.astype(float) +df = df.astype(float) pcs = analysis.get_PCA(n_components=K) # should be renamed to get_PCs -pcs = pcs.iloc[:, :K].join(analysis.df_meta).join(sample_counts) +pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts) pcs_name = pcs.columns[:2] pcs_index_name = pcs.index.name @@ -478,11 +437,11 @@ def join_as_str(seq): # %% if params.meta_cat_col: - fig, ax = plt.subplots(figsize=(2,2)) + fig, ax = plt.subplots(figsize=(2, 2)) analyzers.seaborn_scatter( pcs[pcs_name], ax, meta=pcs[params.meta_cat_col], title=f"by {params.meta_cat_col}") fname = (params.out_figures - / f'pca_sample_by_{"_".join(params.meta_cat_col.split())}') + / f'pca_sample_by_{"_".join(params.meta_cat_col.split())}') figures[fname.stem] = fname vaep.savefig(fig, fname) @@ -496,7 +455,6 @@ def join_as_str(seq): vaep.savefig(fig, fname) # %% [markdown] -# - software version: Does it make a difference? # - size: number of features in a single sample # %% @@ -523,8 +481,8 @@ def join_as_str(seq): # color=pcs['Software Version'], color=col_identified_feat, template='none', - width=1200, # 4 inches x 300 dpi - height=600 # 2 inches x 300 dpi + width=1200, # 4 inches x 300 dpi + height=600 # 2 inches x 300 dpi ) fname = (params.out_figures / f'pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf') @@ -536,30 +494,35 @@ def join_as_str(seq): # ## Sample Medians and percentiles # %% -analysis.df.head() +df.head() # %% -df = analysis.df -df = df.join(df_meta[params.meta_date_col]) -df = df.set_index(params.meta_date_col).sort_index() +df_w_date = df.join(df_meta[params.meta_date_col]) +df_w_date = df_w_date.set_index(params.meta_date_col).sort_index() if not params.meta_date_col == 'PlaceholderTime': - df.to_period('min') -df = df.T + df_w_date.to_period('min') +df_w_date = df_w_date.T +df_w_date # %% -ax = df.boxplot(rot=80, figsize=(8, 3), fontsize=5, - showfliers=False, showcaps=False) +ax = df_w_date.boxplot(rot=80, + figsize=(8, 3), + fontsize=6, + showfliers=False, + showcaps=False + ) _ = vaep.plotting.select_xticks(ax) fig = ax.get_figure() fname = params.out_figures / 'median_boxplot' figures[fname.stem] = fname vaep.savefig(fig, fname) +del df_w_date # %% [markdown] # Percentiles of intensities in dataset # %% -df.stack().describe(percentiles=np.linspace(0.05, 0.95, 10)) +df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2)) # %% [markdown] # ### Plot sample median over time @@ -569,15 +532,14 @@ def join_as_str(seq): # %% if not params.meta_date_col == 'PlaceholderTime': dates = df_meta[params.meta_date_col].sort_values() - # dates.name = 'date' - median_sample_intensity = (analysis.df + median_sample_intensity = (df .median(axis=1) .to_frame('median intensity')) median_sample_intensity = median_sample_intensity.join(dates) ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity', rot=90, - fontsize='large', + # fontsize=6, figsize=(8, 2), s=5, xticks=vaep.plotting.select_dates( @@ -592,26 +554,17 @@ def join_as_str(seq): # %% [markdown] # ## Feature frequency in data -# -# - higher count, higher probability to be sampled into training data -# - missing peptides are sampled both into training as well as into validation dataset -# - everything not in training data is validation data -# -# Based on unmodified training data # %% -msg = "Total number of samples in training data split: {}" -print(msg.format(len(analysis.df))) +msg = "Total number of samples in data: {}" +print(msg.format(len(df))) -# %% -# # analysis.splits.to_wide_format() -# assert analysis.splits is splits, "Sanity check failed." # %% [markdown] -# Recalculate feature frequency after selecting some samples +# Recalculate feature frequency after selecting samples # %% -freq_per_feature = feature_frequency(analysis.df) +freq_per_feature = feature_frequency(df) freq_per_feature # %% @@ -624,16 +577,19 @@ def join_as_str(seq): dumps[fname.name] = fname freq_per_feature.to_pickle(fname) -# %% [markdown] -# Conserning sampling with frequency weights: -# - larger weight -> higher probablility of being sampled -# - weights need to be alignable to index of original DataFrame before grouping (same index) # %% [markdown] # ## Split: Train, validation and test data # -# - test data is in clinical language often denoted as independent validation cohort -# - validation data (for model) +# Select features as described in +# > Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016. +# > “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative +# > Proteomics Data Sets to Compare Imputation Strategies.” +# > Journal of Proteome Research 15 (4): 1116–25. +# +# - select `frac_mnar` based on threshold matrix on quantile of overall frac of data to be used +# for validation and test data split, e.g. 0.1 = quantile(0.1) +# - select frac_mnar from intensities selected using threshold matrix # %% analysis.splits = DataSplits(is_wide_format=False) @@ -641,31 +597,97 @@ def join_as_str(seq): print(f"{analysis.splits = }") analysis.splits.__annotations__ -# %% [markdown] -# ### Sample targets (Fake NAs) # %% [markdown] -# Add goldstandard targets for valiation and test data -# - based on same day -# - same instrument - -# %% [markdown] -# Create some target values by sampling 5% of the validation and test data. +# Create some target values by sampling X% of the validation and test data. +# Simulated missing values are not used for validation and testing. # %% analysis.to_long_format(inplace=True) analysis.df_long # %% -fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(), - sample_index_to_drop=0, - weights=freq_per_feature, - frac=0.1, - random_state=params.random_state,) -assert len(splits.train_X) > len(fake_na) -splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state).sort_index() +# if not mnar: +# fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(), +# sample_index_to_drop=0, +# # weights=freq_per_feature, +# frac=0.1, +# random_state=params.random_state,) +# assert len(splits.train_X) > len(fake_na) +# ! move parameter checks to start of script +if 0.0 <= params.frac_mnar <= 1.0: + fig, axes = plt.subplots(1, 2, figsize=(8, 2)) + quantile_frac = analysis.df_long.quantile(params.frac_non_train) + rng = np.random.default_rng(params.random_state) + threshold = pd.Series(rng.normal(loc=float(quantile_frac), + scale=float(0.3 * analysis.df_long.std()), + size=len(analysis.df_long), + ), + index=analysis.df_long.index, + ) + # plot data vs threshold data + ax = axes[0] + from functools import partial + plot_histogram_intensities = partial(vaep.plotting.data.plot_histogram_intensities, + min_max=min_max, + alpha=0.8) + plot_histogram_intensities( + analysis.df_long.squeeze(), + ax=ax, + label='observed') + plot_histogram_intensities( + threshold, + ax=ax, + label='thresholds') + ax.legend() + # select MNAR (intensity between randomly sampled threshold) + mask = analysis.df_long.squeeze() < threshold + # ! subsample to have exact fraction of MNAR? + N = len(analysis.df_long) + logger.info(f"{int(N * params.frac_non_train) = :,d}") + N_MNAR = int(params.frac_non_train * params.frac_mnar * N) + fake_na_mnar = analysis.df_long.loc[mask] + if len(fake_na_mnar) > N_MNAR: + fake_na_mnar = fake_na_mnar.sample(N_MNAR, + random_state=params.random_state) + splits.train_X = analysis.df_long.loc[ + analysis.df_long.index.difference( + fake_na_mnar.index) + ] + logger.info(f"{len(fake_na_mnar) = :,d}") + N_MCAR = int(N * (1 - params.frac_mnar) * params.frac_non_train) + fake_na_mcar = splits.train_X.sample(N_MCAR, + random_state=params.random_state) + logger.info(f"{len(fake_na_mcar) = :,d}") + splits.train_X = (splits + .train_X + .loc[splits + .train_X + .index + .difference( + fake_na_mcar.index)] + ).squeeze() + logger.info(f"{len(splits.train_X) = :,d}") + fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze() + logger.info(f"{len(fake_na) = :,d}") + ax = axes[1] + plot_histogram_intensities( + fake_na_mnar.squeeze(), + ax=ax, + label=f'MNAR ({N_MNAR:,d})', + color='C2') + plot_histogram_intensities( + fake_na_mcar.squeeze(), + ax=ax, + color='C3', + label=f'MCAR ({N_MCAR:,d})') + ax.legend() + assert len(fake_na) + len(splits.train_X) == len(analysis.df_long) +else: + raise ValueError(f"Invalid MNAR float value (should be betw. 0 and 1): {params.frac_mnar}") + +splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state) splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)] -# splits # %% splits.test_y @@ -684,14 +706,14 @@ def join_as_str(seq): # per feature are allowd. diff = (splits - .val_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) + .val_y + .index + .levels[-1] + .difference(splits + .train_X + .index + .levels[-1] + ).to_list()) if diff: to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]] display(to_remove) @@ -701,14 +723,14 @@ def join_as_str(seq): # %% diff = (splits - .test_y - .index - .levels[-1] - .difference(splits - .train_X - .index - .levels[-1] - ).to_list()) + .test_y + .index + .levels[-1] + .difference(splits + .train_X + .index + .levels[-1] + ).to_list()) if diff: to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]] display(to_remove) @@ -744,7 +766,7 @@ def join_as_str(seq): splits_df['val'] = splits.val_y splits_df['test'] = splits.test_y stats_splits = splits_df.describe() -stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f') +# stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f') stats_splits # %% @@ -767,7 +789,7 @@ def join_as_str(seq): bins=bins, ax=None, color='C0', -)) + )) _ = (splits .val_y .plot @@ -790,7 +812,7 @@ def join_as_str(seq): legend=False, stacked=True, color=['C0', 'C1', 'C2'], - ) + ) ax.legend(_legend) ax.set_xlabel('Intensity bins') ax.yaxis.set_major_formatter("{x:,.0f}") @@ -804,7 +826,7 @@ def join_as_str(seq): color=['C1', 'C2'], legend=False, stacked=True, - ) + ) ax.legend(_legend[1:]) ax.set_xlabel('Intensity bins') ax.yaxis.set_major_formatter("{x:,.0f}") @@ -832,6 +854,31 @@ def join_as_str(seq): figures[fname.stem] = fname vaep.savefig(ax.get_figure(), fname) +# %% +medians = (splits + .train_X + .median() + .astype(int) + ).to_frame('median_floor') + +feat_with_median = medians.groupby('median_floor').size().rename('M feat') +medians = medians.join(feat_with_median, on='median_floor') +medians = medians.apply(lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1) + +fig, ax = plt.subplots(figsize=(8, 2)) +s = 1 +s_axes = pd.DataFrame({'medians': medians, + 'validation split': splits.val_y.notna().sum(), + 'training split': splits.train_X.notna().sum()} + ).plot.box(by='medians', + subplots=True, + boxprops=dict(linewidth=s), + flierprops=dict(markersize=s), + ax=ax) +for ax in s_axes: + _ = ax.set_xticklabels(ax.get_xticklabels(), + rotation=45, + horizontalalignment='right') # %% [markdown] # ## Save parameters diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py index 1fbd31d7a..f50c63606 100644 --- a/vaep/plotting/data.py +++ b/vaep/plotting/data.py @@ -1,4 +1,4 @@ -"""Plot data distribution based on pandas DataFrames or Series.""" +"""Plot data distribution based on pandas `DataFrames` or `Series`.""" from typing import Tuple, Iterable import matplotlib @@ -9,7 +9,19 @@ def min_max(s: pd.Series) -> Tuple[int]: - min_bin, max_bin = (int(s.min()), (int(s.max())+1)) + """Get the min and max as integer from a pandas.Series. + + Parameters + ---------- + s : pd.Series + Series of intensities. + + Returns + ------- + Tuple[int] + _description_ + """ + min_bin, max_bin = (int(s.min()), (int(s.max()) + 1)) return min_bin, max_bin @@ -27,10 +39,10 @@ def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]: def plot_histogram_intensities(s: pd.Series, - interval_bins=1, - min_max=(15, 40), - ax=None, - **kwargs) -> Tuple[Axes, range]: + interval_bins=1, + min_max=(15, 40), + ax=None, + **kwargs) -> Tuple[Axes, range]: """Plot intensities in Series in a certain range and equally spaced intervals.""" min_bin, max_bin = min_max bins = range(min_bin, max_bin, interval_bins) @@ -90,8 +102,24 @@ def plot_observations(df: pd.DataFrame, def plot_missing_dist_highdim(data: pd.DataFrame, - min_feat_per_sample=None, - min_samples_per_feat=None) -> matplotlib.figure.Figure: + min_feat_per_sample: int = None, + min_samples_per_feat: int = None) -> matplotlib.figure.Figure: + """Plot missing distribution (cdf) in high dimensional data. + + Parameters + ---------- + data : pd.DataFrame + Intensity table with samples in rows and features in columns. + min_feat_per_sample : int, optional + Show the minimum required features a sample has to have, by default None + min_samples_per_feat : int, optional + Show the minimum required number of samples a feature has to be found in, by default None + + Returns + ------- + matplotlib.figure.Figure + Figure with two plots (Axes). + """ fig, axes = plt.subplots(1, 2, figsize=(4, 2)) not_na = data.notna() name = 'features per sample' @@ -251,8 +279,8 @@ def plot_feat_median_over_prop_missing(data: pd.DataFrame, missing_by_median['bins'] = pd.cut( missing_by_median['median feat value'], bins=bins) missing_by_median['median feat value (floor)'] = (missing_by_median['median feat value'] - .astype(int) - ) + .astype(int) + ) _counts = (missing_by_median .groupby('median feat value (floor)')['median feat value'] .count()