diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb
index f6df4bd16..5d0214392 100644
--- a/project/01_0_split_data.ipynb
+++ b/project/01_0_split_data.ipynb
@@ -16,10 +16,10 @@
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
-    "\n",
+    "import logging\n",
     "from typing import Union, List\n",
     "\n",
-    "\n",
+    "from IPython.display import display\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
@@ -28,31 +28,31 @@
     "\n",
     "import vaep\n",
     "from vaep.io.datasplits import DataSplits\n",
-    "from vaep.io import thermo_raw_files\n",
     "from vaep.sampling import feature_frequency, sample_data\n",
     "\n",
     "from vaep.analyzers import analyzers\n",
-    "from vaep.analyzers.analyzers import  AnalyzePeptides\n",
+    "from vaep.analyzers.analyzers import AnalyzePeptides\n",
     "\n",
     "logger = vaep.logging.setup_nb_logger()\n",
     "logger.info(\"Split data and make diagnostic plots\")\n",
+    "logging.getLogger('fontTools').setLevel(logging.WARNING)\n",
+    "\n",
     "\n",
-    "def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):\n",
+    "def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):\n",
     "    try:\n",
-    "        analysis.df = analysis.df.loc[df_meta.index]\n",
+    "        df = df.loc[df_meta.index]\n",
     "    except KeyError as e:\n",
     "        logger.warning(e)\n",
     "        logger.warning(\"Ignore missing samples in quantified samples\")\n",
-    "        analysis.df = analysis.df.loc[analysis.df.index.intersection(\n",
+    "        df = df.loc[df.index.intersection(\n",
     "            df_meta.index)]\n",
-    "\n",
-    "    analysis.df_meta = df_meta\n",
-    "    return analysis\n",
+    "    return df_meta\n",
     "\n",
     "\n",
     "pd.options.display.max_columns = 32\n",
     "plt.rcParams['figure.figsize'] = [4, 2]\n",
-    "vaep.plotting.make_large_descriptors(5)\n",
+    "\n",
+    "vaep.plotting.make_large_descriptors(6)\n",
     "\n",
     "figures = {}  # collection of ax or figures\n",
     "dumps = {}  # collection of data dumps"
@@ -80,39 +80,35 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "lines_to_next_cell": 2,
     "tags": [
      "parameters"
     ]
    },
    "outputs": [],
    "source": [
-    "# Sample (rows) intensiites for features (columns)\n",
-    "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'\n",
-    "# Can be either a string or position (typical 0 for first column), or a list of these.\n",
-    "index_col: Union[str, int] = 0\n",
-    "# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise)\n",
-    "# Manuelly set column names (of Index object in columns)\n",
-    "column_names: List[str] = [\"Gene Names\"]\n",
-    "# Machine parsed metadata from raw file (see workflows/metadata), wide format per sample\n",
-    "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n",
-    "# Minimum number or fraction of feature prevalence across samples to be kept\n",
-    "feat_prevalence: Union[int, float] = 0.25\n",
-    "# Minimum number or fraction of total requested features per Sample\n",
-    "sample_completeness: Union[int, float] = 0.5\n",
+    "FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'  # Sample (rows), features (columns)\n",
+    "index_col: Union[str, int] = 0  # Can be either a string or position (default 0 for first column), or a list of these.\n",
+    "column_names: List[str] = [\"Gene Names\"]  # Manuelly set column names (of Index object in columns)\n",
+    "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'  # metadata for samples (rows)\n",
+    "feat_prevalence: Union[int, float] = 0.25  # Minimum number or fraction of feature prevalence across samples to be kept\n",
+    "sample_completeness: Union[int, float] = 0.5  # Minimum number or fraction of total requested features per Sample\n",
     "select_N: int = None  # only use latest N samples\n",
-    "sample_N: bool = False # if select_N, sample N randomly instead of using latest?\n",
+    "sample_N: bool = False  # if select_N, sample N randomly instead of using latest N\n",
     "random_state: int = 42  # random state for reproducibility of splits\n",
-    "# based on raw file meta data, only take samples with RT > min_RT_time\n",
-    "min_RT_time: Union[int, float] = None\n",
-    "# Log transformation of initial data (select one of the existing in numpy)\n",
-    "logarithm: str = 'log2'\n",
-    "folder_experiment: str = f'runs/example'\n",
-    "folder_data: str = ''  # specify data directory if needed\n",
+    "min_RT_time: Union[int, float] = None  # based on raw file meta data, only take samples with RT > min_RT_time\n",
+    "logarithm: str = 'log2'  # Log transformation of initial data (select one of the existing in numpy)\n",
+    "folder_experiment: str = 'runs/example'  # folder to save figures and data dumps\n",
+    "folder_data: str = ''  # specify special data directory if needed\n",
     "file_format: str = 'csv'  # file format of create splits, default pickle (pkl)\n",
     "# metadata -> defaults for metadata extracted from machine data, used for plotting\n",
     "meta_date_col: str = None  # date column in meta data\n",
-    "meta_cat_col: str = None  # category column in meta data"
+    "meta_cat_col: str = None  # category column in meta data\n",
+    "# train, validation and test data splits\n",
+    "frac_non_train: float = 0.1  # fraction of non training data (validation and test split)\n",
+    "frac_mnar: float = 0.0  # fraction of missing not at random data, rest: missing completely at random\n",
+    "\n",
+    "meta_date_col: str = 'Content Creation Date'\n",
+    "meta_cat_col: str = 'Software Version'"
    ]
   },
   {
@@ -187,6 +183,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "! factor out file reading to a separate module, not class\n",
     "# AnalyzePeptides.from_csv\n",
     "constructor = getattr(AnalyzePeptides, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT])\n",
     "analysis = constructor(fname=params.FN_INTENSITIES,\n",
@@ -202,7 +199,8 @@
     "log_fct = getattr(np, params.logarithm)\n",
     "analysis.log_transform(log_fct)\n",
     "logger.info(f\"{analysis = }\")\n",
-    "analysis.df"
+    "df = analysis.df\n",
+    "df"
    ]
   },
   {
@@ -213,8 +211,13 @@
    },
    "outputs": [],
    "source": [
-    "ax = analysis.df.notna().sum(axis=0).to_frame(\n",
-    "    analysis.df.columns.name).plot.box()\n",
+    "ax = (df\n",
+    "      .notna()\n",
+    "      .sum(axis=0)\n",
+    "      .to_frame(df.columns.name)\n",
+    "      .plot\n",
+    "      .box()\n",
+    "      )\n",
     "ax.set_ylabel('number of observation across samples')"
    ]
   },
@@ -228,7 +231,7 @@
     "dumps[fname.name] = fname.as_posix()\n",
     "writer = pd.ExcelWriter(fname)\n",
     "\n",
-    "notna = analysis.df.notna()\n",
+    "notna = df.notna()\n",
     "data_stats_original = pd.concat(\n",
     "    [\n",
     "        notna.sum().describe().rename('feat_stats'),\n",
@@ -258,15 +261,16 @@
     "    ret = \"_\".join(str(x) for x in seq)\n",
     "    return ret\n",
     "\n",
+    "\n",
     "# ToDo: join multiindex samples indices (pkl dumps)\n",
-    "# if hasattr(analysis.df.columns, \"levels\"):\n",
-    "if isinstance(analysis.df.columns, pd.MultiIndex):\n",
+    "# if hasattr(df.columns, \"levels\"):\n",
+    "if isinstance(df.columns, pd.MultiIndex):\n",
     "    logger.warning(\"combine MultiIndex columns to one feature column\")\n",
-    "    print(analysis.df.columns[:10].map(join_as_str))\n",
-    "    _new_name = join_as_str(analysis.df.columns.names)\n",
-    "    analysis.df.columns = analysis.df.columns.map(join_as_str)\n",
-    "    analysis.df.columns.name = _new_name\n",
-    "    logger.warning(f\"New name: {analysis.df.columns.names = }\")"
+    "    print(df.columns[:10].map(join_as_str))\n",
+    "    _new_name = join_as_str(df.columns.names)\n",
+    "    df.columns = df.columns.map(join_as_str)\n",
+    "    df.columns.name = _new_name\n",
+    "    logger.warning(f\"New name: {df.columns.names = }\")"
    ]
   },
   {
@@ -294,8 +298,8 @@
     "    if params.meta_cat_col:\n",
     "        raise ValueError(\n",
     "            f\"No metadata provided, but data column set: {params.meta_cat_col}\")\n",
-    "    df_meta = pd.DataFrame(index=analysis.df.index)\n",
-    "df_meta = df_meta.loc[analysis.df.index.to_list()]  # index is sample index\n",
+    "    df_meta = pd.DataFrame(index=df.index)\n",
+    "df_meta = df_meta.loc[df.index.to_list()]  # index is sample index\n",
     "if df_meta.index.name is None:\n",
     "    df_meta.index.name = params.index_col[0]\n",
     "df_meta"
@@ -304,7 +308,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "if params.meta_date_col:\n",
@@ -316,20 +322,6 @@
     "df_meta"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if df_meta.columns.isin(thermo_raw_files.cols_instrument).sum() == len(thermo_raw_files.cols_instrument):\n",
-    "    display(df_meta.groupby(thermo_raw_files.cols_instrument)[\n",
-    "            params.meta_date_col].agg(['count', 'min', 'max']))\n",
-    "else:\n",
-    "    logger.info(\n",
-    "        f\"Instrument column not found: {thermo_raw_files.cols_instrument}\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -364,7 +356,7 @@
     "    logger.info(msg)\n",
     "    df_meta = df_meta.loc[mask_RT]\n",
     "else:\n",
-    "    logger.warning(f\"Retention time filtering deactivated.\")"
+    "    logger.warning(\"Retention time filtering deactivated.\")"
    ]
   },
   {
@@ -396,7 +388,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "try:\n",
@@ -409,65 +403,13 @@
     "        display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)])"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Optional, if using ThermoRawFileParser: check some columns describing settings\n",
-    "  - software can be updated: `Software Version`\n",
-    "  - `mass resolution` setting for instrument\n",
-    "  - colision type for MS2: `beam-type collision-induced dissocation`\n",
-    "  - missing `dilution factor`\n",
-    "  - omit (uncomment if needed):\n",
-    "    - quite some variation due to `MS max charge`: omit\n",
-    "    - variation by `injection volume setting` and instrument over time\n",
-    "        - 500ng of peptides should be injected, based on concentration of peptides this setting is adjusted to get it"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
-   "outputs": [],
-   "source": [
-    "meta_raw_settings = [\n",
-    "    'Thermo Scientific instrument model',\n",
-    "    'instrument serial number',\n",
-    "    'Software Version',\n",
-    "    # 'MS max charge',\n",
-    "    'mass resolution',\n",
-    "    'beam-type collision-induced dissociation',\n",
-    "    # 'injection volume setting',\n",
-    "    'dilution factor',\n",
-    "]\n",
-    "\n",
-    "if df_meta.columns.isin(meta_raw_settings).sum() == len(meta_raw_settings):\n",
-    "    display(\n",
-    "        # index gives first example with this combination\n",
-    "        df_meta[meta_raw_settings].drop_duplicates()\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- check for variation in `software Version` and `injection volume setting`\n",
-    "\n",
-    "\n",
-    "Update selection of samples based on metadata (e.g. minimal retention time)\n",
-    "- sort data the same as sorted meta data"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "analysis = add_meta_data(analysis, df_meta=df_meta)"
+    "df_meta = add_meta_data(df, df_meta=df_meta)"
    ]
   },
   {
@@ -483,7 +425,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert analysis.df.index.is_unique, \"Duplicates in index\""
+    "assert df.index.is_unique, \"Duplicates in index\""
    ]
   },
   {
@@ -504,21 +446,23 @@
    "outputs": [],
    "source": [
     "if params.select_N is not None:\n",
-    "    params.select_N = min(params.select_N, len(analysis.df_meta))\n",
+    "    params.select_N = min(params.select_N, len(df_meta))\n",
     "    if params.sample_N:\n",
-    "        analysis.df_meta = analysis.df_meta.sample(params.select_N)\n",
+    "        df_meta = df_meta.sample(params.select_N)\n",
     "    else:\n",
-    "        analysis.df_meta = analysis.df_meta.iloc[-params.select_N:]\n",
+    "        df_meta = df_meta.iloc[-params.select_N:]\n",
     "\n",
-    "    analysis.df = analysis.df.loc[analysis.df_meta.index].dropna(\n",
+    "    df = df.loc[df_meta.index].dropna(\n",
     "        how='all', axis=1)\n",
-    "    ax = analysis.df.T.describe().loc['count'].hist()\n",
+    "    ax = df.T.describe().loc['count'].hist()\n",
     "    _ = ax.set_title('histogram of features for all eligable samples')"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "source": [
     "## First Step: Select features by prevalence\n",
     "- `feat_prevalence` across samples"
@@ -530,9 +474,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "freq_per_feature = analysis.df.notna().sum()  # on wide format\n",
+    "freq_per_feature = df.notna().sum()  # on wide format\n",
     "if isinstance(params.feat_prevalence, float):\n",
-    "    N_samples = len(analysis.df_meta)\n",
+    "    N_samples = len(df)\n",
     "    logger.info(f\"Current number of samples: {N_samples}\")\n",
     "    logger.info(\n",
     "        f\"Feature has to be present in at least {params.feat_prevalence:.2%} of samples\")\n",
@@ -546,13 +490,19 @@
     "mask = freq_per_feature >= params.feat_prevalence\n",
     "logger.info(f\"Drop {(~mask).sum()} features\")\n",
     "freq_per_feature = freq_per_feature.loc[mask]\n",
-    "analysis.df = analysis.df.loc[:, mask]\n",
-    "analysis.N, analysis.M = analysis.df.shape\n",
-    "\n",
+    "df = df.loc[:, mask]\n",
+    "analysis.N, analysis.M = df.shape\n",
     "# # potentially create freq based on DataFrame\n",
-    "analysis.df\n",
-    "\n",
-    "notna = analysis.df.notna()\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "notna = df.notna()\n",
     "data_stats_filtered = pd.concat(\n",
     "    [\n",
     "        notna.sum().describe().rename('feat_stats'),\n",
@@ -587,11 +537,11 @@
     "    msg = f'Fraction of minimum sample completeness over all features specified with: {params.sample_completeness}\\n'\n",
     "    # assumes df in wide format\n",
     "    params.overwrite_entry('sample_completeness', int(\n",
-    "        analysis.df.shape[1] * params.sample_completeness))\n",
+    "        df.shape[1] * params.sample_completeness))\n",
     "    msg += f'This translates to a minimum number of features per sample (to be included): {params.sample_completeness}'\n",
     "    logger.info(msg)\n",
     "\n",
-    "sample_counts = analysis.df.notna().sum(axis=1)  # if DataFrame\n",
+    "sample_counts = df.notna().sum(axis=1)  # if DataFrame\n",
     "sample_counts.describe()"
    ]
   },
@@ -604,8 +554,8 @@
     "mask = sample_counts > params.sample_completeness\n",
     "msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.'\n",
     "print(msg)\n",
-    "analysis.df = analysis.df.loc[mask]\n",
-    "analysis.df = analysis.df.dropna(\n",
+    "df = df.loc[mask]\n",
+    "df = df.dropna(\n",
     "    axis=1, how='all')  # drop now missing features"
    ]
   },
@@ -615,8 +565,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "params.N, params.M = analysis.df.shape  # save data dimensions\n",
-    "params.used_samples = analysis.df.index.to_list()"
+    "params.N, params.M = df.shape  # save data dimensions\n",
+    "params.used_samples = df.index.to_list()"
    ]
   },
   {
@@ -632,7 +582,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ax = analysis.df.notna().sum(axis=1).hist()\n",
+    "ax = df.notna().sum(axis=1).hist()\n",
     "ax.set_xlabel('features per eligable sample')\n",
     "ax.set_ylabel('observations')\n",
     "fname = params.out_figures / 'hist_features_per_sample'\n",
@@ -648,7 +598,7 @@
    },
    "outputs": [],
    "source": [
-    "ax = analysis.df.notna().sum(axis=0).sort_values().plot()\n",
+    "ax = df.notna().sum(axis=0).sort_values().plot()\n",
     "_new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]\n",
     "_ = ax.set_xticklabels(_new_labels, rotation=45,\n",
     "                       horizontalalignment='right')\n",
@@ -672,9 +622,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "min_max = vaep.plotting.data.min_max(analysis.df.stack())\n",
+    "min_max = vaep.plotting.data.min_max(df.stack())\n",
     "ax, bins = vaep.plotting.data.plot_histogram_intensities(\n",
-    "    analysis.df.stack(), min_max=min_max)\n",
+    "    df.stack(), min_max=min_max)\n",
     "\n",
     "fname = params.out_figures / 'intensity_distribution_overall'\n",
     "figures[fname.stem] = fname\n",
@@ -688,7 +638,7 @@
    "outputs": [],
    "source": [
     "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
-    "    data=analysis.df, type='scatter')\n",
+    "    data=df, type='scatter')\n",
     "fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter'\n",
     "figures[fname.stem] = fname\n",
     "vaep.savefig(ax.get_figure(), fname)"
@@ -701,7 +651,7 @@
    "outputs": [],
    "source": [
     "ax = vaep.plotting.data.plot_feat_median_over_prop_missing(\n",
-    "    data=analysis.df, type='boxplot')\n",
+    "    data=df, type='boxplot')\n",
     "fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot'\n",
     "figures[fname.stem] = fname\n",
     "vaep.savefig(ax.get_figure(), fname)"
@@ -714,13 +664,6 @@
     "### Interactive and Single plots"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Scatter plots need to become interactive."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -737,9 +680,9 @@
    "outputs": [],
    "source": [
     "K = 2\n",
-    "analysis.df = analysis.df.astype(float)\n",
+    "df = df.astype(float)\n",
     "pcs = analysis.get_PCA(n_components=K)  # should be renamed to get_PCs\n",
-    "pcs = pcs.iloc[:, :K].join(analysis.df_meta).join(sample_counts)\n",
+    "pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts)\n",
     "\n",
     "pcs_name = pcs.columns[:2]\n",
     "pcs_index_name = pcs.index.name\n",
@@ -763,11 +706,11 @@
    "outputs": [],
    "source": [
     "if params.meta_cat_col:\n",
-    "    fig, ax = plt.subplots(figsize=(2,2))\n",
+    "    fig, ax = plt.subplots(figsize=(2, 2))\n",
     "    analyzers.seaborn_scatter(\n",
     "        pcs[pcs_name], ax, meta=pcs[params.meta_cat_col], title=f\"by {params.meta_cat_col}\")\n",
     "    fname = (params.out_figures\n",
-    "              / f'pca_sample_by_{\"_\".join(params.meta_cat_col.split())}')\n",
+    "             / f'pca_sample_by_{\"_\".join(params.meta_cat_col.split())}')\n",
     "    figures[fname.stem] = fname\n",
     "    vaep.savefig(fig, fname)"
    ]
@@ -791,7 +734,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "- software version: Does it make a difference?\n",
     "- size: number of features in a single sample"
    ]
   },
@@ -830,8 +772,8 @@
     "    # color=pcs['Software Version'],\n",
     "    color=col_identified_feat,\n",
     "    template='none',\n",
-    "    width=1200, # 4 inches x 300 dpi\n",
-    "    height=600 # 2 inches x 300 dpi\n",
+    "    width=1200,  # 4 inches x 300 dpi\n",
+    "    height=600  # 2 inches x 300 dpi\n",
     ")\n",
     "fname = (params.out_figures\n",
     "         / f'pca_sample_by_{\"_\".join(col_identified_feat.split())}_plotly.pdf')\n",
@@ -853,7 +795,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "analysis.df.head()"
+    "df.head()"
    ]
   },
   {
@@ -862,12 +804,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = analysis.df\n",
-    "df = df.join(df_meta[params.meta_date_col])\n",
-    "df = df.set_index(params.meta_date_col).sort_index()\n",
+    "df_w_date = df.join(df_meta[params.meta_date_col])\n",
+    "df_w_date = df_w_date.set_index(params.meta_date_col).sort_index()\n",
     "if not params.meta_date_col == 'PlaceholderTime':\n",
-    "    df.to_period('min')\n",
-    "df = df.T"
+    "    df_w_date.to_period('min')\n",
+    "df_w_date = df_w_date.T\n",
+    "df_w_date"
    ]
   },
   {
@@ -876,13 +818,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ax = df.boxplot(rot=80, figsize=(8, 3), fontsize=5,\n",
-    "                showfliers=False, showcaps=False)\n",
+    "ax = df_w_date.boxplot(rot=80,\n",
+    "                       figsize=(8, 3),\n",
+    "                       fontsize=6,\n",
+    "                       showfliers=False,\n",
+    "                       showcaps=False\n",
+    "                       )\n",
     "_ = vaep.plotting.select_xticks(ax)\n",
     "fig = ax.get_figure()\n",
     "fname = params.out_figures / 'median_boxplot'\n",
     "figures[fname.stem] = fname\n",
-    "vaep.savefig(fig, fname)"
+    "vaep.savefig(fig, fname)\n",
+    "del df_w_date"
    ]
   },
   {
@@ -898,7 +845,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 10))"
+    "df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))"
    ]
   },
   {
@@ -918,15 +865,14 @@
    "source": [
     "if not params.meta_date_col == 'PlaceholderTime':\n",
     "    dates = df_meta[params.meta_date_col].sort_values()\n",
-    "    # dates.name = 'date'\n",
-    "    median_sample_intensity = (analysis.df\n",
+    "    median_sample_intensity = (df\n",
     "                               .median(axis=1)\n",
     "                               .to_frame('median intensity'))\n",
     "    median_sample_intensity = median_sample_intensity.join(dates)\n",
     "\n",
     "    ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity',\n",
     "                                              rot=90,\n",
-    "                                              fontsize='large',\n",
+    "                                              #   fontsize=6,\n",
     "                                              figsize=(8, 2),\n",
     "                                              s=5,\n",
     "                                              xticks=vaep.plotting.select_dates(\n",
@@ -948,40 +894,26 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Feature frequency  in data\n",
-    "\n",
-    "- higher count, higher probability to be sampled into training data\n",
-    "- missing peptides are sampled both into training as well as into validation dataset\n",
-    "- everything not in training data is validation data\n",
-    "\n",
-    "Based on unmodified training data"
+    "## Feature frequency  in data"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "msg = \"Total number of samples in training data split: {}\"\n",
-    "print(msg.format(len(analysis.df)))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
-    "# # analysis.splits.to_wide_format()\n",
-    "# assert analysis.splits is splits, \"Sanity check failed.\""
+    "msg = \"Total number of samples in data: {}\"\n",
+    "print(msg.format(len(df)))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Recalculate feature frequency after selecting some samples"
+    "Recalculate feature frequency after selecting samples"
    ]
   },
   {
@@ -990,14 +922,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "freq_per_feature = feature_frequency(analysis.df)\n",
+    "freq_per_feature = feature_frequency(df)\n",
     "freq_per_feature"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "# freq_per_feature.name = 'Gene names freq' # name it differently?\n",
@@ -1010,29 +944,29 @@
     "freq_per_feature.to_pickle(fname)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Conserning sampling with frequency weights:\n",
-    "  - larger weight -> higher probablility of being sampled\n",
-    "  - weights need to be alignable to index of original DataFrame before grouping (same index)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Split: Train, validation and test data\n",
     "\n",
-    "- test data is in clinical language often denoted as independent validation cohort\n",
-    "- validation data (for model)"
+    "Select features as described in\n",
+    "> Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016.\n",
+    "> “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative\n",
+    "> Proteomics Data Sets to Compare Imputation Strategies.”\n",
+    "> Journal of Proteome Research 15 (4): 1116–25.\n",
+    "\n",
+    "- select `frac_mnar` based on threshold matrix on quantile of overall frac of data to be used\n",
+    "  for validation and test data split, e.g. 0.1 = quantile(0.1)\n",
+    "- select frac_mnar from intensities selected using threshold matrix"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
     "analysis.splits = DataSplits(is_wide_format=False)\n",
@@ -1045,23 +979,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Sample targets (Fake NAs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Add goldstandard targets for valiation and test data\n",
-    "- based on same day\n",
-    "- same instrument"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Create some target values by sampling 5% of the validation and test data."
+    "Create some target values by sampling X% of the validation and test data.\n",
+    "Simulated missing values are not used for validation and testing."
    ]
   },
   {
@@ -1080,15 +999,87 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),\n",
-    "                                      sample_index_to_drop=0,\n",
-    "                                      weights=freq_per_feature,\n",
-    "                                      frac=0.1,\n",
-    "                                      random_state=params.random_state,)\n",
-    "assert len(splits.train_X) > len(fake_na)\n",
-    "splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state).sort_index()\n",
-    "splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]\n",
-    "# splits"
+    "# if not mnar:\n",
+    "#     fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),\n",
+    "#                                           sample_index_to_drop=0,\n",
+    "#                                           #   weights=freq_per_feature,\n",
+    "#                                           frac=0.1,\n",
+    "#                                           random_state=params.random_state,)\n",
+    "#     assert len(splits.train_X) > len(fake_na)\n",
+    "! move parameter checks to start of script\n",
+    "if 0.0 <= params.frac_mnar <= 1.0:\n",
+    "    fig, axes = plt.subplots(1, 2, figsize=(8, 2))\n",
+    "    quantile_frac = analysis.df_long.quantile(params.frac_non_train)\n",
+    "    rng = np.random.default_rng(params.random_state)\n",
+    "    threshold = pd.Series(rng.normal(loc=float(quantile_frac),\n",
+    "                                     scale=float(0.3 * analysis.df_long.std()),\n",
+    "                                     size=len(analysis.df_long),\n",
+    "                                     ),\n",
+    "                          index=analysis.df_long.index,\n",
+    "                          )\n",
+    "    # plot data vs threshold data\n",
+    "    ax = axes[0]\n",
+    "    from functools import partial\n",
+    "    plot_histogram_intensities = partial(vaep.plotting.data.plot_histogram_intensities,\n",
+    "                                         min_max=min_max,\n",
+    "                                         alpha=0.8)\n",
+    "    plot_histogram_intensities(\n",
+    "        analysis.df_long.squeeze(),\n",
+    "        ax=ax,\n",
+    "        label='observed')\n",
+    "    plot_histogram_intensities(\n",
+    "        threshold,\n",
+    "        ax=ax,\n",
+    "        label='thresholds')\n",
+    "    ax.legend()\n",
+    "    # select MNAR (intensity between randomly sampled threshold)\n",
+    "    mask = analysis.df_long.squeeze() < threshold\n",
+    "    ! subsample to have exact fraction of MNAR?\n",
+    "    N = len(analysis.df_long)\n",
+    "    logger.info(f\"{int(N * params.frac_non_train) = :,d}\")\n",
+    "    N_MNAR = int(params.frac_non_train * params.frac_mnar * N)\n",
+    "    fake_na_mnar = analysis.df_long.loc[mask]\n",
+    "    if len(fake_na_mnar) > N_MNAR:\n",
+    "        fake_na_mnar = fake_na_mnar.sample(N_MNAR,\n",
+    "                                           random_state=params.random_state)\n",
+    "    splits.train_X = analysis.df_long.loc[\n",
+    "        analysis.df_long.index.difference(\n",
+    "            fake_na_mnar.index)\n",
+    "    ]\n",
+    "    logger.info(f\"{len(fake_na_mnar) = :,d}\")\n",
+    "    N_MCAR = int(N * (1 - params.frac_mnar) * params.frac_non_train)\n",
+    "    fake_na_mcar = splits.train_X.sample(N_MCAR,\n",
+    "                                         random_state=params.random_state)\n",
+    "    logger.info(f\"{len(fake_na_mcar) = :,d}\")\n",
+    "    splits.train_X = (splits\n",
+    "                      .train_X\n",
+    "                      .loc[splits\n",
+    "                           .train_X\n",
+    "                           .index\n",
+    "                           .difference(\n",
+    "                               fake_na_mcar.index)]\n",
+    "                      ).squeeze()\n",
+    "    logger.info(f\"{len(splits.train_X) = :,d}\")\n",
+    "    fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze()\n",
+    "    logger.info(f\"{len(fake_na) = :,d}\")\n",
+    "    ax = axes[1]\n",
+    "    plot_histogram_intensities(\n",
+    "        fake_na_mnar.squeeze(),\n",
+    "        ax=ax,\n",
+    "        label=f'MNAR ({N_MNAR:,d})',\n",
+    "        color='C2')\n",
+    "    plot_histogram_intensities(\n",
+    "        fake_na_mcar.squeeze(),\n",
+    "        ax=ax,\n",
+    "        color='C3',\n",
+    "        label=f'MCAR ({N_MCAR:,d})')\n",
+    "    ax.legend()\n",
+    "    assert len(fake_na) + len(splits.train_X) == len(analysis.df_long)\n",
+    "else:\n",
+    "    raise ValueError(f\"Invalid MNAR float value (should be betw. 0 and 1): {params.frac_mnar}\")\n",
+    "\n",
+    "splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state)\n",
+    "splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]"
    ]
   },
   {
@@ -1131,14 +1122,14 @@
     "# per feature are allowd.\n",
     "\n",
     "diff = (splits\n",
-    "    .val_y\n",
-    "    .index\n",
-    "    .levels[-1]\n",
-    "    .difference(splits\n",
-    "                .train_X\n",
-    "                .index\n",
-    "                .levels[-1]\n",
-    "    ).to_list())\n",
+    "        .val_y\n",
+    "        .index\n",
+    "        .levels[-1]\n",
+    "        .difference(splits\n",
+    "                    .train_X\n",
+    "                    .index\n",
+    "                    .levels[-1]\n",
+    "                    ).to_list())\n",
     "if diff:\n",
     "    to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]\n",
     "    display(to_remove)\n",
@@ -1156,14 +1147,14 @@
    "outputs": [],
    "source": [
     "diff = (splits\n",
-    "    .test_y\n",
-    "    .index\n",
-    "    .levels[-1]\n",
-    "    .difference(splits\n",
-    "                .train_X\n",
-    "                .index\n",
-    "                .levels[-1]\n",
-    "    ).to_list())\n",
+    "        .test_y\n",
+    "        .index\n",
+    "        .levels[-1]\n",
+    "        .difference(splits\n",
+    "                    .train_X\n",
+    "                    .index\n",
+    "                    .levels[-1]\n",
+    "                    ).to_list())\n",
     "if diff:\n",
     "    to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]\n",
     "    display(to_remove)\n",
@@ -1228,7 +1219,7 @@
     "splits_df['val'] = splits.val_y\n",
     "splits_df['test'] = splits.test_y\n",
     "stats_splits = splits_df.describe()\n",
-    "stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')\n",
+    "# stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')\n",
     "stats_splits"
    ]
   },
@@ -1263,7 +1254,7 @@
     "          bins=bins,\n",
     "          ax=None,\n",
     "          color='C0',\n",
-    "))\n",
+    "      ))\n",
     "_ = (splits\n",
     "     .val_y\n",
     "     .plot\n",
@@ -1292,7 +1283,7 @@
     "                         legend=False,\n",
     "                         stacked=True,\n",
     "                         color=['C0', 'C1', 'C2'],\n",
-    "    )\n",
+    "                         )\n",
     "ax.legend(_legend)\n",
     "ax.set_xlabel('Intensity bins')\n",
     "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n",
@@ -1312,7 +1303,7 @@
     "                                               color=['C1', 'C2'],\n",
     "                                               legend=False,\n",
     "                                               stacked=True,\n",
-    "                        )\n",
+    "                                               )\n",
     "ax.legend(_legend[1:])\n",
     "ax.set_xlabel('Intensity bins')\n",
     "ax.yaxis.set_major_formatter(\"{x:,.0f}\")\n",
@@ -1363,6 +1354,40 @@
     "vaep.savefig(ax.get_figure(), fname)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "lines_to_next_cell": 0
+   },
+   "outputs": [],
+   "source": [
+    "medians = (splits\n",
+    "           .train_X\n",
+    "           .median()\n",
+    "           .astype(int)\n",
+    "           ).to_frame('median_floor')\n",
+    "\n",
+    "feat_with_median = medians.groupby('median_floor').size().rename('M feat')\n",
+    "medians = medians.join(feat_with_median, on='median_floor')\n",
+    "medians = medians.apply(lambda s: \"{:02,d} (N={:3,d})\".format(*s), axis=1)\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(8, 2))\n",
+    "s = 1\n",
+    "s_axes = pd.DataFrame({'medians': medians,\n",
+    "                       'validation split': splits.val_y.notna().sum(),\n",
+    "                       'training split': splits.train_X.notna().sum()}\n",
+    "                      ).plot.box(by='medians',\n",
+    "                                 subplots=True,\n",
+    "                                 boxprops=dict(linewidth=s),\n",
+    "                                 flierprops=dict(markersize=s),\n",
+    "                                 ax=ax)\n",
+    "for ax in s_axes:\n",
+    "    _ = ax.set_xticklabels(ax.get_xticklabels(),\n",
+    "                           rotation=45,\n",
+    "                           horizontalalignment='right')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py
index 48b1fa31d..9507aeb39 100644
--- a/project/01_0_split_data.py
+++ b/project/01_0_split_data.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.5
+#       jupytext_version: 1.15.0
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
@@ -20,10 +20,10 @@
 
 # %%
 from pathlib import Path
-
+import logging
 from typing import Union, List
 
-
+from IPython.display import display
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -32,31 +32,31 @@
 
 import vaep
 from vaep.io.datasplits import DataSplits
-from vaep.io import thermo_raw_files
 from vaep.sampling import feature_frequency, sample_data
 
 from vaep.analyzers import analyzers
-from vaep.analyzers.analyzers import  AnalyzePeptides
+from vaep.analyzers.analyzers import AnalyzePeptides
 
 logger = vaep.logging.setup_nb_logger()
 logger.info("Split data and make diagnostic plots")
+logging.getLogger('fontTools').setLevel(logging.WARNING)
+
 
-def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):
+def add_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):
     try:
-        analysis.df = analysis.df.loc[df_meta.index]
+        df = df.loc[df_meta.index]
     except KeyError as e:
         logger.warning(e)
         logger.warning("Ignore missing samples in quantified samples")
-        analysis.df = analysis.df.loc[analysis.df.index.intersection(
+        df = df.loc[df.index.intersection(
             df_meta.index)]
-
-    analysis.df_meta = df_meta
-    return analysis
+    return df_meta
 
 
 pd.options.display.max_columns = 32
 plt.rcParams['figure.figsize'] = [4, 2]
-vaep.plotting.make_large_descriptors(5)
+
+vaep.plotting.make_large_descriptors(6)
 
 figures = {}  # collection of ax or figures
 dumps = {}  # collection of data dumps
@@ -70,33 +70,29 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):
 args = dict(globals()).keys()
 
 # %% tags=["parameters"]
-# Sample (rows) intensiites for features (columns)
-FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'
-# Can be either a string or position (typical 0 for first column), or a list of these.
-index_col: Union[str, int] = 0
-# wide_format: bool = False # intensities in wide format (more memory efficient of csv). Default is long_format (more precise)
-# Manuelly set column names (of Index object in columns)
-column_names: List[str] = ["Gene Names"]
-# Machine parsed metadata from raw file (see workflows/metadata), wide format per sample
-fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
-# Minimum number or fraction of feature prevalence across samples to be kept
-feat_prevalence: Union[int, float] = 0.25
-# Minimum number or fraction of total requested features per Sample
-sample_completeness: Union[int, float] = 0.5
+FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'  # Sample (rows), features (columns)
+index_col: Union[str, int] = 0  # Can be either a string or position (default 0 for first column), or a list of these.
+column_names: List[str] = ["Gene Names"]  # Manuelly set column names (of Index object in columns)
+fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'  # metadata for samples (rows)
+feat_prevalence: Union[int, float] = 0.25  # Minimum number or fraction of feature prevalence across samples to be kept
+sample_completeness: Union[int, float] = 0.5  # Minimum number or fraction of total requested features per Sample
 select_N: int = None  # only use latest N samples
-sample_N: bool = False # if select_N, sample N randomly instead of using latest?
+sample_N: bool = False  # if select_N, sample N randomly instead of using latest N
 random_state: int = 42  # random state for reproducibility of splits
-# based on raw file meta data, only take samples with RT > min_RT_time
-min_RT_time: Union[int, float] = None
-# Log transformation of initial data (select one of the existing in numpy)
-logarithm: str = 'log2'
-folder_experiment: str = f'runs/example'
-folder_data: str = ''  # specify data directory if needed
+min_RT_time: Union[int, float] = None  # based on raw file meta data, only take samples with RT > min_RT_time
+logarithm: str = 'log2'  # Log transformation of initial data (select one of the existing in numpy)
+folder_experiment: str = 'runs/example'  # folder to save figures and data dumps
+folder_data: str = ''  # specify special data directory if needed
 file_format: str = 'csv'  # file format of create splits, default pickle (pkl)
 # metadata -> defaults for metadata extracted from machine data, used for plotting
 meta_date_col: str = None  # date column in meta data
 meta_cat_col: str = None  # category column in meta data
+# train, validation and test data splits
+frac_non_train: float = 0.1  # fraction of non training data (validation and test split)
+frac_mnar: float = 0.0  # fraction of missing not at random data, rest: missing completely at random
 
+meta_date_col: str = 'Content Creation Date'
+meta_cat_col: str = 'Software Version'
 
 # %%
 args = vaep.nb.get_params(args, globals=globals())
@@ -132,6 +128,7 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):
     f"File format (extension): {FILE_EXT}  (!specifies data loading function!)")
 
 # %%
+# ! factor out file reading to a separate module, not class
 # AnalyzePeptides.from_csv
 constructor = getattr(AnalyzePeptides, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT])
 analysis = constructor(fname=params.FN_INTENSITIES,
@@ -147,11 +144,17 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):
 log_fct = getattr(np, params.logarithm)
 analysis.log_transform(log_fct)
 logger.info(f"{analysis = }")
-analysis.df
+df = analysis.df
+df
 
 # %%
-ax = analysis.df.notna().sum(axis=0).to_frame(
-    analysis.df.columns.name).plot.box()
+ax = (df
+      .notna()
+      .sum(axis=0)
+      .to_frame(df.columns.name)
+      .plot
+      .box()
+      )
 ax.set_ylabel('number of observation across samples')
 
 
@@ -160,7 +163,7 @@ def add_meta_data(analysis: AnalyzePeptides, df_meta: pd.DataFrame):
 dumps[fname.name] = fname.as_posix()
 writer = pd.ExcelWriter(fname)
 
-notna = analysis.df.notna()
+notna = df.notna()
 data_stats_original = pd.concat(
     [
         notna.sum().describe().rename('feat_stats'),
@@ -181,15 +184,16 @@ def join_as_str(seq):
     ret = "_".join(str(x) for x in seq)
     return ret
 
+
 # ToDo: join multiindex samples indices (pkl dumps)
-# if hasattr(analysis.df.columns, "levels"):
-if isinstance(analysis.df.columns, pd.MultiIndex):
+# if hasattr(df.columns, "levels"):
+if isinstance(df.columns, pd.MultiIndex):
     logger.warning("combine MultiIndex columns to one feature column")
-    print(analysis.df.columns[:10].map(join_as_str))
-    _new_name = join_as_str(analysis.df.columns.names)
-    analysis.df.columns = analysis.df.columns.map(join_as_str)
-    analysis.df.columns.name = _new_name
-    logger.warning(f"New name: {analysis.df.columns.names = }")
+    print(df.columns[:10].map(join_as_str))
+    _new_name = join_as_str(df.columns.names)
+    df.columns = df.columns.map(join_as_str)
+    df.columns.name = _new_name
+    logger.warning(f"New name: {df.columns.names = }")
 
 # %% [markdown]
 # ## Machine metadata
@@ -207,8 +211,8 @@ def join_as_str(seq):
     if params.meta_cat_col:
         raise ValueError(
             f"No metadata provided, but data column set: {params.meta_cat_col}")
-    df_meta = pd.DataFrame(index=analysis.df.index)
-df_meta = df_meta.loc[analysis.df.index.to_list()]  # index is sample index
+    df_meta = pd.DataFrame(index=df.index)
+df_meta = df_meta.loc[df.index.to_list()]  # index is sample index
 if df_meta.index.name is None:
     df_meta.index.name = params.index_col[0]
 df_meta
@@ -222,13 +226,6 @@ def join_as_str(seq):
     df_meta[params.meta_date_col] = range(len(df_meta))
 df_meta
 
-# %%
-if df_meta.columns.isin(thermo_raw_files.cols_instrument).sum() == len(thermo_raw_files.cols_instrument):
-    display(df_meta.groupby(thermo_raw_files.cols_instrument)[
-            params.meta_date_col].agg(['count', 'min', 'max']))
-else:
-    logger.info(
-        f"Instrument column not found: {thermo_raw_files.cols_instrument}")
 
 # %%
 df_meta.describe(datetime_is_numeric=True,
@@ -249,7 +246,7 @@ def join_as_str(seq):
     logger.info(msg)
     df_meta = df_meta.loc[mask_RT]
 else:
-    logger.warning(f"Retention time filtering deactivated.")
+    logger.warning("Retention time filtering deactivated.")
 
 # %%
 df_meta = df_meta.sort_values(params.meta_date_col)
@@ -271,51 +268,15 @@ def join_as_str(seq):
     if 'unique' in meta_stats.index:
         display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)])
 
-# %% [markdown]
-# Optional, if using ThermoRawFileParser: check some columns describing settings
-#   - software can be updated: `Software Version`
-#   - `mass resolution` setting for instrument
-#   - colision type for MS2: `beam-type collision-induced dissocation`
-#   - missing `dilution factor`
-#   - omit (uncomment if needed):
-#     - quite some variation due to `MS max charge`: omit
-#     - variation by `injection volume setting` and instrument over time
-#         - 500ng of peptides should be injected, based on concentration of peptides this setting is adjusted to get it
-
-# %%
-meta_raw_settings = [
-    'Thermo Scientific instrument model',
-    'instrument serial number',
-    'Software Version',
-    # 'MS max charge',
-    'mass resolution',
-    'beam-type collision-induced dissociation',
-    # 'injection volume setting',
-    'dilution factor',
-]
-
-if df_meta.columns.isin(meta_raw_settings).sum() == len(meta_raw_settings):
-    display(
-        # index gives first example with this combination
-        df_meta[meta_raw_settings].drop_duplicates()
-    )
-
-
-# %% [markdown]
-# - check for variation in `software Version` and `injection volume setting`
-#
-#
-# Update selection of samples based on metadata (e.g. minimal retention time)
-# - sort data the same as sorted meta data
 
 # %%
-analysis = add_meta_data(analysis, df_meta=df_meta)
+df_meta = add_meta_data(df, df_meta=df_meta)
 
 # %% [markdown]
 # Ensure unique indices
 
 # %%
-assert analysis.df.index.is_unique, "Duplicates in index"
+assert df.index.is_unique, "Duplicates in index"
 
 # %% [markdown]
 # ## Select a subset of samples if specified (reduce the number of samples)
@@ -326,25 +287,26 @@ def join_as_str(seq):
 
 # %%
 if params.select_N is not None:
-    params.select_N = min(params.select_N, len(analysis.df_meta))
+    params.select_N = min(params.select_N, len(df_meta))
     if params.sample_N:
-        analysis.df_meta = analysis.df_meta.sample(params.select_N)
+        df_meta = df_meta.sample(params.select_N)
     else:
-        analysis.df_meta = analysis.df_meta.iloc[-params.select_N:]
+        df_meta = df_meta.iloc[-params.select_N:]
 
-    analysis.df = analysis.df.loc[analysis.df_meta.index].dropna(
+    df = df.loc[df_meta.index].dropna(
         how='all', axis=1)
-    ax = analysis.df.T.describe().loc['count'].hist()
+    ax = df.T.describe().loc['count'].hist()
     _ = ax.set_title('histogram of features for all eligable samples')
 
 # %% [markdown]
 # ## First Step: Select features by prevalence
 # - `feat_prevalence` across samples
 
+
 # %%
-freq_per_feature = analysis.df.notna().sum()  # on wide format
+freq_per_feature = df.notna().sum()  # on wide format
 if isinstance(params.feat_prevalence, float):
-    N_samples = len(analysis.df_meta)
+    N_samples = len(df)
     logger.info(f"Current number of samples: {N_samples}")
     logger.info(
         f"Feature has to be present in at least {params.feat_prevalence:.2%} of samples")
@@ -358,13 +320,13 @@ def join_as_str(seq):
 mask = freq_per_feature >= params.feat_prevalence
 logger.info(f"Drop {(~mask).sum()} features")
 freq_per_feature = freq_per_feature.loc[mask]
-analysis.df = analysis.df.loc[:, mask]
-analysis.N, analysis.M = analysis.df.shape
-
+df = df.loc[:, mask]
+analysis.N, analysis.M = df.shape
 # # potentially create freq based on DataFrame
-analysis.df
+df
 
-notna = analysis.df.notna()
+# %%
+notna = df.notna()
 data_stats_filtered = pd.concat(
     [
         notna.sum().describe().rename('feat_stats'),
@@ -385,30 +347,30 @@ def join_as_str(seq):
     msg = f'Fraction of minimum sample completeness over all features specified with: {params.sample_completeness}\n'
     # assumes df in wide format
     params.overwrite_entry('sample_completeness', int(
-        analysis.df.shape[1] * params.sample_completeness))
+        df.shape[1] * params.sample_completeness))
     msg += f'This translates to a minimum number of features per sample (to be included): {params.sample_completeness}'
     logger.info(msg)
 
-sample_counts = analysis.df.notna().sum(axis=1)  # if DataFrame
+sample_counts = df.notna().sum(axis=1)  # if DataFrame
 sample_counts.describe()
 
 # %%
 mask = sample_counts > params.sample_completeness
 msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.'
 print(msg)
-analysis.df = analysis.df.loc[mask]
-analysis.df = analysis.df.dropna(
+df = df.loc[mask]
+df = df.dropna(
     axis=1, how='all')  # drop now missing features
 
 # %%
-params.N, params.M = analysis.df.shape  # save data dimensions
-params.used_samples = analysis.df.index.to_list()
+params.N, params.M = df.shape  # save data dimensions
+params.used_samples = df.index.to_list()
 
 # %% [markdown]
 # ### Histogram of features per sample
 
 # %%
-ax = analysis.df.notna().sum(axis=1).hist()
+ax = df.notna().sum(axis=1).hist()
 ax.set_xlabel('features per eligable sample')
 ax.set_ylabel('observations')
 fname = params.out_figures / 'hist_features_per_sample'
@@ -416,7 +378,7 @@ def join_as_str(seq):
 vaep.savefig(ax.get_figure(), fname)
 
 # %%
-ax = analysis.df.notna().sum(axis=0).sort_values().plot()
+ax = df.notna().sum(axis=0).sort_values().plot()
 _new_labels = [l.get_text().split(';')[0] for l in ax.get_xticklabels()]
 _ = ax.set_xticklabels(_new_labels, rotation=45,
                        horizontalalignment='right')
@@ -431,9 +393,9 @@ def join_as_str(seq):
 # ### Number off observations accross feature value
 
 # %%
-min_max = vaep.plotting.data.min_max(analysis.df.stack())
+min_max = vaep.plotting.data.min_max(df.stack())
 ax, bins = vaep.plotting.data.plot_histogram_intensities(
-    analysis.df.stack(), min_max=min_max)
+    df.stack(), min_max=min_max)
 
 fname = params.out_figures / 'intensity_distribution_overall'
 figures[fname.stem] = fname
@@ -441,14 +403,14 @@ def join_as_str(seq):
 
 # %%
 ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
-    data=analysis.df, type='scatter')
+    data=df, type='scatter')
 fname = params.out_figures / 'intensity_median_vs_prop_missing_scatter'
 figures[fname.stem] = fname
 vaep.savefig(ax.get_figure(), fname)
 
 # %%
 ax = vaep.plotting.data.plot_feat_median_over_prop_missing(
-    data=analysis.df, type='boxplot')
+    data=df, type='boxplot')
 fname = params.out_figures / 'intensity_median_vs_prop_missing_boxplot'
 figures[fname.stem] = fname
 vaep.savefig(ax.get_figure(), fname)
@@ -456,17 +418,14 @@ def join_as_str(seq):
 # %% [markdown]
 # ### Interactive and Single plots
 
-# %% [markdown]
-# Scatter plots need to become interactive.
-
 # %%
 sample_counts.name = 'identified features'
 
 # %%
 K = 2
-analysis.df = analysis.df.astype(float)
+df = df.astype(float)
 pcs = analysis.get_PCA(n_components=K)  # should be renamed to get_PCs
-pcs = pcs.iloc[:, :K].join(analysis.df_meta).join(sample_counts)
+pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts)
 
 pcs_name = pcs.columns[:2]
 pcs_index_name = pcs.index.name
@@ -478,11 +437,11 @@ def join_as_str(seq):
 
 # %%
 if params.meta_cat_col:
-    fig, ax = plt.subplots(figsize=(2,2))
+    fig, ax = plt.subplots(figsize=(2, 2))
     analyzers.seaborn_scatter(
         pcs[pcs_name], ax, meta=pcs[params.meta_cat_col], title=f"by {params.meta_cat_col}")
     fname = (params.out_figures
-              / f'pca_sample_by_{"_".join(params.meta_cat_col.split())}')
+             / f'pca_sample_by_{"_".join(params.meta_cat_col.split())}')
     figures[fname.stem] = fname
     vaep.savefig(fig, fname)
 
@@ -496,7 +455,6 @@ def join_as_str(seq):
     vaep.savefig(fig, fname)
 
 # %% [markdown]
-# - software version: Does it make a difference?
 # - size: number of features in a single sample
 
 # %%
@@ -523,8 +481,8 @@ def join_as_str(seq):
     # color=pcs['Software Version'],
     color=col_identified_feat,
     template='none',
-    width=1200, # 4 inches x 300 dpi
-    height=600 # 2 inches x 300 dpi
+    width=1200,  # 4 inches x 300 dpi
+    height=600  # 2 inches x 300 dpi
 )
 fname = (params.out_figures
          / f'pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf')
@@ -536,30 +494,35 @@ def join_as_str(seq):
 # ## Sample Medians and percentiles
 
 # %%
-analysis.df.head()
+df.head()
 
 # %%
-df = analysis.df
-df = df.join(df_meta[params.meta_date_col])
-df = df.set_index(params.meta_date_col).sort_index()
+df_w_date = df.join(df_meta[params.meta_date_col])
+df_w_date = df_w_date.set_index(params.meta_date_col).sort_index()
 if not params.meta_date_col == 'PlaceholderTime':
-    df.to_period('min')
-df = df.T
+    df_w_date.to_period('min')
+df_w_date = df_w_date.T
+df_w_date
 
 # %%
-ax = df.boxplot(rot=80, figsize=(8, 3), fontsize=5,
-                showfliers=False, showcaps=False)
+ax = df_w_date.boxplot(rot=80,
+                       figsize=(8, 3),
+                       fontsize=6,
+                       showfliers=False,
+                       showcaps=False
+                       )
 _ = vaep.plotting.select_xticks(ax)
 fig = ax.get_figure()
 fname = params.out_figures / 'median_boxplot'
 figures[fname.stem] = fname
 vaep.savefig(fig, fname)
+del df_w_date
 
 # %% [markdown]
 # Percentiles of intensities in dataset
 
 # %%
-df.stack().describe(percentiles=np.linspace(0.05, 0.95, 10))
+df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))
 
 # %% [markdown]
 # ### Plot sample median over time
@@ -569,15 +532,14 @@ def join_as_str(seq):
 # %%
 if not params.meta_date_col == 'PlaceholderTime':
     dates = df_meta[params.meta_date_col].sort_values()
-    # dates.name = 'date'
-    median_sample_intensity = (analysis.df
+    median_sample_intensity = (df
                                .median(axis=1)
                                .to_frame('median intensity'))
     median_sample_intensity = median_sample_intensity.join(dates)
 
     ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity',
                                               rot=90,
-                                              fontsize='large',
+                                              #   fontsize=6,
                                               figsize=(8, 2),
                                               s=5,
                                               xticks=vaep.plotting.select_dates(
@@ -592,26 +554,17 @@ def join_as_str(seq):
 
 # %% [markdown]
 # ## Feature frequency  in data
-#
-# - higher count, higher probability to be sampled into training data
-# - missing peptides are sampled both into training as well as into validation dataset
-# - everything not in training data is validation data
-#
-# Based on unmodified training data
 
 # %%
-msg = "Total number of samples in training data split: {}"
-print(msg.format(len(analysis.df)))
+msg = "Total number of samples in data: {}"
+print(msg.format(len(df)))
 
-# %%
-# # analysis.splits.to_wide_format()
-# assert analysis.splits is splits, "Sanity check failed."
 
 # %% [markdown]
-# Recalculate feature frequency after selecting some samples
+# Recalculate feature frequency after selecting samples
 
 # %%
-freq_per_feature = feature_frequency(analysis.df)
+freq_per_feature = feature_frequency(df)
 freq_per_feature
 
 # %%
@@ -624,16 +577,19 @@ def join_as_str(seq):
 dumps[fname.name] = fname
 freq_per_feature.to_pickle(fname)
 
-# %% [markdown]
-# Conserning sampling with frequency weights:
-#   - larger weight -> higher probablility of being sampled
-#   - weights need to be alignable to index of original DataFrame before grouping (same index)
 
 # %% [markdown]
 # ## Split: Train, validation and test data
 #
-# - test data is in clinical language often denoted as independent validation cohort
-# - validation data (for model)
+# Select features as described in
+# > Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016.
+# > “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative
+# > Proteomics Data Sets to Compare Imputation Strategies.”
+# > Journal of Proteome Research 15 (4): 1116–25.
+#
+# - select `frac_mnar` based on threshold matrix on quantile of overall frac of data to be used
+#   for validation and test data split, e.g. 0.1 = quantile(0.1)
+# - select frac_mnar from intensities selected using threshold matrix
 
 # %%
 analysis.splits = DataSplits(is_wide_format=False)
@@ -641,31 +597,97 @@ def join_as_str(seq):
 print(f"{analysis.splits = }")
 analysis.splits.__annotations__
 
-# %% [markdown]
-# ### Sample targets (Fake NAs)
 
 # %% [markdown]
-# Add goldstandard targets for valiation and test data
-# - based on same day
-# - same instrument
-
-# %% [markdown]
-# Create some target values by sampling 5% of the validation and test data.
+# Create some target values by sampling X% of the validation and test data.
+# Simulated missing values are not used for validation and testing.
 
 # %%
 analysis.to_long_format(inplace=True)
 analysis.df_long
 
 # %%
-fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),
-                                      sample_index_to_drop=0,
-                                      weights=freq_per_feature,
-                                      frac=0.1,
-                                      random_state=params.random_state,)
-assert len(splits.train_X) > len(fake_na)
-splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state).sort_index()
+# if not mnar:
+#     fake_na, splits.train_X = sample_data(analysis.df_long.squeeze(),
+#                                           sample_index_to_drop=0,
+#                                           #   weights=freq_per_feature,
+#                                           frac=0.1,
+#                                           random_state=params.random_state,)
+#     assert len(splits.train_X) > len(fake_na)
+# ! move parameter checks to start of script
+if 0.0 <= params.frac_mnar <= 1.0:
+    fig, axes = plt.subplots(1, 2, figsize=(8, 2))
+    quantile_frac = analysis.df_long.quantile(params.frac_non_train)
+    rng = np.random.default_rng(params.random_state)
+    threshold = pd.Series(rng.normal(loc=float(quantile_frac),
+                                     scale=float(0.3 * analysis.df_long.std()),
+                                     size=len(analysis.df_long),
+                                     ),
+                          index=analysis.df_long.index,
+                          )
+    # plot data vs threshold data
+    ax = axes[0]
+    from functools import partial
+    plot_histogram_intensities = partial(vaep.plotting.data.plot_histogram_intensities,
+                                         min_max=min_max,
+                                         alpha=0.8)
+    plot_histogram_intensities(
+        analysis.df_long.squeeze(),
+        ax=ax,
+        label='observed')
+    plot_histogram_intensities(
+        threshold,
+        ax=ax,
+        label='thresholds')
+    ax.legend()
+    # select MNAR (intensity between randomly sampled threshold)
+    mask = analysis.df_long.squeeze() < threshold
+    # ! subsample to have exact fraction of MNAR?
+    N = len(analysis.df_long)
+    logger.info(f"{int(N * params.frac_non_train) = :,d}")
+    N_MNAR = int(params.frac_non_train * params.frac_mnar * N)
+    fake_na_mnar = analysis.df_long.loc[mask]
+    if len(fake_na_mnar) > N_MNAR:
+        fake_na_mnar = fake_na_mnar.sample(N_MNAR,
+                                           random_state=params.random_state)
+    splits.train_X = analysis.df_long.loc[
+        analysis.df_long.index.difference(
+            fake_na_mnar.index)
+    ]
+    logger.info(f"{len(fake_na_mnar) = :,d}")
+    N_MCAR = int(N * (1 - params.frac_mnar) * params.frac_non_train)
+    fake_na_mcar = splits.train_X.sample(N_MCAR,
+                                         random_state=params.random_state)
+    logger.info(f"{len(fake_na_mcar) = :,d}")
+    splits.train_X = (splits
+                      .train_X
+                      .loc[splits
+                           .train_X
+                           .index
+                           .difference(
+                               fake_na_mcar.index)]
+                      ).squeeze()
+    logger.info(f"{len(splits.train_X) = :,d}")
+    fake_na = pd.concat([fake_na_mcar, fake_na_mnar]).squeeze()
+    logger.info(f"{len(fake_na) = :,d}")
+    ax = axes[1]
+    plot_histogram_intensities(
+        fake_na_mnar.squeeze(),
+        ax=ax,
+        label=f'MNAR ({N_MNAR:,d})',
+        color='C2')
+    plot_histogram_intensities(
+        fake_na_mcar.squeeze(),
+        ax=ax,
+        color='C3',
+        label=f'MCAR ({N_MCAR:,d})')
+    ax.legend()
+    assert len(fake_na) + len(splits.train_X) == len(analysis.df_long)
+else:
+    raise ValueError(f"Invalid MNAR float value (should be betw. 0 and 1): {params.frac_mnar}")
+
+splits.val_y = fake_na.sample(frac=0.5, random_state=params.random_state)
 splits.test_y = fake_na.loc[fake_na.index.difference(splits.val_y.index)]
-# splits
 
 # %%
 splits.test_y
@@ -684,14 +706,14 @@ def join_as_str(seq):
 # per feature are allowd.
 
 diff = (splits
-    .val_y
-    .index
-    .levels[-1]
-    .difference(splits
-                .train_X
-                .index
-                .levels[-1]
-    ).to_list())
+        .val_y
+        .index
+        .levels[-1]
+        .difference(splits
+                    .train_X
+                    .index
+                    .levels[-1]
+                    ).to_list())
 if diff:
     to_remove = splits.val_y.loc[pd.IndexSlice[:, diff]]
     display(to_remove)
@@ -701,14 +723,14 @@ def join_as_str(seq):
 
 # %%
 diff = (splits
-    .test_y
-    .index
-    .levels[-1]
-    .difference(splits
-                .train_X
-                .index
-                .levels[-1]
-    ).to_list())
+        .test_y
+        .index
+        .levels[-1]
+        .difference(splits
+                    .train_X
+                    .index
+                    .levels[-1]
+                    ).to_list())
 if diff:
     to_remove = splits.test_y.loc[pd.IndexSlice[:, diff]]
     display(to_remove)
@@ -744,7 +766,7 @@ def join_as_str(seq):
 splits_df['val'] = splits.val_y
 splits_df['test'] = splits.test_y
 stats_splits = splits_df.describe()
-stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')
+# stats_splits.to_excel(writer, 'stats_splits', float_format='%.2f')
 stats_splits
 
 # %%
@@ -767,7 +789,7 @@ def join_as_str(seq):
           bins=bins,
           ax=None,
           color='C0',
-))
+      ))
 _ = (splits
      .val_y
      .plot
@@ -790,7 +812,7 @@ def join_as_str(seq):
                          legend=False,
                          stacked=True,
                          color=['C0', 'C1', 'C2'],
-    )
+                         )
 ax.legend(_legend)
 ax.set_xlabel('Intensity bins')
 ax.yaxis.set_major_formatter("{x:,.0f}")
@@ -804,7 +826,7 @@ def join_as_str(seq):
                                                color=['C1', 'C2'],
                                                legend=False,
                                                stacked=True,
-                        )
+                                               )
 ax.legend(_legend[1:])
 ax.set_xlabel('Intensity bins')
 ax.yaxis.set_major_formatter("{x:,.0f}")
@@ -832,6 +854,31 @@ def join_as_str(seq):
 figures[fname.stem] = fname
 vaep.savefig(ax.get_figure(), fname)
 
+# %%
+medians = (splits
+           .train_X
+           .median()
+           .astype(int)
+           ).to_frame('median_floor')
+
+feat_with_median = medians.groupby('median_floor').size().rename('M feat')
+medians = medians.join(feat_with_median, on='median_floor')
+medians = medians.apply(lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1)
+
+fig, ax = plt.subplots(figsize=(8, 2))
+s = 1
+s_axes = pd.DataFrame({'medians': medians,
+                       'validation split': splits.val_y.notna().sum(),
+                       'training split': splits.train_X.notna().sum()}
+                      ).plot.box(by='medians',
+                                 subplots=True,
+                                 boxprops=dict(linewidth=s),
+                                 flierprops=dict(markersize=s),
+                                 ax=ax)
+for ax in s_axes:
+    _ = ax.set_xticklabels(ax.get_xticklabels(),
+                           rotation=45,
+                           horizontalalignment='right')
 # %% [markdown]
 # ## Save parameters
 
diff --git a/vaep/plotting/data.py b/vaep/plotting/data.py
index 1fbd31d7a..f50c63606 100644
--- a/vaep/plotting/data.py
+++ b/vaep/plotting/data.py
@@ -1,4 +1,4 @@
-"""Plot data distribution based on pandas DataFrames or Series."""
+"""Plot data distribution based on pandas `DataFrames` or `Series`."""
 from typing import Tuple, Iterable
 
 import matplotlib
@@ -9,7 +9,19 @@
 
 
 def min_max(s: pd.Series) -> Tuple[int]:
-    min_bin, max_bin = (int(s.min()), (int(s.max())+1))
+    """Get the min and max as integer from a pandas.Series.
+
+    Parameters
+    ----------
+    s : pd.Series
+        Series of intensities.
+
+    Returns
+    -------
+    Tuple[int]
+        _description_
+    """
+    min_bin, max_bin = (int(s.min()), (int(s.max()) + 1))
     return min_bin, max_bin
 
 
@@ -27,10 +39,10 @@ def get_min_max_iterable(series: Iterable[pd.Series]) -> Tuple[int]:
 
 
 def plot_histogram_intensities(s: pd.Series,
-                              interval_bins=1,
-                              min_max=(15, 40),
-                              ax=None,
-                              **kwargs) -> Tuple[Axes, range]:
+                               interval_bins=1,
+                               min_max=(15, 40),
+                               ax=None,
+                               **kwargs) -> Tuple[Axes, range]:
     """Plot intensities in Series in a certain range and equally spaced intervals."""
     min_bin, max_bin = min_max
     bins = range(min_bin, max_bin, interval_bins)
@@ -90,8 +102,24 @@ def plot_observations(df: pd.DataFrame,
 
 
 def plot_missing_dist_highdim(data: pd.DataFrame,
-                              min_feat_per_sample=None,
-                              min_samples_per_feat=None) -> matplotlib.figure.Figure:
+                              min_feat_per_sample: int = None,
+                              min_samples_per_feat: int = None) -> matplotlib.figure.Figure:
+    """Plot missing distribution (cdf) in high dimensional data.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Intensity table with samples in rows and features in columns.
+    min_feat_per_sample : int, optional
+        Show the minimum required features a sample has to have, by default None
+    min_samples_per_feat : int, optional
+        Show the minimum required number of samples a feature has to be found in, by default None
+
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Figure with two plots (Axes).
+    """
     fig, axes = plt.subplots(1, 2, figsize=(4, 2))
     not_na = data.notna()
     name = 'features per sample'
@@ -251,8 +279,8 @@ def plot_feat_median_over_prop_missing(data: pd.DataFrame,
     missing_by_median['bins'] = pd.cut(
         missing_by_median['median feat value'], bins=bins)
     missing_by_median['median feat value (floor)'] = (missing_by_median['median feat value']
-                                                        .astype(int)
-                                                        )
+                                                      .astype(int)
+                                                      )
     _counts = (missing_by_median
                .groupby('median feat value (floor)')['median feat value']
                .count()