diff --git a/project/erda_02_mq_count_features.ipynb b/project/erda_02_mq_count_features.ipynb index bc1e89640..1d50f1f7d 100644 --- a/project/erda_02_mq_count_features.ipynb +++ b/project/erda_02_mq_count_features.ipynb @@ -10,7 +10,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import os\n", @@ -53,7 +55,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')\n", @@ -73,7 +77,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fn_id_old_new: str = 'data/rename/selected_old_new_id_mapping.csv' # selected samples with pride and original id\n", @@ -81,24 +87,37 @@ "df_ids" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Select files and create list of folders" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']}\n", "# folders_dict = {p.stem : p.parent / p.stem for p in folders_dict}\n", - "# folders_dict" + "# folders_dict\n", + "folders = [Path(folder_path) for folder_path in folders_dict.values()]\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "OVERWRITE = False\n", + "OVERWRITE = True\n", "\n", "from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES\n", "\n", @@ -115,7 +134,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import random\n", @@ -129,7 +150,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "use_columns = mq_output.peptides.columns[33:45]\n", @@ -140,7 +163,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "df_json_string = df.to_json(orient='index', indent=4)\n", @@ -150,7 +175,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "df_csv = df.to_csv()\n", @@ -160,7 +187,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "pd.read_json(df_json_string, orient='index')" @@ -169,7 +198,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands" @@ -185,7 +216,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "peptide_counter = PeptideCounter(FNAME_C_PEPTIDES, overwrite=OVERWRITE)\n", @@ -195,7 +228,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "if peptide_counter.loaded:\n", @@ -222,10 +257,37 @@ "outputs": [], "source": [ "%%time\n", - "# folders = [Path(folder_path) for folder_path in folders_dict.values()]\n", "c = peptide_counter.sum_over_files(folders=folders)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for k, v in peptide_counter.dumps.items():\n", + " old_name = v\n", + " new_name = v.parent / (df_ids.loc[k, 'new_sample_id'] + '.csv')\n", + " try:\n", + " os.rename(old_name, new_name)\n", + " except FileNotFoundError:\n", + " logging.warning(f\"File not found: {old_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "new_name" + ] + }, { "cell_type": "code", "execution_count": null, @@ -238,7 +300,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# To share as python file\n", @@ -761,7 +825,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.15" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/project/erda_02_mq_count_features.py b/project/erda_02_mq_count_features.py index e14a92657..80821096b 100644 --- a/project/erda_02_mq_count_features.py +++ b/project/erda_02_mq_count_features.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.1 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -67,13 +67,19 @@ df_ids = pd.read_csv(fn_id_old_new) df_ids +# %% [markdown] +# Select files and create list of folders + # %% folders_dict = { sample_id: FOLDER_MQ_TXT_DATA / sample_id for sample_id in df_ids['Sample ID']} # folders_dict = {p.stem : p.parent / p.stem for p in folders_dict} # folders_dict +folders = [Path(folder_path) for folder_path in folders_dict.values()] + # %% OVERWRITE = False +OVERWRITE = True from config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES @@ -129,9 +135,20 @@ # %% # %%time -# folders = [Path(folder_path) for folder_path in folders_dict.values()] c = peptide_counter.sum_over_files(folders=folders) +# %% +for k, v in peptide_counter.dumps.items(): + old_name = v + new_name = v.parent / (df_ids.loc[k, 'new_sample_id'] + '.csv') + try: + os.rename(old_name, new_name) + except FileNotFoundError: + logging.warning(f"File not found: {old_name}") + +# %% +new_name + # %% c.most_common(10) # peptide_counter.counter.most_common(10) diff --git a/vaep/io/data_objects.py b/vaep/io/data_objects.py index 6540ecbad..8ffb62340 100644 --- a/vaep/io/data_objects.py +++ b/vaep/io/data_objects.py @@ -399,7 +399,7 @@ def __call__(self, folders, ### aggregated peptides # # check df for redundant information (same feature value for all entries) -usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE] +usecols = mq.COLS_ + ['Potential contaminant', mq.mq_col.SEQUENCE, 'PEP'] def count_peptides(folders: List[Path], dump=True, @@ -426,6 +426,7 @@ def count_peptides(folders: List[Path], dump=True, d_dtypes_training_sample = { 'Sequence': pd.StringDtype(), 'Proteins': pd.StringDtype(), + 'PEP': pd.Float32Dtype(), 'Leading razor protein': pd.StringDtype(), 'Gene names': pd.StringDtype(), 'Intensity': pd.Int64Dtype()