10_compute_correlations: pearson/spearman, compute only on whole blood

miltondp · miltondp · commit 356b9549dddf · 2023-09-06T15:33:00.000-06:00
diff --git a/nbs/10_compute_correlations/05_gtex_v8/05_01-gtex-var_pc_log2-pearson.ipynb b/nbs/10_compute_correlations/05_gtex_v8/05_01-gtex-var_pc_log2-pearson.ipynb
@@ -76,10 +76,13 @@
    },
    "outputs": [],
    "source": [
+    "from time import time\n",
+    "\n",
     "import pandas as pd\n",
     "from tqdm import tqdm\n",
     "\n",
     "from ccc import conf\n",
+    "from ccc.utils import simplify_string\n",
     "from ccc.corr import pearson"
    ]
   },
@@ -125,6 +128,38 @@
     "GENE_SELECTION_STRATEGY = \"var_pc_log2\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ac2eaa49-c49e-4f3c-83c5-4f8c321d25a7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-05-24T14:46:59.207591Z",
+     "iopub.status.busy": "2022-05-24T14:46:59.207409Z",
+     "iopub.status.idle": "2022-05-24T14:46:59.210688Z",
+     "shell.execute_reply": "2022-05-24T14:46:59.210092Z"
+    },
+    "papermill": {
+     "duration": 0.010126,
+     "end_time": "2022-05-24T14:46:59.212259",
+     "exception": false,
+     "start_time": "2022-05-24T14:46:59.202133",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)\n",
+    "TISSUES = [\n",
+    "    # \"Muscle - Skeletal\",\n",
+    "    \"Whole Blood\",\n",
+    "    # \"Skin - Sun Exposed (Lower leg)\",\n",
+    "    # \"Adipose - Subcutaneous\",\n",
+    "    # \"Artery - Tibial\",\n",
+    "]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -341,12 +376,27 @@
      "output_type": "display_data"
     }
    ],
+   "source": [
+    "tissue_in_file_names = [f\"_data_{simplify_string(t.lower())}-\" for t in TISSUES]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd78a8c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "input_files = sorted(list(INPUT_DIR.glob(f\"*-{GENE_SELECTION_STRATEGY}.pkl\")))\n",
+    "input_files = [\n",
+    "    f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)\n",
+    "]\n",
     "display(len(input_files))\n",
     "\n",
-    "assert len(input_files) == conf.GTEX[\"N_TISSUES\"], len(input_files)\n",
-    "display(input_files[:5])"
+    "assert len(input_files) == len(TISSUES), len(TISSUES)\n",
+    "display(input_files)"
    ]
   },
   {
@@ -914,7 +964,7 @@
     }
    ],
    "source": [
-    "%timeit CORRELATION_METHOD(test_data)"
+    "%timeit -r1 CORRELATION_METHOD(test_data)"
    ]
   },
   {
@@ -973,8 +1023,14 @@
     "    data = pd.read_pickle(tissue_data_file)\n",
     "\n",
     "    # compute correlations\n",
+    "    start_time = time()\n",
+    "\n",
     "    data_corrs = CORRELATION_METHOD(data)\n",
     "\n",
+    "    end_time = time()\n",
+    "    elapsed_time = end_time - start_time\n",
+    "    display(elapsed_time)\n",
+    "\n",
     "    # save\n",
     "    output_filename = f\"{tissue_data_file.stem}-{method_name}.pkl\"\n",
     "    data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)"
@@ -1017,7 +1073,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.9"
+   "version": "3.9.12"
   },
   "papermill": {
    "default_parameters": {},
diff --git a/nbs/10_compute_correlations/05_gtex_v8/06_01-gtex-var_pc_log2-spearman.ipynb b/nbs/10_compute_correlations/05_gtex_v8/06_01-gtex-var_pc_log2-spearman.ipynb
@@ -76,10 +76,13 @@
    },
    "outputs": [],
    "source": [
+    "from time import time\n",
+    "\n",
     "import pandas as pd\n",
     "from tqdm import tqdm\n",
     "\n",
     "from ccc import conf\n",
+    "from ccc.utils import simplify_string\n",
     "from ccc.corr import spearman"
    ]
   },
@@ -125,6 +128,25 @@
     "GENE_SELECTION_STRATEGY = \"var_pc_log2\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9b9f8b2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)\n",
+    "TISSUES = [\n",
+    "    # \"Muscle - Skeletal\",\n",
+    "    \"Whole Blood\",\n",
+    "    # \"Skin - Sun Exposed (Lower leg)\",\n",
+    "    # \"Adipose - Subcutaneous\",\n",
+    "    # \"Artery - Tibial\",\n",
+    "]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -341,12 +363,27 @@
      "output_type": "display_data"
     }
    ],
+   "source": [
+    "tissue_in_file_names = [f\"_data_{simplify_string(t.lower())}-\" for t in TISSUES]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fadd2c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "input_files = sorted(list(INPUT_DIR.glob(f\"*-{GENE_SELECTION_STRATEGY}.pkl\")))\n",
+    "input_files = [\n",
+    "    f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)\n",
+    "]\n",
     "display(len(input_files))\n",
     "\n",
-    "assert len(input_files) == conf.GTEX[\"N_TISSUES\"], len(input_files)\n",
-    "display(input_files[:5])"
+    "assert len(input_files) == len(TISSUES), len(TISSUES)\n",
+    "display(input_files)"
    ]
   },
   {
@@ -914,7 +951,7 @@
     }
    ],
    "source": [
-    "%timeit CORRELATION_METHOD(test_data)"
+    "%timeit -r1 CORRELATION_METHOD(test_data)"
    ]
   },
   {
@@ -973,8 +1010,14 @@
     "    data = pd.read_pickle(tissue_data_file)\n",
     "\n",
     "    # compute correlations\n",
+    "    start_time = time()\n",
+    "\n",
     "    data_corrs = CORRELATION_METHOD(data)\n",
     "\n",
+    "    end_time = time()\n",
+    "    elapsed_time = end_time - start_time\n",
+    "    display(elapsed_time)\n",
+    "\n",
     "    # save\n",
     "    output_filename = f\"{tissue_data_file.stem}-{method_name}.pkl\"\n",
     "    data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)"
diff --git a/nbs/10_compute_correlations/05_gtex_v8/07_01-gtex-var_pc_log2-ccc.ipynb b/nbs/10_compute_correlations/05_gtex_v8/07_01-gtex-var_pc_log2-ccc.ipynb
@@ -79,6 +79,7 @@
     "from time import time\n",
     "\n",
     "import pandas as pd\n",
+    "from tqdm import tqdm\n",
     "\n",
     "from ccc import conf\n",
     "from ccc.utils import simplify_string\n",
@@ -1034,8 +1035,10 @@
     }
    ],
    "source": [
-    "for tissue_data_file in input_files:\n",
-    "    display(tissue_data_file.stem)\n",
+    "pbar = tqdm(input_files, ncols=100)\n",
+    "\n",
+    "for tissue_data_file in pbar:\n",
+    "    pbar.set_description(tissue_data_file.stem)\n",
     "\n",
     "    # read\n",
     "    data = pd.read_pickle(tissue_data_file)\n",
@@ -1074,13 +1077,7 @@
  ],
  "metadata": {
   "jupytext": {
-   "cell_metadata_filter": "all,-execution,-papermill,-trusted",
-   "text_representation": {
-    "extension": ".py",
-    "format_name": "percent",
-    "format_version": "1.3",
-    "jupytext_version": "1.11.5"
-   }
+   "cell_metadata_filter": "all,-execution,-papermill,-trusted"
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
diff --git a/nbs/10_compute_correlations/05_gtex_v8/py/05_01-gtex-var_pc_log2-pearson.py b/nbs/10_compute_correlations/05_gtex_v8/py/05_01-gtex-var_pc_log2-pearson.py
@@ -26,10 +26,13 @@
 # # Modules
 
 # %% tags=[]
+from time import time
+
 import pandas as pd
 from tqdm import tqdm
 
 from ccc import conf
+from ccc.utils import simplify_string
 from ccc.corr import pearson
 
 # %% [markdown] tags=[]
@@ -38,6 +41,16 @@
 # %% tags=[]
 GENE_SELECTION_STRATEGY = "var_pc_log2"
 
+# %% tags=[]
+# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
+TISSUES = [
+    # "Muscle - Skeletal",
+    "Whole Blood",
+    # "Skin - Sun Exposed (Lower leg)",
+    # "Adipose - Subcutaneous",
+    # "Artery - Tibial",
+]
+
 # %% tags=[]
 CORRELATION_METHOD = pearson
 
@@ -64,12 +77,18 @@
 # %% [markdown] tags=[]
 # # Data loading
 
+# %% tags=[]
+tissue_in_file_names = [f"_data_{simplify_string(t.lower())}-" for t in TISSUES]
+
 # %% tags=[]
 input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
+input_files = [
+    f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)
+]
 display(len(input_files))
 
-assert len(input_files) == conf.GTEX["N_TISSUES"], len(input_files)
-display(input_files[:5])
+assert len(input_files) == len(TISSUES), len(TISSUES)
+display(input_files)
 
 # %% [markdown] tags=[]
 # # Compute similarity
@@ -97,7 +116,7 @@
 display(_tmp)
 
 # %% tags=[]
-# %timeit CORRELATION_METHOD(test_data)
+# %timeit -r1 CORRELATION_METHOD(test_data)
 
 # %% [markdown] tags=[]
 # ## Run
@@ -112,8 +131,14 @@
     data = pd.read_pickle(tissue_data_file)
 
     # compute correlations
+    start_time = time()
+
     data_corrs = CORRELATION_METHOD(data)
 
+    end_time = time()
+    elapsed_time = end_time - start_time
+    display(elapsed_time)
+
     # save
     output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
     data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)
diff --git a/nbs/10_compute_correlations/05_gtex_v8/py/06_01-gtex-var_pc_log2-spearman.py b/nbs/10_compute_correlations/05_gtex_v8/py/06_01-gtex-var_pc_log2-spearman.py
@@ -26,10 +26,13 @@
 # # Modules
 
 # %% tags=[]
+from time import time
+
 import pandas as pd
 from tqdm import tqdm
 
 from ccc import conf
+from ccc.utils import simplify_string
 from ccc.corr import spearman
 
 # %% [markdown] tags=[]
@@ -38,6 +41,16 @@
 # %% tags=[]
 GENE_SELECTION_STRATEGY = "var_pc_log2"
 
+# %% tags=[]
+# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
+TISSUES = [
+    # "Muscle - Skeletal",
+    "Whole Blood",
+    # "Skin - Sun Exposed (Lower leg)",
+    # "Adipose - Subcutaneous",
+    # "Artery - Tibial",
+]
+
 # %% tags=[]
 CORRELATION_METHOD = spearman
 
@@ -64,12 +77,18 @@
 # %% [markdown] tags=[]
 # # Data loading
 
+# %% tags=[]
+tissue_in_file_names = [f"_data_{simplify_string(t.lower())}-" for t in TISSUES]
+
 # %% tags=[]
 input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
+input_files = [
+    f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)
+]
 display(len(input_files))
 
-assert len(input_files) == conf.GTEX["N_TISSUES"], len(input_files)
-display(input_files[:5])
+assert len(input_files) == len(TISSUES), len(TISSUES)
+display(input_files)
 
 # %% [markdown] tags=[]
 # # Compute similarity
@@ -97,7 +116,7 @@
 display(_tmp)
 
 # %% tags=[]
-# %timeit CORRELATION_METHOD(test_data)
+# %timeit -r1 CORRELATION_METHOD(test_data)
 
 # %% [markdown] tags=[]
 # ## Run
@@ -112,8 +131,14 @@
     data = pd.read_pickle(tissue_data_file)
 
     # compute correlations
+    start_time = time()
+
     data_corrs = CORRELATION_METHOD(data)
 
+    end_time = time()
+    elapsed_time = end_time - start_time
+    display(elapsed_time)
+
     # save
     output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
     data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)
diff --git a/nbs/10_compute_correlations/05_gtex_v8/py/07_01-gtex-var_pc_log2-ccc.py b/nbs/10_compute_correlations/05_gtex_v8/py/07_01-gtex-var_pc_log2-ccc.py