From 09a1a27f8a15f919f385ff66c4c170de5611ed5f Mon Sep 17 00:00:00 2001 From: Milton Pividori Date: Tue, 12 Sep 2023 23:24:05 -0600 Subject: [PATCH] ccc pvalue: increase the number of permutations and other improvements --- .../15-compute_pvalues_from_samples.ipynb | 902 ++---------------- .../py/15-compute_pvalues_from_samples.py | 27 +- 2 files changed, 73 insertions(+), 856 deletions(-) diff --git a/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb index 60daa718..9e876e90 100644 --- a/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb +++ b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb @@ -31,7 +31,7 @@ "tags": [] }, "source": [ - "TODO" + "Reads the gene pair samples across different categories and computes their p-values." ] }, { @@ -53,15 +53,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "1ffa1a96-7545-40b9-ac8b-8627e13de8d4", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.411609Z", - "iopub.status.busy": "2023-09-12T22:59:49.411498Z", - "iopub.status.idle": "2023-09-12T22:59:49.837994Z", - "shell.execute_reply": "2023-09-12T22:59:49.837700Z" - }, "papermill": { "duration": 0.429643, "end_time": "2023-09-12T22:59:49.838894", @@ -80,6 +74,7 @@ "import numpy as np\n", "import pandas as pd\n", "from concurrent.futures import as_completed, ProcessPoolExecutor\n", + "from tqdm import tqdm\n", "\n", "from ccc.coef import ccc\n", "from ccc import conf" @@ -104,15 +99,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "9a154623-c787-4a31-871a-cad173f0eb9f", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.855314Z", - "iopub.status.busy": "2023-09-12T22:59:49.855247Z", - "iopub.status.idle": "2023-09-12T22:59:49.857116Z", - "shell.execute_reply": "2023-09-12T22:59:49.856940Z" - }, "papermill": { "duration": 0.004783, "end_time": "2023-09-12T22:59:49.857681", @@ -128,7 +117,7 @@ "GTEX_TISSUE = \"whole_blood\"\n", "GENE_SEL_STRATEGY = \"var_pc_log2\"\n", "\n", - "PVALUE_N_PERMS = 1000000\n", + "PVALUE_N_PERMS = 10000000\n", "\n", "RANDOM_STATE = np.random.RandomState(0)" ] @@ -152,15 +141,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "c6f73068-fa38-44be-bd0c-708f6ff450ea", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.865853Z", - "iopub.status.busy": "2023-09-12T22:59:49.865715Z", - "iopub.status.idle": "2023-09-12T22:59:49.868127Z", - "shell.execute_reply": "2023-09-12T22:59:49.867992Z" - }, "papermill": { "duration": 0.00506, "end_time": "2023-09-12T22:59:49.868624", @@ -170,17 +153,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "INPUT_GENE_EXPR_FILE = (\n", " DATASET_CONFIG[\"GENE_SELECTION_DIR\"]\n", @@ -193,15 +166,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "30cce6f5-ca1b-438c-859d-31903a42d4c6", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.872266Z", - "iopub.status.busy": "2023-09-12T22:59:49.872175Z", - "iopub.status.idle": "2023-09-12T22:59:49.873814Z", - "shell.execute_reply": "2023-09-12T22:59:49.873677Z" - }, "papermill": { "duration": 0.004102, "end_time": "2023-09-12T22:59:49.874422", @@ -211,17 +178,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (\n", " DATASET_CONFIG[\"GENE_PAIR_INTERSECTIONS\"]\n", @@ -234,15 +191,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "0122253c-99c0-41e2-8807-60df86bf0619", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.878047Z", - "iopub.status.busy": "2023-09-12T22:59:49.877936Z", - "iopub.status.idle": "2023-09-12T22:59:49.879230Z", - "shell.execute_reply": "2023-09-12T22:59:49.879101Z" - }, "papermill": { "duration": 0.00365, "end_time": "2023-09-12T22:59:49.879712", @@ -260,15 +211,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "3003ed2c-5da0-43b9-969d-9cf037d05730", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.883414Z", - "iopub.status.busy": "2023-09-12T22:59:49.883308Z", - "iopub.status.idle": "2023-09-12T22:59:49.884842Z", - "shell.execute_reply": "2023-09-12T22:59:49.884722Z" - }, "papermill": { "duration": 0.003938, "end_time": "2023-09-12T22:59:49.885310", @@ -278,18 +223,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "OUTPUT_DIR" ] @@ -313,15 +247,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "6e8ef201-6f98-4fb6-a306-180ed4b467db", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.892209Z", - "iopub.status.busy": "2023-09-12T22:59:49.892105Z", - "iopub.status.idle": "2023-09-12T22:59:49.912164Z", - "shell.execute_reply": "2023-09-12T22:59:49.911953Z" - }, "papermill": { "duration": 0.022686, "end_time": "2023-09-12T22:59:49.912927", @@ -338,15 +266,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "4d18e93e-b394-46bd-8d16-d9261a85ba06", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.917204Z", - "iopub.status.busy": "2023-09-12T22:59:49.917116Z", - "iopub.status.idle": "2023-09-12T22:59:49.918963Z", - "shell.execute_reply": "2023-09-12T22:59:49.918826Z" - }, "papermill": { "duration": 0.004607, "end_time": "2023-09-12T22:59:49.919511", @@ -356,33 +278,16 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "(5000, 755)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "data.shape" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "ea8947b9-9064-43ec-bf10-6e6ae361c451", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.923551Z", - "iopub.status.busy": "2023-09-12T22:59:49.923405Z", - "iopub.status.idle": "2023-09-12T22:59:49.931350Z", - "shell.execute_reply": "2023-09-12T22:59:49.931198Z" - }, "papermill": { "duration": 0.01065, "end_time": "2023-09-12T22:59:49.931837", @@ -392,290 +297,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GTEX-111YS-0006-SM-5NQBEGTEX-1122O-0005-SM-5O99JGTEX-1128S-0005-SM-5P9HIGTEX-113IC-0006-SM-5NQ9CGTEX-113JC-0006-SM-5O997GTEX-117XS-0005-SM-5PNU6GTEX-117YW-0005-SM-5NQ8ZGTEX-1192W-0005-SM-5NQBQGTEX-1192X-0005-SM-5NQC3GTEX-11DXW-0006-SM-5NQ7Y...GTEX-ZVE2-0006-SM-51MRWGTEX-ZVP2-0005-SM-51MRKGTEX-ZVT2-0005-SM-57WBWGTEX-ZVT3-0006-SM-51MT9GTEX-ZVT4-0006-SM-57WB8GTEX-ZVTK-0006-SM-57WBKGTEX-ZVZP-0006-SM-51MSWGTEX-ZVZQ-0006-SM-51MR8GTEX-ZXES-0005-SM-57WCBGTEX-ZXG5-0005-SM-57WCN
gene_ens_id
ENSG00000000419.1220.650025.0507.15549.1306.1474.14305.3904.3891.15806.8240...4.407032.34018.68009.2517.8287.46033.24005.848025.76017.080
ENSG00000000938.12906.00001344.000633.500719.200392.600166.5000338.200413.20051.5400423.6000...354.80001102.000774.9000206.000620.400346.3001304.0000232.9000631.600884.500
ENSG00000001167.148.190020.01020.47021.22016.4608.619018.22016.5801.602035.6800...11.340011.25011.18009.52341.86024.5808.892013.390013.47042.640
ENSG00000001561.60.71041.7712.2346.0143.2060.39622.4451.4180.55310.7447...0.92692.5550.59763.4172.6451.8830.53910.98161.0366.729
ENSG00000002549.1222.500021.33019.290157.10029.3309.577014.17023.3301.407028.3000...4.493050.47016.210032.74018.15011.92020.100015.550011.98035.370
\n", - "

5 rows × 755 columns

\n", - "
" - ], - "text/plain": [ - " GTEX-111YS-0006-SM-5NQBE GTEX-1122O-0005-SM-5O99J \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 20.6500 25.050 \n", - "ENSG00000000938.12 906.0000 1344.000 \n", - "ENSG00000001167.14 8.1900 20.010 \n", - "ENSG00000001561.6 0.7104 1.771 \n", - "ENSG00000002549.12 22.5000 21.330 \n", - "\n", - " GTEX-1128S-0005-SM-5P9HI GTEX-113IC-0006-SM-5NQ9C \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 7.155 49.130 \n", - "ENSG00000000938.12 633.500 719.200 \n", - "ENSG00000001167.14 20.470 21.220 \n", - "ENSG00000001561.6 2.234 6.014 \n", - "ENSG00000002549.12 19.290 157.100 \n", - "\n", - " GTEX-113JC-0006-SM-5O997 GTEX-117XS-0005-SM-5PNU6 \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 6.147 4.1430 \n", - "ENSG00000000938.12 392.600 166.5000 \n", - "ENSG00000001167.14 16.460 8.6190 \n", - "ENSG00000001561.6 3.206 0.3962 \n", - "ENSG00000002549.12 29.330 9.5770 \n", - "\n", - " GTEX-117YW-0005-SM-5NQ8Z GTEX-1192W-0005-SM-5NQBQ \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 5.390 4.389 \n", - "ENSG00000000938.12 338.200 413.200 \n", - "ENSG00000001167.14 18.220 16.580 \n", - "ENSG00000001561.6 2.445 1.418 \n", - "ENSG00000002549.12 14.170 23.330 \n", - "\n", - " GTEX-1192X-0005-SM-5NQC3 GTEX-11DXW-0006-SM-5NQ7Y ... \\\n", - "gene_ens_id ... \n", - "ENSG00000000419.12 1.1580 6.8240 ... \n", - "ENSG00000000938.12 51.5400 423.6000 ... \n", - "ENSG00000001167.14 1.6020 35.6800 ... \n", - "ENSG00000001561.6 0.5531 0.7447 ... \n", - "ENSG00000002549.12 1.4070 28.3000 ... \n", - "\n", - " GTEX-ZVE2-0006-SM-51MRW GTEX-ZVP2-0005-SM-51MRK \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 4.4070 32.340 \n", - "ENSG00000000938.12 354.8000 1102.000 \n", - "ENSG00000001167.14 11.3400 11.250 \n", - "ENSG00000001561.6 0.9269 2.555 \n", - "ENSG00000002549.12 4.4930 50.470 \n", - "\n", - " GTEX-ZVT2-0005-SM-57WBW GTEX-ZVT3-0006-SM-51MT9 \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 18.6800 9.251 \n", - "ENSG00000000938.12 774.9000 206.000 \n", - "ENSG00000001167.14 11.1800 9.523 \n", - "ENSG00000001561.6 0.5976 3.417 \n", - "ENSG00000002549.12 16.2100 32.740 \n", - "\n", - " GTEX-ZVT4-0006-SM-57WB8 GTEX-ZVTK-0006-SM-57WBK \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 7.828 7.460 \n", - "ENSG00000000938.12 620.400 346.300 \n", - "ENSG00000001167.14 41.860 24.580 \n", - "ENSG00000001561.6 2.645 1.883 \n", - "ENSG00000002549.12 18.150 11.920 \n", - "\n", - " GTEX-ZVZP-0006-SM-51MSW GTEX-ZVZQ-0006-SM-51MR8 \\\n", - "gene_ens_id \n", - "ENSG00000000419.12 33.2400 5.8480 \n", - "ENSG00000000938.12 1304.0000 232.9000 \n", - "ENSG00000001167.14 8.8920 13.3900 \n", - "ENSG00000001561.6 0.5391 0.9816 \n", - "ENSG00000002549.12 20.1000 15.5500 \n", - "\n", - " GTEX-ZXES-0005-SM-57WCB GTEX-ZXG5-0005-SM-57WCN \n", - "gene_ens_id \n", - "ENSG00000000419.12 25.760 17.080 \n", - "ENSG00000000938.12 631.600 884.500 \n", - "ENSG00000001167.14 13.470 42.640 \n", - "ENSG00000001561.6 1.036 6.729 \n", - "ENSG00000002549.12 11.980 35.370 \n", - "\n", - "[5 rows x 755 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "data.head()" ] @@ -699,15 +321,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "178a09a8-1a2e-425a-8a52-773f41c72633", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.939521Z", - "iopub.status.busy": "2023-09-12T22:59:49.939455Z", - "iopub.status.idle": "2023-09-12T22:59:49.940684Z", - "shell.execute_reply": "2023-09-12T22:59:49.940556Z" - }, "papermill": { "duration": 0.003701, "end_time": "2023-09-12T22:59:49.941066", @@ -724,15 +340,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "c42a9f4c-3672-4ab0-b9ff-c214eb40cd2f", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.944914Z", - "iopub.status.busy": "2023-09-12T22:59:49.944802Z", - "iopub.status.idle": "2023-09-12T22:59:49.950774Z", - "shell.execute_reply": "2023-09-12T22:59:49.950627Z" - }, "papermill": { "duration": 0.008321, "end_time": "2023-09-12T22:59:49.951197", @@ -749,15 +359,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "1724d63c-19eb-49a8-83fc-6c8b07585e98", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.955084Z", - "iopub.status.busy": "2023-09-12T22:59:49.954971Z", - "iopub.status.idle": "2023-09-12T22:59:49.956445Z", - "shell.execute_reply": "2023-09-12T22:59:49.956319Z" - }, "papermill": { "duration": 0.003797, "end_time": "2023-09-12T22:59:49.956831", @@ -767,33 +371,16 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "9" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(gene_pair_samples)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "99f5098f-aa01-471b-a6a2-5aabc332176b", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.960750Z", - "iopub.status.busy": "2023-09-12T22:59:49.960641Z", - "iopub.status.idle": "2023-09-12T22:59:49.962169Z", - "shell.execute_reply": "2023-09-12T22:59:49.962046Z" - }, "papermill": { "duration": 0.003951, "end_time": "2023-09-12T22:59:49.962566", @@ -803,41 +390,16 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "['all_high',\n", - " 'all_low',\n", - " 'ccc_high_and_pearson_low',\n", - " 'ccc_high_and_spearman_low',\n", - " 'ccc_high_and_spearman_pearson_low',\n", - " 'ccc_spearman_high_and_pearson_low',\n", - " 'pearson_high_and_ccc_low',\n", - " 'pearson_high_and_ccc_spearman_low',\n", - " 'selected_in_manuscript']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sorted(gene_pair_samples.keys())" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "c60378f6-3f87-49d4-8b86-cf3ec30fc545", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.966462Z", - "iopub.status.busy": "2023-09-12T22:59:49.966348Z", - "iopub.status.idle": "2023-09-12T22:59:49.970153Z", - "shell.execute_reply": "2023-09-12T22:59:49.970022Z" - }, "papermill": { "duration": 0.006176, "end_time": "2023-09-12T22:59:49.970553", @@ -847,167 +409,17 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Pearson (high)Pearson (low)Spearman (high)Spearman (low)Clustermatch (high)Clustermatch (low)cccpearsonspearman
ENSG00000052749.13ENSG00000165025.14TrueFalseTrueFalseTrueFalse0.3623400.7094490.795566
ENSG00000102897.9ENSG00000086544.2TrueFalseTrueFalseTrueFalse0.4290920.6985370.822212
ENSG00000110628.13ENSG00000267078.1TrueFalseTrueFalseTrueFalse0.2301430.5094990.632816
ENSG00000169554.18ENSG00000132424.14TrueFalseTrueFalseTrueFalse0.5090120.7737620.878352
ENSG00000143933.16ENSG00000135378.3TrueFalseTrueFalseTrueFalse0.4718420.5311210.819382
\n", - "
" - ], - "text/plain": [ - " Pearson (high) Pearson (low) \\\n", - "ENSG00000052749.13 ENSG00000165025.14 True False \n", - "ENSG00000102897.9 ENSG00000086544.2 True False \n", - "ENSG00000110628.13 ENSG00000267078.1 True False \n", - "ENSG00000169554.18 ENSG00000132424.14 True False \n", - "ENSG00000143933.16 ENSG00000135378.3 True False \n", - "\n", - " Spearman (high) Spearman (low) \\\n", - "ENSG00000052749.13 ENSG00000165025.14 True False \n", - "ENSG00000102897.9 ENSG00000086544.2 True False \n", - "ENSG00000110628.13 ENSG00000267078.1 True False \n", - "ENSG00000169554.18 ENSG00000132424.14 True False \n", - "ENSG00000143933.16 ENSG00000135378.3 True False \n", - "\n", - " Clustermatch (high) \\\n", - "ENSG00000052749.13 ENSG00000165025.14 True \n", - "ENSG00000102897.9 ENSG00000086544.2 True \n", - "ENSG00000110628.13 ENSG00000267078.1 True \n", - "ENSG00000169554.18 ENSG00000132424.14 True \n", - "ENSG00000143933.16 ENSG00000135378.3 True \n", - "\n", - " Clustermatch (low) ccc pearson \\\n", - "ENSG00000052749.13 ENSG00000165025.14 False 0.362340 0.709449 \n", - "ENSG00000102897.9 ENSG00000086544.2 False 0.429092 0.698537 \n", - "ENSG00000110628.13 ENSG00000267078.1 False 0.230143 0.509499 \n", - "ENSG00000169554.18 ENSG00000132424.14 False 0.509012 0.773762 \n", - "ENSG00000143933.16 ENSG00000135378.3 False 0.471842 0.531121 \n", - "\n", - " spearman \n", - "ENSG00000052749.13 ENSG00000165025.14 0.795566 \n", - "ENSG00000102897.9 ENSG00000086544.2 0.822212 \n", - "ENSG00000110628.13 ENSG00000267078.1 0.632816 \n", - "ENSG00000169554.18 ENSG00000132424.14 0.878352 \n", - "ENSG00000143933.16 ENSG00000135378.3 0.819382 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "gene_pair_samples[\"all_high\"].head()" + "_k = list(gene_pair_samples.keys())[0]\n", + "gene_pair_samples[_k].head()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "6ccae66e-e276-43c3-809c-512aa0fe795b", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.974698Z", - "iopub.status.busy": "2023-09-12T22:59:49.974579Z", - "iopub.status.idle": "2023-09-12T22:59:49.976390Z", - "shell.execute_reply": "2023-09-12T22:59:49.976266Z" - }, "papermill": { "duration": 0.00426, "end_time": "2023-09-12T22:59:49.976769", @@ -1017,29 +429,9 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('ENSG00000052749.13', 'ENSG00000165025.14'),\n", - " ('ENSG00000102897.9', 'ENSG00000086544.2'),\n", - " ('ENSG00000110628.13', 'ENSG00000267078.1'),\n", - " ('ENSG00000169554.18', 'ENSG00000132424.14'),\n", - " ('ENSG00000143933.16', 'ENSG00000135378.3'),\n", - " ('ENSG00000170776.21', 'ENSG00000155903.11'),\n", - " ('ENSG00000136111.12', 'ENSG00000065911.11'),\n", - " ('ENSG00000131042.14', 'ENSG00000141367.11'),\n", - " ('ENSG00000160703.15', 'ENSG00000231964.1'),\n", - " ('ENSG00000008394.12', 'ENSG00000101347.8')]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "[i for i in gene_pair_samples[\"all_high\"].head(10).index]" + "[i for i in gene_pair_samples[_k].head(10).index]" ] }, { @@ -1061,15 +453,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "62d8632e-13e0-4a78-ad30-26770172d21e", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.984650Z", - "iopub.status.busy": "2023-09-12T22:59:49.984567Z", - "iopub.status.idle": "2023-09-12T22:59:49.985738Z", - "shell.execute_reply": "2023-09-12T22:59:49.985617Z" - }, "papermill": { "duration": 0.003625, "end_time": "2023-09-12T22:59:49.986138", @@ -1086,15 +472,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "c8a85ce0-4c5a-4ed9-8ad6-24b21fb10b1e", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.990241Z", - "iopub.status.busy": "2023-09-12T22:59:49.990158Z", - "iopub.status.idle": "2023-09-12T22:59:49.991649Z", - "shell.execute_reply": "2023-09-12T22:59:49.991513Z" - }, "papermill": { "duration": 0.00395, "end_time": "2023-09-12T22:59:49.992041", @@ -1107,7 +487,7 @@ "outputs": [], "source": [ "def corr_single(x, y):\n", - " ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)\n", + " ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=conf.GENERAL[\"N_JOBS\"])\n", " p_val, p_pval = stats.pearsonr(x, y)\n", " s_val, s_pval = stats.spearmanr(x, y)\n", "\n", @@ -1116,15 +496,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "d9838801-1f01-4316-8e29-ffedbdc2a67a", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-12T22:59:49.996242Z", - "iopub.status.busy": "2023-09-12T22:59:49.996132Z", - "iopub.status.idle": "2023-09-13T03:38:36.170632Z", - "shell.execute_reply": "2023-09-13T03:38:36.170157Z" - }, "papermill": { "duration": 16726.17747, "end_time": "2023-09-13T03:38:36.171453", @@ -1138,7 +512,10 @@ "source": [ "results = []\n", "\n", - "with ProcessPoolExecutor(max_workers=conf.GENERAL[\"N_JOBS\"]) as executor:\n", + "# I leave the ProcessPoolExecutor here in case I want to easily swith between\n", + "# parallelize across gene pairs (max_workers=conf.GENERAL[\"N_JOBS\"] and n_jobs=1 inside function corr_single)\n", + "# or across permutations for one gene pair (max_workers=1 and n_jobs=conf.GENERAL[\"N_JOBS\"])\n", + "with ProcessPoolExecutor(max_workers=1) as executor:\n", " tasks = {\n", " executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (\n", " gene0,\n", @@ -1149,7 +526,7 @@ " for gene0, gene1 in gene_pair_samples[k].index\n", " }\n", "\n", - " for t_idx, t in enumerate(as_completed(tasks)):\n", + " for t_idx, t in tqdm(enumerate(as_completed(tasks)), total=len(tasks), ncols=100):\n", " gene0, gene1, k = tasks[t]\n", " ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()\n", "\n", @@ -1167,23 +544,17 @@ " }\n", " )\n", "\n", - " if t_idx % 10:\n", - " _df = pd.DataFrame(results)\n", - " _df[\"group\"] = _df[\"group\"].astype(\"category\")\n", - " _df.to_pickle(output_file)" + " # save\n", + " _df = pd.DataFrame(results)\n", + " _df[\"group\"] = _df[\"group\"].astype(\"category\")\n", + " _df.to_pickle(output_file)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "6f32ad1a-3b2f-4e08-8a53-35cfb68e3970", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-13T03:38:36.176634Z", - "iopub.status.busy": "2023-09-13T03:38:36.176542Z", - "iopub.status.idle": "2023-09-13T03:38:36.178678Z", - "shell.execute_reply": "2023-09-13T03:38:36.178505Z" - }, "papermill": { "duration": 0.005138, "end_time": "2023-09-13T03:38:36.179138", @@ -1193,33 +564,16 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "644" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(results)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "e68a65a5-8bba-4a79-a740-26d722dc670e", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-13T03:38:36.183638Z", - "iopub.status.busy": "2023-09-13T03:38:36.183514Z", - "iopub.status.idle": "2023-09-13T03:38:36.186504Z", - "shell.execute_reply": "2023-09-13T03:38:36.186287Z" - }, "papermill": { "duration": 0.005853, "end_time": "2023-09-13T03:38:36.187014", @@ -1237,15 +591,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "9514ebb1-f1c1-46d9-96b6-a2264e3a6b4b", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-13T03:38:36.192047Z", - "iopub.status.busy": "2023-09-13T03:38:36.191910Z", - "iopub.status.idle": "2023-09-13T03:38:36.193682Z", - "shell.execute_reply": "2023-09-13T03:38:36.193507Z" - }, "papermill": { "duration": 0.004782, "end_time": "2023-09-13T03:38:36.194305", @@ -1255,33 +603,16 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "(644, 9)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "results_df.shape" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "6110dd19-95e0-4400-847a-424a498fa63d", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-13T03:38:36.198506Z", - "iopub.status.busy": "2023-09-13T03:38:36.198414Z", - "iopub.status.idle": "2023-09-13T03:38:36.203136Z", - "shell.execute_reply": "2023-09-13T03:38:36.202842Z" - }, "papermill": { "duration": 0.007371, "end_time": "2023-09-13T03:38:36.203668", @@ -1291,125 +622,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gene0gene1groupcccccc_pvaluepearsonpearson_pvaluespearmanspearman_pvalue
0ENSG00000169554.18ENSG00000132424.14all_high0.5090129.999990e-070.7737621.893487e-1510.8783521.374455e-243
1ENSG00000136111.12ENSG00000065911.11all_high0.2301439.999990e-070.5582824.403216e-630.6561303.863872e-94
2ENSG00000170776.21ENSG00000155903.11all_high0.3249879.999990e-070.7513374.609357e-1380.7697466.110239e-149
3ENSG00000143933.16ENSG00000135378.3all_high0.4718429.999990e-070.5311213.525528e-560.8193823.815707e-184
4ENSG00000160703.15ENSG00000231964.1all_high0.3189589.999990e-070.5892059.250622e-720.7028821.639640e-113
\n", - "
" - ], - "text/plain": [ - " gene0 gene1 group ccc ccc_pvalue \\\n", - "0 ENSG00000169554.18 ENSG00000132424.14 all_high 0.509012 9.999990e-07 \n", - "1 ENSG00000136111.12 ENSG00000065911.11 all_high 0.230143 9.999990e-07 \n", - "2 ENSG00000170776.21 ENSG00000155903.11 all_high 0.324987 9.999990e-07 \n", - "3 ENSG00000143933.16 ENSG00000135378.3 all_high 0.471842 9.999990e-07 \n", - "4 ENSG00000160703.15 ENSG00000231964.1 all_high 0.318958 9.999990e-07 \n", - "\n", - " pearson pearson_pvalue spearman spearman_pvalue \n", - "0 0.773762 1.893487e-151 0.878352 1.374455e-243 \n", - "1 0.558282 4.403216e-63 0.656130 3.863872e-94 \n", - "2 0.751337 4.609357e-138 0.769746 6.110239e-149 \n", - "3 0.531121 3.525528e-56 0.819382 3.815707e-184 \n", - "4 0.589205 9.250622e-72 0.702882 1.639640e-113 " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "results_df.head()" ] @@ -1433,15 +646,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "bb8e28d4-3adf-4d6a-a94e-81b6763ebd61", "metadata": { - "execution": { - "iopub.execute_input": "2023-09-13T03:38:36.212308Z", - "iopub.status.busy": "2023-09-13T03:38:36.212205Z", - "iopub.status.idle": "2023-09-13T03:38:36.214108Z", - "shell.execute_reply": "2023-09-13T03:38:36.213850Z" - }, "papermill": { "duration": 0.004663, "end_time": "2023-09-13T03:38:36.214516", @@ -1477,7 +684,12 @@ "metadata": { "jupytext": { "cell_metadata_filter": "all,-execution,-papermill,-trusted", - "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version" + "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version", + "text_representation": { + "extension": ".py", + "format_name": "percent", + "format_version": "1.3" + } }, "kernelspec": { "display_name": "Python 3 (ipykernel)", diff --git a/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py index 6de900b3..5a27d599 100644 --- a/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py +++ b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py @@ -17,7 +17,7 @@ # # Description # %% [markdown] tags=[] -# TODO +# Reads the gene pair samples across different categories and computes their p-values. # %% [markdown] tags=[] # # Modules loading @@ -30,6 +30,7 @@ import numpy as np import pandas as pd from concurrent.futures import as_completed, ProcessPoolExecutor +from tqdm import tqdm from ccc.coef import ccc from ccc import conf @@ -42,7 +43,7 @@ GTEX_TISSUE = "whole_blood" GENE_SEL_STRATEGY = "var_pc_log2" -PVALUE_N_PERMS = 1000000 +PVALUE_N_PERMS = 10000000 RANDOM_STATE = np.random.RandomState(0) @@ -102,10 +103,11 @@ sorted(gene_pair_samples.keys()) # %% tags=[] -gene_pair_samples["all_high"].head() +_k = list(gene_pair_samples.keys())[0] +gene_pair_samples[_k].head() # %% tags=[] -[i for i in gene_pair_samples["all_high"].head(10).index] +[i for i in gene_pair_samples[_k].head(10).index] # %% [markdown] tags=[] # # Compute pvalues on sampled gene pairs @@ -116,7 +118,7 @@ # %% tags=[] def corr_single(x, y): - ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1) + ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=conf.GENERAL["N_JOBS"]) p_val, p_pval = stats.pearsonr(x, y) s_val, s_pval = stats.spearmanr(x, y) @@ -126,7 +128,10 @@ def corr_single(x, y): # %% tags=[] results = [] -with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor: +# I leave the ProcessPoolExecutor here in case I want to easily swith between +# parallelize across gene pairs (max_workers=conf.GENERAL["N_JOBS"] and n_jobs=1 inside function corr_single) +# or across permutations for one gene pair (max_workers=1 and n_jobs=conf.GENERAL["N_JOBS"]) +with ProcessPoolExecutor(max_workers=1) as executor: tasks = { executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): ( gene0, @@ -137,7 +142,7 @@ def corr_single(x, y): for gene0, gene1 in gene_pair_samples[k].index } - for t_idx, t in enumerate(as_completed(tasks)): + for t_idx, t in tqdm(enumerate(as_completed(tasks)), total=len(tasks), ncols=100): gene0, gene1, k = tasks[t] ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result() @@ -155,10 +160,10 @@ def corr_single(x, y): } ) - if t_idx % 10: - _df = pd.DataFrame(results) - _df["group"] = _df["group"].astype("category") - _df.to_pickle(output_file) + # save + _df = pd.DataFrame(results) + _df["group"] = _df["group"].astype("category") + _df.to_pickle(output_file) # %% tags=[] len(results)