From 702fc1916b093aafce0c7eb019339d948e23a346 Mon Sep 17 00:00:00 2001 From: Milton Pividori Date: Fri, 8 Sep 2023 11:52:16 -0600 Subject: [PATCH] ccc pvalue: add notebook to compute pvalues using gene pairs --- ...-ccc_pvalue_dist-generate-gene_pairs.ipynb | 603 ++++++++++++++++++ .../01-ccc_pvalue_dist-generate-gene_pairs.py | 118 ++++ 2 files changed, 721 insertions(+) create mode 100644 nbs/25_pvalue/01-ccc_pvalue_dist-generate-gene_pairs.ipynb create mode 100644 nbs/25_pvalue/py/01-ccc_pvalue_dist-generate-gene_pairs.py diff --git a/nbs/25_pvalue/01-ccc_pvalue_dist-generate-gene_pairs.ipynb b/nbs/25_pvalue/01-ccc_pvalue_dist-generate-gene_pairs.ipynb new file mode 100644 index 00000000..dd15e537 --- /dev/null +++ b/nbs/25_pvalue/01-ccc_pvalue_dist-generate-gene_pairs.ipynb @@ -0,0 +1,603 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ec52faa3-656f-483e-9617-d7ec0f7d818c", + "metadata": { + "papermill": { + "duration": 0.003093, + "end_time": "2023-09-06T17:55:26.691143", + "exception": false, + "start_time": "2023-09-06T17:55:26.688050", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "51102f42-fcd9-4a58-9c8d-dfcd3d2d464e", + "metadata": { + "papermill": { + "duration": 0.004704, + "end_time": "2023-09-06T17:55:26.705384", + "exception": false, + "start_time": "2023-09-06T17:55:26.700680", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Generates a distribution of pvalues under the null hypothesis of no association.\n", + "\n", + "This notebook uses individual gene pairs as input for CCC and parallelizes permutations." + ] + }, + { + "cell_type": "markdown", + "id": "7006ceeb-2651-407d-bfa1-1039727649ef", + "metadata": { + "papermill": { + "duration": 0.002814, + "end_time": "2023-09-06T17:55:26.710790", + "exception": false, + "start_time": "2023-09-06T17:55:26.707976", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules loading" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1ffa1a96-7545-40b9-ac8b-8627e13de8d4", + "metadata": { + "papermill": { + "duration": 1.939954, + "end_time": "2023-09-06T17:55:28.653439", + "exception": false, + "start_time": "2023-09-06T17:55:26.713485", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.spatial.distance import squareform\n", + "from sklearn.metrics import pairwise_distances\n", + "\n", + "from ccc.coef import ccc\n", + "from ccc import conf" + ] + }, + { + "cell_type": "markdown", + "id": "0d3cc810-4b17-4213-8f03-6fe7e97a0fe3", + "metadata": { + "papermill": { + "duration": 0.001036, + "end_time": "2023-09-06T17:55:28.655692", + "exception": false, + "start_time": "2023-09-06T17:55:28.654656", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a8dfa548-6ce1-4edd-bef2-a919fc6ad850", + "metadata": { + "papermill": { + "duration": 0.005077, + "end_time": "2023-09-06T17:55:28.662990", + "exception": false, + "start_time": "2023-09-06T17:55:28.657913", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "rs = np.random.RandomState(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fd167aff-e768-416f-a078-f926f6023a1e", + "metadata": { + "papermill": { + "duration": 0.004073, + "end_time": "2023-09-06T17:55:28.668116", + "exception": false, + "start_time": "2023-09-06T17:55:28.664043", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "DATA_N_OBJS, DATA_N_FEATURES = 100, 1000\n", + "PVALUE_N_PERMS = 1000" + ] + }, + { + "cell_type": "markdown", + "id": "26bab485-b08e-4f59-b547-1da68fd36d54", + "metadata": { + "papermill": { + "duration": 0.001018, + "end_time": "2023-09-06T17:55:28.670228", + "exception": false, + "start_time": "2023-09-06T17:55:28.669210", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "170ee0f3-a6dd-4c8b-9a99-ec6d02df8e2e", + "metadata": { + "papermill": { + "duration": 0.003311, + "end_time": "2023-09-06T17:55:28.674559", + "exception": false, + "start_time": "2023-09-06T17:55:28.671248", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "OUTPUT_DIR = conf.RESULTS_DIR / \"ccc_null-pvalues\"\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d083d95e-247e-45cc-bc28-36cf8144383c", + "metadata": { + "papermill": { + "duration": 0.004671, + "end_time": "2023-09-06T17:55:28.680257", + "exception": false, + "start_time": "2023-09-06T17:55:28.675586", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/ccc_null-pvalues')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "OUTPUT_DIR" + ] + }, + { + "cell_type": "markdown", + "id": "6b593ccb-bce7-4a6b-818f-79d5378d4610", + "metadata": { + "papermill": { + "duration": 0.001047, + "end_time": "2023-09-06T17:55:28.682448", + "exception": false, + "start_time": "2023-09-06T17:55:28.681401", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Generate random data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "472ff1df-b4f6-417f-b396-58a55ce0e39a", + "metadata": { + "papermill": { + "duration": 0.003501, + "end_time": "2023-09-06T17:55:28.687033", + "exception": false, + "start_time": "2023-09-06T17:55:28.683532", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "acd7a9c0-d8a8-46f5-ab60-2478347adf36", + "metadata": { + "papermill": { + "duration": 0.003179, + "end_time": "2023-09-06T17:55:28.691323", + "exception": false, + "start_time": "2023-09-06T17:55:28.688144", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(10, 1000)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "markdown", + "id": "7c24b674-edde-4b83-817d-c7f10729cdc8", + "metadata": { + "papermill": { + "duration": 0.001073, + "end_time": "2023-09-06T17:55:28.693551", + "exception": false, + "start_time": "2023-09-06T17:55:28.692478", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run CCC" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c8a85ce0-4c5a-4ed9-8ad6-24b21fb10b1e", + "metadata": {}, + "outputs": [], + "source": [ + "def ccc_single(x, y):\n", + " return ccc(x, y, n_jobs=1, pvalue_n_perms=PVALUE_N_PERMS, pvalue_n_jobs=conf.GENERAL[\"N_JOBS\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6a5158c1-904a-42a7-9ead-4fc72ef9c720", + "metadata": {}, + "outputs": [], + "source": [ + "cm_values = []\n", + "cm_pvalues = []\n", + "\n", + "for i in range(data.shape[0] - 1):\n", + " for j in range(i+1, data.shape[0]):\n", + " v, p = ccc_single(data[i], data[j])\n", + " cm_values.append(v)\n", + " cm_pvalues.append(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "003f5e04-5e2e-477f-b66a-ea28ac1a8abc", + "metadata": {}, + "outputs": [], + "source": [ + "assert len(cm_values) == len(cm_pvalues)\n", + "assert len(cm_values) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5525b4ef-2e2e-4338-b52a-37d8308e237d", + "metadata": { + "papermill": { + "duration": 0.012058, + "end_time": "2023-09-06T19:36:00.762002", + "exception": false, + "start_time": "2023-09-06T19:36:00.749944", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cm_values = np.array(cm_values)\n", + "cm_pvalues = np.array(cm_pvalues)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e08382ef-423a-4114-9a8f-f1b5abc48055", + "metadata": { + "papermill": { + "duration": 0.005248, + "end_time": "2023-09-06T19:36:00.769387", + "exception": false, + "start_time": "2023-09-06T19:36:00.764139", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(45,)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm_values.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "93c4f9d7-6727-4db1-8bcc-1b618ecf41fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.00254684, 0.00104179, 0.00320558, 0.00018284, 0.00186997,\n", + " 0.00147106, 0.00177705, 0.00194291, 0.00049431, 0.00425941,\n", + " 0.00148615, 0.00019465, 0.00363023, 0.0056535 , 0.00274262,\n", + " 0.00522602, 0.0022903 , 0.00320755, 0.00099358, 0.00532412,\n", + " 0.00253045, 0.00149274, 0.00629346, 0.00221865, 0.00627013,\n", + " 0.00389841, 0.00138057, 0.00221203, 0.00417506, 0.00241475,\n", + " 0.00504645, 0.00137032, 0.00529612, 0.00326284, 0.00375165,\n", + " 0.00377352, 0.00323483, 0.00277389, 0.00797598, 0.0026016 ,\n", + " 0.00238008, 0.00171082, 0.00084283, 0.0051361 , 0.00122446])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm_values" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "46e8560e-4c1b-4e2b-b373-f42ca0a59819", + "metadata": { + "papermill": { + "duration": 0.003931, + "end_time": "2023-09-06T19:36:00.774747", + "exception": false, + "start_time": "2023-09-06T19:36:00.770816", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(45,)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm_pvalues.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "31ce94b0-ce31-4de6-9848-c1644268bd2b", + "metadata": { + "papermill": { + "duration": 0.003931, + "end_time": "2023-09-06T19:36:00.774747", + "exception": false, + "start_time": "2023-09-06T19:36:00.770816", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.45454545, 0.81818182, 0.36363636, 1. , 0.81818182,\n", + " 0.72727273, 0.90909091, 0.81818182, 1. , 0.18181818,\n", + " 0.90909091, 1. , 0.09090909, 0.09090909, 0.45454545,\n", + " 0.27272727, 0.54545455, 0.36363636, 0.90909091, 0.09090909,\n", + " 0.63636364, 0.90909091, 0.09090909, 0.72727273, 0.09090909,\n", + " 0.27272727, 0.90909091, 0.81818182, 0.27272727, 0.72727273,\n", + " 0.27272727, 1. , 0.09090909, 0.54545455, 0.36363636,\n", + " 0.36363636, 0.45454545, 0.36363636, 0.09090909, 0.45454545,\n", + " 0.54545455, 0.90909091, 0.90909091, 0.09090909, 0.81818182])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm_pvalues" + ] + }, + { + "cell_type": "markdown", + "id": "d25a59fa-a22b-41e0-84a3-74414ddaad23", + "metadata": { + "papermill": { + "duration": 0.001184, + "end_time": "2023-09-06T19:36:00.777478", + "exception": false, + "start_time": "2023-09-06T19:36:00.776294", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Save" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b11f71f7-bab8-4d83-bf49-fd9419648a3d", + "metadata": { + "papermill": { + "duration": 0.003911, + "end_time": "2023-09-06T19:36:00.782652", + "exception": false, + "start_time": "2023-09-06T19:36:00.778741", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/ccc_null-pvalues/gene_pairs-cm_values.npy')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "output_file = OUTPUT_DIR / \"gene_pairs-cm_values.npy\"\n", + "display(output_file)\n", + "\n", + "np.save(output_file, cm_values)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "12968ead-2e56-4214-956c-08f4f02952e9", + "metadata": { + "papermill": { + "duration": 0.003367, + "end_time": "2023-09-06T19:36:00.787278", + "exception": false, + "start_time": "2023-09-06T19:36:00.783911", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output_file = OUTPUT_DIR / \"gene_pairs-cm_pvalues.npy\"\n", + "display(output_file)\n", + "\n", + "np.save(output_file, cm_pvalues)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f57efbc-893b-44a7-bc7a-77ca6b72a60a", + "metadata": { + "papermill": { + "duration": 0.001175, + "end_time": "2023-09-06T19:36:00.789703", + "exception": false, + "start_time": "2023-09-06T19:36:00.788528", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted", + "text_representation": { + "extension": ".py", + "format_name": "percent", + "format_version": "1.3", + "jupytext_version": "1.11.5" + } + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 6034.996428, + "end_time": "2023-09-06T19:36:01.020079", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/25_pvalue/00-ccc_pvalue_dist-generate.ipynb", + "output_path": "nbs/25_pvalue/00-ccc_pvalue_dist-generate.run.ipynb", + "parameters": {}, + "start_time": "2023-09-06T17:55:26.023651", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/25_pvalue/py/01-ccc_pvalue_dist-generate-gene_pairs.py b/nbs/25_pvalue/py/01-ccc_pvalue_dist-generate-gene_pairs.py new file mode 100644 index 00000000..d3ccfe1a --- /dev/null +++ b/nbs/25_pvalue/py/01-ccc_pvalue_dist-generate-gene_pairs.py @@ -0,0 +1,118 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: all,-execution,-papermill,-trusted +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.11.5 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] tags=[] +# # Description + +# %% [markdown] tags=[] +# Generates a distribution of pvalues under the null hypothesis of no association. +# +# This notebook uses individual gene pairs as input for CCC and parallelizes permutations. + +# %% [markdown] tags=[] +# # Modules loading + +# %% tags=[] +import numpy as np +from scipy.spatial.distance import squareform +from sklearn.metrics import pairwise_distances + +from ccc.coef import ccc +from ccc import conf + +# %% [markdown] tags=[] +# # Settings + +# %% tags=[] +rs = np.random.RandomState(0) + +# %% tags=[] +DATA_N_OBJS, DATA_N_FEATURES = 100, 1000 +PVALUE_N_PERMS = 1000 + +# %% [markdown] tags=[] +# # Paths + +# %% tags=[] +OUTPUT_DIR = conf.RESULTS_DIR / "ccc_null-pvalues" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# %% tags=[] +OUTPUT_DIR + +# %% [markdown] tags=[] +# # Generate random data + +# %% tags=[] +data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES) + +# %% tags=[] +data.shape + + +# %% [markdown] tags=[] +# # Run CCC + +# %% +def ccc_single(x, y): + return ccc(x, y, n_jobs=1, pvalue_n_perms=PVALUE_N_PERMS, pvalue_n_jobs=conf.GENERAL["N_JOBS"]) + + +# %% +cm_values = [] +cm_pvalues = [] + +for i in range(data.shape[0] - 1): + for j in range(i+1, data.shape[0]): + v, p = ccc_single(data[i], data[j]) + cm_values.append(v) + cm_pvalues.append(p) + +# %% +assert len(cm_values) == len(cm_pvalues) +assert len(cm_values) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2 + +# %% tags=[] +cm_values = np.array(cm_values) +cm_pvalues = np.array(cm_pvalues) + +# %% tags=[] +cm_values.shape + +# %% +cm_values + +# %% tags=[] +cm_pvalues.shape + +# %% tags=[] +cm_pvalues + +# %% [markdown] tags=[] +# # Save + +# %% tags=[] +output_file = OUTPUT_DIR / "gene_pairs-cm_values.npy" +display(output_file) + +np.save(output_file, cm_values) + +# %% tags=[] +output_file = OUTPUT_DIR / "gene_pairs-cm_pvalues.npy" +display(output_file) + +np.save(output_file, cm_pvalues) + +# %% tags=[]