Skip to content

Commit 356b954

Browse files
committed
10_compute_correlations: pearson/spearman, compute only on whole blood
1 parent f102c0f commit 356b954

6 files changed

+173
-24
lines changed

nbs/10_compute_correlations/05_gtex_v8/05_01-gtex-var_pc_log2-pearson.ipynb

+60-4
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,13 @@
7676
},
7777
"outputs": [],
7878
"source": [
79+
"from time import time\n",
80+
"\n",
7981
"import pandas as pd\n",
8082
"from tqdm import tqdm\n",
8183
"\n",
8284
"from ccc import conf\n",
85+
"from ccc.utils import simplify_string\n",
8386
"from ccc.corr import pearson"
8487
]
8588
},
@@ -125,6 +128,38 @@
125128
"GENE_SELECTION_STRATEGY = \"var_pc_log2\""
126129
]
127130
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 3,
134+
"id": "ac2eaa49-c49e-4f3c-83c5-4f8c321d25a7",
135+
"metadata": {
136+
"execution": {
137+
"iopub.execute_input": "2022-05-24T14:46:59.207591Z",
138+
"iopub.status.busy": "2022-05-24T14:46:59.207409Z",
139+
"iopub.status.idle": "2022-05-24T14:46:59.210688Z",
140+
"shell.execute_reply": "2022-05-24T14:46:59.210092Z"
141+
},
142+
"papermill": {
143+
"duration": 0.010126,
144+
"end_time": "2022-05-24T14:46:59.212259",
145+
"exception": false,
146+
"start_time": "2022-05-24T14:46:59.202133",
147+
"status": "completed"
148+
},
149+
"tags": []
150+
},
151+
"outputs": [],
152+
"source": [
153+
"# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)\n",
154+
"TISSUES = [\n",
155+
" # \"Muscle - Skeletal\",\n",
156+
" \"Whole Blood\",\n",
157+
" # \"Skin - Sun Exposed (Lower leg)\",\n",
158+
" # \"Adipose - Subcutaneous\",\n",
159+
" # \"Artery - Tibial\",\n",
160+
"]"
161+
]
162+
},
128163
{
129164
"cell_type": "code",
130165
"execution_count": 3,
@@ -341,12 +376,27 @@
341376
"output_type": "display_data"
342377
}
343378
],
379+
"source": [
380+
"tissue_in_file_names = [f\"_data_{simplify_string(t.lower())}-\" for t in TISSUES]"
381+
]
382+
},
383+
{
384+
"cell_type": "code",
385+
"execution_count": null,
386+
"id": "cd78a8c5",
387+
"metadata": {
388+
"tags": []
389+
},
390+
"outputs": [],
344391
"source": [
345392
"input_files = sorted(list(INPUT_DIR.glob(f\"*-{GENE_SELECTION_STRATEGY}.pkl\")))\n",
393+
"input_files = [\n",
394+
" f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)\n",
395+
"]\n",
346396
"display(len(input_files))\n",
347397
"\n",
348-
"assert len(input_files) == conf.GTEX[\"N_TISSUES\"], len(input_files)\n",
349-
"display(input_files[:5])"
398+
"assert len(input_files) == len(TISSUES), len(TISSUES)\n",
399+
"display(input_files)"
350400
]
351401
},
352402
{
@@ -914,7 +964,7 @@
914964
}
915965
],
916966
"source": [
917-
"%timeit CORRELATION_METHOD(test_data)"
967+
"%timeit -r1 CORRELATION_METHOD(test_data)"
918968
]
919969
},
920970
{
@@ -973,8 +1023,14 @@
9731023
" data = pd.read_pickle(tissue_data_file)\n",
9741024
"\n",
9751025
" # compute correlations\n",
1026+
" start_time = time()\n",
1027+
"\n",
9761028
" data_corrs = CORRELATION_METHOD(data)\n",
9771029
"\n",
1030+
" end_time = time()\n",
1031+
" elapsed_time = end_time - start_time\n",
1032+
" display(elapsed_time)\n",
1033+
"\n",
9781034
" # save\n",
9791035
" output_filename = f\"{tissue_data_file.stem}-{method_name}.pkl\"\n",
9801036
" data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)"
@@ -1017,7 +1073,7 @@
10171073
"name": "python",
10181074
"nbconvert_exporter": "python",
10191075
"pygments_lexer": "ipython3",
1020-
"version": "3.9.9"
1076+
"version": "3.9.12"
10211077
},
10221078
"papermill": {
10231079
"default_parameters": {},

nbs/10_compute_correlations/05_gtex_v8/06_01-gtex-var_pc_log2-spearman.ipynb

+46-3
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,13 @@
7676
},
7777
"outputs": [],
7878
"source": [
79+
"from time import time\n",
80+
"\n",
7981
"import pandas as pd\n",
8082
"from tqdm import tqdm\n",
8183
"\n",
8284
"from ccc import conf\n",
85+
"from ccc.utils import simplify_string\n",
8386
"from ccc.corr import spearman"
8487
]
8588
},
@@ -125,6 +128,25 @@
125128
"GENE_SELECTION_STRATEGY = \"var_pc_log2\""
126129
]
127130
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": null,
134+
"id": "a9b9f8b2",
135+
"metadata": {
136+
"tags": []
137+
},
138+
"outputs": [],
139+
"source": [
140+
"# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)\n",
141+
"TISSUES = [\n",
142+
" # \"Muscle - Skeletal\",\n",
143+
" \"Whole Blood\",\n",
144+
" # \"Skin - Sun Exposed (Lower leg)\",\n",
145+
" # \"Adipose - Subcutaneous\",\n",
146+
" # \"Artery - Tibial\",\n",
147+
"]"
148+
]
149+
},
128150
{
129151
"cell_type": "code",
130152
"execution_count": 3,
@@ -341,12 +363,27 @@
341363
"output_type": "display_data"
342364
}
343365
],
366+
"source": [
367+
"tissue_in_file_names = [f\"_data_{simplify_string(t.lower())}-\" for t in TISSUES]"
368+
]
369+
},
370+
{
371+
"cell_type": "code",
372+
"execution_count": null,
373+
"id": "5fadd2c5",
374+
"metadata": {
375+
"tags": []
376+
},
377+
"outputs": [],
344378
"source": [
345379
"input_files = sorted(list(INPUT_DIR.glob(f\"*-{GENE_SELECTION_STRATEGY}.pkl\")))\n",
380+
"input_files = [\n",
381+
" f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)\n",
382+
"]\n",
346383
"display(len(input_files))\n",
347384
"\n",
348-
"assert len(input_files) == conf.GTEX[\"N_TISSUES\"], len(input_files)\n",
349-
"display(input_files[:5])"
385+
"assert len(input_files) == len(TISSUES), len(TISSUES)\n",
386+
"display(input_files)"
350387
]
351388
},
352389
{
@@ -914,7 +951,7 @@
914951
}
915952
],
916953
"source": [
917-
"%timeit CORRELATION_METHOD(test_data)"
954+
"%timeit -r1 CORRELATION_METHOD(test_data)"
918955
]
919956
},
920957
{
@@ -973,8 +1010,14 @@
9731010
" data = pd.read_pickle(tissue_data_file)\n",
9741011
"\n",
9751012
" # compute correlations\n",
1013+
" start_time = time()\n",
1014+
"\n",
9761015
" data_corrs = CORRELATION_METHOD(data)\n",
9771016
"\n",
1017+
" end_time = time()\n",
1018+
" elapsed_time = end_time - start_time\n",
1019+
" display(elapsed_time)\n",
1020+
"\n",
9781021
" # save\n",
9791022
" output_filename = f\"{tissue_data_file.stem}-{method_name}.pkl\"\n",
9801023
" data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)"

nbs/10_compute_correlations/05_gtex_v8/07_01-gtex-var_pc_log2-ccc.ipynb

+6-9
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
"from time import time\n",
8080
"\n",
8181
"import pandas as pd\n",
82+
"from tqdm import tqdm\n",
8283
"\n",
8384
"from ccc import conf\n",
8485
"from ccc.utils import simplify_string\n",
@@ -1034,8 +1035,10 @@
10341035
}
10351036
],
10361037
"source": [
1037-
"for tissue_data_file in input_files:\n",
1038-
" display(tissue_data_file.stem)\n",
1038+
"pbar = tqdm(input_files, ncols=100)\n",
1039+
"\n",
1040+
"for tissue_data_file in pbar:\n",
1041+
" pbar.set_description(tissue_data_file.stem)\n",
10391042
"\n",
10401043
" # read\n",
10411044
" data = pd.read_pickle(tissue_data_file)\n",
@@ -1074,13 +1077,7 @@
10741077
],
10751078
"metadata": {
10761079
"jupytext": {
1077-
"cell_metadata_filter": "all,-execution,-papermill,-trusted",
1078-
"text_representation": {
1079-
"extension": ".py",
1080-
"format_name": "percent",
1081-
"format_version": "1.3",
1082-
"jupytext_version": "1.11.5"
1083-
}
1080+
"cell_metadata_filter": "all,-execution,-papermill,-trusted"
10841081
},
10851082
"kernelspec": {
10861083
"display_name": "Python 3 (ipykernel)",

nbs/10_compute_correlations/05_gtex_v8/py/05_01-gtex-var_pc_log2-pearson.py

+28-3
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@
2626
# # Modules
2727

2828
# %% tags=[]
29+
from time import time
30+
2931
import pandas as pd
3032
from tqdm import tqdm
3133

3234
from ccc import conf
35+
from ccc.utils import simplify_string
3336
from ccc.corr import pearson
3437

3538
# %% [markdown] tags=[]
@@ -38,6 +41,16 @@
3841
# %% tags=[]
3942
GENE_SELECTION_STRATEGY = "var_pc_log2"
4043

44+
# %% tags=[]
45+
# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
46+
TISSUES = [
47+
# "Muscle - Skeletal",
48+
"Whole Blood",
49+
# "Skin - Sun Exposed (Lower leg)",
50+
# "Adipose - Subcutaneous",
51+
# "Artery - Tibial",
52+
]
53+
4154
# %% tags=[]
4255
CORRELATION_METHOD = pearson
4356

@@ -64,12 +77,18 @@
6477
# %% [markdown] tags=[]
6578
# # Data loading
6679

80+
# %% tags=[]
81+
tissue_in_file_names = [f"_data_{simplify_string(t.lower())}-" for t in TISSUES]
82+
6783
# %% tags=[]
6884
input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
85+
input_files = [
86+
f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)
87+
]
6988
display(len(input_files))
7089

71-
assert len(input_files) == conf.GTEX["N_TISSUES"], len(input_files)
72-
display(input_files[:5])
90+
assert len(input_files) == len(TISSUES), len(TISSUES)
91+
display(input_files)
7392

7493
# %% [markdown] tags=[]
7594
# # Compute similarity
@@ -97,7 +116,7 @@
97116
display(_tmp)
98117

99118
# %% tags=[]
100-
# %timeit CORRELATION_METHOD(test_data)
119+
# %timeit -r1 CORRELATION_METHOD(test_data)
101120

102121
# %% [markdown] tags=[]
103122
# ## Run
@@ -112,8 +131,14 @@
112131
data = pd.read_pickle(tissue_data_file)
113132

114133
# compute correlations
134+
start_time = time()
135+
115136
data_corrs = CORRELATION_METHOD(data)
116137

138+
end_time = time()
139+
elapsed_time = end_time - start_time
140+
display(elapsed_time)
141+
117142
# save
118143
output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
119144
data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)

nbs/10_compute_correlations/05_gtex_v8/py/06_01-gtex-var_pc_log2-spearman.py

+28-3
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@
2626
# # Modules
2727

2828
# %% tags=[]
29+
from time import time
30+
2931
import pandas as pd
3032
from tqdm import tqdm
3133

3234
from ccc import conf
35+
from ccc.utils import simplify_string
3336
from ccc.corr import spearman
3437

3538
# %% [markdown] tags=[]
@@ -38,6 +41,16 @@
3841
# %% tags=[]
3942
GENE_SELECTION_STRATEGY = "var_pc_log2"
4043

44+
# %% tags=[]
45+
# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
46+
TISSUES = [
47+
# "Muscle - Skeletal",
48+
"Whole Blood",
49+
# "Skin - Sun Exposed (Lower leg)",
50+
# "Adipose - Subcutaneous",
51+
# "Artery - Tibial",
52+
]
53+
4154
# %% tags=[]
4255
CORRELATION_METHOD = spearman
4356

@@ -64,12 +77,18 @@
6477
# %% [markdown] tags=[]
6578
# # Data loading
6679

80+
# %% tags=[]
81+
tissue_in_file_names = [f"_data_{simplify_string(t.lower())}-" for t in TISSUES]
82+
6783
# %% tags=[]
6884
input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
85+
input_files = [
86+
f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)
87+
]
6988
display(len(input_files))
7089

71-
assert len(input_files) == conf.GTEX["N_TISSUES"], len(input_files)
72-
display(input_files[:5])
90+
assert len(input_files) == len(TISSUES), len(TISSUES)
91+
display(input_files)
7392

7493
# %% [markdown] tags=[]
7594
# # Compute similarity
@@ -97,7 +116,7 @@
97116
display(_tmp)
98117

99118
# %% tags=[]
100-
# %timeit CORRELATION_METHOD(test_data)
119+
# %timeit -r1 CORRELATION_METHOD(test_data)
101120

102121
# %% [markdown] tags=[]
103122
# ## Run
@@ -112,8 +131,14 @@
112131
data = pd.read_pickle(tissue_data_file)
113132

114133
# compute correlations
134+
start_time = time()
135+
115136
data_corrs = CORRELATION_METHOD(data)
116137

138+
end_time = time()
139+
elapsed_time = end_time - start_time
140+
display(elapsed_time)
141+
117142
# save
118143
output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
119144
data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)

0 commit comments

Comments
 (0)