From 6face6b91196d1c392d90d1ef7908f716625a831 Mon Sep 17 00:00:00 2001 From: Lucas Camillo Date: Thu, 8 Feb 2024 20:04:52 +0000 Subject: [PATCH] added tutorials to docs again --- docs/source/tutorials/tutorial_atacseq.ipynb | 772 ++++++++ .../tutorials/tutorial_bloodchemistry.ipynb | 633 +++++++ .../tutorial_dnam_illumina_human_array.ipynb | 1186 ++++++++++++ ...torial_dnam_illumina_mammalian_array.ipynb | 1616 ++++++++++++++++ .../source/tutorials/tutorial_dnam_rrbs.ipynb | 1656 +++++++++++++++++ .../tutorial_histonemarkchipseq.ipynb | 298 +++ docs/source/tutorials/tutorial_rnaseq.ipynb | 683 +++++++ docs/source/tutorials/tutorial_utils.ipynb | 522 ++++++ tutorials/tutorial_histonemarkchipseq.ipynb | 2 +- 9 files changed, 7367 insertions(+), 1 deletion(-) create mode 100644 docs/source/tutorials/tutorial_atacseq.ipynb create mode 100644 docs/source/tutorials/tutorial_bloodchemistry.ipynb create mode 100644 docs/source/tutorials/tutorial_dnam_illumina_human_array.ipynb create mode 100644 docs/source/tutorials/tutorial_dnam_illumina_mammalian_array.ipynb create mode 100644 docs/source/tutorials/tutorial_dnam_rrbs.ipynb create mode 100644 docs/source/tutorials/tutorial_histonemarkchipseq.ipynb create mode 100644 docs/source/tutorials/tutorial_rnaseq.ipynb create mode 100644 docs/source/tutorials/tutorial_utils.ipynb diff --git a/docs/source/tutorials/tutorial_atacseq.ipynb b/docs/source/tutorials/tutorial_atacseq.ipynb new file mode 100644 index 0000000..b99a0bb --- /dev/null +++ b/docs/source/tutorials/tutorial_atacseq.ipynb @@ -0,0 +1,772 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "82321cbb-e1b9-49f3-b826-32c4fafd96f4", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_atacseq.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_atacseq.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "cf837dcf-de19-46f7-9c81-c063a45b14b6", + "metadata": {}, + "source": [ + "# Bulk ATAC-Seq" + ] + }, + { + "cell_type": "markdown", + "id": "133e64f6-a0d2-4a70-84a0-33a4c2bea725", + "metadata": {}, + "source": [ + "This tutorial is a brief guide for the implementation of the two ATAC clocks developed by Morandini et al. Link to [paper](https://link.springer.com/article/10.1007/s11357-023-00986-0)." + ] + }, + { + "cell_type": "markdown", + "id": "a0469770-c9af-4d69-a055-bf3c312286db", + "metadata": {}, + "source": [ + "We just need two packages for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e64f6bb6-7c95-4b9e-b37a-4ae811dc088d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyaging as pya " + ] + }, + { + "cell_type": "markdown", + "id": "5c367e47-ceca-4c73-ac14-b6a9a781c66a", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "52b6e9c2-b7b7-4655-ab9c-84afe3185b78", + "metadata": {}, + "source": [ + "If you have your own ATAC-Seq data, please follow the recommendations in the Ocampo paper. Specifically, one needs to count the number of reads for each of the peak regions from the paper (file [here](https://static-content.springer.com/esm/art%3A10.1007%2Fs11357-023-00986-0/MediaObjects/11357_2023_986_MOESM9_ESM.tsv)). This can be done through the code found on their [GitHub](https://github.com/SunScript0/ATAC-clock/blob/main/pipeline_atac/02_peakset_and_counts.sh) using featureCounts." + ] + }, + { + "cell_type": "markdown", + "id": "76f0a1a6-ad70-4a40-8fd0-63de208c7ad5", + "metadata": {}, + "source": [ + "For testing purposes, let's download an example of input for the ATAC clocks. For instructions on how to go from raw sequencing reads to the data table, please refer to the paper. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "992db2fb-2b58-4f8b-92ac-f760df3758dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Data found in pyaging_data/GSE193140.pkl\n", + "|-----> πŸŽ‰ Done! [0.4942s]\n" + ] + } + ], + "source": [ + "pya.data.download_example_data('GSE193140')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "078b2b96-7317-4f84-a8c4-16276fb76137", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('pyaging_data/GSE193140.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cd1a8e9b-53a5-4cfc-ac6d-22d9ed09784a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chr1:817100-817691chr1:826742-828191chr1:841908-843021chr1:844055-844921chr1:857908-859108chr1:869571-870271chr1:898378-899076chr1:904303-905702chr1:906675-907111chr1:912617-913368...chrY:21073148-21074236chrY:21174455-21175401chrY:21177324-21177828chrY:21180682-21181317chrY:21239902-21241040chrY:21248553-21249961chrY:21256824-21257260chrY:21259823-21260874chrY:22086084-22086722chrY:22499696-22500344
CR_124182265215119843271455637...6210465319020502122
CR_12296268827254010971378616712...11133125270372918912
CR_12113727854246691297863835124...0000000000
CR_120169281929354613732093130110...798331514750183214
CR_1192053005184537102533113824136...1518171257257877
\n", + "

5 rows Γ— 80400 columns

\n", + "
" + ], + "text/plain": [ + " chr1:817100-817691 chr1:826742-828191 chr1:841908-843021 \\\n", + "CR_124 182 2652 15 \n", + "CR_122 96 2688 27 \n", + "CR_121 137 2785 42 \n", + "CR_120 169 2819 29 \n", + "CR_119 205 3005 18 \n", + "\n", + " chr1:844055-844921 chr1:857908-859108 chr1:869571-870271 \\\n", + "CR_124 11 9 843 \n", + "CR_122 25 40 1097 \n", + "CR_121 46 69 1297 \n", + "CR_120 35 46 1373 \n", + "CR_119 45 37 1025 \n", + "\n", + " chr1:898378-899076 chr1:904303-905702 chr1:906675-907111 \\\n", + "CR_124 2 714 556 \n", + "CR_122 13 786 167 \n", + "CR_121 8 638 351 \n", + "CR_120 20 931 301 \n", + "CR_119 33 1138 241 \n", + "\n", + " chr1:912617-913368 ... chrY:21073148-21074236 \\\n", + "CR_124 37 ... 62 \n", + "CR_122 12 ... 11 \n", + "CR_121 24 ... 0 \n", + "CR_120 10 ... 7 \n", + "CR_119 36 ... 15 \n", + "\n", + " chrY:21174455-21175401 chrY:21177324-21177828 \\\n", + "CR_124 104 65 \n", + "CR_122 13 31 \n", + "CR_121 0 0 \n", + "CR_120 9 8 \n", + "CR_119 18 17 \n", + "\n", + " chrY:21180682-21181317 chrY:21239902-21241040 \\\n", + "CR_124 31 90 \n", + "CR_122 25 270 \n", + "CR_121 0 0 \n", + "CR_120 33 151 \n", + "CR_119 12 57 \n", + "\n", + " chrY:21248553-21249961 chrY:21256824-21257260 \\\n", + "CR_124 20 50 \n", + "CR_122 37 29 \n", + "CR_121 0 0 \n", + "CR_120 47 50 \n", + "CR_119 25 7 \n", + "\n", + " chrY:21259823-21260874 chrY:22086084-22086722 chrY:22499696-22500344 \n", + "CR_124 21 2 2 \n", + "CR_122 18 9 12 \n", + "CR_121 0 0 0 \n", + "CR_120 18 32 14 \n", + "CR_119 8 7 7 \n", + "\n", + "[5 rows x 80400 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "251495e7-082f-45ae-841c-a2dd86a3cb15", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "73ba54e0-4292-4d85-b208-e56e267d6797", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4dab5019-9f54-4e32-be19-abbb1c71a2d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.0289s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0004s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 157 observations\n", + "|-----------> There are 80400 features\n", + "|-----------> Total missing values: 0\n", + "|-----------> Percentage of missing values: 0.00%\n", + "|-----> βœ… Log data statistics finished [0.0049s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> No missing values found. No imputation necessary\n", + "|-----> βœ… Impute missing values finished [0.0053s]\n", + "|-----> πŸŽ‰ Done! [0.0419s]\n" + ] + } + ], + "source": [ + "adata = pya.preprocess.df_to_adata(df)" + ] + }, + { + "cell_type": "markdown", + "id": "5042e04f-17c0-4eb2-8c5d-2c2fc5d6d2d6", + "metadata": {}, + "source": [ + "Note that the original DataFrame is stored in `X_original` under layers. is This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "503da312-2256-4e67-9747-107f5c4587ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 157 Γ— 80400\n", + " var: 'percent_na'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "c072990d-0f54-49b3-bb7a-7bbd13301e2a", + "metadata": {}, + "source": [ + "## Predict age" + ] + }, + { + "cell_type": "markdown", + "id": "5fe08978-f1ba-49b3-a0d8-52df4b6efb4e", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. For convenience, let's simply input all two clocks of interest at once. The function is invariant to the capitalization of the clock name. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "96e008fe-9f8c-45fb-8dc6-6a39f1ecb7ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: ocampoatac1\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/ocampoatac1.pt\n", + "|-----------> βœ… Load clock finished [0.5113s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_ocampoatac1]\n", + "|-----------> βœ… Check features in adata finished [3.8480s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> The preprocessing method is tpm_norm_log1p\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.1635s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸ•’ Processing clock: ocampoatac2\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/ocampoatac2.pt\n", + "|-----------> βœ… Load clock finished [0.4514s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_ocampoatac2]\n", + "|-----------> βœ… Check features in adata finished [4.9598s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> The preprocessing method is tpm_norm_log1p\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0690s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸŽ‰ Done! [10.1175s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, ['OcampoATAC1', 'OcampoATAC2'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b83a10e2-7984-4427-9e85-05329a16feb3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocampoatac1ocampoatac2
CR_12429.52712428.114206
CR_12239.00309740.061162
CR_12140.71600843.095199
CR_12032.38037233.033456
CR_11936.44071138.301516
\n", + "
" + ], + "text/plain": [ + " ocampoatac1 ocampoatac2\n", + "CR_124 29.527124 28.114206\n", + "CR_122 39.003097 40.061162\n", + "CR_121 40.716008 43.095199\n", + "CR_120 32.380372 33.033456\n", + "CR_119 36.440711 38.301516" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a2cfa1e5-a7f4-4157-8c66-4afcc2323ef7", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "055761d9-7e22-49f3-a1db-31c3ed3749ba", + "metadata": {}, + "outputs": [], + "source": [ + "pya.data.download_example_data('GSE193140', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/GSE193140.pkl')\n", + "adata = pya.preprocess.df_to_adata(df, verbose=False)\n", + "pya.pred.predict_age(adata, ['OcampoATAC1', 'OcampoATAC2'], verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fdd9d6c2-7f0a-4f96-a095-4a492ed73f8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocampoatac1ocampoatac2
CR_12429.52712428.114206
CR_12239.00309740.061162
CR_12140.71600843.095199
CR_12032.38037233.033456
CR_11936.44071138.301516
\n", + "
" + ], + "text/plain": [ + " ocampoatac1 ocampoatac2\n", + "CR_124 29.527124 28.114206\n", + "CR_122 39.003097 40.061162\n", + "CR_121 40.716008 43.095199\n", + "CR_120 32.380372 33.033456\n", + "CR_119 36.440711 38.301516" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "4bad3df8-f868-4cf5-be74-00ffd02c18f5", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0d13fb55-8a12-4d28-83e9-ec7c9fbbe30c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 157 Γ— 80400\n", + " obs: 'ocampoatac1', 'ocampoatac2'\n", + " var: 'percent_na'\n", + " uns: 'ocampoatac1_percent_na', 'ocampoatac1_missing_features', 'ocampoatac1_metadata', 'ocampoatac2_percent_na', 'ocampoatac2_missing_features', 'ocampoatac2_metadata'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "a4e7ad8d-44ae-4ced-a626-f9e3b2d04114", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "d7d089b0-5433-47a9-b031-bc4504c6b55d", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6b368506-55d1-4b74-be61-817bcf575ade", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'ocampoatac1',\n", + " 'data_type': 'atac',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2023,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Morandini, Francesco, et al. \"ATAC-clock: An aging clock based on chromatin accessibility.\" GeroScience (2023): 1-18.',\n", + " 'doi': 'https://doi.org/10.1007/s11357-023-00986-0',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['ocampoatac1_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_bloodchemistry.ipynb b/docs/source/tutorials/tutorial_bloodchemistry.ipynb new file mode 100644 index 0000000..3c25b1c --- /dev/null +++ b/docs/source/tutorials/tutorial_bloodchemistry.ipynb @@ -0,0 +1,633 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2089cc5b-a025-4928-a331-ad33fd1b6a85", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_rnaseq.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_rnaseq.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "31cf37ce-09ee-49d7-a411-719bf65e186e", + "metadata": {}, + "source": [ + "# Blood chemistry" + ] + }, + { + "cell_type": "markdown", + "id": "3ea2b570-56af-4e4f-9606-d4c6d071554c", + "metadata": {}, + "source": [ + "This tutorial is a brief guide for the implementation of PhenoAge. Link to [paper](https://www.aging-us.com/article/101414/text)." + ] + }, + { + "cell_type": "markdown", + "id": "0a093c7d-dea7-4b34-91bf-08cde6c98011", + "metadata": {}, + "source": [ + "We just need two packages for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ad192191-e44f-4994-80ad-ab16cdb7c7e8", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import pyaging as pya" + ] + }, + { + "cell_type": "markdown", + "id": "d87488d5-731c-469e-ad6f-79c4c9662371", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "4c30471f-89e7-4e92-a176-aa3af14a5274", + "metadata": {}, + "source": [ + "Let's download some example human blood data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a0692cf7-e979-4f27-bc14-e1013057c16d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Data found in pyaging_data/blood_chemistry_example.pkl\n", + "|-----> πŸŽ‰ Done! [0.5248s]\n" + ] + } + ], + "source": [ + "pya.data.download_example_data('blood_chemistry_example')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "13aeb69a-4b0e-40f2-8094-194c9a6b42a1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('pyaging_data/blood_chemistry_example.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0106112d-21ad-4991-af9f-74b92f46c55b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
albumincreatinineglucoselog_crplymphocyte_percentmean_cell_volumered_cell_distribution_widthalkaline_phosphatasewhite_blood_cell_countage
patient151.887.24.5-0.227.992.413.9123.50.00603770.2
patient253.157.36.1-0.227.880.912.081.50.00413576.5
patient337.4114.75.6-0.223.683.212.4124.40.00738266.4
patient445.988.15.4-0.238.692.511.4113.40.00653746.5
patient540.745.44.7-0.238.388.813.5107.80.00469542.3
\n", + "
" + ], + "text/plain": [ + " albumin creatinine glucose log_crp lymphocyte_percent \\\n", + "patient1 51.8 87.2 4.5 -0.2 27.9 \n", + "patient2 53.1 57.3 6.1 -0.2 27.8 \n", + "patient3 37.4 114.7 5.6 -0.2 23.6 \n", + "patient4 45.9 88.1 5.4 -0.2 38.6 \n", + "patient5 40.7 45.4 4.7 -0.2 38.3 \n", + "\n", + " mean_cell_volume red_cell_distribution_width alkaline_phosphatase \\\n", + "patient1 92.4 13.9 123.5 \n", + "patient2 80.9 12.0 81.5 \n", + "patient3 83.2 12.4 124.4 \n", + "patient4 92.5 11.4 113.4 \n", + "patient5 88.8 13.5 107.8 \n", + "\n", + " white_blood_cell_count age \n", + "patient1 0.006037 70.2 \n", + "patient2 0.004135 76.5 \n", + "patient3 0.007382 66.4 \n", + "patient4 0.006537 46.5 \n", + "patient5 0.004695 42.3 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "45cbc6e1-9cf7-46a8-ac92-18924a7a5cf8", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "ae486006-b533-411b-b449-ff6d2261345a", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "acf93ebe-0440-4b1f-9040-05260df459f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.0029s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0006s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 30 observations\n", + "|-----------> There are 10 features\n", + "|-----------> Total missing values: 0\n", + "|-----------> Percentage of missing values: 0.00%\n", + "|-----> βœ… Log data statistics finished [0.0018s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> No missing values found. No imputation necessary\n", + "|-----> βœ… Impute missing values finished [0.0012s]\n", + "|-----> πŸŽ‰ Done! [0.0098s]\n" + ] + } + ], + "source": [ + "adata = pya.preprocess.df_to_adata(df)" + ] + }, + { + "cell_type": "markdown", + "id": "54dcb802-6dd7-40cc-ab61-073f70778754", + "metadata": {}, + "source": [ + "Note that the original DataFrame is stored in `X_original` under layers. is This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3cfcf1f4-01d8-4da2-81e9-fee50e051ffe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 30 Γ— 10\n", + " var: 'percent_na'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "2277ede6-ab9e-487b-a58d-c01cb21b6b68", + "metadata": {}, + "source": [ + "## Predict age" + ] + }, + { + "cell_type": "markdown", + "id": "889d2d5f-a596-41d0-b849-560b6bc856a1", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. Given we only have one clock of interest for this tutorial, let's go with one. The function is invariant to the capitalization of the clock name. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2dbc7beb-79b8-4e99-b36f-36bcd693c864", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0011s]\n", + "|-----> πŸ•’ Processing clock: phenoage\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/phenoage.pt\n", + "|-----------> βœ… Load clock finished [0.4217s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_phenoage]\n", + "|-----------> βœ… Check features in adata finished [0.0050s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mortality_to_phenoage\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0037s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0020s]\n", + "|-----> πŸŽ‰ Done! [0.5048s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, 'PhenoAge')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "032382f5-7d98-465e-a3cb-51165eeb7025", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
phenoage
patient170.643137
patient264.834061
patient370.258559
patient442.979385
patient541.677749
\n", + "
" + ], + "text/plain": [ + " phenoage\n", + "patient1 70.643137\n", + "patient2 64.834061\n", + "patient3 70.258559\n", + "patient4 42.979385\n", + "patient5 41.677749" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2acc80b1-f936-40e4-900a-ef4deb304558", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a587f129-a88b-46ec-a249-ac62737a0cb7", + "metadata": {}, + "outputs": [], + "source": [ + "pya.data.download_example_data('blood_chemistry_example', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/blood_chemistry_example.pkl')\n", + "adata = pya.preprocess.df_to_adata(df, verbose=False)\n", + "pya.pred.predict_age(adata, ['PhenoAge'], verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "99fbe406-d076-4979-a2f4-70469755937f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
phenoage
patient170.643137
patient264.834061
patient370.258559
patient442.979385
patient541.677749
\n", + "
" + ], + "text/plain": [ + " phenoage\n", + "patient1 70.643137\n", + "patient2 64.834061\n", + "patient3 70.258559\n", + "patient4 42.979385\n", + "patient5 41.677749" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "72f0eb22-76f2-41b5-b20f-824548215122", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a778028a-7ee6-419c-9be6-e7046a9d8f9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 30 Γ— 10\n", + " obs: 'phenoage'\n", + " var: 'percent_na'\n", + " uns: 'phenoage_percent_na', 'phenoage_missing_features', 'phenoage_metadata'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "1a73e164-a610-4cb6-93f5-6f8ac7d8d56f", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "6c7a070c-c448-4ad7-ae0b-21857dafd00e", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9908d25a-9639-4684-9da6-353c7eb4a555", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'phenoage',\n", + " 'data_type': 'blood chemistry',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2018,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Levine, Morgan E., et al. \"An epigenetic biomarker of aging for lifespan and healthspan.\" Aging (albany NY) 10.4 (2018): 573.',\n", + " 'doi': 'https://doi.org/10.18632%2Faging.101414',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['phenoage_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_dnam_illumina_human_array.ipynb b/docs/source/tutorials/tutorial_dnam_illumina_human_array.ipynb new file mode 100644 index 0000000..9a9d827 --- /dev/null +++ b/docs/source/tutorials/tutorial_dnam_illumina_human_array.ipynb @@ -0,0 +1,1186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e690b3c-4dec-450e-a7f8-f63987e60cdb", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_dnam.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_dnam.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "62e1ac68-927d-4ca8-a2ab-bd99a7ee52ab", + "metadata": {}, + "source": [ + "# Illumina Human Methylation Arrays" + ] + }, + { + "cell_type": "markdown", + "id": "41699e8b-e682-4617-867c-fd0a624ae0ef", + "metadata": {}, + "source": [ + "This tutorial is a brief guide for the implementation of an array of bulk DNA-methylation epigenetic clocks that predict age in humans. In this notebook, we will demonstrate the breadth of epigenetic clock models available in `pyaging` by showing:\n", + "\n", + "- Horvath's 2013 ElasticNet-based clock ([paper](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-10-r115));\n", + " \n", + "- AltumAge, a highly accurate deep-learning based clock ([paper](https://www.nature.com/articles/s41514-022-00085-y));\n", + " \n", + "- PCGrimAge, a principal-component based version of the GrimAge clock ([paper](https://www.nature.com/articles/s43587-022-00248-2));\n", + "\n", + "- GrimAge2, the latest version of GrimAge ([paper](https://www.aging-us.com/article/204434/text]));\n", + "\n", + "- DunedinPACE, a biomarker of the pace of aging ([paper](https://elifesciences.org/articles/73420))." + ] + }, + { + "cell_type": "markdown", + "id": "d8ddf439-0b19-4a1a-9491-3a0e1ee94447", + "metadata": {}, + "source": [ + "We just need two packages for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dc347ae0-41ae-46ac-ba50-08cacd4c9241", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyaging as pya" + ] + }, + { + "cell_type": "markdown", + "id": "f3eebcc0-d3a2-42a4-b87f-1637d0db2588", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "096cb9c0-67d1-4a37-958e-13da15b15435", + "metadata": {}, + "source": [ + "Let's download the publicly avaiable dataset GSE139307 with Illumina's 450k array. The CpG coverage of the 450k array should be good enough for most clocks." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4b2680c6-0c91-4d6a-bd12-6f9cc06932f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Data found in pyaging_data/GSE139307.pkl\n", + "|-----> πŸŽ‰ Done! [0.0006s]\n" + ] + } + ], + "source": [ + "pya.data.download_example_data('GSE139307')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3192d1bd-f5d6-426c-8c65-29971e46c4b4", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('pyaging_data/GSE139307.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ddf9e1d5-ddb1-42e2-9dfc-9cf4441b326d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasettissue_typeagegendercg00000029cg00000108cg00000109cg00000165cg00000236cg00000289...ch.X.93511680Fch.X.938089Fch.X.94051109Rch.X.94260649Rch.X.967194Fch.X.97129969Rch.X.97133160Rch.X.97651759Fch.X.97737721Fch.X.98007042R
GSM4137709GSE139307sperm84.0M0.0848110.9206960.8568510.0845670.8386990.247273...0.0617510.0459420.0376310.0564550.2498720.0490220.0856910.0374350.0778200.106234
GSM4137710GSE139307sperm69.0M0.0996260.9190730.8900240.1155410.8525840.198103...0.0750770.0418490.0325730.0897900.2502450.0790950.0797560.0462290.0912560.120241
GSM4137711GSE139307sperm69.0M0.1172280.9202760.8943170.1171270.8392580.213410...0.0686790.0495150.0580970.0799190.2997580.0793050.0898150.0653640.0868640.156005
GSM4137712GSE139307sperm69.0M0.0770960.9102040.9084000.0738850.8616150.163276...0.0700910.0332890.0388360.1082130.2954280.0507310.0999430.0475970.0784800.107480
GSM4137713GSE139307sperm67.0M0.0635240.9116080.8846430.0798770.8646540.176169...0.0823680.0384110.0487870.0886310.3166940.0418730.0793030.0488230.0890100.117903
\n", + "

5 rows Γ— 485516 columns

\n", + "
" + ], + "text/plain": [ + " dataset tissue_type age gender cg00000029 cg00000108 \\\n", + "GSM4137709 GSE139307 sperm 84.0 M 0.084811 0.920696 \n", + "GSM4137710 GSE139307 sperm 69.0 M 0.099626 0.919073 \n", + "GSM4137711 GSE139307 sperm 69.0 M 0.117228 0.920276 \n", + "GSM4137712 GSE139307 sperm 69.0 M 0.077096 0.910204 \n", + "GSM4137713 GSE139307 sperm 67.0 M 0.063524 0.911608 \n", + "\n", + " cg00000109 cg00000165 cg00000236 cg00000289 ... \\\n", + "GSM4137709 0.856851 0.084567 0.838699 0.247273 ... \n", + "GSM4137710 0.890024 0.115541 0.852584 0.198103 ... \n", + "GSM4137711 0.894317 0.117127 0.839258 0.213410 ... \n", + "GSM4137712 0.908400 0.073885 0.861615 0.163276 ... \n", + "GSM4137713 0.884643 0.079877 0.864654 0.176169 ... \n", + "\n", + " ch.X.93511680F ch.X.938089F ch.X.94051109R ch.X.94260649R \\\n", + "GSM4137709 0.061751 0.045942 0.037631 0.056455 \n", + "GSM4137710 0.075077 0.041849 0.032573 0.089790 \n", + "GSM4137711 0.068679 0.049515 0.058097 0.079919 \n", + "GSM4137712 0.070091 0.033289 0.038836 0.108213 \n", + "GSM4137713 0.082368 0.038411 0.048787 0.088631 \n", + "\n", + " ch.X.967194F ch.X.97129969R ch.X.97133160R ch.X.97651759F \\\n", + "GSM4137709 0.249872 0.049022 0.085691 0.037435 \n", + "GSM4137710 0.250245 0.079095 0.079756 0.046229 \n", + "GSM4137711 0.299758 0.079305 0.089815 0.065364 \n", + "GSM4137712 0.295428 0.050731 0.099943 0.047597 \n", + "GSM4137713 0.316694 0.041873 0.079303 0.048823 \n", + "\n", + " ch.X.97737721F ch.X.98007042R \n", + "GSM4137709 0.077820 0.106234 \n", + "GSM4137710 0.091256 0.120241 \n", + "GSM4137711 0.086864 0.156005 \n", + "GSM4137712 0.078480 0.107480 \n", + "GSM4137713 0.089010 0.117903 \n", + "\n", + "[5 rows x 485516 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "cfbebcff-687e-4199-8b8a-6712577cebc0", + "metadata": {}, + "source": [ + "For PCGrimAge and GrimAge2, both age and sex are features. Therefore, to get the full prediction, let's convert the column `gender` into a column called `female`, with 1 being female and 0 being male." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "671906c4-ae7e-4808-859a-470e19757477", + "metadata": {}, + "outputs": [], + "source": [ + "# needs only numerical data (doesn't work with strings)\n", + "df['female'] = (df['gender'] == 'F').astype(int)" + ] + }, + { + "cell_type": "markdown", + "id": "125e979c-3149-4e00-9a78-820b9c89c0ef", + "metadata": {}, + "source": [ + "Moreover, it is important to note that some probes are duplicated in the EPICv2 array, following the format cg#########_BC11 and cg#########_TC11 for the opposite strands. Given that at this moment most clocks have not been trained with EPICv2 data directly, it is recommended to average these probes. This is particularly the case for DunedinPACE, from which some clock probes were duplicated in the update from EPICv1. To remedy this issue, simply use the following function to aggregate any duplicated probes that may be present." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cdb19129-49b3-4b81-b264-73ccbc7943c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting epicv2_probe_aggregation function\n", + "|-----> βš™οΈ Looking for duplicated probes started\n", + "|-----------> in progress: 100.0000%\n", + "|-----------> There are no duplicated probes. Returning original data\n", + "|-----> πŸŽ‰ Done! [7.6244s]\n" + ] + } + ], + "source": [ + "df = pya.pp.epicv2_probe_aggregation(df)" + ] + }, + { + "cell_type": "markdown", + "id": "798b8998-dc50-44ac-bc4e-c97c4f1ec183", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "f8269bff-fadf-4524-801d-6294655c005b", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "037192c8-ef2e-440a-88ae-06bddbab80a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----------? Dropping 1 columns with only NAs: ['cg01550828'], etc.\n", + "|-----> ⚠️ Create anndata object finished [0.3588s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------> Adding provided metadata to adata.obs\n", + "|-----> βœ… Add metadata to anndata finished [0.0007s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 37 observations\n", + "|-----------> There are 485513 features\n", + "|-----------> Total missing values: 489\n", + "|-----------> Percentage of missing values: 0.00%\n", + "|-----> βœ… Log data statistics finished [0.0203s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> Imputing missing values using knn strategy\n", + "|-----> βœ… Impute missing values finished [5.1869s]\n", + "|-----> βš™οΈ Add imputer strategy to adata.uns started\n", + "|-----> βœ… Add imputer strategy to adata.uns finished [0.0002s]\n", + "|-----> πŸŽ‰ Done! [5.6547s]\n" + ] + } + ], + "source": [ + "adata = pya.pp.df_to_adata(df, metadata_cols=['gender', 'tissue_type', 'dataset'], imputer_strategy='knn')" + ] + }, + { + "cell_type": "markdown", + "id": "3f4e479d-0b80-4079-9cd5-79f7d5130ba4", + "metadata": {}, + "source": [ + "Note that the original DataFrame is stored in `X_original` under layers. is This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3259b596-5679-42c1-967b-5297f1612cf4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 37 Γ— 485513\n", + " obs: 'gender', 'tissue_type', 'dataset'\n", + " var: 'percent_na'\n", + " uns: 'imputer_strategy'\n", + " layers: 'X_original', 'X_imputed'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "bff7c621-f7b7-4fc0-9c91-6f82a9211e8b", + "metadata": {}, + "source": [ + "## Predict age" + ] + }, + { + "cell_type": "markdown", + "id": "07f5a0ae-1901-4f49-af64-320974584231", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. For convenience, let's simply input all four clocks of interest at once. The function is invariant to the capitalization of the clock name. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5383ef0f-ff8b-4e41-bbb6-7fd7ff6cc6be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0008s]\n", + "|-----> πŸ•’ Processing clock: horvath2013\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Downloading data to pyaging_data/horvath2013.pt\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Load clock finished [0.5065s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_horvath2013]\n", + "|-----------> βœ… Check features in adata finished [0.0667s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is anti_log_linear\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0108s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸ•’ Processing clock: altumage\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Downloading data to pyaging_data/altumage.pt\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Load clock finished [6.2463s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_altumage]\n", + "|-----------> βœ… Check features in adata finished [1.8184s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> The preprocessing method is scale\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0105s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: pcgrimage\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Downloading data to pyaging_data/pcgrimage.pt\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Load clock finished [173.4044s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_pcgrimage]\n", + "|-----------> βœ… Check features in adata finished [7.7016s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.1926s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸ•’ Processing clock: grimage2\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Downloading data to pyaging_data/grimage2.pt\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Load clock finished [0.6879s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_grimage2]\n", + "|-----------> βœ… Check features in adata finished [0.1193s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is cox_to_years\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0028s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: dunedinpace\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Downloading data to pyaging_data/dunedinpace.pt\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Load clock finished [0.8928s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_dunedinpace]\n", + "|-----------> βœ… Check features in adata finished [2.4793s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> The preprocessing method is quantile_normalization_with_gold_standard\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0608s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸŽ‰ Done! [197.5751s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, ['Horvath2013', 'AltumAge', 'PCGrimAge', 'GrimAge2', 'DunedinPACE'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9c0c6926-2944-4274-aefa-eb099b8e6737", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gendertissue_typedatasethorvath2013altumagepcgrimagegrimage2dunedinpace
GSM4137709MspermGSE13930733.62477637.00721395.50611477.5810571.326327
GSM4137710MspermGSE13930728.82934429.42689983.93424465.9263461.215611
GSM4137711MspermGSE13930728.31654522.79892882.70933463.3583411.271091
GSM4137712MspermGSE13930724.85063018.07917384.26946260.2188801.276866
GSM4137713MspermGSE13930725.94211120.07198584.35698561.2359191.262023
\n", + "
" + ], + "text/plain": [ + " gender tissue_type dataset horvath2013 altumage pcgrimage \\\n", + "GSM4137709 M sperm GSE139307 33.624776 37.007213 95.506114 \n", + "GSM4137710 M sperm GSE139307 28.829344 29.426899 83.934244 \n", + "GSM4137711 M sperm GSE139307 28.316545 22.798928 82.709334 \n", + "GSM4137712 M sperm GSE139307 24.850630 18.079173 84.269462 \n", + "GSM4137713 M sperm GSE139307 25.942111 20.071985 84.356985 \n", + "\n", + " grimage2 dunedinpace \n", + "GSM4137709 77.581057 1.326327 \n", + "GSM4137710 65.926346 1.215611 \n", + "GSM4137711 63.358341 1.271091 \n", + "GSM4137712 60.218880 1.276866 \n", + "GSM4137713 61.235919 1.262023 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0274db5f-c66d-4b74-a4f2-d5e40ed2824e", + "metadata": {}, + "source": [ + "For curiosity, we can also check if there are any correlations amongst these clocks." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bbe696da-df04-4e28-b72a-b70bf6a9bffc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
horvath2013altumagepcgrimagegrimage2dunedinpace
horvath20131.0000000.6762420.2118810.4591930.354771
altumage0.6762421.0000000.1564560.4400440.164101
pcgrimage0.2118810.1564561.0000000.8594900.061491
grimage20.4591930.4400440.8594901.0000000.183725
dunedinpace0.3547710.1641010.0614910.1837251.000000
\n", + "
" + ], + "text/plain": [ + " horvath2013 altumage pcgrimage grimage2 dunedinpace\n", + "horvath2013 1.000000 0.676242 0.211881 0.459193 0.354771\n", + "altumage 0.676242 1.000000 0.156456 0.440044 0.164101\n", + "pcgrimage 0.211881 0.156456 1.000000 0.859490 0.061491\n", + "grimage2 0.459193 0.440044 0.859490 1.000000 0.183725\n", + "dunedinpace 0.354771 0.164101 0.061491 0.183725 1.000000" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.iloc[:, 3:].corr('pearson')" + ] + }, + { + "cell_type": "markdown", + "id": "069520e6-0ffe-43d5-a7c1-c3f726b0a1ac", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7009e4a4-90c6-4c4f-9bbd-bcb3e3836eee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gendertissue_typedatasethorvath2013altumagepcgrimagegrimage2dunedinpace
GSM4137709MspermGSE13930733.62477637.00721395.50578077.5810571.326308
GSM4137710MspermGSE13930728.82934429.42689983.93424465.9263461.215614
GSM4137711MspermGSE13930728.31654522.80555182.70933463.3583411.271033
GSM4137712MspermGSE13930724.85063018.06010784.26946260.2188801.276866
GSM4137713MspermGSE13930725.94211120.07198584.35698561.2359191.262023
\n", + "
" + ], + "text/plain": [ + " gender tissue_type dataset horvath2013 altumage pcgrimage \\\n", + "GSM4137709 M sperm GSE139307 33.624776 37.007213 95.505780 \n", + "GSM4137710 M sperm GSE139307 28.829344 29.426899 83.934244 \n", + "GSM4137711 M sperm GSE139307 28.316545 22.805551 82.709334 \n", + "GSM4137712 M sperm GSE139307 24.850630 18.060107 84.269462 \n", + "GSM4137713 M sperm GSE139307 25.942111 20.071985 84.356985 \n", + "\n", + " grimage2 dunedinpace \n", + "GSM4137709 77.581057 1.326308 \n", + "GSM4137710 65.926346 1.215614 \n", + "GSM4137711 63.358341 1.271033 \n", + "GSM4137712 60.218880 1.276866 \n", + "GSM4137713 61.235919 1.262023 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pya.data.download_example_data('GSE139307', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/GSE139307.pkl')\n", + "df['female'] = (df['gender'] == 'F').astype(int)\n", + "df = pya.pp.epicv2_probe_aggregation(df, verbose=False)\n", + "adata = pya.preprocess.df_to_adata(df, metadata_cols=['gender', 'tissue_type', 'dataset'], imputer_strategy='mean', verbose=False)\n", + "pya.pred.predict_age(adata, ['Horvath2013', 'AltumAge', 'PCGrimAge', 'GrimAge2', 'DunedinPACE'], verbose=False)\n", + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6f246126-0be1-47b2-86f4-c81dcc752da0", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "14649c41-167e-4771-a4de-1ed2ae72eb51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 37 Γ— 485513\n", + " obs: 'gender', 'tissue_type', 'dataset', 'horvath2013', 'altumage', 'pcgrimage', 'grimage2', 'dunedinpace'\n", + " var: 'percent_na'\n", + " uns: 'imputer_strategy', 'horvath2013_percent_na', 'horvath2013_missing_features', 'horvath2013_metadata', 'altumage_percent_na', 'altumage_missing_features', 'altumage_metadata', 'pcgrimage_percent_na', 'pcgrimage_missing_features', 'pcgrimage_metadata', 'grimage2_percent_na', 'grimage2_missing_features', 'grimage2_metadata', 'dunedinpace_percent_na', 'dunedinpace_missing_features', 'dunedinpace_metadata'\n", + " layers: 'X_original', 'X_imputed'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "ba818dde-1561-4184-9c54-3aedc5c8de8e", + "metadata": {}, + "source": [ + "We can also look at which features seem to be missing from each clock (if there are any)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "083b2f80-c191-4f84-abcc-d18aa9659e99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['dunedinpace_missing_features']" + ] + }, + { + "cell_type": "markdown", + "id": "123ea2ce-8db1-492d-9d13-c57447030ad8", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "99ad630f-a3b1-4cf7-a180-b8b56bd548e1", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7d2d10dc-4ffe-4940-a7f1-2041b933f7b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'horvath2013',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2013,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Horvath, Steve. \"DNA methylation age of human tissues and cell types.\" Genome biology 14.10 (2013): 1-20.',\n", + " 'doi': 'https://doi.org/10.1186/gb-2013-14-10-r115',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['horvath2013_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e2a5311a-ed7d-4e1b-9fbb-b4ad676ce9da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'altumage',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2022,\n", + " 'approved_by_author': 'βœ…',\n", + " 'citation': 'de Lima Camillo, Lucas Paulo, Louis R. Lapierre, and Ritambhara Singh. \"A pan-tissue DNA-methylation epigenetic clock based on deep learning.\" npj Aging 8.1 (2022): 4.',\n", + " 'doi': 'https://doi.org/10.1038/s41514-022-00085-y',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['altumage_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0bbefc0c-acc8-47db-84dc-5ebe80d08500", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'pcgrimage',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2022,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Higgins-Chen, Albert T., et al. \"A computational solution for bolstering reliability of epigenetic clocks: Implications for clinical trials and longitudinal tracking.\" Nature aging 2.7 (2022): 644-661.',\n", + " 'doi': 'https://doi.org/10.1038/s43587-022-00248-2',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['pcgrimage_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9ddfb7c3-83ef-44a6-ace3-ffd5553a5770", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'grimage2',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2022,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Lu, Ake T., et al. \"DNA methylation GrimAge version 2.\" Aging (Albany NY) 14.23 (2022): 9484.',\n", + " 'doi': 'https://doi.org/10.18632/aging.204434',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['grimage2_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "54497c7f-e1cb-4dd4-815a-182af52155b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'dunedinpace',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2022,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Belsky, Daniel W., et al. \"DunedinPACE, a DNA methylation biomarker of the pace of aging.\" Elife 11 (2022): e73420.',\n", + " 'doi': 'https://doi.org/10.7554/eLife.73420',\n", + " 'notes': \"The automatic failure if fewer than 80% of the CpG probes are available is not implemented and left to the user's discretion.\",\n", + " 'version': None}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['dunedinpace_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_dnam_illumina_mammalian_array.ipynb b/docs/source/tutorials/tutorial_dnam_illumina_mammalian_array.ipynb new file mode 100644 index 0000000..777e70d --- /dev/null +++ b/docs/source/tutorials/tutorial_dnam_illumina_mammalian_array.ipynb @@ -0,0 +1,1616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e690b3c-4dec-450e-a7f8-f63987e60cdb", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_dnam.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_dnam.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "62e1ac68-927d-4ca8-a2ab-bd99a7ee52ab", + "metadata": {}, + "source": [ + "# Illumina Mammalian Methylation Arrays" + ] + }, + { + "cell_type": "markdown", + "id": "d8ddf439-0b19-4a1a-9491-3a0e1ee94447", + "metadata": {}, + "source": [ + "We just need two packages for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dc347ae0-41ae-46ac-ba50-08cacd4c9241", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyaging as pya" + ] + }, + { + "cell_type": "markdown", + "id": "d5e6b7de-7f34-4ee5-935a-00a0e2fb8945", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "8c6ba751-380e-4b91-b4a2-c20e05711380", + "metadata": {}, + "source": [ + "Let's download the publicly avaiable dataset GSE223748 with Illumina's Mammalian Methylation array. The CpG coverage of the this array (~37k) spans highly conserved CpG sequences. Let's download a subset of that data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7a34a4cc-1756-485d-bec6-305693eea35b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Data found in pyaging_data/GSE223748_subset.pkl\n", + "|-----> πŸŽ‰ Done! [0.5310s]\n" + ] + } + ], + "source": [ + "pya.data.download_example_data('GSE223748')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6e929219-e691-4171-911e-46143ae94898", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('pyaging_data/GSE223748_subset.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "03141fc7-f175-4ad3-86ed-22d51db5cadd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cg00000165cg00001209cg00001364cg00001582cg00002920cg00003994cg00004555cg00005112cg00005271cg00006213...rs7746156_II_F_C_37550rs798149_II_F_C_37528rs845016_II_F_C_37529rs877309_II_F_C_37552rs9292570_I_F_C_37499rs9363764_II_F_C_37541rs939290_II_F_C_37535rs951295_I_F_C_37507rs966367_II_F_C_37551rs9839873_II_F_C_37532
204509080002_R01C020.0948790.9161540.8903140.0535830.4903810.0348520.1597050.7639590.9732450.928975...0.4885920.4913610.4800240.5000000.4842520.4894480.5055850.5053350.4850030.510081
202897220142_R04C020.4970770.4412630.9153140.0473390.6510290.0377740.0826340.4158000.7028570.821715...0.5081020.5002990.5072610.4906840.4996730.4972560.5641060.4821510.4866670.505236
204529320092_R01C020.3211410.8341580.8811940.0561240.6883500.0302250.0867760.7775880.9745870.923934...0.5204040.5095680.5075490.5016590.4928230.4872430.5160180.4712440.4910660.491759
202794570004_R02C010.4952260.9241210.9158120.0508660.6883350.0323440.1133180.8720940.9691890.917076...0.4993140.5161320.4870090.4871460.4691190.4951250.5482380.5122830.5142570.492520
203531420070_R05C020.1839540.9343320.9241530.0550320.7174950.0371080.0986320.8596140.9734220.963446...0.5014320.5094120.4850550.4972720.4806370.4675020.4942460.5009240.5313340.503709
\n", + "

5 rows Γ— 37554 columns

\n", + "
" + ], + "text/plain": [ + " cg00000165 cg00001209 cg00001364 cg00001582 \\\n", + "204509080002_R01C02 0.094879 0.916154 0.890314 0.053583 \n", + "202897220142_R04C02 0.497077 0.441263 0.915314 0.047339 \n", + "204529320092_R01C02 0.321141 0.834158 0.881194 0.056124 \n", + "202794570004_R02C01 0.495226 0.924121 0.915812 0.050866 \n", + "203531420070_R05C02 0.183954 0.934332 0.924153 0.055032 \n", + "\n", + " cg00002920 cg00003994 cg00004555 cg00005112 \\\n", + "204509080002_R01C02 0.490381 0.034852 0.159705 0.763959 \n", + "202897220142_R04C02 0.651029 0.037774 0.082634 0.415800 \n", + "204529320092_R01C02 0.688350 0.030225 0.086776 0.777588 \n", + "202794570004_R02C01 0.688335 0.032344 0.113318 0.872094 \n", + "203531420070_R05C02 0.717495 0.037108 0.098632 0.859614 \n", + "\n", + " cg00005271 cg00006213 ... rs7746156_II_F_C_37550 \\\n", + "204509080002_R01C02 0.973245 0.928975 ... 0.488592 \n", + "202897220142_R04C02 0.702857 0.821715 ... 0.508102 \n", + "204529320092_R01C02 0.974587 0.923934 ... 0.520404 \n", + "202794570004_R02C01 0.969189 0.917076 ... 0.499314 \n", + "203531420070_R05C02 0.973422 0.963446 ... 0.501432 \n", + "\n", + " rs798149_II_F_C_37528 rs845016_II_F_C_37529 \\\n", + "204509080002_R01C02 0.491361 0.480024 \n", + "202897220142_R04C02 0.500299 0.507261 \n", + "204529320092_R01C02 0.509568 0.507549 \n", + "202794570004_R02C01 0.516132 0.487009 \n", + "203531420070_R05C02 0.509412 0.485055 \n", + "\n", + " rs877309_II_F_C_37552 rs9292570_I_F_C_37499 \\\n", + "204509080002_R01C02 0.500000 0.484252 \n", + "202897220142_R04C02 0.490684 0.499673 \n", + "204529320092_R01C02 0.501659 0.492823 \n", + "202794570004_R02C01 0.487146 0.469119 \n", + "203531420070_R05C02 0.497272 0.480637 \n", + "\n", + " rs9363764_II_F_C_37541 rs939290_II_F_C_37535 \\\n", + "204509080002_R01C02 0.489448 0.505585 \n", + "202897220142_R04C02 0.497256 0.564106 \n", + "204529320092_R01C02 0.487243 0.516018 \n", + "202794570004_R02C01 0.495125 0.548238 \n", + "203531420070_R05C02 0.467502 0.494246 \n", + "\n", + " rs951295_I_F_C_37507 rs966367_II_F_C_37551 \\\n", + "204509080002_R01C02 0.505335 0.485003 \n", + "202897220142_R04C02 0.482151 0.486667 \n", + "204529320092_R01C02 0.471244 0.491066 \n", + "202794570004_R02C01 0.512283 0.514257 \n", + "203531420070_R05C02 0.500924 0.531334 \n", + "\n", + " rs9839873_II_F_C_37532 \n", + "204509080002_R01C02 0.510081 \n", + "202897220142_R04C02 0.505236 \n", + "204529320092_R01C02 0.491759 \n", + "202794570004_R02C01 0.492520 \n", + "203531420070_R05C02 0.503709 \n", + "\n", + "[5 rows x 37554 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "be02f82e-69b3-4534-88e2-5ee8851f9319", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "3f7ddbab-c020-4a07-a95c-450f6f591e2f", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cb628052-a992-4e41-8d56-eb932f1579eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.0119s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0019s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 100 observations\n", + "|-----------> There are 37554 features\n", + "|-----------> Total missing values: 0\n", + "|-----------> Percentage of missing values: 0.00%\n", + "|-----> βœ… Log data statistics finished [0.0128s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> No missing values found. No imputation necessary\n", + "|-----> βœ… Impute missing values finished [0.0079s]\n", + "|-----> πŸŽ‰ Done! [0.0404s]\n" + ] + } + ], + "source": [ + "adata = pya.pp.df_to_adata(df, imputer_strategy='knn')" + ] + }, + { + "cell_type": "markdown", + "id": "36b4dc99-1069-45da-a988-9493de817d99", + "metadata": {}, + "source": [ + "This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e26459d3-276d-4cda-b3bf-a4147c397667", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 100 Γ— 37554\n", + " var: 'percent_na'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "cc9c675d-6196-4274-b209-1487656acc9f", + "metadata": {}, + "source": [ + "## Predict age" + ] + }, + { + "cell_type": "markdown", + "id": "30e82111-8c24-4a4d-8f28-22e0c4d35413", + "metadata": {}, + "source": [ + "### Mammalian predictors without species declaration" + ] + }, + { + "cell_type": "markdown", + "id": "90eb1122-ce2e-49ec-919b-c47bd85643dc", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. Let's first start with the mammalian clocks that do not need the species Latin name for the conversion of the output into units of years." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4a3484b8-ea3c-49dc-a2b1-4fb831906e29", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0013s]\n", + "|-----> πŸ•’ Processing clock: mammalian1\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalian1.pt\n", + "|-----------> βœ… Load clock finished [0.5443s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalian1]\n", + "|-----------> βœ… Check features in adata finished [0.0318s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is anti_logp2\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0015s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: mammalianlifespan\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianlifespan.pt\n", + "|-----------> βœ… Load clock finished [0.4468s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianlifespan]\n", + "|-----------> βœ… Check features in adata finished [0.0127s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0006s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸ•’ Processing clock: mammalianfemale\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianfemale.pt\n", + "|-----------> βœ… Load clock finished [0.4320s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianfemale]\n", + "|-----------> βœ… Check features in adata finished [0.0095s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is sigmoid\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0015s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0005s]\n", + "|-----> πŸŽ‰ Done! [1.6454s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, ['Mammalian1', 'MammalianLifespan', 'MammalianFemale'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "98fbdf4c-57c2-4885-bc4a-96b4771a638b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mammalian1mammalianlifespanmammalianfemale
204509080002_R01C0226.37243793.8860670.994351
202897220142_R04C021.1765866.9991760.991473
204529320092_R01C0218.77643873.3351190.008419
202794570004_R02C010.8909735.3326150.941965
203531420070_R05C0210.37131568.4093310.009133
\n", + "
" + ], + "text/plain": [ + " mammalian1 mammalianlifespan mammalianfemale\n", + "204509080002_R01C02 26.372437 93.886067 0.994351\n", + "202897220142_R04C02 1.176586 6.999176 0.991473\n", + "204529320092_R01C02 18.776438 73.335119 0.008419\n", + "202794570004_R02C01 0.890973 5.332615 0.941965\n", + "203531420070_R05C02 10.371315 68.409331 0.009133" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "9f6620b4-3a88-4cfd-9f4b-0cbc8047adf9", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64035819-8dd0-4917-96ed-55c36ef34a66", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mammalian1mammalianlifespanmammalianfemale
204509080002_R01C0226.37243793.8860670.994351
202897220142_R04C021.1765866.9991760.991473
204529320092_R01C0218.77643873.3351190.008419
202794570004_R02C010.8909735.3326150.941965
203531420070_R05C0210.37131568.4093310.009133
\n", + "
" + ], + "text/plain": [ + " mammalian1 mammalianlifespan mammalianfemale\n", + "204509080002_R01C02 26.372437 93.886067 0.994351\n", + "202897220142_R04C02 1.176586 6.999176 0.991473\n", + "204529320092_R01C02 18.776438 73.335119 0.008419\n", + "202794570004_R02C01 0.890973 5.332615 0.941965\n", + "203531420070_R05C02 10.371315 68.409331 0.009133" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pya.data.download_example_data('GSE223748', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/GSE223748_subset.pkl')\n", + "adata = pya.preprocess.df_to_adata(df, imputer_strategy='knn', verbose=False)\n", + "pya.pred.predict_age(adata, ['Mammalian1', 'MammalianLifespan', 'MammalianFemale'], verbose=False)\n", + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b5d3a683-2ff4-436b-9a90-34a138a98a14", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0d069189-1750-42f6-89d9-73039dd07a00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 100 Γ— 37554\n", + " obs: 'mammalian1', 'mammalianlifespan', 'mammalianfemale'\n", + " var: 'percent_na'\n", + " uns: 'mammalian1_percent_na', 'mammalian1_missing_features', 'mammalian1_metadata', 'mammalianlifespan_percent_na', 'mammalianlifespan_missing_features', 'mammalianlifespan_metadata', 'mammalianfemale_percent_na', 'mammalianfemale_missing_features', 'mammalianfemale_metadata'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "bd975ebd-88aa-49f5-889e-43c69eb79afc", + "metadata": {}, + "source": [ + "### Mammalian predictors with species declaration" + ] + }, + { + "cell_type": "markdown", + "id": "1ec840ce-2286-45a3-989e-d7a36469e1cd", + "metadata": {}, + "source": [ + "Mammalian2 and mammalian3 types of clocks require species declaration for the reverse transformation of the output into units of years. For the mammalian2 clocks, there are 1756 species in the dictionary with the available variables for reverse transformation; for the mammalian3, there are 1707 species. By default, Homo sapiens is the chosen species.\n", + "\n", + "Let's first have a look at the species that can be used for these clocks by loading the models themselves." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "be99c22b-dc46-419b-b415-1c024b5e35d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> βš™οΈ Load clock started\n", + "|-----------> Data found in pyaging_data/mammalian2.pt\n", + "|-----> βœ… Load clock finished [0.4540s]\n", + "|-----> βš™οΈ Load clock started\n", + "|-----------> Data found in pyaging_data/mammalian3.pt\n", + "|-----> βœ… Load clock finished [0.4946s]\n" + ] + } + ], + "source": [ + "logger = pya.logger.Logger('test_logger')\n", + "device = 'cpu'\n", + "dir = 'pyaging_data'\n", + "indent_level = 1\n", + "\n", + "mammalian2_model = pya.pred.load_clock('Mammalian2', device, dir, logger, indent_level=indent_level)\n", + "mammalian3_model = pya.pred.load_clock('Mammalian3', device, dir, logger, indent_level=indent_level)" + ] + }, + { + "cell_type": "markdown", + "id": "2f6a5ec7-a9b2-44fc-af5e-d14ae910b935", + "metadata": {}, + "source": [ + "We need to filter the features for the ones that are not CpG sites." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a8e37387-a32f-4e2f-98b0-22000792c565", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 1756 species Latin name features in mammalian2.\n", + "There are 1707 species Latin name features in mammalian3.\n" + ] + } + ], + "source": [ + "mammalian2_species = [feature for feature in mammalian2_model.features if feature[0:2] != 'cg']\n", + "mammalian3_species = [feature for feature in mammalian3_model.features if feature[0:2] != 'cg']\n", + "print(f\"There are {len(mammalian2_species)} species Latin name features in mammalian2.\")\n", + "print(f\"There are {len(mammalian3_species)} species Latin name features in mammalian3.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7a86f159-ccf2-4dbc-93b2-ea76558a1f81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Anaxyrus americanus',\n", + " 'Anaxyrus boreas',\n", + " 'Anaxyrus canorus',\n", + " 'Anaxyrus cognatus',\n", + " 'Anaxyrus retiformis',\n", + " 'Anaxyrus terrestris',\n", + " 'Rhinella marina',\n", + " 'Dendrobates auratus',\n", + " 'Dendrobates leucomelas',\n", + " 'Hyla chrysoscelis']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mammalian2_species[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8e125e7a-7f8f-4360-9d69-be6815404128", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Anaxyrus americanus',\n", + " 'Anaxyrus boreas',\n", + " 'Anaxyrus canorus',\n", + " 'Anaxyrus cognatus',\n", + " 'Rhinella marina',\n", + " 'Dendrobates auratus',\n", + " 'Dendrobates leucomelas',\n", + " 'Phyllobates vittatus',\n", + " 'Hyla chrysoscelis',\n", + " 'Hyla versicolor']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mammalian3_species[0:10]" + ] + }, + { + "cell_type": "markdown", + "id": "498ca26d-ab18-48d8-96a5-4c3915433a05", + "metadata": {}, + "source": [ + "To chose a species, simply add the Latin name as a feature with value 1. In this subset version of the GSE223748 dataset, the species names are not available. Therefore, let's use the naked mole rat (Heterocephalus glaber) as our species. \n", + "\n", + "Let's first check that it is available in the clocks." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "47908607-99b8-411e-a91c-e4829cb2781c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'Heterocephalus glaber' in mammalian2_species" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6bf84b35-2091-4297-892a-d7addb7badc6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'Heterocephalus glaber' in mammalian3_species" + ] + }, + { + "cell_type": "markdown", + "id": "4f1e4c93-397a-401e-948f-b42fbb62f222", + "metadata": {}, + "source": [ + "Then, let's add it as a feature to the pandas dataframe and create a new adata object." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "09259244-661c-475c-bc9d-2168788f6226", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.0291s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0006s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 100 observations\n", + "|-----------> There are 37555 features\n", + "|-----------> Total missing values: 0\n", + "|-----------> Percentage of missing values: 0.00%\n", + "|-----> βœ… Log data statistics finished [0.0045s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> No missing values found. No imputation necessary\n", + "|-----> βœ… Impute missing values finished [0.0040s]\n", + "|-----> πŸŽ‰ Done! [0.0415s]\n" + ] + } + ], + "source": [ + "df['Heterocephalus glaber'] = 1\n", + "adata = pya.pp.df_to_adata(df, imputer_strategy='knn')" + ] + }, + { + "cell_type": "markdown", + "id": "bd588d20-a333-4775-9862-75c28cecff51", + "metadata": {}, + "source": [ + "Finally, let's make the predictions using the multi-tissue mammalian2 and mammalian3 clocks plus the blood-specific and skin-specific versions." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f3f2a5a2-e546-4cbb-97ad-df8714b138c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0013s]\n", + "|-----> πŸ•’ Processing clock: mammalian2\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalian2.pt\n", + "|-----------> βœ… Load clock finished [0.4827s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1755 out of 2572 features (68.23%) are missing: ['Anaxyrus americanus', 'Anaxyrus boreas', 'Anaxyrus canorus'], etc.\n", + "|-----------------> Using reference feature values for mammalian2\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalian2]\n", + "|-----------> ⚠️ Check features in adata finished [0.0556s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian2\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0042s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸ•’ Processing clock: mammalianskin2\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianskin2.pt\n", + "|-----------> βœ… Load clock finished [0.4283s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1755 out of 2240 features (78.35%) are missing: ['Anaxyrus americanus', 'Anaxyrus boreas', 'Anaxyrus canorus'], etc.\n", + "|-----------------> Using reference feature values for mammalianskin2\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianskin2]\n", + "|-----------> ⚠️ Check features in adata finished [0.0447s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian2\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0022s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: mammalianblood2\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianblood2.pt\n", + "|-----------> βœ… Load clock finished [0.4847s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1755 out of 2257 features (77.76%) are missing: ['Anaxyrus americanus', 'Anaxyrus boreas', 'Anaxyrus canorus'], etc.\n", + "|-----------------> Using reference feature values for mammalianblood2\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianblood2]\n", + "|-----------> ⚠️ Check features in adata finished [0.0799s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian2\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0013s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0005s]\n", + "|-----> πŸ•’ Processing clock: mammalian3\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalian3.pt\n", + "|-----------> βœ… Load clock finished [0.5006s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1706 out of 2467 features (69.15%) are missing: ['Anaxyrus americanus', 'Anaxyrus boreas', 'Anaxyrus canorus'], etc.\n", + "|-----------------> Using reference feature values for mammalian3\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalian3]\n", + "|-----------> ⚠️ Check features in adata finished [0.1086s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian3\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0059s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0005s]\n", + "|-----> πŸ•’ Processing clock: mammalianskin3\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianskin3.pt\n", + "|-----------> βœ… Load clock finished [0.5248s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1706 out of 2055 features (83.02%) are missing: ['Anaxyrus americanus', 'Anaxyrus boreas', 'Anaxyrus canorus'], etc.\n", + "|-----------------> Using reference feature values for mammalianskin3\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianskin3]\n", + "|-----------> ⚠️ Check features in adata finished [0.0743s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian3\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0024s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0007s]\n", + "|-----> πŸ•’ Processing clock: mammalianblood3\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianblood3.pt\n", + "|-----------> βœ… Load clock finished [0.4440s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1706 out of 2097 features (81.35%) are missing: ['Anaxyrus americanus', 'Anaxyrus boreas', 'Anaxyrus canorus'], etc.\n", + "|-----------------> Using reference feature values for mammalianblood3\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianblood3]\n", + "|-----------> ⚠️ Check features in adata finished [0.0856s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian3\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0024s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸŽ‰ Done! [3.6574s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, ['Mammalian2', 'MammalianSkin2', 'MammalianBlood2', 'Mammalian3', 'MammalianSkin3', 'MammalianBlood3'])" + ] + }, + { + "cell_type": "markdown", + "id": "2b582309-e03a-4906-a35d-7bd556f9f5e7", + "metadata": {}, + "source": [ + "During age prediction, if the other species are not present in the input data, they will show up as missing features and the value will be automatically replaced with 0. Therefore, those missing features are not necessarily CpG sites. To double check, one can simply go to the adata.uns to check for missing features." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "48d9c49e-ca16-43c8-8919-2e37ba092a56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Anaxyrus americanus',\n", + " 'Anaxyrus boreas',\n", + " 'Anaxyrus canorus',\n", + " 'Anaxyrus cognatus',\n", + " 'Anaxyrus retiformis',\n", + " 'Anaxyrus terrestris',\n", + " 'Rhinella marina',\n", + " 'Dendrobates auratus',\n", + " 'Dendrobates leucomelas',\n", + " 'Hyla chrysoscelis']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['mammalian2_missing_features'][0:10]" + ] + }, + { + "cell_type": "markdown", + "id": "d6175d3a-c2a6-449f-b1c7-60e334b3cf7f", + "metadata": {}, + "source": [ + "Finally, let's look at the predictions." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bde7b46e-98ea-479c-a08c-f91483e60371", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mammalian2mammalianskin2mammalianblood2mammalian3mammalianskin3mammalianblood3
204509080002_R01C0214.9463356.56278015.3218109.2114764.32820813.111774
202897220142_R04C0217.0371823.9665184.29304817.7375730.9004704.240264
204529320092_R01C0212.06534713.39264314.4833155.9504737.0482465.498531
202794570004_R02C0115.26356919.4513867.14874315.87087814.48329011.685419
203531420070_R05C026.6894906.8098017.6021414.1060292.0401046.293626
.....................
205128010037_R03C0222.69813822.94806225.32632114.43279914.01326721.264321
206116820044_R06C0211.1460129.92755216.8131755.0922384.3678786.303160
203203210055_R03C022.2204053.1146507.2040621.3237261.7197786.897330
203203210003_R06C0221.67213628.80066121.53650216.23194920.17706224.678961
204027420026_R03C024.9252576.6567135.4819184.3500994.8502693.033646
\n", + "

100 rows Γ— 6 columns

\n", + "
" + ], + "text/plain": [ + " mammalian2 mammalianskin2 mammalianblood2 mammalian3 \\\n", + "204509080002_R01C02 14.946335 6.562780 15.321810 9.211476 \n", + "202897220142_R04C02 17.037182 3.966518 4.293048 17.737573 \n", + "204529320092_R01C02 12.065347 13.392643 14.483315 5.950473 \n", + "202794570004_R02C01 15.263569 19.451386 7.148743 15.870878 \n", + "203531420070_R05C02 6.689490 6.809801 7.602141 4.106029 \n", + "... ... ... ... ... \n", + "205128010037_R03C02 22.698138 22.948062 25.326321 14.432799 \n", + "206116820044_R06C02 11.146012 9.927552 16.813175 5.092238 \n", + "203203210055_R03C02 2.220405 3.114650 7.204062 1.323726 \n", + "203203210003_R06C02 21.672136 28.800661 21.536502 16.231949 \n", + "204027420026_R03C02 4.925257 6.656713 5.481918 4.350099 \n", + "\n", + " mammalianskin3 mammalianblood3 \n", + "204509080002_R01C02 4.328208 13.111774 \n", + "202897220142_R04C02 0.900470 4.240264 \n", + "204529320092_R01C02 7.048246 5.498531 \n", + "202794570004_R02C01 14.483290 11.685419 \n", + "203531420070_R05C02 2.040104 6.293626 \n", + "... ... ... \n", + "205128010037_R03C02 14.013267 21.264321 \n", + "206116820044_R06C02 4.367878 6.303160 \n", + "203203210055_R03C02 1.719778 6.897330 \n", + "203203210003_R06C02 20.177062 24.678961 \n", + "204027420026_R03C02 4.850269 3.033646 \n", + "\n", + "[100 rows x 6 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "333ff256-996c-4f41-8260-fffb5b248513", + "metadata": {}, + "source": [ + "For curiosity let's check the correlation between the clocks." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "165d709e-73ce-4ccb-a0b2-25e485f16d92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mammalian2mammalianskin2mammalianblood2mammalian3mammalianskin3mammalianblood3
mammalian21.0000000.6589340.6093920.9101190.6111790.717135
mammalianskin20.6589341.0000000.4182900.6074720.9003050.594863
mammalianblood20.6093920.4182901.0000000.4635040.3881970.754871
mammalian30.9101190.6074720.4635041.0000000.6750220.729099
mammalianskin30.6111790.9003050.3881970.6750221.0000000.646473
mammalianblood30.7171350.5948630.7548710.7290990.6464731.000000
\n", + "
" + ], + "text/plain": [ + " mammalian2 mammalianskin2 mammalianblood2 mammalian3 \\\n", + "mammalian2 1.000000 0.658934 0.609392 0.910119 \n", + "mammalianskin2 0.658934 1.000000 0.418290 0.607472 \n", + "mammalianblood2 0.609392 0.418290 1.000000 0.463504 \n", + "mammalian3 0.910119 0.607472 0.463504 1.000000 \n", + "mammalianskin3 0.611179 0.900305 0.388197 0.675022 \n", + "mammalianblood3 0.717135 0.594863 0.754871 0.729099 \n", + "\n", + " mammalianskin3 mammalianblood3 \n", + "mammalian2 0.611179 0.717135 \n", + "mammalianskin2 0.900305 0.594863 \n", + "mammalianblood2 0.388197 0.754871 \n", + "mammalian3 0.675022 0.729099 \n", + "mammalianskin3 1.000000 0.646473 \n", + "mammalianblood3 0.646473 1.000000 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.corr('pearson')" + ] + }, + { + "cell_type": "markdown", + "id": "7931df39-6108-408a-9214-829a8da9da9a", + "metadata": {}, + "source": [ + "Again, having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "6f00d62f-2f2c-4e90-bd2b-bd409b0131de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mammalian2mammalianskin2mammalianblood2mammalian3mammalianskin3mammalianblood3
204509080002_R01C0214.9463356.56278015.3218109.2114764.32820813.111774
202897220142_R04C0217.0371823.9665184.29304817.7375730.9004704.240264
204529320092_R01C0212.06534713.39264314.4833155.9504737.0482465.498531
202794570004_R02C0115.26356919.4513867.14874315.87087814.48329011.685419
203531420070_R05C026.6894906.8098017.6021414.1060292.0401046.293626
\n", + "
" + ], + "text/plain": [ + " mammalian2 mammalianskin2 mammalianblood2 mammalian3 \\\n", + "204509080002_R01C02 14.946335 6.562780 15.321810 9.211476 \n", + "202897220142_R04C02 17.037182 3.966518 4.293048 17.737573 \n", + "204529320092_R01C02 12.065347 13.392643 14.483315 5.950473 \n", + "202794570004_R02C01 15.263569 19.451386 7.148743 15.870878 \n", + "203531420070_R05C02 6.689490 6.809801 7.602141 4.106029 \n", + "\n", + " mammalianskin3 mammalianblood3 \n", + "204509080002_R01C02 4.328208 13.111774 \n", + "202897220142_R04C02 0.900470 4.240264 \n", + "204529320092_R01C02 7.048246 5.498531 \n", + "202794570004_R02C01 14.483290 11.685419 \n", + "203531420070_R05C02 2.040104 6.293626 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pya.data.download_example_data('GSE223748', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/GSE223748_subset.pkl')\n", + "df['Heterocephalus glaber'] = 1\n", + "adata = pya.preprocess.df_to_adata(df, imputer_strategy='knn', verbose=False)\n", + "pya.pred.predict_age(adata, ['Mammalian2', 'MammalianSkin2', 'MammalianBlood2', 'Mammalian3', 'MammalianSkin3', 'MammalianBlood3'], verbose=False)\n", + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "61ad69dd-9e91-447a-80f9-95647a19a082", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "f2fd36d5-c059-4680-b300-f8e9344186cd", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "300b40ad-68e7-49b3-a7f8-66c64436c80f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'mammalian2',\n", + " 'data_type': 'methylation',\n", + " 'species': 'multi',\n", + " 'year': 2023,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Lu, A. T., et al. \"Universal DNA methylation age across mammalian tissues.\" Nature aging 3.9 (2023): 1144-1166.',\n", + " 'doi': 'https://doi.org/10.1038/s43587-023-00462-6',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['mammalian2_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_dnam_rrbs.ipynb b/docs/source/tutorials/tutorial_dnam_rrbs.ipynb new file mode 100644 index 0000000..664c0ea --- /dev/null +++ b/docs/source/tutorials/tutorial_dnam_rrbs.ipynb @@ -0,0 +1,1656 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e690b3c-4dec-450e-a7f8-f63987e60cdb", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_dnam.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_dnam.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "62e1ac68-927d-4ca8-a2ab-bd99a7ee52ab", + "metadata": {}, + "source": [ + "# RRBS DNA methylation" + ] + }, + { + "cell_type": "markdown", + "id": "9552602a-777c-42a5-900a-41c85096c3d8", + "metadata": {}, + "source": [ + "This tutorial focuses on predicting age from Mus musculus reduced-representation bisulfite sequencing (RRBS) data. There are a few clocks available that were trained on RRBS data. Moreover, it is possible to use Horvath's mammalian clocks by converting the genomic location to the probes in the Horvath methylation array." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dc347ae0-41ae-46ac-ba50-08cacd4c9241", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyaging as pya\n", + "import os\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "5e620ab9-4837-4a7a-83f1-726be9c9f7bf", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "5ff79235-46fb-4c59-a629-1f479f9f13a3", + "metadata": {}, + "source": [ + "Let's download the publicly available dataset GSE130735 with RRBS samples from mouse. Given it is RRBS, there are millions of CpG sites." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "332d96c6-6b12-4cd1-b216-c32ce21673b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Data found in pyaging_data/GSE130735_subset.pkl\n", + "|-----> πŸŽ‰ Done! [0.5425s]\n" + ] + } + ], + "source": [ + "pya.data.download_example_data('GSE130735')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2a6fcd90-8ded-40d5-a606-e32e21816ebf", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('pyaging_data/GSE130735_subset.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "df8ea1a3-313f-42bc-aeef-ec5349975b80", + "metadata": {}, + "source": [ + "It is important to note that the features for RRBS clocks are the genomic coordinates in the format below." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c4d8245b-8d04-4ae0-945d-3aed4956a3bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chr1:3020814chr1:3020842chr1:3020877chr1:3020891chr1:3020945chr1:3020971chr1:3020987chr1:3021012chr1:3037802chr1:3037820...chrY:1825397chrY:4682362chrY:32122892chrY:85867071chrY:85867083chrY:85867117chrY:85867137chrY:85867139chrY:85867178chrY:88224179
GSM37526310.6090.250.4080.1890.0680.3730.5710.2520.3330.158...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GSM3752625NaNNaN0.9730.9840.9120.9150.9870.9740.9910.932...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GSM3752634NaNNaN0.5260.1310.0000.0380.4690.7690.7720.146...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GSM37526200.9310.920.9880.9490.8970.9210.9070.9581.0000.867...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GSM3752622NaNNaN0.2050.3820.0910.1320.1740.2270.1080.053...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows Γ— 1778324 columns

\n", + "
" + ], + "text/plain": [ + " chr1:3020814 chr1:3020842 chr1:3020877 chr1:3020891 \\\n", + "GSM3752631 0.609 0.25 0.408 0.189 \n", + "GSM3752625 NaN NaN 0.973 0.984 \n", + "GSM3752634 NaN NaN 0.526 0.131 \n", + "GSM3752620 0.931 0.92 0.988 0.949 \n", + "GSM3752622 NaN NaN 0.205 0.382 \n", + "\n", + " chr1:3020945 chr1:3020971 chr1:3020987 chr1:3021012 \\\n", + "GSM3752631 0.068 0.373 0.571 0.252 \n", + "GSM3752625 0.912 0.915 0.987 0.974 \n", + "GSM3752634 0.000 0.038 0.469 0.769 \n", + "GSM3752620 0.897 0.921 0.907 0.958 \n", + "GSM3752622 0.091 0.132 0.174 0.227 \n", + "\n", + " chr1:3037802 chr1:3037820 ... chrY:1825397 chrY:4682362 \\\n", + "GSM3752631 0.333 0.158 ... NaN NaN \n", + "GSM3752625 0.991 0.932 ... NaN NaN \n", + "GSM3752634 0.772 0.146 ... NaN NaN \n", + "GSM3752620 1.000 0.867 ... NaN NaN \n", + "GSM3752622 0.108 0.053 ... NaN NaN \n", + "\n", + " chrY:32122892 chrY:85867071 chrY:85867083 chrY:85867117 \\\n", + "GSM3752631 NaN NaN NaN NaN \n", + "GSM3752625 NaN NaN NaN NaN \n", + "GSM3752634 NaN NaN NaN NaN \n", + "GSM3752620 NaN NaN NaN NaN \n", + "GSM3752622 NaN NaN NaN NaN \n", + "\n", + " chrY:85867137 chrY:85867139 chrY:85867178 chrY:88224179 \n", + "GSM3752631 NaN NaN NaN NaN \n", + "GSM3752625 NaN NaN NaN NaN \n", + "GSM3752634 NaN NaN NaN NaN \n", + "GSM3752620 NaN NaN NaN NaN \n", + "GSM3752622 NaN NaN NaN NaN \n", + "\n", + "[5 rows x 1778324 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ea7c44d7-73c3-4cd7-844d-bab34aa2dcee", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "04f2758f-fb8c-4a52-983a-29ec826dba6c", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "682f7e04-22a6-4561-b389-c8f336f19862", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.9882s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0006s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 14 observations\n", + "|-----------> There are 1778324 features\n", + "|-----------> Total missing values: 6322346\n", + "|-----------> Percentage of missing values: 25.39%\n", + "|-----> βœ… Log data statistics finished [0.0205s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> Imputing missing values using mean strategy\n", + "|-----> βœ… Impute missing values finished [0.4631s]\n", + "|-----> βš™οΈ Add imputer strategy to adata.uns started\n", + "|-----> βœ… Add imputer strategy to adata.uns finished [0.0087s]\n", + "|-----> πŸŽ‰ Done! [1.4897s]\n" + ] + } + ], + "source": [ + "adata = pya.pp.df_to_adata(df, imputer_strategy='mean') # knn might be a bit slow" + ] + }, + { + "cell_type": "markdown", + "id": "7349164c-f28b-4222-bf41-6f80d8b79c3b", + "metadata": {}, + "source": [ + "This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "markdown", + "id": "4b5ff1ef-e724-407a-b6d4-9907558f21ba", + "metadata": {}, + "source": [ + "## Predict age with RRBS clocks" + ] + }, + { + "cell_type": "markdown", + "id": "eb197ded-91dd-4319-8dbb-a635d09c8367", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. For convenience, let's simply input all four available mammalian clocks at once. The function is invariant to the capitalization of the clock name." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cdd18ee5-9af1-404e-80f6-42a83685273e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0033s]\n", + "|-----> πŸ•’ Processing clock: thompson\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/thompson.pt\n", + "|-----------> βœ… Load clock finished [0.5324s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 1 out of 582 features (0.17%) are missing: ['chr4:91376687'], etc.\n", + "|-----------------> Filling missing features entirely with 0\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_thompson]\n", + "|-----------> ⚠️ Check features in adata finished [0.0654s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0013s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0008s]\n", + "|-----> πŸ•’ Processing clock: meer\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/meer.pt\n", + "|-----------> βœ… Load clock finished [0.4402s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 225 out of 435 features (51.72%) are missing: ['chr10:111559529', 'chr10:115250413', 'chr10:127620127'], etc.\n", + "|-----------------> Filling missing features entirely with 0\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_meer]\n", + "|-----------> ⚠️ Check features in adata finished [0.0412s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0010s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: petkovich\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/petkovich.pt\n", + "|-----------> βœ… Load clock finished [0.5167s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 58 out of 90 features (64.44%) are missing: ['chr19:23893237', 'chr18:45589182', 'chr16:10502162'], etc.\n", + "|-----------------> Filling missing features entirely with 0\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_petkovich]\n", + "|-----------> ⚠️ Check features in adata finished [0.0161s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is petkovich\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0033s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0020s]\n", + "|-----> πŸ•’ Processing clock: stubbs\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/stubbs.pt\n", + "|-----------> βœ… Load clock finished [0.4679s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 8889 out of 17992 features (49.41%) are missing: ['chr1:10038066', 'chr1:106173313', 'chr1:106759301'], etc.\n", + "|-----------------> Using reference feature values for stubbs\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_stubbs]\n", + "|-----------> ⚠️ Check features in adata finished [0.8672s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> The preprocessing method is quantile_normalization_and_scale_with_gold_standard\n", + "|-----------------> The postprocessing method is stubbs\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0263s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0014s]\n", + "|-----> πŸŽ‰ Done! [3.2757s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, ['Thompson', 'Meer', 'Petkovich', 'Stubbs'])" + ] + }, + { + "cell_type": "markdown", + "id": "3978afec-40a0-4e1f-8ff6-1d048da8a894", + "metadata": {}, + "source": [ + "All of the age predictions are in unit of months." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b04572f9-23dd-4eb1-8e84-16a9b25c2d6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
thompsonmeerpetkovichstubbs
GSM375263119.6341137.3151838.0751770.957770
GSM3752625-1.4104610.0282212.953822-0.074265
GSM375263461.05878321.3221789.6404891.389193
GSM3752620-2.6638151.6119473.019351-0.092710
GSM375262220.5941147.5921457.1047660.667168
\n", + "
" + ], + "text/plain": [ + " thompson meer petkovich stubbs\n", + "GSM3752631 19.634113 7.315183 8.075177 0.957770\n", + "GSM3752625 -1.410461 0.028221 2.953822 -0.074265\n", + "GSM3752634 61.058783 21.322178 9.640489 1.389193\n", + "GSM3752620 -2.663815 1.611947 3.019351 -0.092710\n", + "GSM3752622 20.594114 7.592145 7.104766 0.667168" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4bb259c5-2cba-4dc1-b123-2387a5bb7749", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
thompsonmeerpetkovichstubbs
GSM375263119.6341137.3151838.0751770.957770
GSM3752625-1.4104610.0282212.953822-0.074265
GSM375263461.05878321.3221789.6404891.389193
GSM3752620-2.6638151.6119473.019351-0.092710
GSM375262220.5941147.5921457.1047660.667168
\n", + "
" + ], + "text/plain": [ + " thompson meer petkovich stubbs\n", + "GSM3752631 19.634113 7.315183 8.075177 0.957770\n", + "GSM3752625 -1.410461 0.028221 2.953822 -0.074265\n", + "GSM3752634 61.058783 21.322178 9.640489 1.389193\n", + "GSM3752620 -2.663815 1.611947 3.019351 -0.092710\n", + "GSM3752622 20.594114 7.592145 7.104766 0.667168" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "8519affc-ffc8-4904-ad7a-bd6a6d6458cf", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "18b44cfa-36d5-49c9-badf-7ba9e189bbc0", + "metadata": {}, + "outputs": [], + "source": [ + "pya.data.download_example_data('GSE130735', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/GSE130735_subset.pkl')\n", + "adata = pya.preprocess.df_to_adata(df, imputer_strategy='mean', verbose=False)\n", + "pya.pred.predict_age(adata, ['Thompson', 'Meer', 'Petkovich', 'Stubbs'], verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a2520978-b693-474f-88cf-91bcde1a5d95", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
thompsonmeerpetkovichstubbs
GSM375263119.6341137.3151838.0751770.957770
GSM3752625-1.4104610.0282212.953822-0.074265
GSM375263461.05878321.3221789.6404891.389193
GSM3752620-2.6638151.6119473.019351-0.092710
GSM375262220.5941147.5921457.1047660.667168
\n", + "
" + ], + "text/plain": [ + " thompson meer petkovich stubbs\n", + "GSM3752631 19.634113 7.315183 8.075177 0.957770\n", + "GSM3752625 -1.410461 0.028221 2.953822 -0.074265\n", + "GSM3752634 61.058783 21.322178 9.640489 1.389193\n", + "GSM3752620 -2.663815 1.611947 3.019351 -0.092710\n", + "GSM3752622 20.594114 7.592145 7.104766 0.667168" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "33119798-f1b3-4c4c-9f18-e4e4b7ca21e8", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "92cfc16e-71ff-4767-9c75-04e52455eb6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 14 Γ— 1778324\n", + " obs: 'thompson', 'meer', 'petkovich', 'stubbs'\n", + " var: 'percent_na'\n", + " uns: 'imputer_strategy', 'thompson_percent_na', 'thompson_missing_features', 'thompson_metadata', 'meer_percent_na', 'meer_missing_features', 'meer_metadata', 'petkovich_percent_na', 'petkovich_missing_features', 'petkovich_metadata', 'stubbs_percent_na', 'stubbs_missing_features', 'stubbs_metadata'\n", + " layers: 'X_original', 'X_imputed'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "569b1ef8-dd55-45e4-8792-e919dc207808", + "metadata": {}, + "source": [ + "## Predict age with mammalian clocks" + ] + }, + { + "cell_type": "markdown", + "id": "b9266699-881f-41c7-9a13-c38307527bfa", + "metadata": {}, + "source": [ + "We can predict age by converting the genomic locations directly into the probes from Horvath's methylation array. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a5f71a02-cbc0-4cd0-a123-e7226307ae84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.system('git clone https://github.com/shorvath/MammalianMethylationConsortium.git')\n", + "\n", + "# Let's read the manifest from the mammalian consortium\n", + "annotation_df = pd.read_csv('MammalianMethylationConsortium/Annotations, Amin Haghani/Mammals/Mus_musculus.grcm38.100.HorvathMammalMethylChip40.v1.csv', index_col=0)\n", + "annotation_df = annotation_df[~annotation_df.seqnames.isna()]\n", + "mm_genomic_locations = 'chr' + annotation_df['seqnames'].astype(str) + ':' + annotation_df['CGstart'].astype(int).astype(str)\n", + "mm_genomic_locations = mm_genomic_locations.tolist()\n", + "mammalian_probes = annotation_df['CGid'].tolist()\n", + "mm_loc_to_probe = dict(zip(mm_genomic_locations, mammalian_probes))\n", + "\n", + "# Let's get the previous RRBS dataset and filter only for the genomic locations in the manifest file\n", + "df_columns_set = set(df.columns)\n", + "mm_loc_to_probe_set = set(mm_loc_to_probe.keys())\n", + "common_columns = df_columns_set.intersection(mm_loc_to_probe_set)\n", + "df_converted = df[list(common_columns)].copy()\n", + "\n", + "# Then, convert the genomic location to the probe name\n", + "df_converted.columns = [mm_loc_to_probe[col] for col in df_converted.columns]\n", + "\n", + "# Let's clean the GitHub\n", + "os.system('rm -r MammalianMethylationConsortium')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6989844c-736d-4637-acfc-8f8f7ae58108", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cg05347424cg26718996cg07727941cg16852837cg12870762cg26080798cg02899039cg12839061cg05267150cg13170453...cg02179016cg20836420cg18831685cg08992395cg13679010cg12982463cg17146242cg13649253cg07588415cg14814195
GSM3752631NaNNaN0.0000.00.0150.0000.005NaN0.0230.000...NaN0.0000.028NaN0.0000.0000.0180.0000.021NaN
GSM37526250.938NaN0.000NaN0.0000.0000.000NaN0.596NaN...NaN0.000NaNNaN0.0000.8950.2270.1560.025NaN
GSM37526340.125NaN0.000NaN0.6270.0170.033NaN0.745NaN...NaN0.495NaNNaN0.0140.2780.5190.7860.012NaN
GSM37526200.769NaN0.0910.00.0700.0060.012NaN0.6070.092...NaN0.0100.054NaN0.0000.9330.2770.1480.000NaN
GSM3752622NaNNaN0.000NaN0.0000.0000.000NaN0.052NaN...NaN0.000NaNNaN0.0000.0000.0640.0220.000NaN
\n", + "

5 rows Γ— 5149 columns

\n", + "
" + ], + "text/plain": [ + " cg05347424 cg26718996 cg07727941 cg16852837 cg12870762 \\\n", + "GSM3752631 NaN NaN 0.000 0.0 0.015 \n", + "GSM3752625 0.938 NaN 0.000 NaN 0.000 \n", + "GSM3752634 0.125 NaN 0.000 NaN 0.627 \n", + "GSM3752620 0.769 NaN 0.091 0.0 0.070 \n", + "GSM3752622 NaN NaN 0.000 NaN 0.000 \n", + "\n", + " cg26080798 cg02899039 cg12839061 cg05267150 cg13170453 ... \\\n", + "GSM3752631 0.000 0.005 NaN 0.023 0.000 ... \n", + "GSM3752625 0.000 0.000 NaN 0.596 NaN ... \n", + "GSM3752634 0.017 0.033 NaN 0.745 NaN ... \n", + "GSM3752620 0.006 0.012 NaN 0.607 0.092 ... \n", + "GSM3752622 0.000 0.000 NaN 0.052 NaN ... \n", + "\n", + " cg02179016 cg20836420 cg18831685 cg08992395 cg13679010 \\\n", + "GSM3752631 NaN 0.000 0.028 NaN 0.000 \n", + "GSM3752625 NaN 0.000 NaN NaN 0.000 \n", + "GSM3752634 NaN 0.495 NaN NaN 0.014 \n", + "GSM3752620 NaN 0.010 0.054 NaN 0.000 \n", + "GSM3752622 NaN 0.000 NaN NaN 0.000 \n", + "\n", + " cg12982463 cg17146242 cg13649253 cg07588415 cg14814195 \n", + "GSM3752631 0.000 0.018 0.000 0.021 NaN \n", + "GSM3752625 0.895 0.227 0.156 0.025 NaN \n", + "GSM3752634 0.278 0.519 0.786 0.012 NaN \n", + "GSM3752620 0.933 0.277 0.148 0.000 NaN \n", + "GSM3752622 0.000 0.064 0.022 0.000 NaN \n", + "\n", + "[5 rows x 5149 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_converted.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e46ca085-026e-4e02-b316-97b880125507", + "metadata": {}, + "source": [ + "Now we can finally put the dataframe into pyaging after defining the species as Mus musculus." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "10bbdf6e-a63a-4c18-bc97-e0872fb9895f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.0057s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0006s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 14 observations\n", + "|-----------> There are 5150 features\n", + "|-----------> Total missing values: 17862\n", + "|-----------> Percentage of missing values: 24.77%\n", + "|-----> βœ… Log data statistics finished [0.0013s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> Imputing missing values using mean strategy\n", + "|-----> βœ… Impute missing values finished [0.0060s]\n", + "|-----> βš™οΈ Add imputer strategy to adata.uns started\n", + "|-----> βœ… Add imputer strategy to adata.uns finished [0.0004s]\n", + "|-----> πŸŽ‰ Done! [0.0174s]\n" + ] + } + ], + "source": [ + "df_converted['Mus musculus'] = 1\n", + "adata_mammalian = pya.pp.df_to_adata(df_converted, imputer_strategy='mean')" + ] + }, + { + "cell_type": "markdown", + "id": "7edae9ce-a0bc-4b78-8575-23745714b42b", + "metadata": {}, + "source": [ + "Let's use these five mammalian predictors." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "56dc5c27-e793-4343-85e2-9ce30a365d64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0013s]\n", + "|-----> πŸ•’ Processing clock: mammalian1\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalian1.pt\n", + "|-----------> βœ… Load clock finished [0.4780s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 274 out of 335 features (81.79%) are missing: ['cg00249943', 'cg00250826', 'cg00292639'], etc.\n", + "|-----------------> Filling missing features entirely with 0\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalian1]\n", + "|-----------> ⚠️ Check features in adata finished [0.0173s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is anti_logp2\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0083s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0017s]\n", + "|-----> πŸ•’ Processing clock: mammalian2\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalian2.pt\n", + "|-----------> βœ… Load clock finished [0.4544s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 2406 out of 2572 features (93.55%) are missing: ['cg00020468', 'cg00096922', 'cg00098422'], etc.\n", + "|-----------------> Using reference feature values for mammalian2\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalian2]\n", + "|-----------> ⚠️ Check features in adata finished [0.0407s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian2\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0196s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: mammalian3\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalian3.pt\n", + "|-----------> βœ… Load clock finished [0.5081s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 2299 out of 2467 features (93.19%) are missing: ['cg00101675', 'cg06259996', 'cg15168457'], etc.\n", + "|-----------------> Using reference feature values for mammalian3\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalian3]\n", + "|-----------> ⚠️ Check features in adata finished [0.0222s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is mammalian3\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0095s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: mammalianlifespan\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianlifespan.pt\n", + "|-----------> βœ… Load clock finished [0.4420s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 133 out of 152 features (87.50%) are missing: ['cg00039845', 'cg00300233', 'cg00810217'], etc.\n", + "|-----------------> Using reference feature values for mammalianlifespan\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianlifespan]\n", + "|-----------> ⚠️ Check features in adata finished [0.0043s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0018s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0009s]\n", + "|-----> πŸ•’ Processing clock: mammalianfemale\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/mammalianfemale.pt\n", + "|-----------> βœ… Load clock finished [0.4532s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------? 73 out of 101 features (72.28%) are missing: ['cg01145947', 'cg02053792', 'cg02407848'], etc.\n", + "|-----------------> Filling missing features entirely with 0\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_mammalianfemale]\n", + "|-----------> ⚠️ Check features in adata finished [0.0135s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> There is no preprocessing necessary\n", + "|-----------------> The postprocessing method is sigmoid\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0051s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0024s]\n", + "|-----> πŸŽ‰ Done! [3.0755s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata_mammalian, ['Mammalian1', 'Mammalian2', 'Mammalian3', \"MammalianLifespan\", \"MammalianFemale\"])" + ] + }, + { + "cell_type": "markdown", + "id": "de8ed8d2-213a-4ca9-a0d0-e141ba7d1789", + "metadata": {}, + "source": [ + "Note that RRBS clocks are in units of months whereas the mammalian clocks are in units of years." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "32c3c7fe-acbb-4fe8-b438-5c664a533c41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mammalian1mammalian2mammalian3mammalianlifespanmammalianfemale
GSM37526312.537895-0.009800-0.0484141.2021340.732238
GSM37526253.3539350.064448-0.0482031.4800800.952105
GSM37526344.4906100.813899-0.0350591.4002780.978554
GSM37526203.6038020.122934-0.0469581.6447010.948646
GSM37526222.9512630.005617-0.0477251.3852820.741336
GSM37526375.7185150.895781-0.0376491.3997690.975114
GSM45582167.4562450.684752-0.0121951.4481190.785830
GSM37526435.8819430.880053-0.0376561.4084830.969979
GSM45582136.7200800.855574-0.0262111.4772360.821308
GSM37526406.4529340.766701-0.0317631.3718310.938162
GSM45582225.0492470.120607-0.0451091.3667290.789795
GSM45582196.0987100.857243-0.0298781.4221440.796568
GSM37526282.7469490.098284-0.0479891.4805990.794259
GSM37526172.7398680.078172-0.0482491.4840910.807197
\n", + "
" + ], + "text/plain": [ + " mammalian1 mammalian2 mammalian3 mammalianlifespan \\\n", + "GSM3752631 2.537895 -0.009800 -0.048414 1.202134 \n", + "GSM3752625 3.353935 0.064448 -0.048203 1.480080 \n", + "GSM3752634 4.490610 0.813899 -0.035059 1.400278 \n", + "GSM3752620 3.603802 0.122934 -0.046958 1.644701 \n", + "GSM3752622 2.951263 0.005617 -0.047725 1.385282 \n", + "GSM3752637 5.718515 0.895781 -0.037649 1.399769 \n", + "GSM4558216 7.456245 0.684752 -0.012195 1.448119 \n", + "GSM3752643 5.881943 0.880053 -0.037656 1.408483 \n", + "GSM4558213 6.720080 0.855574 -0.026211 1.477236 \n", + "GSM3752640 6.452934 0.766701 -0.031763 1.371831 \n", + "GSM4558222 5.049247 0.120607 -0.045109 1.366729 \n", + "GSM4558219 6.098710 0.857243 -0.029878 1.422144 \n", + "GSM3752628 2.746949 0.098284 -0.047989 1.480599 \n", + "GSM3752617 2.739868 0.078172 -0.048249 1.484091 \n", + "\n", + " mammalianfemale \n", + "GSM3752631 0.732238 \n", + "GSM3752625 0.952105 \n", + "GSM3752634 0.978554 \n", + "GSM3752620 0.948646 \n", + "GSM3752622 0.741336 \n", + "GSM3752637 0.975114 \n", + "GSM4558216 0.785830 \n", + "GSM3752643 0.969979 \n", + "GSM4558213 0.821308 \n", + "GSM3752640 0.938162 \n", + "GSM4558222 0.789795 \n", + "GSM4558219 0.796568 \n", + "GSM3752628 0.794259 \n", + "GSM3752617 0.807197 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata_mammalian.obs" + ] + }, + { + "cell_type": "markdown", + "id": "615f8fbf-f5e1-4af9-a2a0-5f4f781001fe", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "0fe55edd-9271-4b41-857d-ef3fceafc2a6", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cdf4c609-7a24-4c3f-a891-647315b77d54", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'thompson',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Mus musculus',\n", + " 'year': 2018,\n", + " 'approved_by_author': 'βœ…',\n", + " 'citation': 'Thompson, Michael J., et al. \"A multi-tissue full lifespan epigenetic clock for mice.\" Aging (Albany NY) 10.10 (2018): 2832.',\n", + " 'doi': 'https://doi.org/10.18632/aging.101590',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['thompson_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "466b1bc5-6207-469e-b479-260bbf55f2a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'meer',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Mus musculus',\n", + " 'year': 2018,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Meer, Margarita V., et al. \"A whole lifespan mouse multi-tissue DNA methylation clock.\" Elife 7 (2018): e40675.',\n", + " 'doi': 'https://doi.org/10.7554/eLife.40675',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['meer_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2808782d-04d3-4527-8328-b18a583cf15b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'petkovich',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Mus musculus',\n", + " 'year': 2017,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Petkovich, Daniel A., et al. \"Using DNA methylation profiling to evaluate biological age and longevity interventions.\" Cell metabolism 25.4 (2017): 954-960.',\n", + " 'doi': 'https://doi.org/10.1016/j.cmet.2017.03.016',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['petkovich_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ec210b8b-9ba1-45df-9b16-80a5f5ac86f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'stubbs',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Mus musculus',\n", + " 'year': 2017,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Stubbs, Thomas M., et al. \"Multi-tissue DNA methylation age predictor in mouse.\" Genome biology 18 (2017): 1-14.',\n", + " 'doi': 'https://doi.org/10.1186/s13059-017-1203-5',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['stubbs_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "78ed8018-bfdc-42b7-b814-d3a55184fc05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'mammalian1',\n", + " 'data_type': 'methylation',\n", + " 'species': 'multi',\n", + " 'year': 2023,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Lu, A. T., et al. \"Universal DNA methylation age across mammalian tissues.\" Nature aging 3.9 (2023): 1144-1166.',\n", + " 'doi': 'https://doi.org/10.1038/s43587-023-00462-6',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata_mammalian.uns['mammalian1_metadata']" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f4b9c4c7-d23b-4850-abf2-5f9a092c7bc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'mammalianlifespan',\n", + " 'data_type': 'methylation',\n", + " 'species': 'multi',\n", + " 'year': 2023,\n", + " 'approved_by_author': 'βŒ›',\n", + " 'citation': 'Li, Caesar Z., et al. \"Epigenetic predictors of species maximum lifespan and other life history traits in mammals.\" bioRxiv (2023): 2023-11.',\n", + " 'doi': 'https://doi.org/10.1101/2023.11.02.565286',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata_mammalian.uns['mammalianlifespan_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_histonemarkchipseq.ipynb b/docs/source/tutorials/tutorial_histonemarkchipseq.ipynb new file mode 100644 index 0000000..39d0a1c --- /dev/null +++ b/docs/source/tutorials/tutorial_histonemarkchipseq.ipynb @@ -0,0 +1,298 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a76ae282-3b11-4246-8292-a9276267832d", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_histonemarkchipseq.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_histonemarkchipseq.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "d444a24e-6a98-4db1-8688-7f3f80ed2876", + "metadata": {}, + "source": [ + "# Bulk histone mark ChIP-Seq" + ] + }, + { + "cell_type": "markdown", + "id": "186154f3-1c8d-4284-a5a4-01f28d4db533", + "metadata": {}, + "source": [ + "This tutorial is a brief guide for the implementation of the seven histone-mark-specific clocks and the pan-histone-mark clock developed ourselves. Link to [preprint](https://www.biorxiv.org/content/10.1101/2023.08.21.554165v3)." + ] + }, + { + "cell_type": "markdown", + "id": "270379c1-9159-4677-92fa-10b08aa9f703", + "metadata": {}, + "source": [ + "We just need two packages for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dd281360-7e16-45d9-ae2b-8f8f3fff809d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyaging as pya" + ] + }, + { + "cell_type": "markdown", + "id": "b6893601-615e-449b-829b-c144276f402f", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "fd3e80a9-5361-40f0-bf3e-6f6057181594", + "metadata": {}, + "source": [ + "Let's download an example of H3K4me3 ChIP-Seq bigWig file from the ENCODE project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85c15bf3-6cf1-4f71-abf2-d0d7ee81b86b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Downloading data to pyaging_data/ENCFF386QWG.bigWig\n", + "|-----------> in progress: 24.0057%" + ] + } + ], + "source": [ + "pya.data.download_example_data('ENCFF386QWG')" + ] + }, + { + "cell_type": "markdown", + "id": "3880246a-471e-4f75-bd2f-ed2623458a48", + "metadata": {}, + "source": [ + "To exemplify that multiple bigWigs can be turned into a df object at once, let's just repeat the file path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f65f5cc7-4c42-45a5-a04e-83e0520eccff", + "metadata": {}, + "outputs": [], + "source": [ + "df = pya.pp.bigwig_to_df(['pyaging_data/ENCFF386QWG.bigWig', 'pyaging_data/ENCFF386QWG.bigWig'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a24e0a5-f97f-4f01-95a7-dd96246d9eb2", + "metadata": {}, + "outputs": [], + "source": [ + "df.index = ['sample1', 'sample2'] # just to avoid an annoying anndata warning that samples have same names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "769858ac-9d6d-43f8-9c53-0f4a88c5484c", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e303dc0f-9e77-4524-9c04-90540e9ee75d", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "ae8e44bc-67fc-4508-9623-faea44301fa8", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c167be6-1bd3-407c-ae12-771739189c3c", + "metadata": {}, + "outputs": [], + "source": [ + "adata = pya.preprocess.df_to_adata(df)" + ] + }, + { + "cell_type": "markdown", + "id": "3f82813b-3db2-4570-9e4c-3dce08dc5108", + "metadata": {}, + "source": [ + "Note that the original DataFrame is stored in `X_original` under layers. This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "641a61a6-46fc-4d47-b176-eb39524ce94f", + "metadata": {}, + "outputs": [], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "c72aa719-efd3-4094-90f5-bffcaea76a34", + "metadata": {}, + "source": [ + "## Predict age" + ] + }, + { + "cell_type": "markdown", + "id": "aff9395b-4954-4148-9cbb-6681e7217cf3", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. For convenience, let's simply input a few clocks of interest at once. The function is invariant to the capitalization of the clock name. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c02455b4-06dd-44c2-b4b3-a2bb434eae7d", + "metadata": {}, + "outputs": [], + "source": [ + "pya.pred.predict_age(adata, ['CamilloH3K4me3', 'CamilloH3K9me3', 'CamilloPanHistone'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f64fb182-937b-4f67-b58e-5fffb0e2fad0", + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "bbaa2243-e380-4020-bf04-f7aa7da83cd4", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8dd3457-8983-41a4-aaab-41563b91a866", + "metadata": {}, + "outputs": [], + "source": [ + "pya.data.download_example_data('ENCFF386QWG', verbose=False)\n", + "df = pya.pp.bigwig_to_df(['pyaging_data/ENCFF386QWG.bigWig', 'pyaging_data/ENCFF386QWG.bigWig'], verbose=False)\n", + "df.index = ['sample1', 'sample2']\n", + "adata = pya.preprocess.df_to_adata(df, verbose=False)\n", + "pya.pred.predict_age(adata, ['CamilloH3K4me3', 'CamilloH3K9me3', 'CamilloPanHistone'], verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8192ab67-a1cc-4728-8ca0-f81a56940fbf", + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "9832aa0b-99a8-4938-a2a2-5e9b484a3353", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4b22bf1-116f-456f-82d2-58b300f863f1", + "metadata": {}, + "outputs": [], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "c08ff758-675c-4136-9fb8-c19f0e05fefd", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "8407c418-6251-4b08-9d29-166f9a4339d2", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2946393e-a199-46ba-a9dd-80bc8fa88787", + "metadata": {}, + "outputs": [], + "source": [ + "adata.uns['camilloh3k4me3_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_rnaseq.ipynb b/docs/source/tutorials/tutorial_rnaseq.ipynb new file mode 100644 index 0000000..398eeec --- /dev/null +++ b/docs/source/tutorials/tutorial_rnaseq.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2089cc5b-a025-4928-a331-ad33fd1b6a85", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_rnaseq.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_rnaseq.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "31cf37ce-09ee-49d7-a411-719bf65e186e", + "metadata": {}, + "source": [ + "# Bulk RNA-Seq" + ] + }, + { + "cell_type": "markdown", + "id": "3ea2b570-56af-4e4f-9606-d4c6d071554c", + "metadata": {}, + "source": [ + "This tutorial is a brief guide for the implementation of BiT Age, a highly accurate bulk transcriptomic clock for C. elegans. Link to [paper](https://onlinelibrary.wiley.com/doi/full/10.1111/acel.13320)." + ] + }, + { + "cell_type": "markdown", + "id": "0a093c7d-dea7-4b34-91bf-08cde6c98011", + "metadata": {}, + "source": [ + "We just need two packages for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ad192191-e44f-4994-80ad-ab16cdb7c7e8", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pyaging as pya" + ] + }, + { + "cell_type": "markdown", + "id": "d87488d5-731c-469e-ad6f-79c4c9662371", + "metadata": {}, + "source": [ + "## Download and load example data" + ] + }, + { + "cell_type": "markdown", + "id": "4c30471f-89e7-4e92-a176-aa3af14a5274", + "metadata": {}, + "source": [ + "Let's download the C. elegans RNA-seq dataset from the BiT Age paper." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "55bbd03e-3953-427e-ab7a-4d523e6bc985", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting download_example_data function\n", + "|-----------> Data found in pyaging_data/GSE65765_CPM.pkl\n", + "|-----> πŸŽ‰ Done! [0.5749s]\n" + ] + } + ], + "source": [ + "pya.data.download_example_data('GSE65765')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "13aeb69a-4b0e-40f2-8094-194c9a6b42a1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('pyaging_data/GSE65765_CPM.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7af12fc3-1418-49df-ba7f-e94730db706e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
WBGene00197333WBGene00198386WBGene00015153WBGene00002061WBGene00255704WBGene00235314WBGene00001177WBGene00169236WBGene00219784WBGene00015152...WBGene00010964WBGene00014467WBGene00014468WBGene00014469WBGene00014470WBGene00010965WBGene00014471WBGene00010966WBGene00010967WBGene00014473
SRR17939930.00.03.780174169.2408151.9074270.27744459.3209860.00.0000001.283178...858.9491560.00.0000000.00.052021234.5268460.01734054.48305778.1178150.000000
SRR17939910.00.00.510354412.6285970.0618610.06186122.2390440.00.0154650.201048...1049.9828850.00.0154650.00.015465372.5117130.00000054.54597159.6185770.000000
SRR17939940.00.04.718708274.7336711.2346440.11839142.4007210.00.0000000.642691...664.2554120.00.1014780.00.000000253.2204210.03382619.48369886.4927350.016913
SRR17939920.00.02.389905351.6125580.5058920.06977820.4973580.00.0174451.308342...1298.7998490.00.0348890.00.000000472.2068030.00000089.50803976.4595080.000000
\n", + "

4 rows Γ— 46755 columns

\n", + "
" + ], + "text/plain": [ + " WBGene00197333 WBGene00198386 WBGene00015153 WBGene00002061 \\\n", + "SRR1793993 0.0 0.0 3.780174 169.240815 \n", + "SRR1793991 0.0 0.0 0.510354 412.628597 \n", + "SRR1793994 0.0 0.0 4.718708 274.733671 \n", + "SRR1793992 0.0 0.0 2.389905 351.612558 \n", + "\n", + " WBGene00255704 WBGene00235314 WBGene00001177 WBGene00169236 \\\n", + "SRR1793993 1.907427 0.277444 59.320986 0.0 \n", + "SRR1793991 0.061861 0.061861 22.239044 0.0 \n", + "SRR1793994 1.234644 0.118391 42.400721 0.0 \n", + "SRR1793992 0.505892 0.069778 20.497358 0.0 \n", + "\n", + " WBGene00219784 WBGene00015152 ... WBGene00010964 \\\n", + "SRR1793993 0.000000 1.283178 ... 858.949156 \n", + "SRR1793991 0.015465 0.201048 ... 1049.982885 \n", + "SRR1793994 0.000000 0.642691 ... 664.255412 \n", + "SRR1793992 0.017445 1.308342 ... 1298.799849 \n", + "\n", + " WBGene00014467 WBGene00014468 WBGene00014469 WBGene00014470 \\\n", + "SRR1793993 0.0 0.000000 0.0 0.052021 \n", + "SRR1793991 0.0 0.015465 0.0 0.015465 \n", + "SRR1793994 0.0 0.101478 0.0 0.000000 \n", + "SRR1793992 0.0 0.034889 0.0 0.000000 \n", + "\n", + " WBGene00010965 WBGene00014471 WBGene00010966 WBGene00010967 \\\n", + "SRR1793993 234.526846 0.017340 54.483057 78.117815 \n", + "SRR1793991 372.511713 0.000000 54.545971 59.618577 \n", + "SRR1793994 253.220421 0.033826 19.483698 86.492735 \n", + "SRR1793992 472.206803 0.000000 89.508039 76.459508 \n", + "\n", + " WBGene00014473 \n", + "SRR1793993 0.000000 \n", + "SRR1793991 0.000000 \n", + "SRR1793994 0.016913 \n", + "SRR1793992 0.000000 \n", + "\n", + "[4 rows x 46755 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "45cbc6e1-9cf7-46a8-ac92-18924a7a5cf8", + "metadata": {}, + "source": [ + "## Convert data to AnnData object" + ] + }, + { + "cell_type": "markdown", + "id": "ae486006-b533-411b-b449-ff6d2261345a", + "metadata": {}, + "source": [ + "AnnData objects are highly flexible and are thus our preferred method of organizing data for age prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "74a99c21-67a7-4adb-8ac9-ea404a6c1e02", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting df_to_adata function\n", + "|-----> βš™οΈ Create anndata object started\n", + "|-----> βœ… Create anndata object finished [0.0190s]\n", + "|-----> βš™οΈ Add metadata to anndata started\n", + "|-----------? No metadata provided. Leaving adata.obs empty\n", + "|-----> ⚠️ Add metadata to anndata finished [0.0005s]\n", + "|-----> βš™οΈ Log data statistics started\n", + "|-----------> There are 4 observations\n", + "|-----------> There are 46755 features\n", + "|-----------> Total missing values: 0\n", + "|-----------> Percentage of missing values: 0.00%\n", + "|-----> βœ… Log data statistics finished [0.0011s]\n", + "|-----> βš™οΈ Impute missing values started\n", + "|-----------> No missing values found. No imputation necessary\n", + "|-----> βœ… Impute missing values finished [0.0013s]\n", + "|-----> πŸŽ‰ Done! [0.0239s]\n" + ] + } + ], + "source": [ + "adata = pya.preprocess.df_to_adata(df)" + ] + }, + { + "cell_type": "markdown", + "id": "94035d2e-2e6b-4927-bb2b-0ddcd1b3cd4e", + "metadata": {}, + "source": [ + "Note that the original DataFrame is stored in `X_original` under layers. is This is what the `adata` object looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5d8b68ec-d3aa-4a10-b7e5-54811bddd68c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 4 Γ— 46755\n", + " var: 'percent_na'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "2277ede6-ab9e-487b-a58d-c01cb21b6b68", + "metadata": {}, + "source": [ + "## Predict age" + ] + }, + { + "cell_type": "markdown", + "id": "889d2d5f-a596-41d0-b849-560b6bc856a1", + "metadata": {}, + "source": [ + "We can either predict one clock at once or all at the same time. Given we only have one clock of interest for this tutorial, let's go with one. The function is invariant to the capitalization of the clock name. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba48641d-ac0d-430c-9905-30a1349b7c50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting predict_age function\n", + "|-----> βš™οΈ Set PyTorch device started\n", + "|-----------> Using device: cpu\n", + "|-----> βœ… Set PyTorch device finished [0.0006s]\n", + "|-----> πŸ•’ Processing clock: bitage\n", + "|-----------> βš™οΈ Load clock started\n", + "|-----------------> Data found in pyaging_data/bitage.pt\n", + "|-----------> βœ… Load clock finished [0.5446s]\n", + "|-----------> βš™οΈ Check features in adata started\n", + "|-----------------> All features are present in adata.var_names.\n", + "|-----------------> Added prepared input matrix to adata.obsm[X_bitage]\n", + "|-----------> βœ… Check features in adata finished [0.0424s]\n", + "|-----------> βš™οΈ Predict ages with model started\n", + "|-----------------> The preprocessing method is binarize\n", + "|-----------------> There is no postprocessing necessary\n", + "|-----------------> in progress: 100.0000%\n", + "|-----------> βœ… Predict ages with model finished [0.0044s]\n", + "|-----------> βš™οΈ Add predicted ages and clock metadata to adata started\n", + "|-----------> βœ… Add predicted ages and clock metadata to adata finished [0.0006s]\n", + "|-----> πŸŽ‰ Done! [0.6613s]\n" + ] + } + ], + "source": [ + "pya.pred.predict_age(adata, 'BiTAge')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "032382f5-7d98-465e-a3cb-51165eeb7025", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bitage
SRR1793993182.353658
SRR179399127.337245
SRR1793994241.629584
SRR179399232.178003
\n", + "
" + ], + "text/plain": [ + " bitage\n", + "SRR1793993 182.353658\n", + "SRR1793991 27.337245\n", + "SRR1793994 241.629584\n", + "SRR1793992 32.178003" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2acc80b1-f936-40e4-900a-ef4deb304558", + "metadata": {}, + "source": [ + "Having so much information printed can be overwhelming, particularly when running several clocks at once. In such cases, just set verbose to False." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a587f129-a88b-46ec-a249-ac62737a0cb7", + "metadata": {}, + "outputs": [], + "source": [ + "pya.data.download_example_data('GSE65765', verbose=False)\n", + "df = pd.read_pickle('pyaging_data/GSE65765_CPM.pkl')\n", + "adata = pya.preprocess.df_to_adata(df, verbose=False)\n", + "pya.pred.predict_age(adata, ['BiTAge'], verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "99fbe406-d076-4979-a2f4-70469755937f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bitage
SRR1793993182.353658
SRR179399127.337245
SRR1793994241.629584
SRR179399232.178003
\n", + "
" + ], + "text/plain": [ + " bitage\n", + "SRR1793993 182.353658\n", + "SRR1793991 27.337245\n", + "SRR1793994 241.629584\n", + "SRR1793992 32.178003" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.head()" + ] + }, + { + "cell_type": "markdown", + "id": "25aedb7e-5cff-42da-a0ea-cc0780395ea7", + "metadata": {}, + "source": [ + "After age prediction, the clocks are added to `adata.obs`. Moreover, the percent of missing values for each clock and other metadata are included in `adata.uns`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "61dcb82f-e7f0-4064-8e67-b47b07b48a55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs Γ— n_vars = 4 Γ— 46755\n", + " obs: 'bitage'\n", + " var: 'percent_na'\n", + " uns: 'bitage_percent_na', 'bitage_missing_features', 'bitage_metadata'\n", + " layers: 'X_original'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "id": "1a73e164-a610-4cb6-93f5-6f8ac7d8d56f", + "metadata": {}, + "source": [ + "## Get citation" + ] + }, + { + "cell_type": "markdown", + "id": "6c7a070c-c448-4ad7-ae0b-21857dafd00e", + "metadata": {}, + "source": [ + "The doi, citation, and some metadata are automatically added to the AnnData object under `adata.uns[CLOCKNAME_metadata]`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9908d25a-9639-4684-9da6-353c7eb4a555", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'bitage',\n", + " 'data_type': 'transcriptomics',\n", + " 'species': 'C elegans',\n", + " 'year': 2021,\n", + " 'approved_by_author': 'βœ…',\n", + " 'citation': 'Meyer, David H., and BjΓΆrn Schumacher. \"BiT age: A transcriptome‐based aging clock near the theoretical limit of accuracy.\" Aging cell 20.3 (2021): e13320.',\n", + " 'doi': 'https://doi.org/10.1111/acel.13320',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.uns['bitage_metadata']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/tutorials/tutorial_utils.ipynb b/docs/source/tutorials/tutorial_utils.ipynb new file mode 100644 index 0000000..17a6066 --- /dev/null +++ b/docs/source/tutorials/tutorial_utils.ipynb @@ -0,0 +1,522 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cfeef651-0cae-4d14-b011-3b78c46fa2e4", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_utils.ipynb) [![Open In nbviewer](https://img.shields.io/badge/View%20in-nbviewer-orange)](https://nbviewer.jupyter.org/github/rsinghlab/pyaging/blob/main/tutorials/tutorial_utils.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "c5aac698-f7ed-4489-8c6c-f75ec3f3df73", + "metadata": {}, + "source": [ + "# Search, cite, get metadata and clock parameters" + ] + }, + { + "cell_type": "markdown", + "id": "9d132efc-15f4-4b4d-bdd7-c1aa21f7d13e", + "metadata": {}, + "source": [ + "This tutorial shows the use of some `pyaging` helper functions." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "445d6e41-c30f-4cbc-b5b3-97dc1020f09e", + "metadata": {}, + "outputs": [], + "source": [ + "import pyaging as pya" + ] + }, + { + "cell_type": "markdown", + "id": "72174a28-ddef-4ab1-ad7b-7c306920df23", + "metadata": {}, + "source": [ + "## Search" + ] + }, + { + "cell_type": "markdown", + "id": "aac87908-def9-40d9-a2cc-ffb34fae2dc5", + "metadata": {}, + "source": [ + "There are two main ways to search for a clock in `pyaging`. The first is through the doi of the paper in which the clock was developed." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d5281558-f1b6-4a5c-ba52-708d31265374", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting find_clock_by_doi function\n", + "|-----> βš™οΈ Load all clock metadata started\n", + "|-----------> Data found in pyaging_data/all_clock_metadata.pt\n", + "|-----> βœ… Load all clock metadata finished [0.4988s]\n", + "|-----> βš™οΈ Searching for clock based on DOI started\n", + "|-----------> in progress: 100.0000%\n", + "|-----------> Clocks with DOI https://doi.org/10.1038/s43587-022-00248-2: pchorvath2013, pcphenoage, pcgrimage, pchannum, pcdnamtl, hrsinchphenoage, pcskinandblood\n", + "|-----> βœ… Searching for clock based on DOI finished [0.0485s]\n", + "|-----> πŸŽ‰ Done! [0.5502s]\n" + ] + } + ], + "source": [ + "pya.utils.find_clock_by_doi('https://doi.org/10.1038/s43587-022-00248-2')" + ] + }, + { + "cell_type": "markdown", + "id": "125e90ad-25f5-46e2-a2ad-0f87c6729ea9", + "metadata": {}, + "source": [ + "The second way is by simply showing the names of all the clocks that are available." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "48c37121-3b2d-40cf-80aa-c1f58fbad127", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting show_all_clocks function\n", + "|-----> βš™οΈ Load all clock metadata started\n", + "|-----------> Data found in pyaging_data/all_clock_metadata.pt\n", + "|-----> βœ… Load all clock metadata finished [0.4589s]\n", + "|-----> βš™οΈ Showing all available clock names started\n", + "|-----------> altumage\n", + "|-----------> bitage\n", + "|-----------> camilloh3k27ac\n", + "|-----------> camilloh3k27me3\n", + "|-----------> camilloh3k36me3\n", + "|-----------> camilloh3k4me1\n", + "|-----------> camilloh3k4me3\n", + "|-----------> camilloh3k9ac\n", + "|-----------> camilloh3k9me3\n", + "|-----------> camillopanhistone\n", + "|-----------> dnamphenoage\n", + "|-----------> dnamtl\n", + "|-----------> dunedinpace\n", + "|-----------> encen100\n", + "|-----------> encen40\n", + "|-----------> grimage\n", + "|-----------> grimage2\n", + "|-----------> han\n", + "|-----------> hannum\n", + "|-----------> horvath2013\n", + "|-----------> hrsinchphenoage\n", + "|-----------> knight\n", + "|-----------> leecontrol\n", + "|-----------> leerefinedrobust\n", + "|-----------> leerobust\n", + "|-----------> lin\n", + "|-----------> mammalian1\n", + "|-----------> mammalian2\n", + "|-----------> mammalian3\n", + "|-----------> mammalianblood2\n", + "|-----------> mammalianblood3\n", + "|-----------> mammalianfemale\n", + "|-----------> mammalianlifespan\n", + "|-----------> mammalianskin2\n", + "|-----------> mammalianskin3\n", + "|-----------> meer\n", + "|-----------> ocampoatac1\n", + "|-----------> ocampoatac2\n", + "|-----------> pcdnamtl\n", + "|-----------> pcgrimage\n", + "|-----------> pchannum\n", + "|-----------> pchorvath2013\n", + "|-----------> pcphenoage\n", + "|-----------> pcskinandblood\n", + "|-----------> pedbe\n", + "|-----------> petkovich\n", + "|-----------> phenoage\n", + "|-----------> replitali\n", + "|-----------> skinandblood\n", + "|-----------> stubbs\n", + "|-----------> thompson\n", + "|-----------> zhangblup\n", + "|-----------> zhangen\n", + "|-----------> zhangmortality\n", + "|-----> βœ… Showing all available clock names finished [0.0280s]\n", + "|-----> πŸŽ‰ Done! [0.4903s]\n" + ] + } + ], + "source": [ + "pya.utils.show_all_clocks()" + ] + }, + { + "cell_type": "markdown", + "id": "6e579e9f-073d-4a26-9bd6-271068ac8601", + "metadata": {}, + "source": [ + "## Cite" + ] + }, + { + "cell_type": "markdown", + "id": "fb15a3ba-4dd1-4526-ab45-732979fcd676", + "metadata": {}, + "source": [ + "`pyaging` also provides citations for all available clocks." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "61f0ac9d-b26c-493b-a576-5d7cde67775c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting cite_clock function\n", + "|-----> βš™οΈ Load all clock metadata started\n", + "|-----------> Data found in pyaging_data/all_clock_metadata.pt\n", + "|-----> βœ… Load all clock metadata finished [0.5150s]\n", + "|-----> βš™οΈ Searching for citation of clock altumage started\n", + "|-----------> Citation for altumage:\n", + "|-----------> de Lima Camillo, Lucas Paulo, Louis R. Lapierre, and Ritambhara Singh. \"A pan-tissue DNA-methylation epigenetic clock based on deep learning.\" npj Aging 8.1 (2022): 4.\n", + "|-----------> Please also consider citing pyaging :)\n", + "|-----------> de Lima Camillo, Lucas Paulo. \"pyaging: a Python-based compendium of GPU-optimized aging clocks.\" bioRxiv (2023): 2023-11.\n", + "|-----> βœ… Searching for citation of clock altumage finished [0.0024s]\n", + "|-----> πŸŽ‰ Done! [0.5205s]\n" + ] + } + ], + "source": [ + "pya.utils.cite_clock('AltumAge')" + ] + }, + { + "cell_type": "markdown", + "id": "9838c1b6-47a6-44a4-8b3e-8c5e760c6172", + "metadata": {}, + "source": [ + "## Get metadata" + ] + }, + { + "cell_type": "markdown", + "id": "99fc2253-7f33-479f-b946-97d0f77c6d19", + "metadata": {}, + "source": [ + "To get all of the metadata for a clock, including citation and doi, just run the following." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2b65ef03-485a-4631-b831-8e87d3ce0f64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> πŸ—οΈ Starting get_clock_metadata function\n", + "|-----> βš™οΈ Load all clock metadata started\n", + "|-----------> Data found in pyaging_data/all_clock_metadata.pt\n", + "|-----> βœ… Load all clock metadata finished [0.5505s]\n", + "|-----> βš™οΈ Showing altumage metadata started\n", + "|-----------> clock_name: altumage\n", + "|-----------> data_type: methylation\n", + "|-----------> species: Homo sapiens\n", + "|-----------> year: 2022\n", + "|-----------> approved_by_author: βœ…\n", + "|-----------> citation: de Lima Camillo, Lucas Paulo, Louis R. Lapierre, and Ritambhara Singh. \"A pan-tissue DNA-methylation epigenetic clock based on deep learning.\" npj Aging 8.1 (2022): 4.\n", + "|-----------> doi: https://doi.org/10.1038/s41514-022-00085-y\n", + "|-----------> notes: None\n", + "|-----------> version: None\n", + "|-----------> reference_values: True\n", + "|-----------> preprocess: scale\n", + "|-----> βœ… Showing altumage metadata finished [0.0062s]\n", + "|-----> πŸŽ‰ Done! [0.5622s]\n" + ] + } + ], + "source": [ + "pya.utils.get_clock_metadata('AltumAge')" + ] + }, + { + "cell_type": "markdown", + "id": "b95a3ad8-efdd-4c73-8851-ec17071a6e78", + "metadata": {}, + "source": [ + "## Get clock parameters" + ] + }, + { + "cell_type": "markdown", + "id": "02a494ca-d06b-4b0b-90b8-f4013a1c100c", + "metadata": {}, + "source": [ + "To easily analyze the weights and features of a particular clock, please use:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7c853695-bc26-4f66-bd9d-39ec2c381f80", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|-----> βš™οΈ Load clock started\n", + "|-----------> Data found in pyaging_data/altumage.pt\n", + "|-----> βœ… Load clock finished [0.5409s]\n" + ] + } + ], + "source": [ + "logger = pya.logger.Logger('test_logger')\n", + "device = 'cpu'\n", + "dir = 'pyaging_data'\n", + "indent_level = 1\n", + "\n", + "clock = pya.pred.load_clock('AltumAge', device, dir, logger, indent_level=indent_level)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a9b2e3bc-d8a2-4625-9d6a-77ba1dede403", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AltumAge(\n", + " (base_model): AltumAgeNeuralNetwork(\n", + " (linear1): Linear(in_features=20318, out_features=32, bias=True)\n", + " (linear2): Linear(in_features=32, out_features=32, bias=True)\n", + " (linear3): Linear(in_features=32, out_features=32, bias=True)\n", + " (linear4): Linear(in_features=32, out_features=32, bias=True)\n", + " (linear5): Linear(in_features=32, out_features=32, bias=True)\n", + " (linear6): Linear(in_features=32, out_features=1, bias=True)\n", + " (bn1): BatchNorm1d(20318, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)\n", + " (bn2): BatchNorm1d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)\n", + " (bn3): BatchNorm1d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)\n", + " (bn4): BatchNorm1d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)\n", + " (bn5): BatchNorm1d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)\n", + " (bn6): BatchNorm1d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)\n", + " )\n", + ")" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clock" + ] + }, + { + "cell_type": "markdown", + "id": "f25f68da-0c13-4403-85ab-88f8a8d70c29", + "metadata": {}, + "source": [ + "Let's check the weights of the first linear layer for AltumAge." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8660aec5-82a8-4b70-a71a-bd3571a81ded", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[ 1.2465e-05, -2.4719e-04, 5.4308e-02, ..., -2.5304e-02,\n", + " 5.2822e-02, 8.9800e-02],\n", + " [ 3.5401e-04, -3.0528e-03, 2.8799e-02, ..., 6.8214e-03,\n", + " 6.9691e-02, 1.2179e-01],\n", + " [ 1.6119e-04, -6.7272e-06, -4.6887e-02, ..., 1.3132e-02,\n", + " 9.2417e-02, -4.2074e-02],\n", + " ...,\n", + " [ 1.9902e-04, 9.0495e-04, -8.5197e-03, ..., -9.6892e-02,\n", + " 2.9396e-02, 5.9170e-02],\n", + " [-1.2038e-04, 3.7530e-04, 1.7924e-01, ..., -4.9997e-02,\n", + " -1.2819e-02, 2.8045e-02],\n", + " [ 1.1584e-04, 2.2752e-04, -3.0746e-02, ..., 1.7930e-02,\n", + " 8.3116e-03, -2.0979e-02]], dtype=torch.float64, requires_grad=True)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clock.base_model.linear1.weight" + ] + }, + { + "cell_type": "markdown", + "id": "a508d495-e2dd-4757-ad4d-732836f08c64", + "metadata": {}, + "source": [ + "A quick look at the features:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7e5ec51f-146a-45a6-ba7f-5a825b8315fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['cg00000292',\n", + " 'cg00002426',\n", + " 'cg00003994',\n", + " 'cg00007981',\n", + " 'cg00008493',\n", + " 'cg00008713',\n", + " 'cg00009407',\n", + " 'cg00011459',\n", + " 'cg00012199',\n", + " 'cg00012386']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(clock.features[0:10])" + ] + }, + { + "cell_type": "markdown", + "id": "8a6b9591-5c1e-4665-88fd-ee633cd92798", + "metadata": {}, + "source": [ + "And the reference values used:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2f73647b-277f-4b93-b49d-a4907f49a892", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.7598633952352156,\n", + " 0.7863788078967272,\n", + " 0.06324422321924528,\n", + " 0.029943418029386736,\n", + " 0.9363471225552753,\n", + " 0.05054944899168823,\n", + " 0.0351571456459043,\n", + " 0.9114132733331861,\n", + " 0.037064057665286136,\n", + " 0.039170308280475935]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(clock.reference_values[0:10])" + ] + }, + { + "cell_type": "markdown", + "id": "6429c0d5-6737-4c76-b8c4-16b030325ad6", + "metadata": {}, + "source": [ + "We can also get the metadata directly from the clock object:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cac05fd5-5480-460e-801f-d3504a37f8d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'clock_name': 'altumage',\n", + " 'data_type': 'methylation',\n", + " 'species': 'Homo sapiens',\n", + " 'year': 2022,\n", + " 'approved_by_author': 'βœ…',\n", + " 'citation': 'de Lima Camillo, Lucas Paulo, Louis R. Lapierre, and Ritambhara Singh. \"A pan-tissue DNA-methylation epigenetic clock based on deep learning.\" npj Aging 8.1 (2022): 4.',\n", + " 'doi': 'https://doi.org/10.1038/s41514-022-00085-y',\n", + " 'notes': None,\n", + " 'version': None}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clock.metadata" + ] + }, + { + "cell_type": "markdown", + "id": "c4987574-3efc-4b63-8dcc-9938c94bc275", + "metadata": {}, + "source": [ + "For a more in depth look at how the clock was setup, including the model type and the source of the weights, please look at our [clocks notebook folder](https://github.com/rsinghlab/pyaging/tree/main/clocks/notebooks) on GitHub." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/tutorial_histonemarkchipseq.ipynb b/tutorials/tutorial_histonemarkchipseq.ipynb index 4ac8d48..39d0a1c 100644 --- a/tutorials/tutorial_histonemarkchipseq.ipynb +++ b/tutorials/tutorial_histonemarkchipseq.ipynb @@ -71,7 +71,7 @@ "text": [ "|-----> πŸ—οΈ Starting download_example_data function\n", "|-----------> Downloading data to pyaging_data/ENCFF386QWG.bigWig\n", - "|-----------> in progress: 19.0045%" + "|-----------> in progress: 24.0057%" ] } ],