add map.py and use in basic_demo.ipynb

martinjankowiak · martinjankowiak · commit 6a00b4678770 · 2022-04-13T15:26:14.000-04:00
diff --git a/bvas/map.py b/bvas/map.py
@@ -0,0 +1,36 @@
+import torch
+from torch import triangular_solve as trisolve
+
+from bvas.util import safe_cholesky
+
+
+def map_inference(Y, Gamma, taus=[2 ** exponent for exponent in range(4, 16)]):
+    r"""
+    Use Maximum A Posteriori (MAP) inference and a diffusion-based likelihood to infer
+    selection effects from genomic surveillance data. See reference [1] for details.
+
+    References:
+    [1] "Inferring effects of mutations on SARS-CoV-2 transmission from genomic surveillance data,"
+        Brian Lee, Muhammad Saqib Sohail, Elizabeth Finney, Syed Faraz Ahmed, Ahmed Abdul Quadeer,
+        Matthew R. McKay, John P. Barton.
+
+    :param torch.Tensor Y: A torch.Tensor of shape (A,) that encodes integrated alelle frequency
+        increments for each allele and where A is the number of alleles.
+    :param torch.Tensor Gamma: A torch.Tensor of shape (A, A) that encodes information about
+        second moments of allele frequencies.
+    :param list taus: A list of floats encoding regularizers `tau_reg` to use in MAP inference, i.e. we run
+        MAP once for each value of `tau_reg`. Note that this quantity is called `gamma` in reference [1].
+
+    :returns dict: Returns a dictionary of inferred selection coefficients beta, one for each value
+        in `taus`.
+    """
+    results = {}
+
+    for tau_reg in taus:
+        L_tau = safe_cholesky(Gamma + tau_reg * torch.eye(Gamma.size(-1)).type_as(Gamma))
+        Yt = trisolve(Y.unsqueeze(-1), L_tau, upper=False)[0]
+        beta = trisolve(Yt, L_tau.t(), upper=True)[0].squeeze(-1)
+        results['map_{}'.format(tau_reg)] = {'beta': beta.data.cpu().numpy(),
+                                             'tau_reg': tau_reg}
+
+    return results
diff --git a/data/covid_preprocessing.py b/data/covid_preprocessing.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 import torch
+
 from bvas.util import get_longest_ones_index
 
 
diff --git a/notebooks/basic_demo.ipynb b/notebooks/basic_demo.ipynb
@@ -2,25 +2,28 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "3ceb4719",
+   "id": "acc138b0",
    "metadata": {},
    "source": [
     "# Basic BVAS demo using simulated data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "aceebab8",
+   "execution_count": 25,
+   "id": "105ca8b5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from bvas import simulate_data, BVASSelector"
+    "from bvas import simulate_data, BVASSelector\n",
+    "from bvas.map import map_inference\n",
+    "import pandas as pd\n",
+    "import numpy as np"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "fef691b9",
+   "id": "d4ae7daa",
    "metadata": {},
    "source": [
     "### Simulate data"
@@ -29,7 +32,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "8f170790",
+   "id": "67c8b3f2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,7 +50,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "f74e7b1b",
+   "id": "a8592b05",
    "metadata": {},
    "outputs": [
     {
@@ -73,7 +76,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "24a7ce7f",
+   "id": "929783f8",
    "metadata": {},
    "source": [
     "### Instantiate BVASSelector object"
@@ -82,7 +85,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "617cb379",
+   "id": "4a7a3d81",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -99,7 +102,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "884fedf0",
+   "id": "56b0072c",
    "metadata": {},
    "source": [
     "### Run BVAS MCMC-based inference"
@@ -108,13 +111,13 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "77bcb9bd",
+   "id": "9c285218",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cc591be1cd164f68be7f385dc2537701",
+       "model_id": "3ddab09989224a008f51f31c69706a59",
        "version_major": 2,
        "version_minor": 0
       },
@@ -132,7 +135,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "86691da7",
+   "id": "0b3a07cc",
    "metadata": {},
    "source": [
     "### Inspect results\n",
@@ -149,7 +152,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "f1ae71fd",
+   "id": "a13e39fc",
    "metadata": {},
    "outputs": [
     {
@@ -182,7 +185,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "6d2a48cd",
+   "id": "81e46a30",
    "metadata": {},
    "outputs": [
     {
@@ -211,13 +214,189 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "e5b9424c",
+   "id": "7b64636a",
    "metadata": {},
    "outputs": [],
    "source": [
     "# the remaining coefficients are all zero\n",
     "assert data['true_betas'][10:].min().item() == data['true_betas'][10:].max().item() == 0.0"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7149875f",
+   "metadata": {},
+   "source": [
+    "# Compare to MAP inference\n",
+    "\n",
+    "Let's compare to Maximum A posteriorir (i.e. MAP) inference as in [Inferring effects of mutations on SARS-CoV-2 transmission from genomic surveillance data](https://www.medrxiv.org/content/10.1101/2021.12.31.21268591v2)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "975411eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "map_results = map_inference(data['Y'], data['Gamma'], taus=[2048.0])\n",
+    "inferred_beta = map_results['map_2048.0']['beta']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "60cffa06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# package results as Pandas DataFrame\n",
+    "inferred_beta = pd.DataFrame(inferred_beta, index=mutations, columns=['Beta'])\n",
+    "inferred_beta['BetaAbs'] = np.fabs(inferred_beta)\n",
+    "inferred_beta = inferred_beta.sort_values(by='BetaAbs', ascending=False)\n",
+    "inferred_beta['Rank'] = 1 + np.arange(inferred_beta.shape[0])\n",
+    "inferred_beta = inferred_beta[['Beta', 'Rank']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "45a53f68",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Beta</th>\n",
+       "      <th>Rank</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Causal9</th>\n",
+       "      <td>-0.053871</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Causal5</th>\n",
+       "      <td>0.049838</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Causal10</th>\n",
+       "      <td>-0.048263</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Causal4</th>\n",
+       "      <td>0.045866</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Causal3</th>\n",
+       "      <td>0.027333</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Causal8</th>\n",
+       "      <td>-0.021542</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious80</th>\n",
+       "      <td>0.020984</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious44</th>\n",
+       "      <td>-0.017381</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious68</th>\n",
+       "      <td>-0.015019</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious61</th>\n",
+       "      <td>0.014249</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious38</th>\n",
+       "      <td>0.014112</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious85</th>\n",
+       "      <td>0.012077</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious90</th>\n",
+       "      <td>0.012060</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious66</th>\n",
+       "      <td>0.011890</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spurious70</th>\n",
+       "      <td>0.011479</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                Beta  Rank\n",
+       "Causal9    -0.053871     1\n",
+       "Causal5     0.049838     2\n",
+       "Causal10   -0.048263     3\n",
+       "Causal4     0.045866     4\n",
+       "Causal3     0.027333     5\n",
+       "Causal8    -0.021542     6\n",
+       "Spurious80  0.020984     7\n",
+       "Spurious44 -0.017381     8\n",
+       "Spurious68 -0.015019     9\n",
+       "Spurious61  0.014249    10\n",
+       "Spurious38  0.014112    11\n",
+       "Spurious85  0.012077    12\n",
+       "Spurious90  0.012060    13\n",
+       "Spurious66  0.011890    14\n",
+       "Spurious70  0.011479    15"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# MAP places 6/10 of the causal alleles at the top\n",
+    "inferred_beta.iloc[:15]"
+   ]
   }
  ],
  "metadata": {