Update preload-labels.ipynb

flekschas · flekschas · commit 1b72fe5b4667 · 2019-11-29T16:35:36.000-05:00
diff --git a/examples/preload-labels.ipynb b/examples/preload-labels.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,10 +51,16 @@
     "base = \"../\"\n",
     "settings_filepath = \"config-user-study-encode-e11-5-face-hindbrain.json\"\n",
     "window_size = 3000\n",
+    "step_size = 1500\n",
     "resolution = 25\n",
+    "# 1395142003 is the absolute offset of chr10\n",
+    "target_from = 1395142003 + 57039000\n",
+    "target_to = 1395142003 + 57042000\n",
+    "\n",
+    "assert target_to - target_from == window_size\n",
     "\n",
     "# Minimum value to consider a peak annotation a peak for differential accessible peak annotations\n",
-    "min_peak_val_diff = 0.5 \n",
+    "min_peak_val_diff = 0.75 \n",
     "# Minimum value to consider a peak annotation a peak for equally accessible peak annotations\n",
     "min_peak_val_same = 1\n",
     "\n",
@@ -163,61 +169,101 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def chunk_beds(bigbed):\n",
+    "    bins = 11\n",
+    "\n",
+    "    chrom_sizes = bbi.chromsizes(bigbed)\n",
+    "    chrom_size = chrom_sizes[settings['chroms'][0]]\n",
+    "    num_total_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1\n",
+    "\n",
+    "    num_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1\n",
+    "    start_pos = np.arange(0, step_size * num_total_windows, step_size)\n",
+    "    end_pos = np.arange(window_size, step_size * num_total_windows + window_size, step_size)\n",
+    "\n",
+    "    return bbi.stackup(\n",
+    "        bigbed,\n",
+    "        settings['chroms'] * num_total_windows,\n",
+    "        start_pos,\n",
+    "        end_pos,\n",
+    "        bins=bins,\n",
+    "        missing=0,\n",
+    "        oob=0,\n",
+    "    ).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "peaks_face = chunk_beds(narrow_peaks_face)\n",
+    "peaks_hindbrain = chunk_beds(narrow_peaks_hindbrain)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Extracted 87129 windows from chr10 with a max value of 1.0.\n",
-      "Extracted 87129 windows from chr10 with a max value of 1.0.\n",
-      "Face peaks: 11769\n",
-      "Hindbrain peaks: 11257\n",
-      "Diff peaks: 644\n",
-      "Same peaks: 11191\n",
-      "Diff peaks with max val >= 0.5: 77\n",
-      "Same peaks with max val >= 1: 1572\n"
+      "Face peaks: 11008\n",
+      "Hindbrain peaks: 8347\n",
+      "Diff peaks: 1762\n",
+      "Same peaks: 5130\n"
      ]
     }
    ],
    "source": [
-    "from ae.utils import chunk_beds_binary\n",
+    "print('Face peaks: {}'.format(np.sum(np.max(peaks_face[:,2:9], axis=1))))\n",
+    "print('Hindbrain peaks: {}'.format(np.sum(np.max(peaks_hindbrain[:,2:9], axis=1))))\n",
     "\n",
-    "face_wins_has_peaks = chunk_beds_binary(\n",
-    "    broad_peaks_face,\n",
-    "    window_size,\n",
-    "    window_size // settings['step_freq'],\n",
-    "    settings['chroms'],\n",
-    "    verbose=True,\n",
-    ").flatten()\n",
-    "\n",
-    "hindbrain_wins_has_peaks = chunk_beds_binary(\n",
-    "    broad_peaks_hindbrain,\n",
-    "    window_size,\n",
-    "    window_size // settings['step_freq'],\n",
-    "    settings['chroms'],\n",
-    "    verbose=True,\n",
-    ").flatten()\n",
-    "\n",
-    "print('Face peaks: {}'.format(np.sum(face_wins_has_peaks)))\n",
-    "print('Hindbrain peaks: {}'.format(np.sum(hindbrain_wins_has_peaks)))\n",
-    "\n",
-    "wins_has_diff_peak = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 1\n",
-    "print('Diff peaks: {}'.format(np.sum(wins_has_diff_peak)))\n",
-    "\n",
-    "wins_has_same_peaks = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 2\n",
-    "print('Same peaks: {}'.format(np.sum(wins_has_same_peaks)))\n",
+    "diff_peaks = (\n",
+    "    (\n",
+    "        np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1) == 1\n",
+    "    ) & (\n",
+    "        np.abs(np.sum(peaks_face[:,2:9], axis=1) - np.sum(peaks_hindbrain[:,2:9], axis=1)) > 2\n",
+    "    )\n",
+    ")\n",
+    "print('Diff peaks: {}'.format(np.sum(diff_peaks)))\n",
     "\n",
-    "diff_peaks_win_ids = np.where(wins_has_diff_peak)[0]\n",
-    "same_peaks_win_ids = np.where(wins_has_same_peaks)[0]\n",
+    "same_peaks = (\n",
+    "    np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1)\n",
+    ") == 2\n",
+    "print('Same peaks: {}'.format(np.sum(same_peaks)))\n",
     "\n",
-    "diff_peaks_with_max = wins_has_diff_peak & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))\n",
+    "diff_peaks_win_ids = np.where(diff_peaks)[0]\n",
+    "same_peaks_win_ids = np.where(same_peaks)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Diff peaks with max val >= 0.75: 55\n",
+      "Same peaks with max val >= 1: 1201\n"
+     ]
+    }
+   ],
+   "source": [
+    "diff_peaks_with_max = diff_peaks & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))\n",
     "diff_peaks_with_max_ids = np.where(diff_peaks_with_max)[0]\n",
     "\n",
     "print('Diff peaks with max val >= {}: {}'.format(min_peak_val_diff, np.sum(diff_peaks_with_max)))\n",
     "\n",
-    "same_peaks_with_max = wins_has_same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))\n",
+    "same_peaks_with_max = same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))\n",
     "same_peaks_with_max_ids = np.where(same_peaks_with_max)[0]\n",
     "\n",
     "print('Same peaks with max val >= {}: {}'.format(min_peak_val_same, np.sum(same_peaks_with_max)))"
@@ -234,7 +280,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db6f614cd1774f999e54ba1ad336dc01",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from ipywidgets.widgets import Checkbox\n",
+    "\n",
+    "clear_db = Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')\n",
+    "clear_db"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -243,55 +316,83 @@
     "\n",
     "db_path = os.path.join(base, settings[\"db_path\"])\n",
     "\n",
-    "if CLEAR_DB:\n",
+    "if os.path.exists(db_path) and not clear_db.value:\n",
+    "    print('Database already exist. Check above to delete!')\n",
+    "else:\n",
     "    os.remove(db_path)\n",
     "    DB(db_path=db_path, clear=True)\n",
-    "else:\n",
-    "    try:\n",
-    "        with sqlite3.connect(db_path) as db:\n",
-    "            c = db.cursor()\n",
-    "            c.execute(f\"SELECT * FROM classification\")\n",
-    "            c.fetchone()\n",
-    "    except sqlite3.OperationalError:\n",
-    "        DB(db_path=db_path, clear=CLEAR_DB)\n",
-    "\n",
-    "with sqlite3.connect(db_path) as db:\n",
-    "    for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED): \n",
-    "        db.execute(f\"DELETE FROM classification WHERE search_id = {int(search_id)};\")\n",
-    "        db.commit()\n",
-    "\n",
-    "        for window_idx in np.random.choice(\n",
-    "            diff_peaks_with_max_ids,\n",
-    "            np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
-    "            replace=False\n",
-    "        ):\n",
+    "\n",
+    "    with sqlite3.connect(db_path) as db:\n",
+    "        for search_id in range(1, NUM_SEARCHES_TO_BE_PRELOADED + 1):\n",
     "            db.execute(\n",
     "                \"\"\"\n",
     "                    INSERT INTO\n",
-    "                        classification(search_id, window_id, is_positive)\n",
+    "                        search(id, target_from, target_to, config)\n",
     "                    VALUES\n",
-    "                        (?, ?, ?);\n",
+    "                        (?, ?, ?, ?);\n",
     "                \"\"\",\n",
-    "                (int(search_id), int(window_idx), 1),\n",
+    "                (int(search_id), int(target_from), int(target_to), json.dumps(settings)),\n",
     "            )\n",
     "\n",
+    "            for window_idx in np.random.choice(\n",
+    "                diff_peaks_with_max_ids,\n",
+    "                np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
+    "                replace=False\n",
+    "            ):\n",
+    "                db.execute(\n",
+    "                    \"\"\"\n",
+    "                        INSERT INTO\n",
+    "                            classification(search_id, window_id, is_positive)\n",
+    "                        VALUES\n",
+    "                            (?, ?, ?);\n",
+    "                    \"\"\",\n",
+    "                    (int(search_id), int(window_idx), 1),\n",
+    "                )\n",
     "\n",
-    "        for window_idx in np.random.choice(\n",
-    "            same_peaks_with_max_ids,\n",
-    "            np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
-    "            replace=False\n",
-    "        ):\n",
-    "            db.execute(\n",
-    "                \"\"\"\n",
-    "                    INSERT INTO\n",
-    "                        classification(search_id, window_id, is_positive)\n",
-    "                    VALUES\n",
-    "                        (?, ?, ?);\n",
-    "                \"\"\",\n",
-    "                (int(search_id), int(window_idx), -1),\n",
-    "            )\n",
     "\n",
-    "        db.commit()"
+    "            for window_idx in np.random.choice(\n",
+    "                same_peaks_with_max_ids,\n",
+    "                np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
+    "                replace=False\n",
+    "            ):\n",
+    "                db.execute(\n",
+    "                    \"\"\"\n",
+    "                        INSERT INTO\n",
+    "                            classification(search_id, window_id, is_positive)\n",
+    "                        VALUES\n",
+    "                            (?, ?, ?);\n",
+    "                    \"\"\",\n",
+    "                    (int(search_id), int(window_idx), -1),\n",
+    "                )\n",
+    "\n",
+    "            db.commit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Make sure to start the server first!**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import time\n",
+    "\n",
+    "for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED, 0, -1):\n",
+    "    r = requests.post(\n",
+    "        url = f'http://localhost:5000/api/v1/classifier/?s={search_id}'\n",
+    "    )\n",
+    "    time.sleep(5)\n",
+    "    r = requests.post(\n",
+    "        url = f'http://localhost:5000/api/v1/progress/?s={search_id}&u=1'\n",
+    "    )\n",
+    "    time.sleep(5)"
    ]
   },
   {