|
36 | 36 | },
|
37 | 37 | {
|
38 | 38 | "cell_type": "code",
|
39 |
| - "execution_count": 2, |
| 39 | + "execution_count": 65, |
40 | 40 | "metadata": {},
|
41 | 41 | "outputs": [],
|
42 | 42 | "source": [
|
|
51 | 51 | "base = \"../\"\n",
|
52 | 52 | "settings_filepath = \"config-user-study-encode-e11-5-face-hindbrain.json\"\n",
|
53 | 53 | "window_size = 3000\n",
|
| 54 | + "step_size = 1500\n", |
54 | 55 | "resolution = 25\n",
|
| 56 | + "# 1395142003 is the absolute offset of chr10\n", |
| 57 | + "target_from = 1395142003 + 57039000\n", |
| 58 | + "target_to = 1395142003 + 57042000\n", |
| 59 | + "\n", |
| 60 | + "assert target_to - target_from == window_size\n", |
55 | 61 | "\n",
|
56 | 62 | "# Minimum value to consider a peak annotation a peak for differential accessible peak annotations\n",
|
57 |
| - "min_peak_val_diff = 0.5 \n", |
| 63 | + "min_peak_val_diff = 0.75 \n", |
58 | 64 | "# Minimum value to consider a peak annotation a peak for equally accessible peak annotations\n",
|
59 | 65 | "min_peak_val_same = 1\n",
|
60 | 66 | "\n",
|
|
163 | 169 | },
|
164 | 170 | {
|
165 | 171 | "cell_type": "code",
|
166 |
| - "execution_count": 6, |
| 172 | + "execution_count": 58, |
| 173 | + "metadata": {}, |
| 174 | + "outputs": [], |
| 175 | + "source": [ |
| 176 | + "def chunk_beds(bigbed):\n", |
| 177 | + " bins = 11\n", |
| 178 | + "\n", |
| 179 | + " chrom_sizes = bbi.chromsizes(bigbed)\n", |
| 180 | + " chrom_size = chrom_sizes[settings['chroms'][0]]\n", |
| 181 | + " num_total_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1\n", |
| 182 | + "\n", |
| 183 | + " num_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1\n", |
| 184 | + " start_pos = np.arange(0, step_size * num_total_windows, step_size)\n", |
| 185 | + " end_pos = np.arange(window_size, step_size * num_total_windows + window_size, step_size)\n", |
| 186 | + "\n", |
| 187 | + " return bbi.stackup(\n", |
| 188 | + " bigbed,\n", |
| 189 | + " settings['chroms'] * num_total_windows,\n", |
| 190 | + " start_pos,\n", |
| 191 | + " end_pos,\n", |
| 192 | + " bins=bins,\n", |
| 193 | + " missing=0,\n", |
| 194 | + " oob=0,\n", |
| 195 | + " ).astype(int)" |
| 196 | + ] |
| 197 | + }, |
| 198 | + { |
| 199 | + "cell_type": "code", |
| 200 | + "execution_count": 61, |
| 201 | + "metadata": {}, |
| 202 | + "outputs": [], |
| 203 | + "source": [ |
| 204 | + "peaks_face = chunk_beds(narrow_peaks_face)\n", |
| 205 | + "peaks_hindbrain = chunk_beds(narrow_peaks_hindbrain)" |
| 206 | + ] |
| 207 | + }, |
| 208 | + { |
| 209 | + "cell_type": "code", |
| 210 | + "execution_count": 62, |
167 | 211 | "metadata": {},
|
168 | 212 | "outputs": [
|
169 | 213 | {
|
170 | 214 | "name": "stdout",
|
171 | 215 | "output_type": "stream",
|
172 | 216 | "text": [
|
173 |
| - "Extracted 87129 windows from chr10 with a max value of 1.0.\n", |
174 |
| - "Extracted 87129 windows from chr10 with a max value of 1.0.\n", |
175 |
| - "Face peaks: 11769\n", |
176 |
| - "Hindbrain peaks: 11257\n", |
177 |
| - "Diff peaks: 644\n", |
178 |
| - "Same peaks: 11191\n", |
179 |
| - "Diff peaks with max val >= 0.5: 77\n", |
180 |
| - "Same peaks with max val >= 1: 1572\n" |
| 217 | + "Face peaks: 11008\n", |
| 218 | + "Hindbrain peaks: 8347\n", |
| 219 | + "Diff peaks: 1762\n", |
| 220 | + "Same peaks: 5130\n" |
181 | 221 | ]
|
182 | 222 | }
|
183 | 223 | ],
|
184 | 224 | "source": [
|
185 |
| - "from ae.utils import chunk_beds_binary\n", |
| 225 | + "print('Face peaks: {}'.format(np.sum(np.max(peaks_face[:,2:9], axis=1))))\n", |
| 226 | + "print('Hindbrain peaks: {}'.format(np.sum(np.max(peaks_hindbrain[:,2:9], axis=1))))\n", |
186 | 227 | "\n",
|
187 |
| - "face_wins_has_peaks = chunk_beds_binary(\n", |
188 |
| - " broad_peaks_face,\n", |
189 |
| - " window_size,\n", |
190 |
| - " window_size // settings['step_freq'],\n", |
191 |
| - " settings['chroms'],\n", |
192 |
| - " verbose=True,\n", |
193 |
| - ").flatten()\n", |
194 |
| - "\n", |
195 |
| - "hindbrain_wins_has_peaks = chunk_beds_binary(\n", |
196 |
| - " broad_peaks_hindbrain,\n", |
197 |
| - " window_size,\n", |
198 |
| - " window_size // settings['step_freq'],\n", |
199 |
| - " settings['chroms'],\n", |
200 |
| - " verbose=True,\n", |
201 |
| - ").flatten()\n", |
202 |
| - "\n", |
203 |
| - "print('Face peaks: {}'.format(np.sum(face_wins_has_peaks)))\n", |
204 |
| - "print('Hindbrain peaks: {}'.format(np.sum(hindbrain_wins_has_peaks)))\n", |
205 |
| - "\n", |
206 |
| - "wins_has_diff_peak = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 1\n", |
207 |
| - "print('Diff peaks: {}'.format(np.sum(wins_has_diff_peak)))\n", |
208 |
| - "\n", |
209 |
| - "wins_has_same_peaks = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 2\n", |
210 |
| - "print('Same peaks: {}'.format(np.sum(wins_has_same_peaks)))\n", |
| 228 | + "diff_peaks = (\n", |
| 229 | + " (\n", |
| 230 | + " np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1) == 1\n", |
| 231 | + " ) & (\n", |
| 232 | + " np.abs(np.sum(peaks_face[:,2:9], axis=1) - np.sum(peaks_hindbrain[:,2:9], axis=1)) > 2\n", |
| 233 | + " )\n", |
| 234 | + ")\n", |
| 235 | + "print('Diff peaks: {}'.format(np.sum(diff_peaks)))\n", |
211 | 236 | "\n",
|
212 |
| - "diff_peaks_win_ids = np.where(wins_has_diff_peak)[0]\n", |
213 |
| - "same_peaks_win_ids = np.where(wins_has_same_peaks)[0]\n", |
| 237 | + "same_peaks = (\n", |
| 238 | + " np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1)\n", |
| 239 | + ") == 2\n", |
| 240 | + "print('Same peaks: {}'.format(np.sum(same_peaks)))\n", |
214 | 241 | "\n",
|
215 |
| - "diff_peaks_with_max = wins_has_diff_peak & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))\n", |
| 242 | + "diff_peaks_win_ids = np.where(diff_peaks)[0]\n", |
| 243 | + "same_peaks_win_ids = np.where(same_peaks)[0]" |
| 244 | + ] |
| 245 | + }, |
| 246 | + { |
| 247 | + "cell_type": "code", |
| 248 | + "execution_count": 66, |
| 249 | + "metadata": {}, |
| 250 | + "outputs": [ |
| 251 | + { |
| 252 | + "name": "stdout", |
| 253 | + "output_type": "stream", |
| 254 | + "text": [ |
| 255 | + "Diff peaks with max val >= 0.75: 55\n", |
| 256 | + "Same peaks with max val >= 1: 1201\n" |
| 257 | + ] |
| 258 | + } |
| 259 | + ], |
| 260 | + "source": [ |
| 261 | + "diff_peaks_with_max = diff_peaks & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))\n", |
216 | 262 | "diff_peaks_with_max_ids = np.where(diff_peaks_with_max)[0]\n",
|
217 | 263 | "\n",
|
218 | 264 | "print('Diff peaks with max val >= {}: {}'.format(min_peak_val_diff, np.sum(diff_peaks_with_max)))\n",
|
219 | 265 | "\n",
|
220 |
| - "same_peaks_with_max = wins_has_same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))\n", |
| 266 | + "same_peaks_with_max = same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))\n", |
221 | 267 | "same_peaks_with_max_ids = np.where(same_peaks_with_max)[0]\n",
|
222 | 268 | "\n",
|
223 | 269 | "print('Same peaks with max val >= {}: {}'.format(min_peak_val_same, np.sum(same_peaks_with_max)))"
|
|
234 | 280 | },
|
235 | 281 | {
|
236 | 282 | "cell_type": "code",
|
237 |
| - "execution_count": 7, |
| 283 | + "execution_count": 15, |
| 284 | + "metadata": {}, |
| 285 | + "outputs": [ |
| 286 | + { |
| 287 | + "data": { |
| 288 | + "application/vnd.jupyter.widget-view+json": { |
| 289 | + "model_id": "db6f614cd1774f999e54ba1ad336dc01", |
| 290 | + "version_major": 2, |
| 291 | + "version_minor": 0 |
| 292 | + }, |
| 293 | + "text/plain": [ |
| 294 | + "Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')" |
| 295 | + ] |
| 296 | + }, |
| 297 | + "metadata": {}, |
| 298 | + "output_type": "display_data" |
| 299 | + } |
| 300 | + ], |
| 301 | + "source": [ |
| 302 | + "from ipywidgets.widgets import Checkbox\n", |
| 303 | + "\n", |
| 304 | + "clear_db = Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')\n", |
| 305 | + "clear_db" |
| 306 | + ] |
| 307 | + }, |
| 308 | + { |
| 309 | + "cell_type": "code", |
| 310 | + "execution_count": 74, |
238 | 311 | "metadata": {},
|
239 | 312 | "outputs": [],
|
240 | 313 | "source": [
|
|
243 | 316 | "\n",
|
244 | 317 | "db_path = os.path.join(base, settings[\"db_path\"])\n",
|
245 | 318 | "\n",
|
246 |
| - "if CLEAR_DB:\n", |
| 319 | + "if os.path.exists(db_path) and not clear_db.value:\n", |
| 320 | + " print('Database already exist. Check above to delete!')\n", |
| 321 | + "else:\n", |
247 | 322 | " os.remove(db_path)\n",
|
248 | 323 | " DB(db_path=db_path, clear=True)\n",
|
249 |
| - "else:\n", |
250 |
| - " try:\n", |
251 |
| - " with sqlite3.connect(db_path) as db:\n", |
252 |
| - " c = db.cursor()\n", |
253 |
| - " c.execute(f\"SELECT * FROM classification\")\n", |
254 |
| - " c.fetchone()\n", |
255 |
| - " except sqlite3.OperationalError:\n", |
256 |
| - " DB(db_path=db_path, clear=CLEAR_DB)\n", |
257 |
| - "\n", |
258 |
| - "with sqlite3.connect(db_path) as db:\n", |
259 |
| - " for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED): \n", |
260 |
| - " db.execute(f\"DELETE FROM classification WHERE search_id = {int(search_id)};\")\n", |
261 |
| - " db.commit()\n", |
262 |
| - "\n", |
263 |
| - " for window_idx in np.random.choice(\n", |
264 |
| - " diff_peaks_with_max_ids,\n", |
265 |
| - " np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n", |
266 |
| - " replace=False\n", |
267 |
| - " ):\n", |
| 324 | + "\n", |
| 325 | + " with sqlite3.connect(db_path) as db:\n", |
| 326 | + " for search_id in range(1, NUM_SEARCHES_TO_BE_PRELOADED + 1):\n", |
268 | 327 | " db.execute(\n",
|
269 | 328 | " \"\"\"\n",
|
270 | 329 | " INSERT INTO\n",
|
271 |
| - " classification(search_id, window_id, is_positive)\n", |
| 330 | + " search(id, target_from, target_to, config)\n", |
272 | 331 | " VALUES\n",
|
273 |
| - " (?, ?, ?);\n", |
| 332 | + " (?, ?, ?, ?);\n", |
274 | 333 | " \"\"\",\n",
|
275 |
| - " (int(search_id), int(window_idx), 1),\n", |
| 334 | + " (int(search_id), int(target_from), int(target_to), json.dumps(settings)),\n", |
276 | 335 | " )\n",
|
277 | 336 | "\n",
|
| 337 | + " for window_idx in np.random.choice(\n", |
| 338 | + " diff_peaks_with_max_ids,\n", |
| 339 | + " np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n", |
| 340 | + " replace=False\n", |
| 341 | + " ):\n", |
| 342 | + " db.execute(\n", |
| 343 | + " \"\"\"\n", |
| 344 | + " INSERT INTO\n", |
| 345 | + " classification(search_id, window_id, is_positive)\n", |
| 346 | + " VALUES\n", |
| 347 | + " (?, ?, ?);\n", |
| 348 | + " \"\"\",\n", |
| 349 | + " (int(search_id), int(window_idx), 1),\n", |
| 350 | + " )\n", |
278 | 351 | "\n",
|
279 |
| - " for window_idx in np.random.choice(\n", |
280 |
| - " same_peaks_with_max_ids,\n", |
281 |
| - " np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n", |
282 |
| - " replace=False\n", |
283 |
| - " ):\n", |
284 |
| - " db.execute(\n", |
285 |
| - " \"\"\"\n", |
286 |
| - " INSERT INTO\n", |
287 |
| - " classification(search_id, window_id, is_positive)\n", |
288 |
| - " VALUES\n", |
289 |
| - " (?, ?, ?);\n", |
290 |
| - " \"\"\",\n", |
291 |
| - " (int(search_id), int(window_idx), -1),\n", |
292 |
| - " )\n", |
293 | 352 | "\n",
|
294 |
| - " db.commit()" |
| 353 | + " for window_idx in np.random.choice(\n", |
| 354 | + " same_peaks_with_max_ids,\n", |
| 355 | + " np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n", |
| 356 | + " replace=False\n", |
| 357 | + " ):\n", |
| 358 | + " db.execute(\n", |
| 359 | + " \"\"\"\n", |
| 360 | + " INSERT INTO\n", |
| 361 | + " classification(search_id, window_id, is_positive)\n", |
| 362 | + " VALUES\n", |
| 363 | + " (?, ?, ?);\n", |
| 364 | + " \"\"\",\n", |
| 365 | + " (int(search_id), int(window_idx), -1),\n", |
| 366 | + " )\n", |
| 367 | + "\n", |
| 368 | + " db.commit()" |
| 369 | + ] |
| 370 | + }, |
| 371 | + { |
| 372 | + "cell_type": "markdown", |
| 373 | + "metadata": {}, |
| 374 | + "source": [ |
| 375 | + "**Make sure to start the server first!**" |
| 376 | + ] |
| 377 | + }, |
| 378 | + { |
| 379 | + "cell_type": "code", |
| 380 | + "execution_count": 75, |
| 381 | + "metadata": {}, |
| 382 | + "outputs": [], |
| 383 | + "source": [ |
| 384 | + "import requests\n", |
| 385 | + "import time\n", |
| 386 | + "\n", |
| 387 | + "for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED, 0, -1):\n", |
| 388 | + " r = requests.post(\n", |
| 389 | + " url = f'http://localhost:5000/api/v1/classifier/?s={search_id}'\n", |
| 390 | + " )\n", |
| 391 | + " time.sleep(5)\n", |
| 392 | + " r = requests.post(\n", |
| 393 | + " url = f'http://localhost:5000/api/v1/progress/?s={search_id}&u=1'\n", |
| 394 | + " )\n", |
| 395 | + " time.sleep(5)" |
295 | 396 | ]
|
296 | 397 | },
|
297 | 398 | {
|
|
0 commit comments