Skip to content

Commit 1b72fe5

Browse files
committed
Update preload-labels.ipynb
1 parent cd3a2da commit 1b72fe5

File tree

1 file changed

+180
-79
lines changed

1 file changed

+180
-79
lines changed

examples/preload-labels.ipynb

+180-79
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
},
3737
{
3838
"cell_type": "code",
39-
"execution_count": 2,
39+
"execution_count": 65,
4040
"metadata": {},
4141
"outputs": [],
4242
"source": [
@@ -51,10 +51,16 @@
5151
"base = \"../\"\n",
5252
"settings_filepath = \"config-user-study-encode-e11-5-face-hindbrain.json\"\n",
5353
"window_size = 3000\n",
54+
"step_size = 1500\n",
5455
"resolution = 25\n",
56+
"# 1395142003 is the absolute offset of chr10\n",
57+
"target_from = 1395142003 + 57039000\n",
58+
"target_to = 1395142003 + 57042000\n",
59+
"\n",
60+
"assert target_to - target_from == window_size\n",
5561
"\n",
5662
"# Minimum value to consider a peak annotation a peak for differential accessible peak annotations\n",
57-
"min_peak_val_diff = 0.5 \n",
63+
"min_peak_val_diff = 0.75 \n",
5864
"# Minimum value to consider a peak annotation a peak for equally accessible peak annotations\n",
5965
"min_peak_val_same = 1\n",
6066
"\n",
@@ -163,61 +169,101 @@
163169
},
164170
{
165171
"cell_type": "code",
166-
"execution_count": 6,
172+
"execution_count": 58,
173+
"metadata": {},
174+
"outputs": [],
175+
"source": [
176+
"def chunk_beds(bigbed):\n",
177+
" bins = 11\n",
178+
"\n",
179+
" chrom_sizes = bbi.chromsizes(bigbed)\n",
180+
" chrom_size = chrom_sizes[settings['chroms'][0]]\n",
181+
" num_total_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1\n",
182+
"\n",
183+
" num_windows = np.ceil((chrom_size - window_size) / step_size).astype(int) + 1\n",
184+
" start_pos = np.arange(0, step_size * num_total_windows, step_size)\n",
185+
" end_pos = np.arange(window_size, step_size * num_total_windows + window_size, step_size)\n",
186+
"\n",
187+
" return bbi.stackup(\n",
188+
" bigbed,\n",
189+
" settings['chroms'] * num_total_windows,\n",
190+
" start_pos,\n",
191+
" end_pos,\n",
192+
" bins=bins,\n",
193+
" missing=0,\n",
194+
" oob=0,\n",
195+
" ).astype(int)"
196+
]
197+
},
198+
{
199+
"cell_type": "code",
200+
"execution_count": 61,
201+
"metadata": {},
202+
"outputs": [],
203+
"source": [
204+
"peaks_face = chunk_beds(narrow_peaks_face)\n",
205+
"peaks_hindbrain = chunk_beds(narrow_peaks_hindbrain)"
206+
]
207+
},
208+
{
209+
"cell_type": "code",
210+
"execution_count": 62,
167211
"metadata": {},
168212
"outputs": [
169213
{
170214
"name": "stdout",
171215
"output_type": "stream",
172216
"text": [
173-
"Extracted 87129 windows from chr10 with a max value of 1.0.\n",
174-
"Extracted 87129 windows from chr10 with a max value of 1.0.\n",
175-
"Face peaks: 11769\n",
176-
"Hindbrain peaks: 11257\n",
177-
"Diff peaks: 644\n",
178-
"Same peaks: 11191\n",
179-
"Diff peaks with max val >= 0.5: 77\n",
180-
"Same peaks with max val >= 1: 1572\n"
217+
"Face peaks: 11008\n",
218+
"Hindbrain peaks: 8347\n",
219+
"Diff peaks: 1762\n",
220+
"Same peaks: 5130\n"
181221
]
182222
}
183223
],
184224
"source": [
185-
"from ae.utils import chunk_beds_binary\n",
225+
"print('Face peaks: {}'.format(np.sum(np.max(peaks_face[:,2:9], axis=1))))\n",
226+
"print('Hindbrain peaks: {}'.format(np.sum(np.max(peaks_hindbrain[:,2:9], axis=1))))\n",
186227
"\n",
187-
"face_wins_has_peaks = chunk_beds_binary(\n",
188-
" broad_peaks_face,\n",
189-
" window_size,\n",
190-
" window_size // settings['step_freq'],\n",
191-
" settings['chroms'],\n",
192-
" verbose=True,\n",
193-
").flatten()\n",
194-
"\n",
195-
"hindbrain_wins_has_peaks = chunk_beds_binary(\n",
196-
" broad_peaks_hindbrain,\n",
197-
" window_size,\n",
198-
" window_size // settings['step_freq'],\n",
199-
" settings['chroms'],\n",
200-
" verbose=True,\n",
201-
").flatten()\n",
202-
"\n",
203-
"print('Face peaks: {}'.format(np.sum(face_wins_has_peaks)))\n",
204-
"print('Hindbrain peaks: {}'.format(np.sum(hindbrain_wins_has_peaks)))\n",
205-
"\n",
206-
"wins_has_diff_peak = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 1\n",
207-
"print('Diff peaks: {}'.format(np.sum(wins_has_diff_peak)))\n",
208-
"\n",
209-
"wins_has_same_peaks = (face_wins_has_peaks + hindbrain_wins_has_peaks) == 2\n",
210-
"print('Same peaks: {}'.format(np.sum(wins_has_same_peaks)))\n",
228+
"diff_peaks = (\n",
229+
" (\n",
230+
" np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1) == 1\n",
231+
" ) & (\n",
232+
" np.abs(np.sum(peaks_face[:,2:9], axis=1) - np.sum(peaks_hindbrain[:,2:9], axis=1)) > 2\n",
233+
" )\n",
234+
")\n",
235+
"print('Diff peaks: {}'.format(np.sum(diff_peaks)))\n",
211236
"\n",
212-
"diff_peaks_win_ids = np.where(wins_has_diff_peak)[0]\n",
213-
"same_peaks_win_ids = np.where(wins_has_same_peaks)[0]\n",
237+
"same_peaks = (\n",
238+
" np.max(peaks_face[:,2:9], axis=1) + np.max(peaks_hindbrain[:,2:9], axis=1)\n",
239+
") == 2\n",
240+
"print('Same peaks: {}'.format(np.sum(same_peaks)))\n",
214241
"\n",
215-
"diff_peaks_with_max = wins_has_diff_peak & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))\n",
242+
"diff_peaks_win_ids = np.where(diff_peaks)[0]\n",
243+
"same_peaks_win_ids = np.where(same_peaks)[0]"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": 66,
249+
"metadata": {},
250+
"outputs": [
251+
{
252+
"name": "stdout",
253+
"output_type": "stream",
254+
"text": [
255+
"Diff peaks with max val >= 0.75: 55\n",
256+
"Same peaks with max val >= 1: 1201\n"
257+
]
258+
}
259+
],
260+
"source": [
261+
"diff_peaks_with_max = diff_peaks & ((max_signal_face >= min_peak_val_diff) | (max_signal_hindbrain >= min_peak_val_diff))\n",
216262
"diff_peaks_with_max_ids = np.where(diff_peaks_with_max)[0]\n",
217263
"\n",
218264
"print('Diff peaks with max val >= {}: {}'.format(min_peak_val_diff, np.sum(diff_peaks_with_max)))\n",
219265
"\n",
220-
"same_peaks_with_max = wins_has_same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))\n",
266+
"same_peaks_with_max = same_peaks & ((max_signal_face >= min_peak_val_same) | (max_signal_hindbrain >= min_peak_val_same))\n",
221267
"same_peaks_with_max_ids = np.where(same_peaks_with_max)[0]\n",
222268
"\n",
223269
"print('Same peaks with max val >= {}: {}'.format(min_peak_val_same, np.sum(same_peaks_with_max)))"
@@ -234,7 +280,34 @@
234280
},
235281
{
236282
"cell_type": "code",
237-
"execution_count": 7,
283+
"execution_count": 15,
284+
"metadata": {},
285+
"outputs": [
286+
{
287+
"data": {
288+
"application/vnd.jupyter.widget-view+json": {
289+
"model_id": "db6f614cd1774f999e54ba1ad336dc01",
290+
"version_major": 2,
291+
"version_minor": 0
292+
},
293+
"text/plain": [
294+
"Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')"
295+
]
296+
},
297+
"metadata": {},
298+
"output_type": "display_data"
299+
}
300+
],
301+
"source": [
302+
"from ipywidgets.widgets import Checkbox\n",
303+
"\n",
304+
"clear_db = Checkbox(value=False, description='Clear DB (Make sure you know what you do!)')\n",
305+
"clear_db"
306+
]
307+
},
308+
{
309+
"cell_type": "code",
310+
"execution_count": 74,
238311
"metadata": {},
239312
"outputs": [],
240313
"source": [
@@ -243,55 +316,83 @@
243316
"\n",
244317
"db_path = os.path.join(base, settings[\"db_path\"])\n",
245318
"\n",
246-
"if CLEAR_DB:\n",
319+
"if os.path.exists(db_path) and not clear_db.value:\n",
320+
" print('Database already exist. Check above to delete!')\n",
321+
"else:\n",
247322
" os.remove(db_path)\n",
248323
" DB(db_path=db_path, clear=True)\n",
249-
"else:\n",
250-
" try:\n",
251-
" with sqlite3.connect(db_path) as db:\n",
252-
" c = db.cursor()\n",
253-
" c.execute(f\"SELECT * FROM classification\")\n",
254-
" c.fetchone()\n",
255-
" except sqlite3.OperationalError:\n",
256-
" DB(db_path=db_path, clear=CLEAR_DB)\n",
257-
"\n",
258-
"with sqlite3.connect(db_path) as db:\n",
259-
" for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED): \n",
260-
" db.execute(f\"DELETE FROM classification WHERE search_id = {int(search_id)};\")\n",
261-
" db.commit()\n",
262-
"\n",
263-
" for window_idx in np.random.choice(\n",
264-
" diff_peaks_with_max_ids,\n",
265-
" np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
266-
" replace=False\n",
267-
" ):\n",
324+
"\n",
325+
" with sqlite3.connect(db_path) as db:\n",
326+
" for search_id in range(1, NUM_SEARCHES_TO_BE_PRELOADED + 1):\n",
268327
" db.execute(\n",
269328
" \"\"\"\n",
270329
" INSERT INTO\n",
271-
" classification(search_id, window_id, is_positive)\n",
330+
" search(id, target_from, target_to, config)\n",
272331
" VALUES\n",
273-
" (?, ?, ?);\n",
332+
" (?, ?, ?, ?);\n",
274333
" \"\"\",\n",
275-
" (int(search_id), int(window_idx), 1),\n",
334+
" (int(search_id), int(target_from), int(target_to), json.dumps(settings)),\n",
276335
" )\n",
277336
"\n",
337+
" for window_idx in np.random.choice(\n",
338+
" diff_peaks_with_max_ids,\n",
339+
" np.min((diff_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
340+
" replace=False\n",
341+
" ):\n",
342+
" db.execute(\n",
343+
" \"\"\"\n",
344+
" INSERT INTO\n",
345+
" classification(search_id, window_id, is_positive)\n",
346+
" VALUES\n",
347+
" (?, ?, ?);\n",
348+
" \"\"\",\n",
349+
" (int(search_id), int(window_idx), 1),\n",
350+
" )\n",
278351
"\n",
279-
" for window_idx in np.random.choice(\n",
280-
" same_peaks_with_max_ids,\n",
281-
" np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
282-
" replace=False\n",
283-
" ):\n",
284-
" db.execute(\n",
285-
" \"\"\"\n",
286-
" INSERT INTO\n",
287-
" classification(search_id, window_id, is_positive)\n",
288-
" VALUES\n",
289-
" (?, ?, ?);\n",
290-
" \"\"\",\n",
291-
" (int(search_id), int(window_idx), -1),\n",
292-
" )\n",
293352
"\n",
294-
" db.commit()"
353+
" for window_idx in np.random.choice(\n",
354+
" same_peaks_with_max_ids,\n",
355+
" np.min((same_peaks_with_max_ids.size, MAX_PRELOADED_LABELS)),\n",
356+
" replace=False\n",
357+
" ):\n",
358+
" db.execute(\n",
359+
" \"\"\"\n",
360+
" INSERT INTO\n",
361+
" classification(search_id, window_id, is_positive)\n",
362+
" VALUES\n",
363+
" (?, ?, ?);\n",
364+
" \"\"\",\n",
365+
" (int(search_id), int(window_idx), -1),\n",
366+
" )\n",
367+
"\n",
368+
" db.commit()"
369+
]
370+
},
371+
{
372+
"cell_type": "markdown",
373+
"metadata": {},
374+
"source": [
375+
"**Make sure to start the server first!**"
376+
]
377+
},
378+
{
379+
"cell_type": "code",
380+
"execution_count": 75,
381+
"metadata": {},
382+
"outputs": [],
383+
"source": [
384+
"import requests\n",
385+
"import time\n",
386+
"\n",
387+
"for search_id in range(NUM_SEARCHES_TO_BE_PRELOADED, 0, -1):\n",
388+
" r = requests.post(\n",
389+
" url = f'http://localhost:5000/api/v1/classifier/?s={search_id}'\n",
390+
" )\n",
391+
" time.sleep(5)\n",
392+
" r = requests.post(\n",
393+
" url = f'http://localhost:5000/api/v1/progress/?s={search_id}&u=1'\n",
394+
" )\n",
395+
" time.sleep(5)"
295396
]
296397
},
297398
{

0 commit comments

Comments
 (0)