-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstats.py
78 lines (63 loc) · 2.06 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Thanks for Andreas Klosterman for dask suggestion
import sys
import md5
import hashlib
import re
import argparse
from collections import defaultdict
import sqlite3
import logging
logging.basicConfig()
logger = logging.getLogger('poretools')
def run(parser, args):
conn = sqlite3.connect(args.db, check_same_thread=False, timeout=30)
c = conn.cursor()
uuiddb = set()
if args.group_by_asic:
statscache = defaultdict(lambda : defaultdict(int))
c.execute("pragma temp_store = 2")
c.execute("PRAGMA cache_size = 10000000")
statement = """select asic_id, trackedfiles.uuid, template
from basecall
join basecaller using (basecaller_id)
join trackedfiles using (filepath)
join experiment using (experiment_id)
join flowcell using (asic_id)
where basecaller.name = 'ONT Sequencing Workflow'"""
n = 0
for r in c.execute(statement):
n += 1
if n % 1000 == 0: print n
template = r[2]
uuid = r[1]
if template:
if uuid in uuiddb:
continue
else:
uuiddb.add(uuid)
a,b,c,d,e = template.split("\n")
l = len(b.strip())
statscache[r[0]]['reads'] += 1
statscache[r[0]]['bases'] += l
"""
trackedfiles = {}
n = 0
statement = "select asic_id, filepath FROM trackedfiles JOIN experiment USING (experiment_id)"
for r in c.execute(statement):
if n % 1000 == 0: print n
n += 1
trackedfiles[r[1]] = r[0]
statement = "select asic_id, template FROM basecall join trackedfiles using (filepath) join experiment using (experiment_id)"
statement = "select template, filepath from basecall"
n = 0
for r in c.execute(statement):
if r[0]:
a,b,c,d,e = r[0].split("\n")
l = len(b.strip())
statscache[trackedfiles[r[1]]]['reads'] += 1
statscache[trackedfiles[r[1]]]['bases'] += l
n += 1
if n % 1000 == 0: print n
"""
for asic, stats in statscache.iteritems():
print "%s\t%s\t%s" % (asic, stats['reads'], stats['bases'])