-
Notifications
You must be signed in to change notification settings - Fork 3
/
countReadlets-all.py
57 lines (40 loc) · 1.76 KB
/
countReadlets-all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python
### readlet counting script
### using numpy arrays instead of dictionaries - see if they are faster, more efficient, and allow us to filter all at once.
### and also to merge (& maybe filter) all at once. YEAH BABY.
# add chromosome lengths
chrlen = 249250621
fnames = ["orbFrontalF1-small.bam", "toyfile.bam"]
def countReadlets(fnames,outfname,chromosome):
import numpy as np
import pysam
dictlist = []
for samp in fnames:
samfile = pysam.Samfile(samp,"rb") # parse the .bam (sequence alignment) file
id_start_end = []
for read in samfile.fetch(chromosome):
id_start_end.append([read.qname, read.pos+1, read.aend])
print "starting sample!"
print len(id_start_end)
datarow = np.zeros((chrlen, len(fnames)))
for idx,rlet in enumerate(id_start_end):
print "starting readlet" + " " + str(idx)
for i in range(rlet[1], rlet[2]):
datarow[i] += 1
print "finished one sample!"
if fnames.index(samp)==0:
datatable = datarow
else:
datatable = np.vstack((datatable, datarow))
return datatable
#starr-seq
### get arguments from command line
### use: python countReadlets.py --file myfile.bam --output outfile.txt --kmer 100 --chrom 22
#import argparse
#parser = argparse.ArgumentParser(description="get arguments from command line")
#parser.add_argument("--files", nargs = "+", help="bam files to count & filter")
#parser.add_argument("--output", help="output file to store results")
#parser.add_argument("--chrom", help="chromosome to parse")
#args = parser.parse_args()
#countReadlets(args.files,args.output,args.chrom)
# also try sparsearray data structure