forked from swiftbiosciences/16S-SNAPP-py3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRefseq.py
executable file
·99 lines (82 loc) · 3.72 KB
/
Refseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
## Swift Biosciences 16S snapp workflow
## Author Benli Chai & Sukhinder Sandhu 20200502
#Class of templates to be used for consensus sequences
## Ported to Python 3 on 20210106
from operator import itemgetter
from itertools import groupby
import numpy as np
class Refseq:
def __init__(self, ID):
self.ID = ID
self.seq = '' #full length reference sequence
self.reads = {}
self.positions = []
self.consensus = ''
def addRead(self, read_info):#add read info: mapped positions
ID = read_info['ID']
start, end = read_info['pos']
positions = range(start, end) #mapped positions
self.reads[ID] = positions #mapped positions
for position in positions:
if not position in self.positions:
self.positions.append(position)
self.positions.sort()
def addPEread(self, ID, read_info_R1, read_info_R2):#add PE read info: two sets of positions
s1, e1 = read_info_R1
s2, e2 = read_info_R2
positions = set(range(s1, e1)).union(set(range(s2, e2)))
positions = list(positions)
positions.sort()
self.reads[ID] = positions #mapped positions
for position in positions:
if not position in self.positions:
self.positions.append(position)
self.positions.sort()
def getReadIDs(self):
return set(self.reads.keys())
## Execute the following operations when read mapping is complete and read count normalization is done
def addSeq(self, seq): #the full length sequence of reference
self.seq = seq.upper()
def addRegs(self): #this Python 3 version is slightly different from Python 2 version but has the same return, i.e. consecutively covered regions once the read mapping is complete
self.baseRegs = []
for k, g in groupby(enumerate(self.positions), lambda x:x[0]-x[1]):
group = (map(itemgetter(1), g))
group = list(map(int, group))
self.baseRegs.append(group)
def addReadCounts(self, DF): #add minimized read counts attributable to this reference of this sample
readIDs = self.reads.keys()
self.count = DF.loc[readIDs, self.ID].to_dict() #count of reads by readID
self.baseFreq = {pos:0 for pos in self.positions} #all mapped positions
for readID, positions in self.reads.items():
for pos in positions:
self.baseFreq[pos] += self.count[readID]
def addAssign(self, tax):#replace tax assignment with better resolved reads and eventually the emsemble's assignment
self.tax = tax
def getCountSum(self):#return the count sum of reads mapped to this reference
return sum(self.count.values())
def getMeanBaseCov(self):#return the average time a mapped base is covered by reads
return np.array(self.baseFreq.values()).mean()
def getAlignLen(self):
length = 0
for reg in self.baseRegs:
length += len(reg)
return length
def getAlignPct(self):#fraction of all aligned positions
return round(len(self.positions)/float(len(self.seq)), 2)
def getProxy(self):#return the proxy sequence formed by concatenating all mapped regions
proxy = [self.seq[reg[0]:reg[-1]] for reg in self.baseRegs]
return "NNNNNNN".join(proxy)
if __name__ == '__main__':
myref = Refseq(ID='r1', seq='GCAACTGGACTGGAA')
myref.addReads(['asv1', 'asv2', 'asv3'])
myref.addReads(['asv8'])
myref.addBases(set([4, 5, 9, 3, 10]))
myref.addBases(set([4, 5, 40, 59, 10]))
print (myref.ID)
print (myref.seq)
print (myref.readIDs)
print (myref.getCount())
print (myref.bases)
print (myref.getBaseCov())
print (myref.getRegs())