-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPOCP.py
171 lines (141 loc) · 5.04 KB
/
POCP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#! /usr/bin/env python
import sys
import re
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
"""
Author: Conor Meehan 23/01/2019
Script to calculate the percentage of conserved proteins (POCP) between two isolates
This is calculated as follows (based on Qin et al, Journal of Bacteriology 2014):
Proteins from the query genome were considered conserved when they had a
BLAST match with an E value of less than 1e-5,
a sequence identity of more than 40%,
and an alignable region of the query protein sequence of more than 50%.
The percentage of conserved proteins (POCP) between two genomes was calculated as
[(C1+C2)/(T1+T2)]*100%,
where C1 and C2 represent the conserved number of proteins in the two genomes being compared, respectively,
and T1 and T2 represent the total number of proteins in the two genomes being compared, respectively.
Input:
A BLASTp output in tabular format (outfmt 6) for isolate 1 as query and isolate 2 as subject
A BLASTp output in tabular format (outfmt 6) for isolate 2 as query and isolate 1 as subject
A protein file (e.g. faa file) for the isolate 1
A protein file (e.g. faa file) for the isolate 2
Output:
Save file with the PCOP inside (no rounding)
File is named as isolate1_isolate2_PCOP.txt
Names are taken from the protein faa files with the folder names (everything before /) and the faa stripped
NOTE: requires biopython
NOTE: Names in faa files are cut at the first space, as they would be in the BLAST output
Usage:
python POCP.py --blast1 genome1QueryBLASTpFile --blast2 genome2QueryBLASTpFile --protein1 faaFileSpecies1 --protein2 faaFileSpecies2
"""
#parse the inputs
parser = argparse.ArgumentParser(description= 'Script to calculate the percentage of conserved proteins (POCP) between two genomes. See script header for details.')
parser.add_argument('--blast1', required=True, help='A BLASTp output in tabular format (outfmt 6) for isolate 1 as query and isolate 2 as subject')
parser.add_argument('--blast2', required=True, help='A BLASTp output in tabular format (outfmt 6) for isolate 2 as query and isolate 1 as subject')
parser.add_argument('--protein1', required=True, help='A protein file (e.g. faa file) for the isolate 1')
parser.add_argument('--protein2', required=True, help='A protein file (e.g. faa file) for the isolate 2')
args = parser.parse_args()
#create the save file
save1=re.sub(".faa","", args.protein1)
save1=re.sub(".*/","",save1)
save2=re.sub(".faa","", args.protein2)
save2=re.sub(".*/","",save2)
try:
save=open(save1+"_"+save2+"_PCOP.txt",'w')
except IOError:
print('no room for save file')
sys.exit()
#open the BLAST files
try:
blast1F=open(args.blast1, 'r')
except IOError:
print("\n BLAST 1 not found in folder.")
sys.exit()
try:
blast2F=open(args.blast2, 'r')
except IOError:
print("\n BLAST 2 not found in folder.")
sys.exit()
#read in the protein file of isolate 1 and for each protein, get the length and name (SeqIO auto cuts it at first space) and save to a dictionary
prots1all = SeqIO.to_dict(SeqIO.parse(args.protein1, "fasta"))
prots1={}
for prot in prots1all:
prots1[prot]=len(prots1all[prot].seq)
#repeat for the protein file of isolate 2
prots2all = SeqIO.to_dict(SeqIO.parse(args.protein2, "fasta"))
prots2={}
for prot in prots2all:
prots2[prot]=len(prots2all[prot].seq)
#get the totals
T1=len(prots1)
T2=len(prots2)
#go through the first BLAST file and for each protein that has a match meeting the minimum requirements, add 1 to the counter and the protein name to the list (to avoid counting the same protein twice)
C1=0.0
matches1=[]
while 1:
line=blast1F.readline()
if not line:
break
line=line.rstrip()
sections=line.split()
name=sections[0]
id=float(sections[2])
lenMatch=float(sections[3])
evalue=float(sections[10])
#if already matched, skip
if name in matches1:
continue
#check evalue is less than 1e-05
if evalue>1e-05:
continue
#check id is greater than 40%
if id<40:
continue
#get the length of the match as a percetnage of the total query protein length
queryProtLen=prots1[name]
matchPC=lenMatch/queryProtLen
if matchPC<0.5:
continue
#if reached this stage, then have passed all checks. Increase the counter by 1 and add the name to the matched list
matches1.append(name)
C1+=1
blast1F.close()
#repeat for isolate 2 as query
C2=0.0
matches2=[]
while 1:
line=blast2F.readline()
if not line:
break
line=line.rstrip()
sections=line.split()
name=sections[0]
id=float(sections[2])
lenMatch=float(sections[3])
evalue=float(sections[10])
#if already matched, skip
if name in matches2:
continue
#check evalue is less than 1e-05
if evalue>1e-05:
continue
#check id is greater than 40%
if id<40:
continue
#get the length of the match as a percetnage of the total query protein length
queryProtLen=prots2[name]
matchPC=lenMatch/queryProtLen
if matchPC<0.5:
continue
#if reached this stage, then have passed all checks. Increase the counter by 1 and add the name to the matched list
matches2.append(name)
C2+=1
blast2F.close()
#calculate the PCOP
PCOP=((C1+C2)/(T1+T2))*100
#save to file
save.write(str(PCOP)+"\n")
save.close()
sys.exit()