-
Notifications
You must be signed in to change notification settings - Fork 9
/
runNetMHCpan.py
187 lines (164 loc) · 8.21 KB
/
runNetMHCpan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# ----------------------------------------------------------------------------------------------- #
# Claire Margolis
# runNetMHCpan.py
#
# Summary: Takes in one or more FASTA files containing all of the peptides upon which netMHCpan
# or netMHCIIpan is to be run. Runs whichever version of netMHCpan is requested and returns
# the results in an appropriately-named output file.
#
# Input format: python runNetMHCpan.py len9peptides.txt,len10peptides.txt HLAalleles.txt 1 outpath
# Options for specifying which netMHCpan version:
# 1 = netMHCIpan
# 2 = netMHCIIpan
# *RELEVANT*: HLA allele input file can be in one of two formats:
# 1. Polysolver winners_hla.txt output file
# example line from file: HLA-A hla_a_02_01_01_01 hla_a_32_01_01
# 2. Already processed, one allele per line in netMHC compatible format
# example line from file: HLA-A02:01
#
# Output: netMHCpan output .xls file(s)
#
# ----------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------- #
# Import necessary packages
#!/usr/bin/python
import sys
import numpy as np
import subprocess
import os
# ----------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------- #
# Function: runNetMHCIpan
# Inputs: FASTA file of peptide sequences, patient HLA alleles (these are automatically given
# by Polysolver and come in a .txt file that needs to be pre-processed into the correct format for
# netMHCpan), peptide length outpath
# Returns: None (netMHCpan will automatically write output to a .xls file)
# Summary: Pre-processes patient HLA alleles, runs netMHCIpan.
def runNetMHCIpan(pepfile, hlafile, length, outpath):
# Determine whether we're dealing with a snv or indel file (for naming the outfile)
varianttype = ''
if pepfile.split('_FASTA_')[1].split('.')[0] == 'snv':
varianttype = 'SNV'
if pepfile.split('_FASTA_')[1].split('.')[0] == 'indel':
varianttype = 'InDel'
# Read in HLA alleles file and process
with open(hlafile) as f:
hlalines = f.read().splitlines()
hlaalleles = []
# Determine which input format the hla allele file is in
if len(hlalines[0].split('\t')) <= 1: # In already pre-processed format
hlaalleles = hlalines
else: # Polysolver output file
for line in hlalines:
split = line.split('\t')
# Reformat each allele (2 for each type of HLA A, B, and C)
for i in range(1, 3):
currallele = 'HLA-'
allele = split[i]
components = allele.split('_')
currallele += components[1].upper() + components[2] + ':' + components[3]
hlaalleles.append(currallele)
hlaalleles = list(set(hlaalleles)) # Remove duplicate alleles if there are any
hlastring = ','.join(hlaalleles)
# Run netMHCI pan
command = 'export NHOME=/netMHCpan-4.1; export NETMHCpan=/netMHCpan-4.1/Linux_x86_64; /netMHCpan-4.1/Linux_x86_64/bin/netMHCpan -a '+hlastring+' -f '+pepfile+' -inptype 0 -l '+str(length)+' -s -xls -xlsfile '+outpath+'/NETMHCpan_out_'+str(length)+varianttype+'.xls -allname /netMHCpan-4.1/Linux_x86_64/data/allelenames -hlapseudo /netMHCpan-4.1/Linux_x86_64/data/MHC_pseudo.dat -t 500 -version /xchip/cga_home/margolis/Packages/netMHCPan/netMHCpan-3.0/data/version -tdir /netMHCpan-4.1/scratch/XXXXXX -rdir /netMHCpan-4.1/Linux_x86_64/ > '+outpath+'/netMHCpanoutlen_'+str(length)+varianttype+'.txt'
subprocess.call(command, shell=True)
# Catch case where peptide file was empty (create dummy file)
dummyfile = outpath+'/NETMHCpan_out_'+str(length)+varianttype+'.xls'
open(dummyfile, 'a').close()
return
# ----------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------- #
# Function: runNetMHCIIpan
# Inputs: FASTA file of peptide sequences, patient HLA alleles (these are automatically given
# by Polysolver and come in a .txt file that needs to be pre-processed into the correct format for
# netMHCIIpan), peptide length outpath
# Returns: None (netMHCIIpan will automatically write output to a .xls file)
# Summary: Pre-processes patient HLA alleles, runs netMHCIIpan
def runNetMHCIIpan(pepfile, hlafile, length, outpath):
# Determine whether we're dealing with a snv or indel file (for naming the outfile)
varianttype = ''
if pepfile.split('_FASTA_')[1].split('.')[0] == 'snv':
varianttype = 'SNV'
if pepfile.split('_FASTA_')[1].split('.')[0] == 'indel':
varianttype = 'InDel'
# Read in HLA alleles file and process
with open(hlafile) as f:
hlalines = f.read().splitlines()
hlaalleles = []
# Determine which input format the hla allele file is in
if len(hlalines[0].split('\t')) <= 1: # In already pre-processed format
hlaalleles = hlalines
else: # PHLAT output file
# DQA1
DQA1a = hlalines[4].split('\t')[1].split('*')[1][0:5]
DQA1a = DQA1a.split(':')[0]+DQA1a.split(':')[1]
DQA1b = hlalines[4].split('\t')[2].split('*')[1][0:5]
DQA1b = DQA1b.split(':')[0]+DQA1b.split(':')[1]
# DQB1
DQB1a = hlalines[5].split('\t')[1].split('*')[1][0:5]
DQB1a = DQB1a.split(':')[0]+DQB1a.split(':')[1]
DQB1b = hlalines[5].split('\t')[2].split('*')[1][0:5]
DQB1b = DQB1b.split(':')[0]+DQB1b.split(':')[1]
# Concatenate four DQ isoforms to be in correct format
DQA1B1a = 'HLA-DQA1'+DQA1a+'-DQB1'+DQB1a
DQA1aB1b = 'HLA-DQA1'+DQA1a+'-DQB1'+DQB1b
DQA1bB1a = 'HLA-DQA1'+DQA1b+'-DQB1'+DQB1a
DQA1B1b = 'HLA-DQA1'+DQA1b+'-DQB1'+DQB1b
# DRB1
DRB1a = hlalines[6].split('\t')[1].split('*')[1][0:5]
DRB1a = DRB1a.split(':')[0]+DRB1a.split(':')[1]
DRB1b = hlalines[6].split('\t')[2].split('*')[1][0:5]
DRB1b = DRB1b.split(':')[0]+DRB1b.split(':')[1]
# Format DRB1 alleles
DRB1a = 'DRB1_'+DRB1a
DRB1b = 'DRB1_'+DRB1b
# Add alleles to list
hlaalleles.append(DQA1B1a)
hlaalleles.append(DQA1aB1b)
hlaalleles.append(DQA1bB1a)
hlaalleles.append(DQA1B1b)
hlaalleles.append(DRB1a)
hlaalleles.append(DRB1b)
hlaalleles = list(set(hlaalleles)) # Remove duplicate alleles if there are any
hlastring = ','.join(hlaalleles)
# Run netMHCIIpan if file is not empty
if os.path.getsize(pepfile) > 1:
command = 'export NHOME=/netMHCIIpan-4.0; export NETMHCpan=/netMHCIIpan-4.0/Linux_x86_64; /netMHCIIpan-4.0/netMHCIIpan -a '+hlastring+' -f '+pepfile+' -inptype 0 -length '+str(length)+' -fast -filter 1 -affF 500 -rankF 2.0 -s -xls -xlsfile '+outpath+'/NETMHCIIpan_out_'+str(length)+varianttype+'.xls rdir /netMHCIIpan-4.0/Linux_x86_64/ > '+outpath+'/netMHCIIpanoutlen_'+str(length)+varianttype+'.txt'
subprocess.call(command, shell=True)
# Catch case where peptide file was empty (create dummy file)
dummyfile = outpath+'/NETMHCIIpan_out_'+str(length)+varianttype+'.xls'
open(dummyfile, 'a').close()
return
# ----------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------- #
# Main function
def main():
# Check to make sure we have the right number of inputs
if len(sys.argv) != 6:
print 'Error: incorrect number of inputs.'
print 'Please input FASTA file(s), a HLAalleles.txt file, the peptide length(s), a netMHCpan version, and an outpath.'
sys.exit()
# Parse inputs
fastas = sys.argv[1]
alleles = sys.argv[2]
peplengths = sys.argv[3]
versionchoice = sys.argv[4]
outpath = sys.argv[5]
# Split FASTA files and peptide lengths
fastalist = fastas.split(',')
lengthslist = peplengths.split(',')
if len(fastalist) != len(lengthslist):
print 'Error: Please make sure your peptide lengths correspond to the fasta files and are in the same order.'
sys.exit()
# Run whichever netMHC version is desired
if versionchoice == '1':
for i in range(0, len(fastalist)):
runNetMHCIpan(fastalist[i], alleles, lengthslist[i], outpath)
else:
for i in range(0, len(fastalist)):
runNetMHCIIpan(fastalist[i], alleles, lengthslist[i], outpath)
return
if __name__ == '__main__':
main()
# ----------------------------------------------------------------------------------------------- #