Skip to content

Commit 845577d

Browse files
committed
Filter fusion artifacts (resolves #29)
resolves #29 related to BD2KGenomics/protect#237 Fusion calling often has FPS related to homology, especially among mitochondrial and (lnc,etc) RNA genes. Furthermore, transcriptional readthroughs are usually oncologically irrelevant. This PR adds filtering flags for mitochondrial, Immunoglobulin, RP11- (lncRNA) and transcriptional readthrough filtering.
1 parent 03eca0f commit 845577d

File tree

3 files changed

+165
-47
lines changed

3 files changed

+165
-47
lines changed

fusion.py

+100-13
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@
2020
genetic_code = {''.join([b1, b2, b3]): aa
2121
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}
2222

23+
BEDPE = collections.namedtuple('BEDPE',
24+
'chrom1, start1, end1, '
25+
'chrom2, start2, end2, '
26+
'name, '
27+
'score, '
28+
'strand1, strand2, '
29+
'junctionSeq1, junctionSeq2, '
30+
'hugo1, hugo2')
31+
2332

2433
def get_transcriptome_data(infile):
2534
"""
@@ -50,17 +59,20 @@ def get_transcriptome_data(infile):
5059
return transcript_cds, gene_transcripts
5160

5261

53-
def get_exons(genome_file, annotation_file):
62+
def get_exons(genome_file, annotation_file, fusions):
5463
"""
5564
Generates list of GTFRecord objects for each transcript
5665
5766
:param file genome_file: Reference genome FASTA file
5867
:param file annotation_file: Genome annotation file (GTF)
68+
:param list(BEDPE) fusions: The fusions under consideration for this sample.
5969
:return: GTFRecord exons
6070
:rtype: dict
6171
"""
72+
annotation_file.seek(0)
6273
chroms ={}
6374
exons = collections.defaultdict(list)
75+
genes_of_interest = set(sum([(record.hugo1, record.hugo2) for record in fusions], ()))
6476
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
6577
chroms[header] = seq
6678

@@ -69,12 +81,31 @@ def get_exons(genome_file, annotation_file):
6981
continue
7082
else:
7183
gtf = GTFRecord(line)
72-
if gtf.feature == 'exon':
84+
if gtf.feature == 'exon' and gtf.gene_name in genes_of_interest:
7385
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
7486
exons[gtf.transcript_id].append(gtf)
7587
return exons
7688

7789

90+
def read_genes_from_gtf(gtf_file):
91+
"""
92+
Read the gene annotations into a dict
93+
94+
:param file gtf_file: A file handle ot the annotation file.
95+
:returns: A dict of a gtf record for each gene
96+
:rtype: dict(string, GTFRecord)
97+
"""
98+
gene_annotations = {}
99+
for line in gtf_file:
100+
if line.startswith('#'):
101+
continue
102+
else:
103+
gtf = GTFRecord(line)
104+
if gtf.feature == 'gene':
105+
gene_annotations[gtf.gene_name] = gtf
106+
return gene_annotations
107+
108+
78109
def translate(seq):
79110
"""
80111
Translates DNA sequence into protein sequence using globally defined genetic code
@@ -93,13 +124,53 @@ def translate(seq):
93124
return ''.join(protein)
94125

95126

96-
def read_fusions(fusion_file):
127+
def rna_gene_in_bedpe(record):
128+
"""
129+
Determine if one of the two candidates in a BEDPE line is an rna gene.
130+
131+
:param BEDPE record: A BEDPE line from the input file
132+
:returns: True if one of the candidates is an RNA gene and False if not
133+
:rtype: bool
134+
"""
135+
# These are heuristics
136+
return 'RP11-' in record.hugo1 or 'RP11-' in record.hugo2
137+
138+
139+
def readthrough_in_bedpe(record, annotation, rt_threshold):
140+
"""
141+
Determine if the two genes in the record are within `rt_threshold` bp of each other on the same
142+
chromosome.
143+
144+
:param BEDPE record: A BEDPE line from the input file
145+
:param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations`
146+
:param rt_threshold: The genomic distance on the same chromosome below which we will call a
147+
candidate fusion a readthrough.
148+
:returns: True if the pair is considered a readthrough and False if not
149+
:rtype: bool
150+
"""
151+
return (record.chrom1 == record.chrom2 and
152+
((annotation[record.hugo1].start <= annotation[record.hugo2].start <=
153+
annotation[record.hugo1].end + rt_threshold) or
154+
(annotation[record.hugo2].start <= annotation[record.hugo1].start <=
155+
annotation[record.hugo2].end + rt_threshold)))
156+
157+
158+
def read_fusions(fusion_file, gene_annotations, filter_mt, filter_ig, filter_rg, filter_rt,
159+
rt_threshold, out_bedpe):
97160
"""
98161
Reads in gene fusion predictions in modified BEDPE format.
99162
In addition to the basic BEDPE features, this function requires the fusion
100163
junction sequences and HUGO names for the donor and acceptor genes.
101164
165+
:param filter_ig:
102166
:param file fusion_file: Fusion calls in BEDPE format
167+
:param dict(str, GTFRecord) gene_annotations: The gene annotations form the gtf
168+
:param bool filter_mt: Filter mitochondrial events?
169+
:param bool filter_ig: Filter immunoglobulin pairs?
170+
:param bool filter_rg: Filter RNA-Gene events?
171+
:param bool filter_rt: Filter transcriptional read-throughs?
172+
:param int rt_threshold: Distance threshold to call a readthrough
173+
:param file out_bedpe: A file handle to an output BEDPE file
103174
:returns: list of BEDPE namedtuples
104175
:rtype: list
105176
@@ -119,31 +190,47 @@ def read_fusions(fusion_file):
119190
hugo1: HUGO name for first feature
120191
hugo2: HUGO name for second feature
121192
"""
122-
BEDPE = collections.namedtuple('BEDPE',
123-
'chrom1, start1, end1, '
124-
'chrom2, start2, end2, '
125-
'name, score, '
126-
'strand1, strand2, '
127-
'junctionSeq1, junctionSeq2, '
128-
'hugo1, hugo2')
129193

130194
calls = []
195+
131196
for line in csv.reader(fusion_file, delimiter='\t'):
132197
if line[0].startswith('#'):
198+
print('\t'.join(line), file=out_bedpe)
133199
continue
134200
try:
135-
calls.append(BEDPE(*line))
136-
201+
record = BEDPE(*line)
137202
except TypeError:
138203
raise ValueError("ERROR: fusion file is malformed.\n{}".format(read_fusions.__doc__))
139204

205+
if filter_mt and 'M' in record.chrom1 or 'M' in record.chrom2:
206+
logging.warning("Rejecting %s-%s for containing a Mitochondrial gene.", record.hugo1,
207+
record.hugo2)
208+
continue
209+
elif filter_ig and record.hugo1.startswith('IG') and record.hugo2.startswith('IG'):
210+
logging.warning("Rejecting %s-%s an an Immunoglobulin gene pair.", record.hugo1,
211+
record.hugo2)
212+
continue
213+
elif filter_rg and rna_gene_in_bedpe(record):
214+
logging.warning("Rejecting %s-%s for containing an RNA gene.", record.hugo1,
215+
record.hugo2)
216+
continue
217+
elif filter_rt and readthrough_in_bedpe(record, gene_annotations, rt_threshold):
218+
logging.warning("Rejecting %s-%s as a potential readthrough.", record.hugo1,
219+
record.hugo2)
220+
continue
221+
else:
222+
logging.info("Accepting %s-%s for further study.", record.hugo1, record.hugo2)
223+
print('\t'.join(line), file=out_bedpe)
224+
calls.append(record)
225+
140226
return calls
141227

142228
# Namedtuple for storing alignment metrics
143-
# Neeeds to be global for pickling
229+
# Needs to be global for pickling
144230
AlignStats = collections.namedtuple('AlignStats',
145231
'qstart, qstop, rstart, rstop, insertions, deletions')
146232

233+
147234
def align_filter(ref, query, mode, mismatches_per_kb=1):
148235
"""
149236
Aligns query to reference CDS sequence using the Smith-Waterman algorithm.

test/test_transgene.py

+4
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ def _test_transgene(self, use_RNA=False, use_DNA=False, fusions=False):
107107
'annotation.gtf')) if fusions else None
108108
params.genome_file = open(self._get_input_path('test_input/chr21.fa')) if fusions else None
109109

110+
params.filter_mt = params.filter_rg = params.filter_rt = False
111+
params.rt_threshold = 100000
112+
params.rg_file = None
113+
110114
params.cores = 3
111115
transgene.main(params)
112116
output = {'9mer': {'fasta': 'unit_test_tumor_9_mer_snpeffed.faa',

transgene.py

+61-34
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
import sys
2727

2828
from common import read_fasta, chrom_sort
29-
from fusion import get_transcriptome_data, read_fusions, insert_fusions, get_exons
29+
from fusion import (get_exons,
30+
get_transcriptome_data,
31+
insert_fusions,
32+
read_fusions,
33+
read_genes_from_gtf)
3034

3135

3236
# A named tuple describing the result of running reject_mutation on a mutation.
@@ -1036,15 +1040,17 @@ def main(params):
10361040
transcriptome, gene_transcript_ids = None, None
10371041

10381042
# Load data from fusion file
1043+
fusions = exons = None
10391044
if params.fusion_file:
1040-
fusions = read_fusions(params.fusion_file)
1041-
else:
1042-
fusions = None
1043-
1044-
if params.genome_file and params.annotation_file:
1045-
exons = get_exons(params.genome_file, params.annotation_file)
1046-
else:
1047-
exons = None
1045+
gene_annotations = read_genes_from_gtf(params.annotation_file)
1046+
out_bedpe = open('_'.join([params.prefix, 'transgened.bedpe']), 'w')
1047+
try:
1048+
fusions = read_fusions(params.fusion_file, gene_annotations, params.filter_mt,
1049+
params.filter_ig, params.filter_rg, params.filter_rt,
1050+
params.rt_threshold, out_bedpe)
1051+
finally:
1052+
out_bedpe.close()
1053+
exons = get_exons(params.genome_file, params.annotation_file, fusions)
10481054

10491055
for peplen in params.pep_lens.split(','):
10501056
logging.info('Processing %s-mers', peplen)
@@ -1057,7 +1063,8 @@ def main(params):
10571063
if proteome_data and snvs:
10581064
insert_snvs(proteome_data, snvs, tumfile, normfile, int(peplen), params.rna_file)
10591065
if transcriptome and gene_transcript_ids and fusions:
1060-
insert_fusions(transcriptome, fusions, gene_transcript_ids, int(peplen), tumfile, exons=exons)
1066+
insert_fusions(transcriptome, fusions, gene_transcript_ids, int(peplen), tumfile,
1067+
exons=exons)
10611068

10621069
if params.no_json_dumps:
10631070
shutil.move(tumfile_path, '_'.join([params.prefix, 'tumor', peplen,
@@ -1080,36 +1087,36 @@ def run_transgene():
10801087
This will try to run transgene from system arguments
10811088
"""
10821089
parser = argparse.ArgumentParser(description=main.__doc__)
1083-
parser.add_argument('--peptides', dest='peptide_file',
1084-
type=argparse.FileType('r'),
1090+
# SNV related options
1091+
parser.add_argument('--peptides', dest='peptide_file', type=argparse.FileType('r'),
10851092
help='Path to GENCODE translation FASTA file')
1086-
parser.add_argument('--transcripts', dest='transcript_file',
1087-
type=argparse.FileType('r'),
1088-
help='Path to GENCODE transcript FASTA file')
1089-
parser.add_argument('--snpeff', dest='snpeff_file',
1090-
type=argparse.FileType('r'),
1093+
parser.add_argument('--snpeff', dest='snpeff_file', type=argparse.FileType('r'),
10911094
help='Path to snpeff file')
1092-
parser.add_argument('--cores', dest='cores', type=int,
1093-
help='Number of cores to use for the filtering step.', required=False,
1094-
default=1)
1095-
parser.add_argument('--fusions', dest='fusion_file',
1096-
help='Path to gene fusion file',
1095+
1096+
# Fusion related options
1097+
parser.add_argument('--fusions', dest='fusion_file', help='Path to gene fusion file',
10971098
type=argparse.FileType('r'))
1099+
parser.add_argument('--transcripts', dest='transcript_file', type=argparse.FileType('r'),
1100+
help='Path to GENCODE transcript FASTA file. Required if calling fusions.')
10981101
parser.add_argument('--genome', dest='genome_file',
1099-
help='Path to reference genome file',
1102+
help='Path to reference genome file, Required if calling fusions.',
11001103
type=argparse.FileType('r'))
11011104
parser.add_argument('--annotation', dest='annotation_file',
1102-
help='Path to gencode annotation file',
1105+
help='Path to gencode annotation file. Required if calling fusions.',
11031106
type=argparse.FileType('r'))
1104-
parser.add_argument('--prefix', dest='prefix', type=str,
1105-
help='Prefix for output file names', required=True)
1106-
parser.add_argument('--pep_lens', dest='pep_lens', type=str,
1107-
help='Desired peptide lengths to process. '
1108-
'The argument should be in the form of comma separated values. '
1109-
'E.g. 9,15', required=False, default='9,10,15')
1110-
parser.add_argument('--no_json_dumps', action='store_true',
1111-
help='Do not educe peptide fasta record names in the output by dumping the '
1112-
'mapping info into a .map json file.', required=False, default=False)
1107+
parser.add_argument('--filter_mt_fusions', dest='filter_mt', action='store_true',
1108+
help='Filter fusions involving Mitochondrial genes.', required=False)
1109+
parser.add_argument('--filter_ig_pairs', dest='filter_ig', action='store_true',
1110+
help='Filter fusions involving two immunoglobulin genes (IGXXX).',
1111+
required=False)
1112+
parser.add_argument('--filter_rna_gene_fusions', dest='filter_rg', action='store_true',
1113+
help='Filter fusions involving RNA genes (RP11-XXXX).', required=False)
1114+
parser.add_argument('--filter_readthroughs', dest='filter_rt', action='store_true',
1115+
help='Filter transcriptional read-troughs.', required=False)
1116+
parser.add_argument('--readthrough_threshold', dest='rt_threshold', type=int,
1117+
help='Genomic distance between candidates on the same strand below which a '
1118+
'fusion will be considered a read-through.', default=500000, required=False)
1119+
11131120
# RNA-Aware options
11141121
parser.add_argument('--rna_file', dest='rna_file', help='The path to an RNA-seq bam file. If '
11151122
'provided, the vcf will be filtered for coding mutations only. The file '
@@ -1122,6 +1129,7 @@ def run_transgene():
11221129
parser.add_argument('--min_rna_alt_freq', dest='rna_min_alt_freq', help='The ALT allele '
11231130
'frequency (as a fraction) in the RNA-Seq below which we will reject the '
11241131
'mutation.', type=float, required=False, default=0.1)
1132+
11251133
# OxoG filtering options
11261134
parser.add_argument('--filterOxoG', dest='filter_oxog', action='store_true', help='Filter the '
11271135
'calls for OxoG artifacts. This feature requires a tumor dna bam as input.',
@@ -1133,19 +1141,38 @@ def run_transgene():
11331141
'allele frequency (as a fraction) in the DNA-Seq below which we will flag'
11341142
'the mutation as being an OxoG variant.',
11351143
type=float, required=False, default=0.1)
1144+
1145+
# Logging
11361146
parser.add_argument('--log_level', dest='log_level', help='The level of logging above which '
11371147
'messages should be printed.', required=False, choices={'DEBUG', 'INFO',
11381148
'WARNING', 'ERROR'},
11391149
default='INFO')
11401150
parser.add_argument('--log_file', dest='logfile', help='A path to a logfile.', type=str,
11411151
required=False, default=None)
1152+
1153+
# Misc
1154+
parser.add_argument('--prefix', dest='prefix', type=str, help='Prefix for output file names.',
1155+
required=True)
1156+
parser.add_argument('--pep_lens', dest='pep_lens', type=str, help='Desired peptide lengths to '
1157+
'process. The argument should be in the form of comma separated values. '
1158+
'E.g. 9,15', required=False, default='9,10,15')
1159+
parser.add_argument('--no_json_dumps', action='store_true',
1160+
help='Do not educe peptide fasta record names in the output by dumping the '
1161+
'mapping info into a .map json file.', required=False, default=False)
1162+
parser.add_argument('--cores', dest='cores', type=int,
1163+
help='Number of cores to use for the filtering step.', required=False,
1164+
default=1)
11421165
params = parser.parse_args()
11431166

11441167
if params.snpeff_file and not params.peptide_file:
11451168
raise ValueError('VCF file requires GENCODE translation FASTA file')
11461169

11471170
if params.fusion_file and not params.transcript_file:
1148-
raise ValueError('Fusion file requires GENCODE transcripts FASTA file')
1171+
raise ValueError('Fusion calling requires GENCODE transcripts FASTA file')
1172+
if params.fusion_file and not params.annotation_file:
1173+
raise ValueError('Fusion calling requires GENCODE gtf annotation file')
1174+
if params.fusion_file and not params.genome_file:
1175+
raise ValueError('Fusion calling requires genomic fasta')
11491176

11501177
if params.filter_oxog:
11511178
if not params.dna_file:

0 commit comments

Comments
 (0)