Skip to content

Commit c37c8c5

Browse files
committed
Filter fusion artifacts (resolves #29)
resolves #29 related to BD2KGenomics/protect#237 Fusion calling often has FPS related to homology, especially among mitochondrial and (lnc,etc) RNA genes. Furthermore, transcriptional readthroughs are usually oncologically irrelevant. This PR adds filtering flags for mitochondrial, Immunoglobulin, RP11- (lncRNA) and transcriptional readthrough filtering.
1 parent 03eca0f commit c37c8c5

File tree

3 files changed

+158
-47
lines changed

3 files changed

+158
-47
lines changed

fusion.py

+97-13
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@
2020
genetic_code = {''.join([b1, b2, b3]): aa
2121
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}
2222

23+
BEDPE = collections.namedtuple('BEDPE',
24+
'chrom1, start1, end1, '
25+
'chrom2, start2, end2, '
26+
'name, '
27+
'score, '
28+
'strand1, strand2, '
29+
'junctionSeq1, junctionSeq2, '
30+
'hugo1, hugo2')
31+
2332

2433
def get_transcriptome_data(infile):
2534
"""
@@ -50,17 +59,20 @@ def get_transcriptome_data(infile):
5059
return transcript_cds, gene_transcripts
5160

5261

53-
def get_exons(genome_file, annotation_file):
62+
def get_exons(genome_file, annotation_file, fusions):
5463
"""
5564
Generates list of GTFRecord objects for each transcript
5665
5766
:param file genome_file: Reference genome FASTA file
5867
:param file annotation_file: Genome annotation file (GTF)
68+
:param list(BEDPE) fusions: The fusions under consideration for this sample.
5969
:return: GTFRecord exons
6070
:rtype: dict
6171
"""
72+
annotation_file.seek(0)
6273
chroms ={}
6374
exons = collections.defaultdict(list)
75+
genes_of_interest = set(sum([(record.hugo1, record.hugo2) for record in fusions], ()))
6476
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
6577
chroms[header] = seq
6678

@@ -69,12 +81,31 @@ def get_exons(genome_file, annotation_file):
6981
continue
7082
else:
7183
gtf = GTFRecord(line)
72-
if gtf.feature == 'exon':
84+
if gtf.feature == 'exon' and gtf.gene_name in genes_of_interest:
7385
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
7486
exons[gtf.transcript_id].append(gtf)
7587
return exons
7688

7789

90+
def read_genes_from_gtf(gtf_file):
91+
"""
92+
Read the gene annotations into a dict
93+
94+
:param file gtf_file: A file handle ot the annotation file.
95+
:returns: A dict of a gtf record for each gene
96+
:rtype: dict(string, GTFRecord)
97+
"""
98+
gene_annotations = {}
99+
for line in gtf_file:
100+
if line.startswith('#'):
101+
continue
102+
else:
103+
gtf = GTFRecord(line)
104+
if gtf.feature == 'gene':
105+
gene_annotations[gtf.gene_name] = gtf
106+
return gene_annotations
107+
108+
78109
def translate(seq):
79110
"""
80111
Translates DNA sequence into protein sequence using globally defined genetic code
@@ -93,13 +124,52 @@ def translate(seq):
93124
return ''.join(protein)
94125

95126

96-
def read_fusions(fusion_file):
127+
def rna_gene_in_bedpe(record):
128+
"""
129+
Determine if one of the two candidates in a BEDPE line is an rna gene.
130+
131+
:param BEDPE record: A BEDPE line from the input file
132+
:returns: True if one of the candidates is an RNA gene and False if not
133+
:rtype: bool
134+
"""
135+
# These are heuristics
136+
return 'RP11-' in record.hugo1 or 'RP11-' in record.hugo2
137+
138+
139+
def readthrough_in_bedpe(record, annotation, rt_threshold):
140+
"""
141+
Determine if the two genes in the record are within `rt_threshold` bp of each other on the same
142+
chromosome.
143+
144+
:param BEDPE record: A BEDPE line from the input file
145+
:param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations`
146+
:param rt_threshold: The genomic distance on the same chromosome below which we will call a
147+
candidate fusion a readthrough.
148+
:returns: True if the pair is considered a readthrough and False if not
149+
:rtype: bool
150+
"""
151+
return (record.chrom1 == record.chrom2 and
152+
((annotation[record.hugo1].start <= annotation[record.hugo2].start <=
153+
annotation[record.hugo1].end + rt_threshold) or
154+
(annotation[record.hugo2].start <= annotation[record.hugo1].start <=
155+
annotation[record.hugo2].end + rt_threshold)))
156+
157+
158+
def read_fusions(fusion_file, gene_annotations, filter_mt, filter_ig, filter_rg, filter_rt,
159+
rt_threshold):
97160
"""
98161
Reads in gene fusion predictions in modified BEDPE format.
99162
In addition to the basic BEDPE features, this function requires the fusion
100163
junction sequences and HUGO names for the donor and acceptor genes.
101164
165+
:param filter_ig:
102166
:param file fusion_file: Fusion calls in BEDPE format
167+
:param dict(str, GTFRecord) gene_annotations: The gene annotations form the gtf
168+
:param bool filter_mt: Filter mitochondrial events?
169+
:param bool filter_ig: Filter immunoglobulin pairs?
170+
:param bool filter_rg: Filter RNA-Gene events?
171+
:param bool filter_rt: Filter transcriptional read-throughs?
172+
:param int rt_threshold: Distance threshold to call a readthrough
103173
:returns: list of BEDPE namedtuples
104174
:rtype: list
105175
@@ -119,31 +189,45 @@ def read_fusions(fusion_file):
119189
hugo1: HUGO name for first feature
120190
hugo2: HUGO name for second feature
121191
"""
122-
BEDPE = collections.namedtuple('BEDPE',
123-
'chrom1, start1, end1, '
124-
'chrom2, start2, end2, '
125-
'name, score, '
126-
'strand1, strand2, '
127-
'junctionSeq1, junctionSeq2, '
128-
'hugo1, hugo2')
129192

130193
calls = []
194+
131195
for line in csv.reader(fusion_file, delimiter='\t'):
132196
if line[0].startswith('#'):
133197
continue
134198
try:
135-
calls.append(BEDPE(*line))
136-
199+
record = BEDPE(*line)
137200
except TypeError:
138201
raise ValueError("ERROR: fusion file is malformed.\n{}".format(read_fusions.__doc__))
139202

203+
if filter_mt and 'M' in record.chrom1 or 'M' in record.chrom2:
204+
logging.warning("Rejecting %s-%s for containing a Mitochondrial gene.", record.hugo1,
205+
record.hugo2)
206+
continue
207+
elif filter_ig and record.hugo1.startswith('IG') and record.hugo2.startswith('IG'):
208+
logging.warning("Rejecting %s-%s an an Immunoglobulin gene pair.", record.hugo1,
209+
record.hugo2)
210+
continue
211+
elif filter_rg and rna_gene_in_bedpe(record):
212+
logging.warning("Rejecting %s-%s for containing an RNA gene.", record.hugo1,
213+
record.hugo2)
214+
continue
215+
elif filter_rt and readthrough_in_bedpe(record, gene_annotations, rt_threshold):
216+
logging.warning("Rejecting %s-%s as a potential readthrough.", record.hugo1,
217+
record.hugo2)
218+
continue
219+
else:
220+
logging.info("Accepting %s-%s for further study.", record.hugo1, record.hugo2)
221+
calls.append(record)
222+
140223
return calls
141224

142225
# Namedtuple for storing alignment metrics
143-
# Neeeds to be global for pickling
226+
# Needs to be global for pickling
144227
AlignStats = collections.namedtuple('AlignStats',
145228
'qstart, qstop, rstart, rstop, insertions, deletions')
146229

230+
147231
def align_filter(ref, query, mode, mismatches_per_kb=1):
148232
"""
149233
Aligns query to reference CDS sequence using the Smith-Waterman algorithm.

test/test_transgene.py

+4
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ def _test_transgene(self, use_RNA=False, use_DNA=False, fusions=False):
107107
'annotation.gtf')) if fusions else None
108108
params.genome_file = open(self._get_input_path('test_input/chr21.fa')) if fusions else None
109109

110+
params.filter_mt = params.filter_rg = params.filter_rt = False
111+
params.rt_threshold = 100000
112+
params.rg_file = None
113+
110114
params.cores = 3
111115
transgene.main(params)
112116
output = {'9mer': {'fasta': 'unit_test_tumor_9_mer_snpeffed.faa',

transgene.py

+57-34
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
import sys
2727

2828
from common import read_fasta, chrom_sort
29-
from fusion import get_transcriptome_data, read_fusions, insert_fusions, get_exons
29+
from fusion import (get_exons,
30+
get_transcriptome_data,
31+
insert_fusions,
32+
read_fusions,
33+
read_genes_from_gtf)
3034

3135

3236
# A named tuple describing the result of running reject_mutation on a mutation.
@@ -1036,15 +1040,13 @@ def main(params):
10361040
transcriptome, gene_transcript_ids = None, None
10371041

10381042
# Load data from fusion file
1043+
fusions = exons = None
10391044
if params.fusion_file:
1040-
fusions = read_fusions(params.fusion_file)
1041-
else:
1042-
fusions = None
1043-
1044-
if params.genome_file and params.annotation_file:
1045-
exons = get_exons(params.genome_file, params.annotation_file)
1046-
else:
1047-
exons = None
1045+
gene_annotations = read_genes_from_gtf(params.annotation_file)
1046+
fusions = read_fusions(params.fusion_file, gene_annotations, params.filter_mt,
1047+
params.filter_ig, params.filter_rg, params.filter_rt,
1048+
params.rt_threshold)
1049+
exons = get_exons(params.genome_file, params.annotation_file, fusions)
10481050

10491051
for peplen in params.pep_lens.split(','):
10501052
logging.info('Processing %s-mers', peplen)
@@ -1057,7 +1059,8 @@ def main(params):
10571059
if proteome_data and snvs:
10581060
insert_snvs(proteome_data, snvs, tumfile, normfile, int(peplen), params.rna_file)
10591061
if transcriptome and gene_transcript_ids and fusions:
1060-
insert_fusions(transcriptome, fusions, gene_transcript_ids, int(peplen), tumfile, exons=exons)
1062+
insert_fusions(transcriptome, fusions, gene_transcript_ids, int(peplen), tumfile,
1063+
exons=exons)
10611064

10621065
if params.no_json_dumps:
10631066
shutil.move(tumfile_path, '_'.join([params.prefix, 'tumor', peplen,
@@ -1080,36 +1083,36 @@ def run_transgene():
10801083
This will try to run transgene from system arguments
10811084
"""
10821085
parser = argparse.ArgumentParser(description=main.__doc__)
1083-
parser.add_argument('--peptides', dest='peptide_file',
1084-
type=argparse.FileType('r'),
1086+
# SNV related options
1087+
parser.add_argument('--peptides', dest='peptide_file', type=argparse.FileType('r'),
10851088
help='Path to GENCODE translation FASTA file')
1086-
parser.add_argument('--transcripts', dest='transcript_file',
1087-
type=argparse.FileType('r'),
1088-
help='Path to GENCODE transcript FASTA file')
1089-
parser.add_argument('--snpeff', dest='snpeff_file',
1090-
type=argparse.FileType('r'),
1089+
parser.add_argument('--snpeff', dest='snpeff_file', type=argparse.FileType('r'),
10911090
help='Path to snpeff file')
1092-
parser.add_argument('--cores', dest='cores', type=int,
1093-
help='Number of cores to use for the filtering step.', required=False,
1094-
default=1)
1095-
parser.add_argument('--fusions', dest='fusion_file',
1096-
help='Path to gene fusion file',
1091+
1092+
# Fusion related options
1093+
parser.add_argument('--fusions', dest='fusion_file', help='Path to gene fusion file',
10971094
type=argparse.FileType('r'))
1095+
parser.add_argument('--transcripts', dest='transcript_file', type=argparse.FileType('r'),
1096+
help='Path to GENCODE transcript FASTA file. Required if calling fusions.')
10981097
parser.add_argument('--genome', dest='genome_file',
1099-
help='Path to reference genome file',
1098+
help='Path to reference genome file, Required if calling fusions.',
11001099
type=argparse.FileType('r'))
11011100
parser.add_argument('--annotation', dest='annotation_file',
1102-
help='Path to gencode annotation file',
1101+
help='Path to gencode annotation file. Required if calling fusions.',
11031102
type=argparse.FileType('r'))
1104-
parser.add_argument('--prefix', dest='prefix', type=str,
1105-
help='Prefix for output file names', required=True)
1106-
parser.add_argument('--pep_lens', dest='pep_lens', type=str,
1107-
help='Desired peptide lengths to process. '
1108-
'The argument should be in the form of comma separated values. '
1109-
'E.g. 9,15', required=False, default='9,10,15')
1110-
parser.add_argument('--no_json_dumps', action='store_true',
1111-
help='Do not educe peptide fasta record names in the output by dumping the '
1112-
'mapping info into a .map json file.', required=False, default=False)
1103+
parser.add_argument('--filter_mt_fusions', dest='filter_mt', action='store_true',
1104+
help='Filter fusions involving Mitochondrial genes.', required=False)
1105+
parser.add_argument('--filter_ig_pairs', dest='filter_ig', action='store_true',
1106+
help='Filter fusions involving two immunoglobulin genes (IGXXX).',
1107+
required=False)
1108+
parser.add_argument('--filter_rna_gene_fusions', dest='filter_rg', action='store_true',
1109+
help='Filter fusions involving RNA genes (RP11-XXXX).', required=False)
1110+
parser.add_argument('--filter_readthroughs', dest='filter_rt', action='store_true',
1111+
help='Filter transcriptional read-troughs.', required=False)
1112+
parser.add_argument('--readthrough_threshold', dest='rt_threshold', type=int,
1113+
help='Genomic distance between candidates on the same strand below which a '
1114+
'fusion will be considered a read-through.', default=500000, required=False)
1115+
11131116
# RNA-Aware options
11141117
parser.add_argument('--rna_file', dest='rna_file', help='The path to an RNA-seq bam file. If '
11151118
'provided, the vcf will be filtered for coding mutations only. The file '
@@ -1122,6 +1125,7 @@ def run_transgene():
11221125
parser.add_argument('--min_rna_alt_freq', dest='rna_min_alt_freq', help='The ALT allele '
11231126
'frequency (as a fraction) in the RNA-Seq below which we will reject the '
11241127
'mutation.', type=float, required=False, default=0.1)
1128+
11251129
# OxoG filtering options
11261130
parser.add_argument('--filterOxoG', dest='filter_oxog', action='store_true', help='Filter the '
11271131
'calls for OxoG artifacts. This feature requires a tumor dna bam as input.',
@@ -1133,19 +1137,38 @@ def run_transgene():
11331137
'allele frequency (as a fraction) in the DNA-Seq below which we will flag'
11341138
'the mutation as being an OxoG variant.',
11351139
type=float, required=False, default=0.1)
1140+
1141+
# Logging
11361142
parser.add_argument('--log_level', dest='log_level', help='The level of logging above which '
11371143
'messages should be printed.', required=False, choices={'DEBUG', 'INFO',
11381144
'WARNING', 'ERROR'},
11391145
default='INFO')
11401146
parser.add_argument('--log_file', dest='logfile', help='A path to a logfile.', type=str,
11411147
required=False, default=None)
1148+
1149+
# Misc
1150+
parser.add_argument('--prefix', dest='prefix', type=str, help='Prefix for output file names.',
1151+
required=True)
1152+
parser.add_argument('--pep_lens', dest='pep_lens', type=str, help='Desired peptide lengths to '
1153+
'process. The argument should be in the form of comma separated values. '
1154+
'E.g. 9,15', required=False, default='9,10,15')
1155+
parser.add_argument('--no_json_dumps', action='store_true',
1156+
help='Do not educe peptide fasta record names in the output by dumping the '
1157+
'mapping info into a .map json file.', required=False, default=False)
1158+
parser.add_argument('--cores', dest='cores', type=int,
1159+
help='Number of cores to use for the filtering step.', required=False,
1160+
default=1)
11421161
params = parser.parse_args()
11431162

11441163
if params.snpeff_file and not params.peptide_file:
11451164
raise ValueError('VCF file requires GENCODE translation FASTA file')
11461165

11471166
if params.fusion_file and not params.transcript_file:
1148-
raise ValueError('Fusion file requires GENCODE transcripts FASTA file')
1167+
raise ValueError('Fusion calling requires GENCODE transcripts FASTA file')
1168+
if params.fusion_file and not params.annotation_file:
1169+
raise ValueError('Fusion calling requires GENCODE gtf annotation file')
1170+
if params.fusion_file and not params.genome_file:
1171+
raise ValueError('Fusion calling requires genomic fasta')
11491172

11501173
if params.filter_oxog:
11511174
if not params.dna_file:

0 commit comments

Comments
 (0)