From fd2f299f29c46e00787e1b6d2ee59b61dc0284da Mon Sep 17 00:00:00 2001 From: Arjun Arkal Rao Date: Fri, 20 Oct 2017 12:01:47 -0700 Subject: [PATCH] Filter fusion artifacts (resolves #29) resolves #29 related to BD2KGenomics/protect#237 Fusion calling often has FPS related to homology, especially among mitochondrial and (lnc,etc) RNA genes. Furthermore, transcriptional readthroughs are usually oncologically irrelevant. This PR adds filtering flags for mitochondrial, Immunoglobulin, RP11- (lncRNA) and transcriptional readthrough filtering. --- common.py | 84 +++++++++++++++++++- fusion.py | 118 +++++++++++++++-------------- test/test_input/test_fusions.bedpe | 3 + test/test_transgene.py | 3 + transgene.py | 94 ++++++++++++++--------- 5 files changed, 210 insertions(+), 92 deletions(-) diff --git a/common.py b/common.py index ccc9797..95254ed 100644 --- a/common.py +++ b/common.py @@ -180,4 +180,86 @@ def __eq__(self, other): # The number of reads at the position 'coverage', # The variant allele frequency - 'vaf')) \ No newline at end of file + 'vaf')) + +# Standard Genetic Code from NCBI +amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' +base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG' +base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG' +base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG' +genetic_code = {''.join([b1, b2, b3]): aa + for aa, b1, b2, b3 in zip(amino, base1, base2, base3)} + + +BEDPE = collections.namedtuple('BEDPE', + 'chrom1, start1, end1, ' + 'chrom2, start2, end2, ' + 'name, ' + 'score, ' + 'strand1, strand2, ' + 'junctionSeq1, junctionSeq2, ' + 'hugo1, hugo2') + + +def get_exons(genome_file, annotation_file, genes_of_interest): + """ + Generates list of GTFRecord objects for each transcript + + :param file genome_file: Reference genome FASTA file + :param file annotation_file: Genome annotation file (GTF) + :param set(str) genes_of_interest: The genes in this sample that might need translation. + :return: GTFRecord exons + :rtype: dict + """ + annotation_file.seek(0) + chroms = {} + exons = collections.defaultdict(list) + for header, comment, seq in read_fasta(genome_file, 'ACGTN'): + chroms[header] = seq + + for line in annotation_file: + if line.startswith('#'): + continue + else: + gtf = GTFRecord(line) + if gtf.feature == 'exon' and gtf.gene_name in genes_of_interest: + gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end] + exons[gtf.transcript_id].append(gtf) + return exons + + +def read_genes_from_gtf(gtf_file): + """ + Read the gene annotations into a dict + + :param file gtf_file: A file handle ot the annotation file. + :returns: A dict of a gtf record for each gene + :rtype: dict(string, GTFRecord) + """ + gene_annotations = {} + for line in gtf_file: + if line.startswith('#'): + continue + else: + gtf = GTFRecord(line) + if gtf.feature == 'gene': + gene_annotations[gtf.gene_name] = gtf + return gene_annotations + + +def translate(seq): + """ + Translates DNA sequence into protein sequence using globally defined genetic code + + :param str seq: DNA sequence + :returns: Translated sequence + :rtype: str + + >>> translate('ATGTTTCGTT') + 'MFR' + """ + start = 0 + n = len(seq) + codons = (seq[i: i+3] for i in range(start, n - n % 3, 3)) + protein = [genetic_code[codon] for codon in codons] + return ''.join(protein) \ No newline at end of file diff --git a/fusion.py b/fusion.py index a771827..7b217bb 100644 --- a/fusion.py +++ b/fusion.py @@ -1,23 +1,14 @@ from __future__ import print_function + import collections import csv import logging import pickle import re -import string from cStringIO import StringIO import swalign - -from common import read_fasta, GTFRecord, trans - -# Standard Genetic Code from NCBI -amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' -base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG' -base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG' -base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG' -genetic_code = {''.join([b1, b2, b3]): aa - for aa, b1, b2, b3 in zip(amino, base1, base2, base3)} +from common import read_fasta, trans, BEDPE, translate def get_transcriptome_data(infile): @@ -49,56 +40,53 @@ def get_transcriptome_data(infile): return transcript_cds, gene_transcripts -def get_exons(genome_file, annotation_file): +def rna_gene_in_bedpe(record): """ - Generates list of GTFRecord objects for each transcript + Determine if one of the two candidates in a BEDPE line is an rna gene. - :param file genome_file: Reference genome FASTA file - :param file annotation_file: Genome annotation file (GTF) - :return: GTFRecord exons - :rtype: dict + :param BEDPE record: A BEDPE line from the input file + :returns: True if one of the candidates is an RNA gene and False if not + :rtype: bool """ - chroms = {} - exons = collections.defaultdict(list) - for header, comment, seq in read_fasta(genome_file, 'ACGTN'): - chroms[header] = seq + # We will accept fusions that have an RP11- (lncRNA) 3' partner since they can still be + # translated. This is a heuristic. + return 'RP11-' in record.hugo1 - for line in annotation_file: - if line.startswith('#'): - continue - else: - gtf = GTFRecord(line) - if gtf.feature == 'exon': - gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end] - exons[gtf.transcript_id].append(gtf) - return exons - -def translate(seq): +def readthrough_in_bedpe(record, annotation, rt_threshold): """ - Translates DNA sequence into protein sequence using globally defined genetic code - - :param str seq: DNA sequence - :returns: Translated sequence - :rtype: str - - >>> translate('ATGTTTCGTT') - 'MFR' + Determine if the two genes in the record are within `rt_threshold` bp of each other on the same + chromosome. + + :param BEDPE record: A BEDPE line from the input file + :param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations` + :param rt_threshold: The genomic distance on the same chromosome below which we will call a + candidate fusion a readthrough. + :returns: True if the pair is considered a readthrough and False if not + :rtype: bool """ - start = 0 - n = len(seq) - codons = (seq[i: i+3] for i in range(start, n - n % 3, 3)) - protein = [genetic_code[codon] for codon in codons] - return ''.join(protein) + return (record.chrom1 == record.chrom2 and + ((annotation[record.hugo1].start <= annotation[record.hugo2].start <= + annotation[record.hugo1].end + rt_threshold) or + (annotation[record.hugo2].start <= annotation[record.hugo1].start <= + annotation[record.hugo2].end + rt_threshold))) -def read_fusions(fusion_file): +def read_fusions(fusion_file, gene_annotations, filter_mt, filter_ig, filter_rg, filter_rt, + rt_threshold, out_bedpe): """ Reads in gene fusion predictions in modified BEDPE format. In addition to the basic BEDPE features, this function requires the fusion junction sequences and HUGO names for the donor and acceptor genes. :param file fusion_file: Fusion calls in BEDPE format + :param dict(str, GTFRecord) gene_annotations: The gene annotations from the gtf + :param bool filter_mt: Filter mitochondrial events? + :param bool filter_ig: Filter immunoglobulin pairs? + :param bool filter_rg: Filter RNA-Gene events? + :param bool filter_rt: Filter transcriptional read-throughs? + :param int rt_threshold: Distance threshold to call a readthrough + :param file out_bedpe: A file handle to an output BEDPE file :returns: list of BEDPE namedtuples :rtype: list @@ -118,31 +106,49 @@ def read_fusions(fusion_file): hugo1: HUGO name for first feature hugo2: HUGO name for second feature """ - BEDPE = collections.namedtuple('BEDPE', - 'chrom1, start1, end1, ' - 'chrom2, start2, end2, ' - 'name, score, ' - 'strand1, strand2, ' - 'junctionSeq1, junctionSeq2, ' - 'hugo1, hugo2') calls = [] + for line in csv.reader(fusion_file, delimiter='\t'): if line[0].startswith('#'): + print('\t'.join(line), file=out_bedpe) continue try: - calls.append(BEDPE(*line)) - + record = BEDPE(*line) except TypeError: raise ValueError("ERROR: fusion file is malformed.\n{}".format(read_fusions.__doc__)) + if filter_mt and 'M' in record.chrom1 or 'M' in record.chrom2: + logging.warning("Rejecting %s-%s for containing a Mitochondrial gene.", record.hugo1, + record.hugo2) + continue + elif filter_ig and record.hugo1.startswith('IG') and record.hugo2.startswith('IG'): + # This will drop some Insulin-like growth factor (IGF) proteins but they have a lot of + # homology too so its ok. + logging.warning("Rejecting %s-%s an an Immunoglobulin gene pair.", record.hugo1, + record.hugo2) + continue + elif filter_rg and rna_gene_in_bedpe(record): + logging.warning("Rejecting %s-%s for containing a 5' RNA gene.", record.hugo1, + record.hugo2) + continue + elif filter_rt and readthrough_in_bedpe(record, gene_annotations, rt_threshold): + logging.warning("Rejecting %s-%s as a potential readthrough.", record.hugo1, + record.hugo2) + continue + else: + logging.info("Accepting %s-%s for further study.", record.hugo1, record.hugo2) + print('\t'.join(line), file=out_bedpe) + calls.append(record) + return calls # Namedtuple for storing alignment metrics -# Neeeds to be global for pickling +# Needs to be global for pickling AlignStats = collections.namedtuple('AlignStats', 'qstart, qstop, rstart, rstop, insertions, deletions') + def align_filter(ref, query, mode, mismatches_per_kb=1): """ Aligns query to reference CDS sequence using the Smith-Waterman algorithm. diff --git a/test/test_input/test_fusions.bedpe b/test/test_input/test_fusions.bedpe index a032e20..7306bb8 100644 --- a/test/test_input/test_fusions.bedpe +++ b/test/test_input/test_fusions.bedpe @@ -4,3 +4,6 @@ chr8 . 42968214 chr10 43116584 . ENSG00000168172-ENSG00000165731 1_1 + + TTGGAGG chr21 . 41494381 chr7 13935844 . ENSG00000184012-ENSG00000006468 1_1 - - CCGGAAAACCCCTATCCCGCACAGCCCACTGTGGTCCCCACTGTCTACGAGGTGCATCCGGCTCAGTACTACCCGTCCCCCGTGCCCCAGTACGCCCCGAGGGTCCTGACGCAGGCTTCCAACCCCGTCGTCTGCACGCAGCCCAAATCC CCATCCAGCACGCCAGTGTCCCCACTGCATCATGCATCTCCAAACTCAACTCATACACCGAAACCTGACCGGGCCTTCCCAGCTCACCTCCCTCCATCGCAGTCCATACCAGATAGCAGCTACCCCATGGACCACAGATTTCGCCGCCAGCTTTCTGAACCCTGTAACTCCTTTCCTCCTTTGCCGACGATGCCAAGGGAAGGACGTCCTATGTACCAACGCCAGATGTCTGAGCCAAACATCCCCTTCCCACCACAAGGCTTTAAGCAGGAGTACCACGACCCAGTGTATGAACACAACACCATGGTTGGCAGTGCGGCCAGCCAAAGCTTTCCCCCTCCTCTGATGATTAAACAGGAACCCAGAGATTTTGCATATGACTCAGAAGTGCCTAGCTGCCACTCCATTTATATGAGGCAAGAAGGCTTCCTGGCTCATCCCAGCAGAACAGAAGGCTGTATGTTTGAAAAGGGCCCCAGGCAGTTTTATGATGACACCTGTGTTGTCCCAGAAAAATTCGATGGAGACATCAAACAAGAGCCAGGAATGTATCGGGAAGGACCCACATACCAACGGCGAGGATCACTTCAGCTCTGGCAGTTTTTGGTAGCTCTTCTGGATGACCCTTCAAATTCTCATTTTATTGCCTGGACTGGTCGAGGCATGGAATTTAAACTGATTGAGCCTGAAGAGGTGGCCCGACGTTGGGGCATTCAGAAAAACAGGCCAGCTATGAACTATGATAAACTTAGCCGTTCACTCCGCTATTACTATGAGAAAGGAATTATGCAAAAGGTGGCTGGAGAGAGATATGTCTACAAGTTTGTGTGTGATCCAGAAGCCCTTTTCTCCATGGCCTTTCCAGATAATCAGCGTCCACTGCTGAAGACAGACATGGAACGTCACATCAACGAGGAGGACACAGTGCCTCTTTCTCACTTTGATGAGAGCATGGCCTACATGCCGGAAGGGGGCTGCTGCAACCCCCACCCCTACAACGAAGGCTACGTGTATTAACACAAGTGACAGTCAAGCAGGGCGTT TMPRSS2 ETV1 chr21 . 41494381 chr7 13935844 . ENSG00000184012-ENSG00000006468 1_1 - - CCGGAAAACCCCTATCCCGCACAGCCCACTGTGGTCCCCACTGTCTACGAGGTGCATCCGGCTCAGTACTACCCGTCCCCCGTGCCCCAGTACGCCCCGAGGGTCCTGACGCAGGCTTCCAACCCCGTCGTCTGCACGCAGCCCAAATCC CCATCCAGCACGCCAGTGTCCCCACTGCATCATGCATCTCCAAACTCAACTCATACACCGAAACCTGACCGGGCCTTCCCAGCTCACCTCCCTCCATCGCAGTCCATACCAGATAGCAGCTACCCCATGGACCACAGATTTCGCCGCCAGCTTTCTGAACCCTGTAACTCCTTTCCTCCTTTGCCGACGATGCCAAGGGAAGGACGTCCTATGTACCAACGCCAGATGTCTGAGCCAAACATCCCCTTCCCACCACAAGGCTTTAAGCAGGAGTACCACGACCCAGTGTATGAACACAACACCATGGTTGGCAGTGCGGCCA TMPRSS2 ETV1 chr21 . 42870046 chr21 39817544 . ENSG00000184012-ENSG00000157554 1_1 - - . . TMPRSS2 ERG +chr21 . 46493768 chr21 46511596 . ENSG00000197381-ENSG00000268805 1_1 - - THIS_IS_A_READTHROUGH . ADARB1 PRED57 +chr21 . 16904219 chr21 16914235 . ENSG00000230481-ENSG00000253691 1_1 - - THIS_IS_AN_IG . IGKV1OR22-5 IGKV2OR22-4 +chr21 . 10397644 chr21 39817544 . ENSG00000278106-ENSG00000157554 1_1 - - THIS_IS_AN_RP11 . RP11-589M2.1 ERG \ No newline at end of file diff --git a/test/test_transgene.py b/test/test_transgene.py index 5754a30..464d2f6 100644 --- a/test/test_transgene.py +++ b/test/test_transgene.py @@ -107,6 +107,9 @@ def _test_transgene(self, use_RNA=False, use_DNA=False, fusions=False): 'annotation.gtf')) if fusions else None params.genome_file = open(self._get_input_path('test_input/chr21.fa')) if fusions else None + params.filter_mt = params.filter_rg = params.filter_ig = params.filter_rt = True + params.rt_threshold = 100000 + params.cores = 3 transgene.main(params) output = {'9mer': {'fasta': 'unit_test_tumor_9_mer_snpeffed.faa', diff --git a/transgene.py b/transgene.py index 1c8c4ea..c7ab82a 100755 --- a/transgene.py +++ b/transgene.py @@ -25,8 +25,9 @@ from operator import itemgetter from tempfile import mkstemp -from common import read_fasta, chrom_sort, reject_decision, trans -from fusion import get_transcriptome_data, read_fusions, insert_fusions, get_exons +from common import chrom_sort, get_exons, read_fasta, read_genes_from_gtf, reject_decision, trans, \ + genetic_code +from fusion import get_transcriptome_data, insert_fusions, read_fusions from snv import reject_snv @@ -561,7 +562,6 @@ def merge_adjacent_snvs(snvs, muts): >>> merge_adjacent_snvs(snvs, ['A123B', 'C123D','E123F']) 'A123H' """ - from fusion import genetic_code if len(muts) == 3: return muts[0][:-1] + genetic_code[''.join([snvs[x]['NUC']['ALT'] for x in muts])] else: @@ -933,6 +933,7 @@ def main(params): # Read in snpeffed vcf files mutations = None + genes_to_translate = set() if params.snpeff_file: if params.rna_file or params.dna_file: out_vcf = open('_'.join([params.prefix, 'transgened.vcf']), 'w') @@ -959,15 +960,18 @@ def main(params): transcriptome, gene_transcript_ids = None, None # Load data from fusion file + fusions = exons = None if params.fusion_file: - fusions = read_fusions(params.fusion_file) - else: - fusions = None - - if params.genome_file and params.annotation_file: - exons = get_exons(params.genome_file, params.annotation_file) - else: - exons = None + gene_annotations = read_genes_from_gtf(params.annotation_file) + out_bedpe = open('_'.join([params.prefix, 'transgened.bedpe']), 'w') + try: + fusions = read_fusions(params.fusion_file, gene_annotations, params.filter_mt, + params.filter_ig, params.filter_rg, params.filter_rt, + params.rt_threshold, out_bedpe) + finally: + out_bedpe.close() + genes_to_translate.update(sum([(record.hugo1, record.hugo2) for record in fusions], ())) + exons = get_exons(params.genome_file, params.annotation_file, genes_to_translate) for peplen in params.pep_lens.split(','): logging.info('Processing %s-mers', peplen) @@ -1005,36 +1009,36 @@ def run_transgene(): This will try to run transgene from system arguments """ parser = argparse.ArgumentParser(description=main.__doc__) - parser.add_argument('--peptides', dest='peptide_file', - type=argparse.FileType('r'), + # SNV related options + parser.add_argument('--peptides', dest='peptide_file', type=argparse.FileType('r'), help='Path to GENCODE translation FASTA file') - parser.add_argument('--transcripts', dest='transcript_file', - type=argparse.FileType('r'), - help='Path to GENCODE transcript FASTA file') - parser.add_argument('--snpeff', dest='snpeff_file', - type=argparse.FileType('r'), + parser.add_argument('--snpeff', dest='snpeff_file', type=argparse.FileType('r'), help='Path to snpeff file') - parser.add_argument('--cores', dest='cores', type=int, - help='Number of cores to use for the filtering step.', required=False, - default=1) - parser.add_argument('--fusions', dest='fusion_file', - help='Path to gene fusion file', + + # Fusion related options + parser.add_argument('--fusions', dest='fusion_file', help='Path to gene fusion file', type=argparse.FileType('r')) + parser.add_argument('--transcripts', dest='transcript_file', type=argparse.FileType('r'), + help='Path to GENCODE transcript FASTA file. Required if calling fusions.') parser.add_argument('--genome', dest='genome_file', - help='Path to reference genome file', + help='Path to reference genome file, Required if calling fusions.', type=argparse.FileType('r')) parser.add_argument('--annotation', dest='annotation_file', - help='Path to gencode annotation file', + help='Path to gencode annotation file. Required if calling fusions.', type=argparse.FileType('r')) - parser.add_argument('--prefix', dest='prefix', type=str, - help='Prefix for output file names', required=True) - parser.add_argument('--pep_lens', dest='pep_lens', type=str, - help='Desired peptide lengths to process. ' - 'The argument should be in the form of comma separated values. ' - 'E.g. 9,15', required=False, default='9,10,15') - parser.add_argument('--no_json_dumps', action='store_true', - help='Do not educe peptide fasta record names in the output by dumping the ' - 'mapping info into a .map json file.', required=False, default=False) + parser.add_argument('--filter_mt_fusions', dest='filter_mt', action='store_true', + help='Filter fusions involving Mitochondrial genes.', required=False) + parser.add_argument('--filter_ig_pairs', dest='filter_ig', action='store_true', + help='Filter fusions involving two immunoglobulin genes (IGXXX).', + required=False) + parser.add_argument('--filter_rna_gene_fusions', dest='filter_rg', action='store_true', + help='Filter fusions involving RNA genes (RP11-XXXX).', required=False) + parser.add_argument('--filter_readthroughs', dest='filter_rt', action='store_true', + help='Filter transcriptional read-troughs.', required=False) + parser.add_argument('--readthrough_threshold', dest='rt_threshold', type=int, + help='Genomic distance between candidates on the same strand below which a ' + 'fusion will be considered a read-through.', default=500000, required=False) + # RNA-Aware options parser.add_argument('--rna_file', dest='rna_file', help='The path to an RNA-seq bam file. If ' 'provided, the vcf will be filtered for coding mutations only. The file ' @@ -1047,6 +1051,7 @@ def run_transgene(): parser.add_argument('--min_rna_alt_freq', dest='rna_min_alt_freq', help='The ALT allele ' 'frequency (as a fraction) in the RNA-Seq below which we will reject the ' 'mutation.', type=float, required=False, default=0.1) + # OxoG filtering options parser.add_argument('--filterOxoG', dest='filter_oxog', action='store_true', help='Filter the ' 'calls for OxoG artifacts. This feature requires a tumor dna bam as input.', @@ -1058,19 +1063,38 @@ def run_transgene(): 'allele frequency (as a fraction) in the DNA-Seq below which we will flag' 'the mutation as being an OxoG variant.', type=float, required=False, default=0.1) + + # Logging parser.add_argument('--log_level', dest='log_level', help='The level of logging above which ' 'messages should be printed.', required=False, choices={'DEBUG', 'INFO', 'WARNING', 'ERROR'}, default='INFO') parser.add_argument('--log_file', dest='logfile', help='A path to a logfile.', type=str, required=False, default=None) + + # Misc + parser.add_argument('--prefix', dest='prefix', type=str, help='Prefix for output file names.', + required=True) + parser.add_argument('--pep_lens', dest='pep_lens', type=str, help='Desired peptide lengths to ' + 'process. The argument should be in the form of comma separated values. ' + 'E.g. 9,15', required=False, default='9,10,15') + parser.add_argument('--no_json_dumps', action='store_true', + help='Do not educe peptide fasta record names in the output by dumping the ' + 'mapping info into a .map json file.', required=False, default=False) + parser.add_argument('--cores', dest='cores', type=int, + help='Number of cores to use for the filtering step.', required=False, + default=1) params = parser.parse_args() if params.snpeff_file and not params.peptide_file: raise ValueError('VCF file requires GENCODE translation FASTA file') if params.fusion_file and not params.transcript_file: - raise ValueError('Fusion file requires GENCODE transcripts FASTA file') + raise ValueError('Fusion calling requires GENCODE transcripts FASTA file') + if params.fusion_file and not params.annotation_file: + raise ValueError('Fusion calling requires GENCODE gtf annotation file') + if params.fusion_file and not params.genome_file: + raise ValueError('Fusion calling requires genomic fasta') if params.filter_oxog: if not params.dna_file: