Skip to content

Commit

Permalink
Filter fusion artifacts (resolves #29)
Browse files Browse the repository at this point in the history
resolves #29
related to BD2KGenomics/protect#237

Fusion calling often has FPS related to homology, especially among
mitochondrial  and (lnc,etc) RNA genes. Furthermore, transcriptional
readthroughs are usually oncologically irrelevant. This PR adds
filtering flags for mitochondrial, Immunoglobulin, RP11- (lncRNA) and
transcriptional readthrough filtering.
  • Loading branch information
arkal committed Oct 24, 2017
1 parent 03eca0f commit 933d5f0
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 47 deletions.
113 changes: 100 additions & 13 deletions fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@
genetic_code = {''.join([b1, b2, b3]): aa
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}

BEDPE = collections.namedtuple('BEDPE',
'chrom1, start1, end1, '
'chrom2, start2, end2, '
'name, '
'score, '
'strand1, strand2, '
'junctionSeq1, junctionSeq2, '
'hugo1, hugo2')


def get_transcriptome_data(infile):
"""
Expand Down Expand Up @@ -50,17 +59,20 @@ def get_transcriptome_data(infile):
return transcript_cds, gene_transcripts


def get_exons(genome_file, annotation_file):
def get_exons(genome_file, annotation_file, fusions):
"""
Generates list of GTFRecord objects for each transcript
:param file genome_file: Reference genome FASTA file
:param file annotation_file: Genome annotation file (GTF)
:param list(BEDPE) fusions: The fusions under consideration for this sample.
:return: GTFRecord exons
:rtype: dict
"""
annotation_file.seek(0)
chroms ={}
exons = collections.defaultdict(list)
genes_of_interest = set(sum([(record.hugo1, record.hugo2) for record in fusions], ()))
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
chroms[header] = seq

Expand All @@ -69,12 +81,31 @@ def get_exons(genome_file, annotation_file):
continue
else:
gtf = GTFRecord(line)
if gtf.feature == 'exon':
if gtf.feature == 'exon' and gtf.gene_name in genes_of_interest:
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
exons[gtf.transcript_id].append(gtf)
return exons


def read_genes_from_gtf(gtf_file):
"""
Read the gene annotations into a dict
:param file gtf_file: A file handle ot the annotation file.
:returns: A dict of a gtf record for each gene
:rtype: dict(string, GTFRecord)
"""
gene_annotations = {}
for line in gtf_file:
if line.startswith('#'):
continue
else:
gtf = GTFRecord(line)
if gtf.feature == 'gene':
gene_annotations[gtf.gene_name] = gtf
return gene_annotations


def translate(seq):
"""
Translates DNA sequence into protein sequence using globally defined genetic code
Expand All @@ -93,13 +124,53 @@ def translate(seq):
return ''.join(protein)


def read_fusions(fusion_file):
def rna_gene_in_bedpe(record):
"""
Determine if one of the two candidates in a BEDPE line is an rna gene.
:param BEDPE record: A BEDPE line from the input file
:returns: True if one of the candidates is an RNA gene and False if not
:rtype: bool
"""
# These are heuristics
return 'RP11-' in record.hugo1 or 'RP11-' in record.hugo2


def readthrough_in_bedpe(record, annotation, rt_threshold):
"""
Determine if the two genes in the record are within `rt_threshold` bp of each other on the same
chromosome.
:param BEDPE record: A BEDPE line from the input file
:param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations`
:param rt_threshold: The genomic distance on the same chromosome below which we will call a
candidate fusion a readthrough.
:returns: True if the pair is considered a readthrough and False if not
:rtype: bool
"""
return (record.chrom1 == record.chrom2 and
((annotation[record.hugo1].start <= annotation[record.hugo2].start <=
annotation[record.hugo1].end + rt_threshold) or
(annotation[record.hugo2].start <= annotation[record.hugo1].start <=
annotation[record.hugo2].end + rt_threshold)))


def read_fusions(fusion_file, gene_annotations, filter_mt, filter_ig, filter_rg, filter_rt,
rt_threshold, out_bedpe):
"""
Reads in gene fusion predictions in modified BEDPE format.
In addition to the basic BEDPE features, this function requires the fusion
junction sequences and HUGO names for the donor and acceptor genes.
:param filter_ig:
:param file fusion_file: Fusion calls in BEDPE format
:param dict(str, GTFRecord) gene_annotations: The gene annotations form the gtf
:param bool filter_mt: Filter mitochondrial events?
:param bool filter_ig: Filter immunoglobulin pairs?
:param bool filter_rg: Filter RNA-Gene events?
:param bool filter_rt: Filter transcriptional read-throughs?
:param int rt_threshold: Distance threshold to call a readthrough
:param file out_bedpe: A file handle to an output BEDPE file
:returns: list of BEDPE namedtuples
:rtype: list
Expand All @@ -119,31 +190,47 @@ def read_fusions(fusion_file):
hugo1: HUGO name for first feature
hugo2: HUGO name for second feature
"""
BEDPE = collections.namedtuple('BEDPE',
'chrom1, start1, end1, '
'chrom2, start2, end2, '
'name, score, '
'strand1, strand2, '
'junctionSeq1, junctionSeq2, '
'hugo1, hugo2')

calls = []

for line in csv.reader(fusion_file, delimiter='\t'):
if line[0].startswith('#'):
print(line, sep='\t', file=out_bedpe)
continue
try:
calls.append(BEDPE(*line))

record = BEDPE(*line)
except TypeError:
raise ValueError("ERROR: fusion file is malformed.\n{}".format(read_fusions.__doc__))

if filter_mt and 'M' in record.chrom1 or 'M' in record.chrom2:
logging.warning("Rejecting %s-%s for containing a Mitochondrial gene.", record.hugo1,
record.hugo2)
continue
elif filter_ig and record.hugo1.startswith('IG') and record.hugo2.startswith('IG'):
logging.warning("Rejecting %s-%s an an Immunoglobulin gene pair.", record.hugo1,
record.hugo2)
continue
elif filter_rg and rna_gene_in_bedpe(record):
logging.warning("Rejecting %s-%s for containing an RNA gene.", record.hugo1,
record.hugo2)
continue
elif filter_rt and readthrough_in_bedpe(record, gene_annotations, rt_threshold):
logging.warning("Rejecting %s-%s as a potential readthrough.", record.hugo1,
record.hugo2)
continue
else:
logging.info("Accepting %s-%s for further study.", record.hugo1, record.hugo2)
print(line, sep='\t', file=out_bedpe)
calls.append(record)

return calls

# Namedtuple for storing alignment metrics
# Neeeds to be global for pickling
# Needs to be global for pickling
AlignStats = collections.namedtuple('AlignStats',
'qstart, qstop, rstart, rstop, insertions, deletions')


def align_filter(ref, query, mode, mismatches_per_kb=1):
"""
Aligns query to reference CDS sequence using the Smith-Waterman algorithm.
Expand Down
4 changes: 4 additions & 0 deletions test/test_transgene.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ def _test_transgene(self, use_RNA=False, use_DNA=False, fusions=False):
'annotation.gtf')) if fusions else None
params.genome_file = open(self._get_input_path('test_input/chr21.fa')) if fusions else None

params.filter_mt = params.filter_rg = params.filter_rt = False
params.rt_threshold = 100000
params.rg_file = None

params.cores = 3
transgene.main(params)
output = {'9mer': {'fasta': 'unit_test_tumor_9_mer_snpeffed.faa',
Expand Down
95 changes: 61 additions & 34 deletions transgene.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
import sys

from common import read_fasta, chrom_sort
from fusion import get_transcriptome_data, read_fusions, insert_fusions, get_exons
from fusion import (get_exons,
get_transcriptome_data,
insert_fusions,
read_fusions,
read_genes_from_gtf)


# A named tuple describing the result of running reject_mutation on a mutation.
Expand Down Expand Up @@ -1036,15 +1040,17 @@ def main(params):
transcriptome, gene_transcript_ids = None, None

# Load data from fusion file
fusions = exons = None
if params.fusion_file:
fusions = read_fusions(params.fusion_file)
else:
fusions = None

if params.genome_file and params.annotation_file:
exons = get_exons(params.genome_file, params.annotation_file)
else:
exons = None
gene_annotations = read_genes_from_gtf(params.annotation_file)
out_bedpe = open('_'.join([params.prefix, 'transgened.bedpe']), 'w')
try:
fusions = read_fusions(params.fusion_file, gene_annotations, params.filter_mt,
params.filter_ig, params.filter_rg, params.filter_rt,
params.rt_threshold, out_bedpe)
finally:
out_bedpe.close()
exons = get_exons(params.genome_file, params.annotation_file, fusions)

for peplen in params.pep_lens.split(','):
logging.info('Processing %s-mers', peplen)
Expand All @@ -1057,7 +1063,8 @@ def main(params):
if proteome_data and snvs:
insert_snvs(proteome_data, snvs, tumfile, normfile, int(peplen), params.rna_file)
if transcriptome and gene_transcript_ids and fusions:
insert_fusions(transcriptome, fusions, gene_transcript_ids, int(peplen), tumfile, exons=exons)
insert_fusions(transcriptome, fusions, gene_transcript_ids, int(peplen), tumfile,
exons=exons)

if params.no_json_dumps:
shutil.move(tumfile_path, '_'.join([params.prefix, 'tumor', peplen,
Expand All @@ -1080,36 +1087,36 @@ def run_transgene():
This will try to run transgene from system arguments
"""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('--peptides', dest='peptide_file',
type=argparse.FileType('r'),
# SNV related options
parser.add_argument('--peptides', dest='peptide_file', type=argparse.FileType('r'),
help='Path to GENCODE translation FASTA file')
parser.add_argument('--transcripts', dest='transcript_file',
type=argparse.FileType('r'),
help='Path to GENCODE transcript FASTA file')
parser.add_argument('--snpeff', dest='snpeff_file',
type=argparse.FileType('r'),
parser.add_argument('--snpeff', dest='snpeff_file', type=argparse.FileType('r'),
help='Path to snpeff file')
parser.add_argument('--cores', dest='cores', type=int,
help='Number of cores to use for the filtering step.', required=False,
default=1)
parser.add_argument('--fusions', dest='fusion_file',
help='Path to gene fusion file',

# Fusion related options
parser.add_argument('--fusions', dest='fusion_file', help='Path to gene fusion file',
type=argparse.FileType('r'))
parser.add_argument('--transcripts', dest='transcript_file', type=argparse.FileType('r'),
help='Path to GENCODE transcript FASTA file. Required if calling fusions.')
parser.add_argument('--genome', dest='genome_file',
help='Path to reference genome file',
help='Path to reference genome file, Required if calling fusions.',
type=argparse.FileType('r'))
parser.add_argument('--annotation', dest='annotation_file',
help='Path to gencode annotation file',
help='Path to gencode annotation file. Required if calling fusions.',
type=argparse.FileType('r'))
parser.add_argument('--prefix', dest='prefix', type=str,
help='Prefix for output file names', required=True)
parser.add_argument('--pep_lens', dest='pep_lens', type=str,
help='Desired peptide lengths to process. '
'The argument should be in the form of comma separated values. '
'E.g. 9,15', required=False, default='9,10,15')
parser.add_argument('--no_json_dumps', action='store_true',
help='Do not educe peptide fasta record names in the output by dumping the '
'mapping info into a .map json file.', required=False, default=False)
parser.add_argument('--filter_mt_fusions', dest='filter_mt', action='store_true',
help='Filter fusions involving Mitochondrial genes.', required=False)
parser.add_argument('--filter_ig_pairs', dest='filter_ig', action='store_true',
help='Filter fusions involving two immunoglobulin genes (IGXXX).',
required=False)
parser.add_argument('--filter_rna_gene_fusions', dest='filter_rg', action='store_true',
help='Filter fusions involving RNA genes (RP11-XXXX).', required=False)
parser.add_argument('--filter_readthroughs', dest='filter_rt', action='store_true',
help='Filter transcriptional read-troughs.', required=False)
parser.add_argument('--readthrough_threshold', dest='rt_threshold', type=int,
help='Genomic distance between candidates on the same strand below which a '
'fusion will be considered a read-through.', default=500000, required=False)

# RNA-Aware options
parser.add_argument('--rna_file', dest='rna_file', help='The path to an RNA-seq bam file. If '
'provided, the vcf will be filtered for coding mutations only. The file '
Expand All @@ -1122,6 +1129,7 @@ def run_transgene():
parser.add_argument('--min_rna_alt_freq', dest='rna_min_alt_freq', help='The ALT allele '
'frequency (as a fraction) in the RNA-Seq below which we will reject the '
'mutation.', type=float, required=False, default=0.1)

# OxoG filtering options
parser.add_argument('--filterOxoG', dest='filter_oxog', action='store_true', help='Filter the '
'calls for OxoG artifacts. This feature requires a tumor dna bam as input.',
Expand All @@ -1133,19 +1141,38 @@ def run_transgene():
'allele frequency (as a fraction) in the DNA-Seq below which we will flag'
'the mutation as being an OxoG variant.',
type=float, required=False, default=0.1)

# Logging
parser.add_argument('--log_level', dest='log_level', help='The level of logging above which '
'messages should be printed.', required=False, choices={'DEBUG', 'INFO',
'WARNING', 'ERROR'},
default='INFO')
parser.add_argument('--log_file', dest='logfile', help='A path to a logfile.', type=str,
required=False, default=None)

# Misc
parser.add_argument('--prefix', dest='prefix', type=str, help='Prefix for output file names.',
required=True)
parser.add_argument('--pep_lens', dest='pep_lens', type=str, help='Desired peptide lengths to '
'process. The argument should be in the form of comma separated values. '
'E.g. 9,15', required=False, default='9,10,15')
parser.add_argument('--no_json_dumps', action='store_true',
help='Do not educe peptide fasta record names in the output by dumping the '
'mapping info into a .map json file.', required=False, default=False)
parser.add_argument('--cores', dest='cores', type=int,
help='Number of cores to use for the filtering step.', required=False,
default=1)
params = parser.parse_args()

if params.snpeff_file and not params.peptide_file:
raise ValueError('VCF file requires GENCODE translation FASTA file')

if params.fusion_file and not params.transcript_file:
raise ValueError('Fusion file requires GENCODE transcripts FASTA file')
raise ValueError('Fusion calling requires GENCODE transcripts FASTA file')
if params.fusion_file and not params.annotation_file:
raise ValueError('Fusion calling requires GENCODE gtf annotation file')
if params.fusion_file and not params.genome_file:
raise ValueError('Fusion calling requires genomic fasta')

if params.filter_oxog:
if not params.dna_file:
Expand Down

0 comments on commit 933d5f0

Please sign in to comment.