Skip to content

Commit

Permalink
Merge pull request #3 from d3b-center/domain_annotation
Browse files Browse the repository at this point in the history
Domain annotation
  • Loading branch information
kgaonkar6 authored May 12, 2020
2 parents 73d380c + 5e29d8a commit da156b2
Show file tree
Hide file tree
Showing 49 changed files with 2,970 additions and 3,248 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Package: annoFuse
Type: Package
Title: Annotation And Filtering Analysis For Fusion Calls
Version: 0.1.8
Version: 0.2
Author: Krutika Gaonkark
Maintainer: Krutika Gaonkark <[email protected]>
Description: Using annoFuse, users can filter out fusions known to be artifactual and retained high-quality fusion calls using support of at least one junction read and remove false calls if there is disproportionate spanning fragment support of more than 10 reads compared to the junction read count. For prioritization, users can capture known as well as putative driver fusions reported in TCGA, or fusions containing gene partners that are known oncogenes, tumor suppressor genes, or COSMIC genes.
Finally, users can also determine recurrent fusions across the cohort and recurrently-fused genes within each histology. By providing a standardized filtering and annotation method from multiple callers (STAR-Fusion and Arriba) users are able to merge, filter and prioritize putative oncogenic fusions across the PBTA.
License: MIT
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.1.9000
Imports: reshape2,dplyr,tidyr,ggplot2,qdapRegex,ggpubr,ensembldb,tibble,ggthemes,EnsDb.Hsapiens.v86,grid,tidyverse,readr
Imports: reshape2,dplyr,tidyr,ggplot2,qdapRegex,ggpubr,ensembldb,tibble,ggthemes,EnsDb.Hsapiens.v86,grid,tidyverse,readr,grDevices,stats,utils,stringr
Suggests: knitr, rmarkdown
VignetteBuilder: knitr
NeedsCompilation: no
Expand Down
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2020
COPYRIGHT HOLDER: Krutika Gaonkar
21 changes: 21 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MIT License

Copyright (c) 2020 Krutika Gaonkar

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
5 changes: 4 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ import(tibble)
import(EnsDb.Hsapiens.v86)
import(ggthemes)
import(grid)

importFrom("stringr","str_detect")
importFrom("grDevices", "rainbow")
importFrom("stats", "median", "na.omit")
importFrom("utils", "head")
Empty file added R/.Rapp.history
Empty file.
46 changes: 23 additions & 23 deletions R/ZscoreAnnotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
expressionMatrixMatched <- expressionMatrix %>%
unique() %>%
# means for each row per each gene_id
dplyr::mutate(means = rowMeans(dplyr::select(.,-GeneSymbol,-gene_id,-EnsembleID))) %>%
dplyr::mutate(means = rowMeans(dplyr::select(.data$.,-.data$GeneSymbol,-.data$gene_id,-.data$EnsembleID))) %>%
# arrange descending mean
arrange(desc(means)) %>%
arrange(desc(.data$means)) %>%
# to keep only first occurence ie. max rowMean per GeneSymbol
distinct(GeneSymbol, .keep_all = TRUE) %>%
distinct(.data$GeneSymbol, .keep_all = TRUE) %>%
ungroup() %>%
dplyr::select(-means,-gene_id,-EnsembleID) %>%
dplyr::filter(GeneSymbol %in% normData$GeneSymbol) %>%
dplyr::select(-.data$means,-.data$gene_id,-.data$EnsembleID) %>%
dplyr::filter(.data$GeneSymbol %in% normData$GeneSymbol) %>%
tibble::column_to_rownames("GeneSymbol")
expressionMatrixMatched<-log2(expressionMatrixMatched+1)

Expand All @@ -42,11 +42,11 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter

#normData mean and sd
normData_means <- rowMeans(normData, na.rm = TRUE)
normData_sd <- apply(normData, 1, sd, na.rm = TRUE)
normData_sd <- apply(normData, 1, .data$sd, na.rm = TRUE)
# subtract mean
expressionMatrixzscored <- sweep(expressionMatrixMatched, 1, normData_means, FUN = "-")
# divide by SD
expressionMatrixzscored <- sweep(expressionMatrixzscored, 1,normData_sd, FUN = "/") %>% mutate(GeneSymbol=rownames(.)) %>% na.omit()
expressionMatrixzscored <- sweep(expressionMatrixzscored, 1,normData_sd, FUN = "/") %>% mutate(GeneSymbol=rownames(.data)) %>% na.omit(.data)


# To save GTEx/cohort scored matrix
Expand All @@ -63,13 +63,13 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
# fusion calls
fusion_sample_gene_df <- standardFusionCalls %>%
# We want to keep track of the gene symbols for each sample-fusion pair
dplyr::select(Sample, FusionName, Gene1A, Gene1B, Gene2A, Gene2B) %>%
dplyr::select(.data$Sample, .data$FusionName, .data$Gene1A, .data$Gene1B, .data$Gene2A, .data$Gene2B) %>%
# We want a single column that contains the gene symbols
tidyr::gather(Gene1A, Gene1B, Gene2A, Gene2B,
key = gene_position, value = GeneSymbol) %>%
tidyr::gather(.data$Gene1A, .data$Gene1B, .data$Gene2A, .data$Gene2B,
key = .data$gene_position, value = .data$GeneSymbol) %>%
# Remove columns without gene symbols
dplyr::filter(GeneSymbol != "") %>%
dplyr::arrange(Sample, FusionName) %>%
dplyr::filter(.data$GeneSymbol != "") %>%
dplyr::arrange(.data$Sample, .data$FusionName) %>%
# Retain only distinct rows
dplyr::distinct()

Expand All @@ -81,21 +81,21 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
# for each sample-fusion name pair
dplyr::left_join(expression_long_df, by = c("Sample", "GeneSymbol")) %>%
# for each sample-fusion name pair, are all genes under the expression threshold?
dplyr::group_by(FusionName, Sample) %>%
dplyr::select(FusionName, Sample,zscore_value,gene_position) %>%
dplyr::group_by(.data$FusionName, .data$Sample) %>%
dplyr::select(.data$FusionName, .data$Sample,.data$zscore_value,.data$gene_position) %>%
# cast to keep zscore and gene position
reshape2::dcast(FusionName+Sample ~gene_position,value.var = "zscore_value") %>%
# incase Gene2A/B dont exist like in STARfusion calls
add_column(!!!cols[setdiff(names(cols),names(.))]) %>%
add_column(!!!cols[setdiff(names(cols),names(.data))]) %>%
# get annotation from z score
dplyr::mutate(note_expression_Gene1A = ifelse((Gene1A>zscoreFilter| Gene1A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene1B = ifelse((Gene1B>zscoreFilter| Gene1B< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2A = ifelse((Gene2A>zscoreFilter| Gene2A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2B = ifelse((Gene2B>zscoreFilter| Gene2B< -zscoreFilter),"differentially expressed","no change")) %>%
dplyr::rename(zscore_Gene1A=Gene1A,
zscore_Gene1B=Gene1B,
zscore_Gene2A=Gene2A,
zscore_Gene2B=Gene2B) %>%
dplyr::mutate(note_expression_Gene1A = ifelse((.data$Gene1A>zscoreFilter| .data$Gene1A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene1B = ifelse((.data$Gene1B>zscoreFilter| .data$Gene1B< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2A = ifelse((.data$Gene2A>zscoreFilter| .data$Gene2A< -zscoreFilter),"differentially expressed","no change"),
note_expression_Gene2B = ifelse((.data$Gene2B>zscoreFilter| .data$Gene2B< -zscoreFilter),"differentially expressed","no change")) %>%
dplyr::rename(zscore_Gene1A=.data$Gene1A,
zscore_Gene1B=.data$Gene1B,
zscore_Gene2A=.data$Gene2A,
zscore_Gene2B=.data$Gene2B) %>%
# unique FusionName-Sample rows
# use this to filter the QC filtered fusion data frame
dplyr::inner_join(standardFusionCalls, by = c("FusionName", "Sample")) %>%
Expand Down
74 changes: 33 additions & 41 deletions R/aggregate_fusion_calls.R
Original file line number Diff line number Diff line change
@@ -1,64 +1,56 @@
#' Function to aggregate Caller and read support per fusions calls
#'
#' @param standardFusioncalls A dataframe from star fusion or arriba standardized to run through the filtering steps
#' @param removeother TRUE to remove Fusion_Type="other" and keep only in-frame and frameshift
#' @param removeother TRUE to remove Fusion_Type="other" and keep only in-frame and frameshift Default: FALSE
#' @param filterAnnots regex to remove from annots column eg. LOCAL_REARRANGEMENT|LOCAL_INVERSION
#' @return Standardized fusion calls with aggregated Caller and read support

aggregate_fusion_calls<-function(standardFusioncalls=standardFusioncalls,removeother=TRUE,filterAnnots=filterAnnots){
aggregate_fusion_calls<-function(standardFusioncalls=standardFusioncalls,removeother=FALSE,filterAnnots=filterAnnots){

if( removeother){
# aggregate caller per FusionName, Sample and Fusion_Type
fusion_caller.summary <- standardFusioncalls %>%
dplyr::filter(.data$Fusion_Type != "other") %>%
dplyr::select(.data$Sample,.data$FusionName,.data$Caller,.data$Fusion_Type) %>%
group_by(.data$FusionName, .data$Sample ,.data$Fusion_Type) %>%
unique() %>%
dplyr::mutate(CalledBy = toString(.data$Caller), caller.count = n())

if (!missing(filterAnnots)){
# remove fusion within local rearrangement if required for your project
standardFusioncalls <- standardFusioncalls %>%
# remove local rearrangement/adjacent genes use "LOCAL_REARRANGEMENT|LOCAL_INVERSION"
dplyr::filter(!grepl(filterAnnots,annots))
dplyr::filter(!grepl(filterAnnots,.data$annots))
}


if( removeother){
# aggregate caller
fusion_caller.summary <- standardFusioncalls %>%
dplyr::filter(Fusion_Type != "other") %>%
dplyr::select(Sample,FusionName,Caller) %>%
group_by(FusionName, Sample ) %>%
unique() %>%
dplyr::mutate(CalledBy = toString(Caller), caller.count = n())

# aggregate by read count
fusion_read.summary <- standardFusioncalls %>%
dplyr::filter(Fusion_Type != "other") %>%
dplyr::select(Sample,FusionName,JunctionReadCount,SpanningFragCount) %>%
group_by(FusionName, Sample ) %>%
unique() %>%
dplyr::mutate(JunctionReadCountSum=sum(JunctionReadCount),SpanningFragCountSum=sum(SpanningFragCount))

fusion_summary<-left_join(fusion_caller.summary,fusion_read.summary,by=c("Sample","FusionName"))


#to add aggregated caller from fusion_caller.summary
standardFusioncalls<-standardFusioncalls %>%
dplyr::filter(Fusion_Type != "other") %>%
left_join(fusion_summary,by=(c("Sample","FusionName","Caller","JunctionReadCount","SpanningFragCount")))
dplyr::filter(.data$Fusion_Type != "other") %>%
left_join(fusion_caller.summary,by=(c("Sample","FusionName","Fusion_Type","Caller"))) %>%
unique()

return(standardFusioncalls)
}
else {
# aggregate caller per FusionName, Sample and Fusion_Type
fusion_caller.summary <- standardFusioncalls %>%
dplyr::select(Sample,FusionName,Caller) %>%
group_by(FusionName, Sample ) %>%
unique() %>%
dplyr::mutate(CalledBy = toString(Caller), caller.count = n())

# aggregate by read count
fusion_read.summary <- standardFusioncalls %>%
dplyr::select(Sample,FusionName,JunctionReadCount,SpanningFragCount) %>%
group_by(FusionName, Sample ) %>%
dplyr::select(.data$Sample,.data$FusionName,.data$Caller,.data$Fusion_Type) %>%
group_by(.data$FusionName, .data$Sample ,.data$Fusion_Type) %>%
unique() %>%
dplyr::mutate(JunctionReadCountSum=sum(JunctionReadCount),SpanningFragCountSum=sum(SpanningFragCount))

fusion_summary<-left_join(fusion_caller.summary,fusion_read.summary,by=c("Sample","FusionName"))

dplyr::mutate(CalledBy = toString(.data$Caller), caller.count = n())

if (!missing(filterAnnots)){
# remove fusion within local rearrangement if required for your project
standardFusioncalls <- standardFusioncalls %>%
# remove local rearrangement/adjacent genes use "LOCAL_REARRANGEMENT|LOCAL_INVERSION"
dplyr::filter(!grepl(filterAnnots,.data$annots))
}

#to add aggregated caller from fusion_caller.summary
standardFusioncalls<-standardFusioncalls %>%
left_join(fusion_summary,by=(c("Sample","FusionName","Caller","JunctionReadCount","SpanningFragCount")))

left_join(fusion_caller.summary,by=(c("Sample","FusionName","Fusion_Type","Caller"))) %>%
unique()


return(standardFusioncalls)

Expand Down
130 changes: 130 additions & 0 deletions R/annoFuseSingleSample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#' Single Sample use for annoFuse
#'
#'
#' Performs artifact filter to remove readthroughs,red flags and performs expression filtering with user provided expression Matrix and expression threshold

#' @param fusionfileArriba A dataframe from arriba fusion caller
#' @param fusionfileStarFusion A dataframe from starfusion caller
#' @param expressionFile Expression matrix for samples used in cohort for fusion calls
#' @param expressionFilter FPKM/TPM threshold for not expressed
#' @param tumorID Sample name to be used
#' @param readingFrameFilter A regex to capture readingframe (eg. in-frame|frameshift|other)
#' @param readthroughFilter Boolean for filtering readthroughs
#' @param artifactFilter A red flag filter from Annotation ; in OpenPBTA annotation is from FusionAnnotator column "annots"
#' @param junctionReadCountFilter An integer threshold for JunctionReadCount
#' @param spanningFragCountFilter An integer threshold for (SpanningFragCount - JunctionReadCount)
#' @return Standardized fusion calls annotated with gene list and fusion list provided in reference folder

annoFuseSingleSample<-function(fusionfileArriba=fusionfileArriba,fusionfileStarFusion=fusionfileStarFusion,expressionFile=NULL,expressionFilter=1,tumorID="tumorID",artifactFilter="GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap|ConjoinG",readingFrameFilter="in-frame|frameshift|other",junctionReadCountFilter=1,spanningFragCountFilter=10,readthroughFilter=FALSE){

# read files
STARFusioninputfile<-read_tsv(fusionfileStarFusion)
Arribainputfile<-read_tsv(fusionfileArriba,col_types = readr::cols(breakpoint1 = readr::col_character(),breakpoint2 = readr::col_character()))

# read in gene and fusion reference tab
geneListReferenceDataTab<-read.delim(system.file("extdata","genelistreference.txt", package="annoFuse"),stringsAsFactors = FALSE)
geneListReferenceDataTab<-geneListReferenceDataTab %>% dplyr::group_by(Gene_Symbol) %>% dplyr::mutate(type = toString(type)) %>%
dplyr::distinct(Gene_Symbol, type,file) %>% as.data.frame()

# column 1 as FusionName 2 source file 3 type; collapse to summarize type
fusionReferenceDataTab<-read.delim(system.file("extdata","fusionreference.txt", package="annoFuse"),stringsAsFactors = FALSE)
fusionReferenceDataTab<-fusionReferenceDataTab %>%
dplyr::distinct(FusionName,type,file) %>% as.data.frame()

# if StarFusion and Arriba files empty execution stops
if(is_empty(STARFusioninputfile$FusionName) & is_empty(Arribainputfile$"gene1--gene2")){
stop("StarFusion and Arriba files empty")
}

# if StarFusion and Arriba files are not empty
if(!is_empty(STARFusioninputfile$FusionName) & !is_empty(Arribainputfile$"gene1--gene2")){
colnames(Arribainputfile)[27]<-"annots"

# To have a general column with unique IDs associated with each sample
STARFusioninputfile$Sample <- tumorID
Arribainputfile$Sample <- tumorID
Arribainputfile$Caller <- "Arriba"
STARFusioninputfile$Caller <- "StarFusion"

# standardized fusion calls
standardizedSTARFusion<-fusion_standardization(fusion_calls = STARFusioninputfile,caller = "STARFUSION")
standardizedArriba<-fusion_standardization(fusion_calls = Arribainputfile,caller = "ARRIBA")

#merge standardized fusion calls
standardFusioncalls<-rbind(standardizedSTARFusion,standardizedArriba) %>% as.data.frame()
}

# if StarFusion file is empty only run standardization for Arriba calls
if(is_empty(STARFusioninputfile$FusionName) & !is_empty(Arribainputfile$"gene1--gene2")){
warning(paste("No fusion calls in StarFusion : ",opt$fusionfileStarFusion))

colnames(Arribainputfile)[27]<-"annots"
# To have a general column with unique IDs associated with each sample
Arribainputfile$Sample <- tumorID
Arribainputfile$Caller <- "Arriba"

# standardized fusion calls
standardizedArriba<-fusion_standardization(fusion_calls = Arribainputfile,caller = "ARRIBA")

# standardized fusion calls
standardFusioncalls<-standardizedArriba %>% as.data.frame()
}

# if Arriba file is empty only run standardization for StarFusion calls
if(!is_empty(STARFusioninputfile$FusionName) & is_empty(Arribainputfile$"gene1--gene2")){
warning(paste("No fusion calls in StarFusion : ",opt$fusionfileStarFusion))

# To have a general column with unique IDs associated with each sample
STARFusioninputfile$Sample <- tumorID
STARFusioninputfile$Caller <- "StarFusion"

# standardized fusion calls
standardizedSTARFusion<-fusion_standardization(fusion_calls = STARFusioninputfile,caller = "STARFUSION")

# standardized fusion calls
standardFusioncalls<-standardizedSTARFusion %>% as.data.frame()
}


# General fusion QC for read support and red flags
fusionQCFiltered<-fusion_filtering_QC(standardFusioncalls=standardFusioncalls,readingFrameFilter=readingFrameFilter,artifactFilter=artifactFilter,junctionReadCountFilter=junctionReadCountFilter,spanningFragCountFilter=spanningFragCountFilter,readthroughFilter=readthroughFilter)

if (!is.null(expressionFile)){
expressionMatrix<-read_tsv(expressionFile)

# split gene id and symbol
expressionMatrix <- expressionMatrix %>%
dplyr::mutate(gene_id = str_replace(gene_id, "_PAR_Y_", "_"))


expressionMatrix <- cbind(expressionMatrix, colsplit(expressionMatrix$gene_id, pattern = '_', names = c("EnsembleID","GeneSymbol")))


# collapse to matrix of HUGO symbols x Sample identifiers
# take max expression per row and use the max value for duplicated gene symbols
expressionMatrix.collapsed <-expressionMatrix %>%
arrange(desc(FPKM)) %>% # arrange decreasing by FPKM
distinct(GeneSymbol, .keep_all = TRUE) %>% # keep the ones with greatest FPKM value. If ties occur, keep the first occurencce
unique() %>%
remove_rownames() %>%
dplyr::select(EnsembleID,GeneSymbol,FPKM,gene_id)

# rename columns
colnames(expressionMatrix.collapsed)[3]<-tumorID

expressionFiltered<-annoFuse::expressionFilterFusion(standardFusioncalls = fusionQCFiltered,expressionFilter = expressionFilter,expressionMatrix = expressionMatrix.collapsed)

# annotated QC and expression filtered fusion calls
filteredFusionAnnotated<-annotate_fusion_calls(standardFusioncalls=expressionFiltered,geneListReferenceDataTab=geneListReferenceDataTab,fusionReferenceDataTab=fusionReferenceDataTab)
} else{
# annotated QC filtered fusion calls
filteredFusionAnnotated<-annotate_fusion_calls(standardFusioncalls=fusionQCFiltered,geneListReferenceDataTab=geneListReferenceDataTab,fusionReferenceDataTab=fusionReferenceDataTab)

}


# return filtered and annotated dataframe
return(filteredFusionAnnotated)


}
Loading

0 comments on commit da156b2

Please sign in to comment.