Merge pull request #3 from d3b-center/domain_annotation

Domain annotation
d3b-center · May 12, 2020 · da156b2 · da156b2
2 parents 73d380c + 5e29d8a
commit da156b2
Show file tree

Hide file tree

Showing 49 changed files with 2,970 additions and 3,248 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,16 @@
 Package: annoFuse
 Type: Package
 Title: Annotation And Filtering Analysis For Fusion Calls
-Version: 0.1.8
+Version: 0.2
 Author: Krutika Gaonkark
 Maintainer: Krutika Gaonkark <[email protected]>
 Description: Using annoFuse, users can filter out fusions known to be artifactual and retained high-quality fusion calls using support of at least one junction read and remove false calls if there is disproportionate spanning fragment support of more than 10 reads compared to the junction read count. For prioritization, users can capture known as well as putative driver fusions reported in TCGA, or fusions containing gene partners that are known oncogenes, tumor suppressor genes, or COSMIC genes. 
     Finally, users can also determine recurrent fusions across the cohort and recurrently-fused genes within each histology. By providing a standardized filtering and annotation method from multiple callers (STAR-Fusion and Arriba) users are able to merge, filter and prioritize putative oncogenic fusions across the PBTA. 
-License: MIT
+License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 6.1.1.9000
-Imports: reshape2,dplyr,tidyr,ggplot2,qdapRegex,ggpubr,ensembldb,tibble,ggthemes,EnsDb.Hsapiens.v86,grid,tidyverse,readr
+Imports: reshape2,dplyr,tidyr,ggplot2,qdapRegex,ggpubr,ensembldb,tibble,ggthemes,EnsDb.Hsapiens.v86,grid,tidyverse,readr,grDevices,stats,utils,stringr
 Suggests: knitr, rmarkdown
 VignetteBuilder: knitr
 NeedsCompilation: no

diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2020
+COPYRIGHT HOLDER: Krutika Gaonkar
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2020 Krutika Gaonkar
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,4 +14,7 @@ import(tibble)
 import(EnsDb.Hsapiens.v86)
 import(ggthemes)
 import(grid)
-
+importFrom("stringr","str_detect")
+importFrom("grDevices", "rainbow")
+importFrom("stats", "median", "na.omit")
+importFrom("utils", "head")
diff --git a/R/.Rapp.history b/R/.Rapp.history
diff --git a/R/ZscoreAnnotation.R b/R/ZscoreAnnotation.R
@@ -19,14 +19,14 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
   expressionMatrixMatched <- expressionMatrix %>%
     unique() %>%
     # means for each row per each gene_id
-    dplyr::mutate(means = rowMeans(dplyr::select(.,-GeneSymbol,-gene_id,-EnsembleID))) %>%
+    dplyr::mutate(means = rowMeans(dplyr::select(.data$.,-.data$GeneSymbol,-.data$gene_id,-.data$EnsembleID))) %>%
     # arrange descending mean
-    arrange(desc(means)) %>%
+    arrange(desc(.data$means)) %>%
     # to keep only first occurence ie. max rowMean per GeneSymbol
-    distinct(GeneSymbol, .keep_all = TRUE) %>%
+    distinct(.data$GeneSymbol, .keep_all = TRUE) %>%
     ungroup() %>%
-    dplyr::select(-means,-gene_id,-EnsembleID) %>%
-    dplyr::filter(GeneSymbol %in% normData$GeneSymbol) %>%
+    dplyr::select(-.data$means,-.data$gene_id,-.data$EnsembleID) %>%
+    dplyr::filter(.data$GeneSymbol %in% normData$GeneSymbol) %>%
     tibble::column_to_rownames("GeneSymbol")
   expressionMatrixMatched<-log2(expressionMatrixMatched+1)
 
@@ -42,11 +42,11 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
 
   #normData mean and sd
   normData_means <- rowMeans(normData, na.rm = TRUE)
-  normData_sd <- apply(normData, 1, sd, na.rm = TRUE)
+  normData_sd <- apply(normData, 1, .data$sd, na.rm = TRUE)
   # subtract mean
   expressionMatrixzscored <- sweep(expressionMatrixMatched, 1, normData_means, FUN = "-")
   # divide by SD
-  expressionMatrixzscored <- sweep(expressionMatrixzscored, 1,normData_sd, FUN = "/") %>% mutate(GeneSymbol=rownames(.)) %>% na.omit()
+  expressionMatrixzscored <- sweep(expressionMatrixzscored, 1,normData_sd, FUN = "/") %>% mutate(GeneSymbol=rownames(.data)) %>% na.omit(.data)
 
 
   # To save GTEx/cohort scored matrix
@@ -63,13 +63,13 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
   # fusion calls
   fusion_sample_gene_df <- standardFusionCalls %>%
     # We want to keep track of the gene symbols for each sample-fusion pair
-    dplyr::select(Sample, FusionName, Gene1A, Gene1B, Gene2A, Gene2B) %>%
+    dplyr::select(.data$Sample, .data$FusionName, .data$Gene1A, .data$Gene1B, .data$Gene2A, .data$Gene2B) %>%
     # We want a single column that contains the gene symbols
-    tidyr::gather(Gene1A, Gene1B, Gene2A, Gene2B,
-                  key = gene_position, value = GeneSymbol) %>%
+    tidyr::gather(.data$Gene1A, .data$Gene1B, .data$Gene2A, .data$Gene2B,
+                  key = .data$gene_position, value = .data$GeneSymbol) %>%
     # Remove columns without gene symbols
-    dplyr::filter(GeneSymbol != "") %>%
-    dplyr::arrange(Sample, FusionName) %>%
+    dplyr::filter(.data$GeneSymbol != "") %>%
+    dplyr::arrange(.data$Sample, .data$FusionName) %>%
     # Retain only distinct rows
     dplyr::distinct()
 
@@ -81,21 +81,21 @@ ZscoredAnnotation<-function(standardFusionCalls=standardFusionCalls,zscoreFilter
     # for each sample-fusion name pair
     dplyr::left_join(expression_long_df, by = c("Sample", "GeneSymbol")) %>%
     # for each sample-fusion name pair, are all genes under the expression threshold?
-    dplyr::group_by(FusionName, Sample) %>%
-    dplyr::select(FusionName, Sample,zscore_value,gene_position) %>%
+    dplyr::group_by(.data$FusionName, .data$Sample) %>%
+    dplyr::select(.data$FusionName, .data$Sample,.data$zscore_value,.data$gene_position) %>%
     # cast to keep zscore and gene position
     reshape2::dcast(FusionName+Sample ~gene_position,value.var = "zscore_value") %>%
     # incase Gene2A/B dont exist like in STARfusion calls
-    add_column(!!!cols[setdiff(names(cols),names(.))]) %>%
+    add_column(!!!cols[setdiff(names(cols),names(.data))]) %>%
     # get annotation from z score
-    dplyr::mutate(note_expression_Gene1A = ifelse((Gene1A>zscoreFilter| Gene1A< -zscoreFilter),"differentially expressed","no change"),
-                  note_expression_Gene1B = ifelse((Gene1B>zscoreFilter| Gene1B< -zscoreFilter),"differentially expressed","no change"),
-                  note_expression_Gene2A = ifelse((Gene2A>zscoreFilter| Gene2A< -zscoreFilter),"differentially expressed","no change"),
-                  note_expression_Gene2B = ifelse((Gene2B>zscoreFilter| Gene2B< -zscoreFilter),"differentially expressed","no change")) %>%
-    dplyr::rename(zscore_Gene1A=Gene1A,
-                  zscore_Gene1B=Gene1B,
-                  zscore_Gene2A=Gene2A,
-                  zscore_Gene2B=Gene2B) %>%
+    dplyr::mutate(note_expression_Gene1A = ifelse((.data$Gene1A>zscoreFilter| .data$Gene1A< -zscoreFilter),"differentially expressed","no change"),
+                  note_expression_Gene1B = ifelse((.data$Gene1B>zscoreFilter| .data$Gene1B< -zscoreFilter),"differentially expressed","no change"),
+                  note_expression_Gene2A = ifelse((.data$Gene2A>zscoreFilter| .data$Gene2A< -zscoreFilter),"differentially expressed","no change"),
+                  note_expression_Gene2B = ifelse((.data$Gene2B>zscoreFilter| .data$Gene2B< -zscoreFilter),"differentially expressed","no change")) %>%
+    dplyr::rename(zscore_Gene1A=.data$Gene1A,
+                  zscore_Gene1B=.data$Gene1B,
+                  zscore_Gene2A=.data$Gene2A,
+                  zscore_Gene2B=.data$Gene2B) %>%
     # unique FusionName-Sample rows
     # use this to filter the QC filtered fusion data frame
     dplyr::inner_join(standardFusionCalls, by = c("FusionName", "Sample")) %>%

diff --git a/R/aggregate_fusion_calls.R b/R/aggregate_fusion_calls.R
@@ -1,64 +1,56 @@
 #' Function to aggregate Caller and read support per fusions calls
 #'
 #' @param standardFusioncalls A dataframe from star fusion or arriba standardized to run through the filtering steps
-#' @param removeother TRUE to remove Fusion_Type="other" and keep only in-frame and frameshift
+#' @param removeother TRUE to remove Fusion_Type="other" and keep only in-frame and frameshift Default: FALSE
 #' @param filterAnnots regex to remove from annots column eg. LOCAL_REARRANGEMENT|LOCAL_INVERSION
 #' @return Standardized fusion calls with aggregated Caller and read support
 
-aggregate_fusion_calls<-function(standardFusioncalls=standardFusioncalls,removeother=TRUE,filterAnnots=filterAnnots){
+aggregate_fusion_calls<-function(standardFusioncalls=standardFusioncalls,removeother=FALSE,filterAnnots=filterAnnots){
 
+  if( removeother){
+  # aggregate caller per FusionName, Sample and Fusion_Type
+  fusion_caller.summary <- standardFusioncalls %>%
+    dplyr::filter(.data$Fusion_Type != "other") %>%
+    dplyr::select(.data$Sample,.data$FusionName,.data$Caller,.data$Fusion_Type) %>%
+    group_by(.data$FusionName, .data$Sample ,.data$Fusion_Type) %>%
+    unique() %>%
+    dplyr::mutate(CalledBy = toString(.data$Caller), caller.count = n())
+
   if (!missing(filterAnnots)){
     # remove fusion within local rearrangement if required for your project
     standardFusioncalls <- standardFusioncalls %>%
       # remove local rearrangement/adjacent genes use "LOCAL_REARRANGEMENT|LOCAL_INVERSION"
-      dplyr::filter(!grepl(filterAnnots,annots))
+      dplyr::filter(!grepl(filterAnnots,.data$annots))
   }
-
-
-  if( removeother){
-  # aggregate caller
-  fusion_caller.summary <- standardFusioncalls %>%
-    dplyr::filter(Fusion_Type != "other") %>%
-    dplyr::select(Sample,FusionName,Caller) %>%
-    group_by(FusionName, Sample ) %>%
-    unique() %>%
-    dplyr::mutate(CalledBy = toString(Caller), caller.count = n())
-
-  # aggregate by read count
-  fusion_read.summary <- standardFusioncalls %>%
-    dplyr::filter(Fusion_Type != "other") %>%
-    dplyr::select(Sample,FusionName,JunctionReadCount,SpanningFragCount) %>%
-    group_by(FusionName, Sample ) %>%
-    unique() %>%
-    dplyr::mutate(JunctionReadCountSum=sum(JunctionReadCount),SpanningFragCountSum=sum(SpanningFragCount))
-
-  fusion_summary<-left_join(fusion_caller.summary,fusion_read.summary,by=c("Sample","FusionName"))
-
+
+  #to add aggregated caller from fusion_caller.summary
   standardFusioncalls<-standardFusioncalls %>%
-    dplyr::filter(Fusion_Type != "other") %>%
-    left_join(fusion_summary,by=(c("Sample","FusionName","Caller","JunctionReadCount","SpanningFragCount")))
+    dplyr::filter(.data$Fusion_Type != "other") %>%
+    left_join(fusion_caller.summary,by=(c("Sample","FusionName","Fusion_Type","Caller"))) %>% 
+    unique()
 
   return(standardFusioncalls)
   }
   else {
+    # aggregate caller per FusionName, Sample and Fusion_Type
     fusion_caller.summary <- standardFusioncalls %>%
-      dplyr::select(Sample,FusionName,Caller) %>%
-      group_by(FusionName, Sample ) %>%
-      unique() %>%
-      dplyr::mutate(CalledBy = toString(Caller), caller.count = n())
-
-    # aggregate by read count
-    fusion_read.summary <- standardFusioncalls %>%
-      dplyr::select(Sample,FusionName,JunctionReadCount,SpanningFragCount) %>%
-      group_by(FusionName, Sample ) %>%
+      dplyr::select(.data$Sample,.data$FusionName,.data$Caller,.data$Fusion_Type) %>%
+      group_by(.data$FusionName, .data$Sample ,.data$Fusion_Type) %>%
       unique() %>%
-      dplyr::mutate(JunctionReadCountSum=sum(JunctionReadCount),SpanningFragCountSum=sum(SpanningFragCount))
-
-    fusion_summary<-left_join(fusion_caller.summary,fusion_read.summary,by=c("Sample","FusionName"))
-
+      dplyr::mutate(CalledBy = toString(.data$Caller), caller.count = n()) 
+
+    if (!missing(filterAnnots)){
+      # remove fusion within local rearrangement if required for your project
+      standardFusioncalls <- standardFusioncalls %>%
+        # remove local rearrangement/adjacent genes use "LOCAL_REARRANGEMENT|LOCAL_INVERSION"
+        dplyr::filter(!grepl(filterAnnots,.data$annots))
+    }
+
+    #to add aggregated caller from fusion_caller.summary
     standardFusioncalls<-standardFusioncalls %>%
-      left_join(fusion_summary,by=(c("Sample","FusionName","Caller","JunctionReadCount","SpanningFragCount")))
-
+      left_join(fusion_caller.summary,by=(c("Sample","FusionName","Fusion_Type","Caller"))) %>% 
+      unique()
+
 
     return(standardFusioncalls)
 

diff --git a/R/annoFuseSingleSample.R b/R/annoFuseSingleSample.R
@@ -0,0 +1,130 @@
+#' Single Sample use for annoFuse
+#'
+#' 
+#' Performs artifact filter to remove readthroughs,red flags and performs expression filtering with user provided expression Matrix and expression threshold
+
+#' @param fusionfileArriba A dataframe from arriba fusion caller
+#' @param fusionfileStarFusion A dataframe from starfusion caller
+#' @param expressionFile Expression matrix for samples used in cohort for fusion calls
+#' @param expressionFilter FPKM/TPM threshold for not expressed
+#' @param tumorID Sample name to be used 
+#' @param readingFrameFilter A regex to capture readingframe (eg. in-frame|frameshift|other)
+#' @param readthroughFilter Boolean for filtering readthroughs
+#' @param artifactFilter A red flag filter from Annotation ; in OpenPBTA annotation is from FusionAnnotator column "annots"
+#' @param junctionReadCountFilter An integer threshold for JunctionReadCount
+#' @param spanningFragCountFilter An integer threshold for (SpanningFragCount - JunctionReadCount)
+#' @return Standardized fusion calls annotated with gene list and fusion list provided in reference folder
+
+annoFuseSingleSample<-function(fusionfileArriba=fusionfileArriba,fusionfileStarFusion=fusionfileStarFusion,expressionFile=NULL,expressionFilter=1,tumorID="tumorID",artifactFilter="GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap|ConjoinG",readingFrameFilter="in-frame|frameshift|other",junctionReadCountFilter=1,spanningFragCountFilter=10,readthroughFilter=FALSE){
+
+  # read files
+  STARFusioninputfile<-read_tsv(fusionfileStarFusion)
+  Arribainputfile<-read_tsv(fusionfileArriba,col_types = readr::cols(breakpoint1 = readr::col_character(),breakpoint2 = readr::col_character()))
+
+  # read in gene and fusion reference tab
+  geneListReferenceDataTab<-read.delim(system.file("extdata","genelistreference.txt", package="annoFuse"),stringsAsFactors = FALSE)
+  geneListReferenceDataTab<-geneListReferenceDataTab %>% dplyr::group_by(Gene_Symbol) %>% dplyr::mutate(type = toString(type)) %>%
+    dplyr::distinct(Gene_Symbol, type,file) %>% as.data.frame()
+
+  # column 1 as FusionName 2 source file 3 type; collapse to summarize type
+  fusionReferenceDataTab<-read.delim(system.file("extdata","fusionreference.txt", package="annoFuse"),stringsAsFactors = FALSE)
+  fusionReferenceDataTab<-fusionReferenceDataTab %>%
+    dplyr::distinct(FusionName,type,file) %>% as.data.frame()
+
+  # if StarFusion and Arriba files empty execution stops
+  if(is_empty(STARFusioninputfile$FusionName) & is_empty(Arribainputfile$"gene1--gene2")){
+    stop("StarFusion and Arriba files empty")
+  }
+
+  # if StarFusion and Arriba files are not empty
+  if(!is_empty(STARFusioninputfile$FusionName) & !is_empty(Arribainputfile$"gene1--gene2")){
+    colnames(Arribainputfile)[27]<-"annots"
+
+    # To have a general column with unique IDs associated with each sample
+    STARFusioninputfile$Sample <- tumorID
+    Arribainputfile$Sample <- tumorID
+    Arribainputfile$Caller <- "Arriba"
+    STARFusioninputfile$Caller <- "StarFusion"
+
+    # standardized fusion calls
+    standardizedSTARFusion<-fusion_standardization(fusion_calls = STARFusioninputfile,caller = "STARFUSION")
+    standardizedArriba<-fusion_standardization(fusion_calls = Arribainputfile,caller = "ARRIBA")
+
+    #merge standardized fusion calls
+    standardFusioncalls<-rbind(standardizedSTARFusion,standardizedArriba) %>% as.data.frame()
+  }
+
+  # if StarFusion file is empty only run standardization for Arriba calls
+  if(is_empty(STARFusioninputfile$FusionName) & !is_empty(Arribainputfile$"gene1--gene2")){
+    warning(paste("No fusion calls in StarFusion : ",opt$fusionfileStarFusion))
+
+    colnames(Arribainputfile)[27]<-"annots"
+    # To have a general column with unique IDs associated with each sample
+    Arribainputfile$Sample <- tumorID
+    Arribainputfile$Caller <- "Arriba"
+
+    # standardized fusion calls
+    standardizedArriba<-fusion_standardization(fusion_calls = Arribainputfile,caller = "ARRIBA")
+
+    # standardized fusion calls
+    standardFusioncalls<-standardizedArriba %>% as.data.frame()
+  }
+
+  # if Arriba file is empty only run standardization for StarFusion calls
+  if(!is_empty(STARFusioninputfile$FusionName) & is_empty(Arribainputfile$"gene1--gene2")){
+    warning(paste("No fusion calls in StarFusion : ",opt$fusionfileStarFusion))
+
+    # To have a general column with unique IDs associated with each sample
+    STARFusioninputfile$Sample <- tumorID
+    STARFusioninputfile$Caller <- "StarFusion"
+
+    # standardized fusion calls
+    standardizedSTARFusion<-fusion_standardization(fusion_calls = STARFusioninputfile,caller = "STARFUSION")
+
+    # standardized fusion calls
+    standardFusioncalls<-standardizedSTARFusion %>% as.data.frame()
+  }
+
+
+  # General fusion QC for read support and red flags
+  fusionQCFiltered<-fusion_filtering_QC(standardFusioncalls=standardFusioncalls,readingFrameFilter=readingFrameFilter,artifactFilter=artifactFilter,junctionReadCountFilter=junctionReadCountFilter,spanningFragCountFilter=spanningFragCountFilter,readthroughFilter=readthroughFilter)
+
+  if (!is.null(expressionFile)){
+  expressionMatrix<-read_tsv(expressionFile)
+
+  # split gene id and symbol
+  expressionMatrix <- expressionMatrix %>% 
+    dplyr::mutate(gene_id = str_replace(gene_id, "_PAR_Y_", "_")) 
+
+
+  expressionMatrix <- cbind(expressionMatrix, colsplit(expressionMatrix$gene_id, pattern = '_', names = c("EnsembleID","GeneSymbol")))
+
+
+  # collapse to matrix of HUGO symbols x Sample identifiers
+  # take max expression per row and use the max value for duplicated gene symbols
+  expressionMatrix.collapsed <-expressionMatrix  %>% 
+    arrange(desc(FPKM)) %>% # arrange decreasing by FPKM
+    distinct(GeneSymbol, .keep_all = TRUE) %>% # keep the ones with greatest FPKM value. If ties occur, keep the first occurencce
+    unique() %>%
+    remove_rownames()  %>%
+    dplyr::select(EnsembleID,GeneSymbol,FPKM,gene_id)
+
+  # rename columns
+  colnames(expressionMatrix.collapsed)[3]<-tumorID
+
+  expressionFiltered<-annoFuse::expressionFilterFusion(standardFusioncalls = fusionQCFiltered,expressionFilter = expressionFilter,expressionMatrix = expressionMatrix.collapsed)
+
+  # annotated QC and expression filtered fusion calls
+  filteredFusionAnnotated<-annotate_fusion_calls(standardFusioncalls=expressionFiltered,geneListReferenceDataTab=geneListReferenceDataTab,fusionReferenceDataTab=fusionReferenceDataTab)
+  } else{
+    # annotated QC filtered fusion calls
+    filteredFusionAnnotated<-annotate_fusion_calls(standardFusioncalls=fusionQCFiltered,geneListReferenceDataTab=geneListReferenceDataTab,fusionReferenceDataTab=fusionReferenceDataTab)
+
+  }
+
+
+  # return filtered and annotated dataframe
+  return(filteredFusionAnnotated)
+
+
+}