|
| 1 | +#' readMaf |
| 2 | +#' @description Read tab delimited MAF (can be plain text or *.gz compressed) file along with sample information file. |
| 3 | +#' |
| 4 | +#' @param mafFile A tab delimited MAF file (plain text or *.gz compressed). Required. |
| 5 | +#' @param clinicalFile A clinical data file includes Tumor_Sample_Barcode, Tumor_ID, Patient_ID. Tumor_Sample_Label is optional. Default NULL. |
| 6 | +#' @param ccfFile A CCF file of somatic mutations. Default NULL. |
| 7 | +#' @param adjusted.VAF Whether adjusted VAF is included in mafFile. Default FALSE. |
| 8 | +#' @param nonSyn.vc List of Variant classifications which are considered as non-silent. Default NULL, use Variant Classifications with "Frame_Shift_Del","Frame_Shift_Ins","Splice_Site","Translation_Start_Site","Nonsense_Mutation","Nonstop_Mutation","In_Frame_Del","In_Frame_Ins","Missense_Mutation" |
| 9 | +#' @param use.indel.ccf Whether include indels in ccfFile. Default FALSE. |
| 10 | +#' @param ccf.conf.level The confidence level of CCF to identify clonal or subclonal. |
| 11 | +#' Only works when "CCF_std" or "CCF_CI_high" is provided in ccfFile. Default 0.95. |
| 12 | +#' @param remove.empty.VAF Whether removing the mutations with VAF=0. When making the comparison of pair-wide CCF, retained mutations with VAF=0. |
| 13 | +#' @param refBuild Human reference genome version. Default 'hg19'. Optional: 'hg18' or 'hg38'. |
| 14 | +#' |
| 15 | +#' @examples |
| 16 | +#' maf.File <- system.file("extdata/", "CRC_HZ.maf", package = "MesKit") |
| 17 | +#' clin.File <- system.file("extdata/", "CRC_HZ.clin.txt", package = "MesKit") |
| 18 | +#' ccf.File <- system.file("extdata/", "CRC_HZ.ccf.tsv", package = "MesKit") |
| 19 | +#' maf <- readMaf(mafFile=maf.File,clinicalFile = clin.File, refBuild="hg19") |
| 20 | +#' maf <- readMaf(mafFile=maf.File, clinicalFile = clin.File, ccfFile=ccf.File, refBuild="hg19") |
| 21 | +#' @return an object of Maf or MafList. |
| 22 | +#' @import methods |
| 23 | +#' @importFrom data.table fread setkey |
| 24 | +#' @importFrom stats qnorm |
| 25 | +#' @export readMaf |
| 26 | + |
| 27 | +## read.maf main function |
| 28 | +readMaf <- function( |
| 29 | + mafFile, |
| 30 | + clinicalFile, |
| 31 | + ccfFile = NULL, |
| 32 | + adjusted.VAF = FALSE, |
| 33 | + nonSyn.vc = NULL, |
| 34 | + use.indel.ccf = FALSE, |
| 35 | + ccf.conf.level = 0.95, |
| 36 | + remove.empty.VAF = TRUE, |
| 37 | + refBuild = "hg19" |
| 38 | + ) { |
| 39 | + |
| 40 | + refBuild <- match.arg(refBuild, choices = c('hg18', 'hg19', 'hg38'), several.ok = FALSE) |
| 41 | + |
| 42 | + ## get non-silent muation types |
| 43 | + if (is.null(nonSyn.vc)) { |
| 44 | + nonSyn.vc <- c( |
| 45 | + "Frame_Shift_Del", |
| 46 | + "Frame_Shift_Ins", |
| 47 | + "Splice_Site", |
| 48 | + "Translation_Start_Site", |
| 49 | + "Nonsense_Mutation", |
| 50 | + "Nonstop_Mutation", |
| 51 | + "In_Frame_Del", |
| 52 | + "In_Frame_Ins", |
| 53 | + "Missense_Mutation" |
| 54 | + ) |
| 55 | + } |
| 56 | + |
| 57 | + maf_data <- data.table::fread( |
| 58 | + file = mafFile, |
| 59 | + quote = "", |
| 60 | + header = TRUE, |
| 61 | + data.table = TRUE, |
| 62 | + fill = TRUE, |
| 63 | + sep = '\t', |
| 64 | + skip = "Hugo_Symbol", |
| 65 | + stringsAsFactors = FALSE |
| 66 | + ) |
| 67 | + |
| 68 | + clin_data <- data.table::fread( |
| 69 | + file = clinicalFile, |
| 70 | + quote = "", |
| 71 | + header = TRUE, |
| 72 | + data.table = TRUE, |
| 73 | + fill = TRUE, |
| 74 | + sep = '\t', |
| 75 | + stringsAsFactors = FALSE |
| 76 | + ) |
| 77 | + |
| 78 | + |
| 79 | + ## merge maf data and clinical data |
| 80 | + maf_col <- colnames(maf_data) |
| 81 | + clin_col <- colnames(clin_data) |
| 82 | + is_col <- intersect(maf_col, clin_col) |
| 83 | + is_col <- is_col[is_col!="Tumor_Sample_Barcode"] |
| 84 | + maf_data <- dplyr::select(maf_data, -all_of(is_col)) |
| 85 | + maf_data <- dplyr::left_join( |
| 86 | + maf_data, |
| 87 | + clin_data, |
| 88 | + by = c( |
| 89 | + "Tumor_Sample_Barcode" |
| 90 | + ) |
| 91 | + ) |
| 92 | + |
| 93 | + # check maf data |
| 94 | + maf_data <- validMaf(maf_data, remove.empty.VAF) |
| 95 | + |
| 96 | + ## calculate Total_allele_depth |
| 97 | + maf_data <- maf_data %>% |
| 98 | + dplyr::mutate(Total_allele_depth = .data$Ref_allele_depth + .data$Alt_allele_depth) %>% |
| 99 | + as.data.frame() |
| 100 | + |
| 101 | + if(adjusted.VAF){ |
| 102 | + maf_data$VAF_adj <- maf_data$VAF |
| 103 | + } |
| 104 | + |
| 105 | + |
| 106 | + ## read ccf files |
| 107 | + if (!is.null(ccfFile)) { |
| 108 | + ccf_data <- suppressWarnings(data.table::fread( |
| 109 | + ccfFile, |
| 110 | + quote = "", |
| 111 | + header = TRUE, |
| 112 | + fill = TRUE, |
| 113 | + sep = '\t', |
| 114 | + stringsAsFactors = FALSE |
| 115 | + )) |
| 116 | + ## check ccf_data |
| 117 | + ccf_data <- validCCF(ccf_data, maf_data, use.indel.ccf = use.indel.ccf) |
| 118 | + ## merge ccf_data to maf_data |
| 119 | + maf_data <- MesKit:::readCCF(maf_data, ccf_data, ccf.conf.level, sample.info, adjusted.VAF, use.indel.ccf = use.indel.ccf) |
| 120 | + } |
| 121 | + |
| 122 | + ## calculate average adjust VAF |
| 123 | + if("VAF_adj" %in% colnames(maf_data)){ |
| 124 | + maf_data <- maf_data %>% |
| 125 | + dplyr::group_by(.data$Patient_ID, .data$Tumor_ID, .data$Chromosome, |
| 126 | + .data$Start_Position, .data$Reference_Allele,.data$Tumor_Seq_Allele2) %>% |
| 127 | + dplyr::mutate(Tumor_Average_VAF = round( |
| 128 | + sum(.data$VAF_adj * .data$Total_allele_depth)/ |
| 129 | + sum(.data$Total_allele_depth) |
| 130 | + ,3)) |
| 131 | + |
| 132 | + } |
| 133 | + |
| 134 | + maf_data <- maf_data %>% |
| 135 | + dplyr::ungroup() %>% |
| 136 | + dplyr::select(-"Total_allele_depth") %>% |
| 137 | + as.data.frame() |
| 138 | + |
| 139 | + data_list <- split(maf_data, maf_data$Patient_ID) |
| 140 | + maf_patient_list <- list() |
| 141 | + for(data in data_list){ |
| 142 | + patient <- unique(data$Patient_ID) |
| 143 | + sample.info <- data %>% |
| 144 | + dplyr::select("Tumor_Sample_Barcode","Tumor_ID") %>% |
| 145 | + dplyr::distinct(.data$Tumor_Sample_Barcode, .keep_all = TRUE) |
| 146 | + if(nrow(sample.info) < 2){ |
| 147 | + n <- nrow(sample.info) |
| 148 | + stop(paste0(patient," has only ",n," tumor samples.", |
| 149 | + "A minimum of two tumor samples are required for each patient.")) |
| 150 | + } |
| 151 | + ## set Maf |
| 152 | + maf <- MesKit:::Maf( |
| 153 | + data = data.table::setDT(data), |
| 154 | + sample.info = as.data.frame(sample.info), |
| 155 | + nonSyn.vc = nonSyn.vc, |
| 156 | + ref.build = refBuild |
| 157 | + ) |
| 158 | + maf_patient_list[[patient]] <- maf |
| 159 | + } |
| 160 | + |
| 161 | + if(length(data_list) > 1){ |
| 162 | + ## set MafList |
| 163 | + maf_list <- MesKit:::MafList(maf_patient_list) |
| 164 | + return(maf_list) |
| 165 | + }else{ |
| 166 | + return(maf_patient_list[[1]]) |
| 167 | + } |
| 168 | +} |
| 169 | + |
| 170 | + |
| 171 | + |
0 commit comments