Skip to content

Commit

Permalink
Tested centrePeaks
Browse files Browse the repository at this point in the history
  • Loading branch information
smped committed Jul 28, 2024
1 parent 7cb5a8e commit 5c97ae7
Show file tree
Hide file tree
Showing 7 changed files with 363 additions and 2 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: extraChIPs
Version: 1.9.4
Version: 1.9.6
Title: Additional functions for working with ChIP-Seq data
Authors@R: person("Stevie", "Pederson",
email = "[email protected]",
Expand Down Expand Up @@ -34,6 +34,7 @@ Imports:
edgeR (>= 4.0),
forcats,
GenomeInfoDb,
GenomicAlignments,
GenomicInteractions,
ggforce,
ggrepel,
Expand Down
18 changes: 18 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ S3method(as_tibble,SummarizedExperiment)
S3method(as_tibble,TopTags)
export(addDiffStatus)
export(bestOverlap)
export(centrePeaks)
export(chopMC)
export(colToRanges)
export(collapseGenes)
Expand Down Expand Up @@ -46,6 +47,7 @@ export(unionMC)
export(voomWeightsFromCPM)
exportMethods(addDiffStatus)
exportMethods(bestOverlap)
exportMethods(centrePeaks)
exportMethods(colToRanges)
exportMethods(fitAssayDiff)
exportMethods(getProfileData)
Expand Down Expand Up @@ -73,9 +75,14 @@ import(ggside)
import(methods)
importClassesFrom(GenomicRanges,GRanges)
importClassesFrom(IRanges,CompressedList)
importClassesFrom(IRanges,RleList)
importClassesFrom(IRanges,RleViewsList)
importClassesFrom(Rsamtools,BamFileList)
importClassesFrom(S4Vectors,DataFrame)
importClassesFrom(S4Vectors,HitsList)
importClassesFrom(SummarizedExperiment,RangedSummarizedExperiment)
importClassesFrom(rtracklayer,BigWigFile)
importClassesFrom(rtracklayer,BigWigFileList)
importFrom(BiocIO,path)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpisup)
Expand All @@ -95,23 +102,31 @@ importFrom(GenomeInfoDb,keepSeqlevels)
importFrom(GenomeInfoDb,seqinfo)
importFrom(GenomeInfoDb,seqlevels)
importFrom(GenomeInfoDb,seqnames)
importFrom(GenomicAlignments,coverage)
importFrom(GenomicInteractions,GenomicInteractions)
importFrom(GenomicInteractions,InteractionTrack)
importFrom(GenomicInteractions,anchorOne)
importFrom(GenomicInteractions,anchorTwo)
importFrom(GenomicInteractions,calculateDistances)
importFrom(IRanges,RleList)
importFrom(IRanges,RleViewsList)
importFrom(IRanges,SplitDataFrameList)
importFrom(IRanges,Views)
importFrom(IRanges,commonColnames)
importFrom(IRanges,overlapsAny)
importFrom(IRanges,subsetByOverlaps)
importFrom(IRanges,viewMaxs)
importFrom(IRanges,viewWhichMaxs)
importFrom(InteractionSet,anchors)
importFrom(RColorBrewer,brewer.pal)
importFrom(Rsamtools,BamFileList)
importFrom(Rsamtools,ScanBamParam)
importFrom(Rsamtools,countBam)
importFrom(Rsamtools,idxstatsBam)
importFrom(S4Vectors,"mcols<-")
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,Rle)
importFrom(S4Vectors,SimpleList)
importFrom(S4Vectors,endoapply)
importFrom(S4Vectors,mcols)
Expand Down Expand Up @@ -172,7 +187,10 @@ importFrom(glue,glue)
importFrom(glue,glue_collapse)
importFrom(grDevices,hcl.colors)
importFrom(grid,grid.newpage)
importFrom(matrixStats,rowMeans2)
importFrom(matrixStats,rowMedians)
importFrom(matrixStats,rowSds)
importFrom(matrixStats,weightedMean)
importFrom(methods,as)
importFrom(methods,is)
importFrom(methods,new)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,7 @@
# Changes in 1.7.7

- Added `merge_within` to `makeConsensus()` for better handling when `method = "coverage"`

# Changes in 1.9.6

- Added the function `centrePeaks()` to recentre peaks using any files with coverage
231 changes: 231 additions & 0 deletions R/centrePeaks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
#' Re-estimate peak centres from coverage
#'
#' Use coverage to estimate peak centres
#'
#' @details
#' Use coverage to estimate the centre of a set of peaks or GenomicRanges.
#'
#' If using the mean or median, the point of maximum coverage for each sample
#' will be found within each peak and these positions will be averaged to return
#' a position representing an estimated peak centre.
#'
#' If using weighted.cov, positions are weighted by the combined coverage across
#' all samples to return the weighted mean position. In this case coverage will
#' be scaled by total alignments within each bam file before summing across files
#'
#' @return A GRanges object with all widths set to one
#'
#' @param x A set of GRanges representing peaks
#' @param y A suitable set of files with methods defined
#' @param f The function to use when estimating a combined peak centre
#' @param BPPARAM An object of class BPPARAM
#' @param ... Used to pass arguments between methods
#'
#' @examples
#' ## Define some peaks
#' f <- system.file("extdata/peaks.bed.gz", package = "extraChIPs")
#' peaks <- importPeaks(f, type = "bed")[[1]]
#' peaks
#'
#' ## Use a bam file to re-centre the regions using highest coverage
#' bf <- system.file("extdata/bam/ex1.bam", package = "extraChIPs")
#' centres <- centrePeaks(peaks, bf, BPPARAM = SerialParam())
#' centres
#'
#' @name centrePeaks
#' @rdname centrePeaks-methods
#' @export
#'
setGeneric("centrePeaks", function(x, y, ...) standardGeneric("centrePeaks"))
#' @importFrom GenomicAlignments coverage
#' @importFrom Rsamtools ScanBamParam idxstatsBam
#' @rdname centrePeaks-methods
#' @export
setMethod(
"centrePeaks",
signature = signature(x = "GRanges", y = "BamFileList"),
function(
x, y, f = c("weighted.cov", "mean", "median"), BPPARAM = bpparam(), ...
) {
## Set the function to use when calculating positions
f <- match.arg(f)

## Check BiocParallel is ready to go
if (!bpisup(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}

## Get the coverage & scale by total reads
message(
sprintf("Getting coverage across all %i peaks...", length(x)),
appendLF = FALSE
)
t1 <- Sys.time()
sbp <- ScanBamParam(which = x)
cov <- bplapply(y, coverage, param = sbp, BPPARAM = BPPARAM)
tot_aln <- vapply(y, \(i) sum(idxstatsBam(i)$mapped), numeric(1))
t2 <- Sys.time()
message(sprintf("done (%.2fs)", t2 - t1))

.centreFromCov(cov, x, f, tot_aln, BPPARAM)

}
)
#' @importFrom Rsamtools BamFileList
#' @rdname centrePeaks-methods
#' @export
setMethod(
"centrePeaks",
signature = signature(x = "GRanges", y = "BamFile"),
function(x, y, ...) {
bfl <- BamFileList(y)
names(bfl) <- basename(path(y))
centrePeaks(x, bfl, ...)
}
)
#' @importClassesFrom IRanges RleList
#' @importClassesFrom rtracklayer BigWigFileList
#' @importFrom rtracklayer import.bw
#' @importFrom S4Vectors splitAsList Rle
#' @rdname centrePeaks-methods
#' @export
setMethod(
"centrePeaks",
signature = signature(x = "GRanges", y = "BigWigFileList"),
function(
x, y, f = c("weighted.cov", "mean", "median"), BPPARAM = bpparam(), ...
) {
## Set the function to use when calculating positions
f <- match.arg(f)

## Check BiocParallel is ready to go
if (!bpisup(BPPARAM)) {
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}

## Get the coverage without any scaling
message(
sprintf("Getting coverage across all %i peaks...", length(x)),
appendLF = FALSE
)
t1 <- Sys.time()
grl <- bplapply(y, import.bw, which = x, BPPARAM = BPPARAM)
## Add zero scores to all missing positions
grl_exp <- lapply(
grl,
\(gr) {
gnm <- setdiff(GRanges(seqinfo(gr)), gr)
gnm$score <- 0
sort(c(gr, gnm))
}
)
cov <- lapply(
grl_exp,
\(gr) {
grl <- splitAsList(gr, seqnames(gr))
rl <- lapply(grl, \(x) Rle(values = x$score, lengths = width(x)))
RleList(rl, compress = FALSE)
}
)
t2 <- Sys.time()
message(sprintf("done (%.2fs)", t2 - t1))

.centreFromCov(cov, x, f, 1, BPPARAM)

}
)
#' @importClassesFrom rtracklayer BigWigFile
#' @importFrom rtracklayer BigWigFileList
#' @rdname centrePeaks-methods
#' @export
setMethod(
"centrePeaks",
signature = signature(x = "GRanges", y = "BigWigFile"),
function(x, y, ...) {
bwfl <- BigWigFileList(y@resource)
centrePeaks(x, bwfl, ...)
}
)
#' @importClassesFrom rtracklayer BigWigFile
#' @importClassesFrom Rsamtools BamFileList
#' @importFrom rtracklayer BigWigFileList
#' @importFrom Rsamtools BamFileList
#' @rdname centrePeaks-methods
#' @export
setMethod(
"centrePeaks",
signature = signature(x = "GRanges", y = "character"),
function(x, y, ...) {
suff <- tolower(unique(gsub(".+\\.([A-Za-z]+$)", "\\1", y)))
if (length(suff) > 1) stop("paths must be to files of the same type")
f <- NULL
if (suff == "bam") f <- BamFileList(y)
if (suff %in% c("bw", "bigwig")) f <- BigWigFileList(y)
if (is.null(f)) stop("Couldn't detect file type from suffix")
centrePeaks(x, f, ...)
}
)

#' @importClassesFrom IRanges RleViewsList RleList
#' @importFrom S4Vectors splitAsList mcols
#' @importFrom IRanges RleList viewWhichMaxs viewMaxs RleViewsList Views
#' @importFrom matrixStats rowMedians rowMeans2 weightedMean
#' @keywords internal
.centreFromCov <- function(cov, peaks, f, totals = 1, BPPARAM) {
## Split everything by chromosome
sq <- seqinfo(peaks)
grl <- splitAsList(peaks, seqnames(peaks))
grl <- grl[vapply(grl, length, integer(1)) > 0]
n <- length(cov)
totals <- rep_len(totals, n)

message("Finding centres by chromosome...", appendLF = FALSE)
t1 <- Sys.time()
new <- bplapply(
grl,
\(gr) {
## Just work with the coverage from each chromosome
chr <- as.character(unique(seqnames(gr)))
if (!length(chr)) return(GRanges(seqinfo = sq))
rl <- RleList(lapply(cov, \(i) i[[chr]]))
if (f == "weighted.cov") {
scaled <- lapply(seq_along(rl), \(i) rl[[i]] / totals[[i]])
rvl <- RleViewsList(lapply(scaled, Views, start(gr), end(gr)))
## Calculate combined weights by adding scaled coverage
## across positions
w <- lapply(
seq_along(gr),
\(j) rowSums(
do.call(
"cbind", lapply(rvl, \(i) as.numeric(i[[j]]))
)
)
)
## Define positions within each peak
pos <- mapply(
\(i, j) seq(i, length.out = j),
i = start(gr), j = width(gr),
SIMPLIFY = FALSE
)
## Return centres as the mean position weighted by coverage
centres <- mapply(weightedMean, x = pos, w = w)
} else{
rvl <- RleViewsList(lapply(rl, Views, start(gr), end(gr)))
pos <- viewWhichMaxs(rvl)
posm <- do.call("cbind", as.list(pos))
if (f == "mean") centres <- rowMeans2(posm)
if (f == "median") centres <- rowMedians(posm)
}
GRanges(paste0(chr, ":", as.integer(centres)), seqinfo = sq)
}, BPPARAM = BPPARAM
)
t2 <- Sys.time()
message(sprintf("done (%.2fs)", t2 - t1))
new <- unlist(GRangesList(new))
mcols(new) <- mcols(peaks)
new
}


2 changes: 1 addition & 1 deletion _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ reference:
- title: Functions For Differential Signal Analysis
contents:
- dualFilter
- importPeaks
- ends_with("Peaks")
- fitAssayDiff
- getProfileData
- grlToSE
Expand Down
Loading

0 comments on commit 5c97ae7

Please sign in to comment.