Skip to content

Commit

Permalink
Merge pull request #150 from Tim-Yu/devel
Browse files Browse the repository at this point in the history
support for parallel by preloading data
  • Loading branch information
jorainer authored Nov 20, 2023
2 parents 9243d0e + c5f51b0 commit 22c6704
Show file tree
Hide file tree
Showing 18 changed files with 1,297 additions and 70 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ jobs:
fail-fast: false
matrix:
config:
- { os: ubuntu-latest, r: 'devel', bioc: '3.17', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: 'devel', bioc: '3.17'}
- { os: windows-latest, r: 'devel', bioc: '3.17'}
- { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: 'devel', bioc: '3.19'}
- { os: windows-latest, r: 'devel', bioc: '3.19'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
Expand Down
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ Authors@R: c(person(given = "Johannes", family = "Rainer",
person(given = "Laurent", family = "Gatto",
email = "[email protected]",
role = "ctb",
comment = c(ORCID = "0000-0002-1520-2268")))
comment = c(ORCID = "0000-0002-1520-2268")),
person(given = "Boyu", family = "Yu",
email = "[email protected]",
role = "ctb"))
Author: Johannes Rainer <[email protected]> with contributions
from Tim Triche, Sebastian Gibb, Laurent Gatto and
Christian Weichenberger.
from Tim Triche, Sebastian Gibb, Laurent Gatto
Christian Weichenberger and Boyu Yu.
Maintainer: Johannes Rainer <[email protected]>
URL: https://github.com/jorainer/ensembldb
BugReports: https://github.com/jorainer/ensembldb/issues
Expand Down
58 changes: 52 additions & 6 deletions R/Methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -925,15 +925,61 @@ setMethod("lengthOf", "EnsDb", function(x, of="gene",
## For EnsDb: calls the .transcriptLengths function.
.transcriptLengths <- function(x, with.cds_len = FALSE, with.utr5_len = FALSE,
with.utr3_len = FALSE,
filter = AnnotationFilterList()) {
filter <- .processFilterParam(filter, x)
allTxs <- transcripts(x, filter = filter)
filter = AnnotationFilterList(),
exons = NA, transcripts = NA) {
## The preloaded data option is currently only for the coordinates mapping
## functions, therefore the filter is "tx_id" only.
preload_ranges_missing <- which(c(
identical(exons,NA),
identical(transcripts,NA)
))
if(identical(integer(0), preload_ranges_missing)){
if (!is(exons, "CompressedGRangesList"))
stop("Argument 'exons' has to be a 'CompressedGRangesList' object")
if (!is(transcripts, "GRanges"))
stop("Argument 'transcripts' has to be an 'GRanges' object")
if (identical(integer(0),grep('T[0-9]', names(exons)[[1]])))
stop("Argument 'exons' has to be by 'tx'.")
## Check if x is a formula and eventually translate it.
if (is(filter, "formula"))
filter <- AnnotationFilter(filter)
tryCatch({
filter_type <- filter@field
filter_tx <- filter@value
}, error = function(e){
message("No filter detected, all transcripts will be returned")
filter_tx <<- names(transcripts)
})
if (exists('filter_type')){
if (filter_type != "tx_id")
stop("Filter must be 'tx_id'.")
}
tryCatch({
allTxs <- transcripts[names(transcripts) %in% unique(filter_tx)]
}, error = function(e){
allTxs <<- GRanges()
})
} else if (length(preload_ranges_missing) == 2){
filter <- .processFilterParam(filter, x)
allTxs <- transcripts(x, filter = filter)
} else {
stop(paste(
"Argument",
c("'exons'", "'transcripts'")[preload_ranges_missing],
'missing.'
, sep = " "
))
}
if (length(allTxs) == 0)
return(data.frame(tx_id = character(), gene_id = character(),
nexon = integer(), tx_len = integer()))
exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
## Match ordering
exns <- exns[match(allTxs$tx_id, names(exns))]
if(identical(integer(0), preload_ranges_missing)){
exns <- exons[match(allTxs$tx_id, names(exons))]
} else {
exns <- exonsBy(x, filter = TxIdFilter(allTxs$tx_id))
## Match ordering
exns <- exns[match(allTxs$tx_id, names(exns))]
}
## Calculate length of transcripts.
txLengths <- sum(width(exns))
## Calculate no. of exons.
Expand Down
103 changes: 89 additions & 14 deletions R/genomeToX.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
#' @param x `GRanges` object with the genomic coordinates that should be
#' mapped.
#'
#' @param db `EnsDb` object.
#' @param db `EnsDb` object or pre-loaded exons 'CompressedGRangesList' object
#' using exonsBy().
#'
#' @return
#'
Expand Down Expand Up @@ -75,11 +76,48 @@
#' ## The result of the mapping is an IRangesList each element providing the
#' ## within-transcript coordinates for each input region
#' genomeToTranscript(gnm, edbx)
#'
#' ## If you are tring to calculate within-transcript coordinates of a huge
#' ## list of genomic region, you shall use pre-loaded exons GRangesList to
#' ## replace the SQLite db edbx
#'
#' ## Below is just a lazy demo of querying multiple genomic region
#' library(parallel)
#'
#' gnm <- rep(GRanges("X:107715899-107715901"),10)
#'
#' exons <- exonsBy(EnsDb.Hsapiens.v86)
#'
#' ## You can pre-define the exons region to further accelerate the code.
#'
#' exons <- exonsBy(
#' EnsDb.Hsapiens.v86, by = "tx",
#' filter = AnnotationFilterList(
#' SeqNameFilter(as.character(unique(seqnames(gnm)))),
#' GeneStartFilter(max(end(gnm)), condition = "<="),
#' GeneEndFilter(min(start(gnm)), condition = ">=")
#' )
#' )
#'
#' ## only run in Linux ##
#' # res_temp <- mclapply(1:10, function(ind){
#' # genomeToTranscript(gnm[ind], exons)
#' # }, mc.preschedule = TRUE, mc.cores = detectCores() - 1)
#'
#' # res <- do.call(c,res_temp)
#'
#' cl <- makeCluster(detectCores() - 1)
#' clusterExport(cl,c('genomeToTranscript','gnm','exons'))
#'
#' res <- parLapply(cl,1:10,function(ind){
#' genomeToTranscript(gnm[ind], exons)
#' })
#' stopCluster(cl)
genomeToTranscript <- function(x, db) {
if (missing(x) || !is(x, "GRanges"))
stop("Argument 'x' is required and has to be a 'GRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
if (missing(db) || !(is(db, "EnsDb") || is(db,"CompressedGRangesList")))
stop("Argument 'db' is required and has to be an 'EnsDb' object or a 'CompressedGRangesList' object") # load the exons priorly
res <- .genome_to_tx(x, db)
if (is(res, "IRanges"))
res <- IRangesList(res)
Expand Down Expand Up @@ -115,6 +153,8 @@ genomeToTranscript <- function(x, db) {
#' within-protein coordinates.
#'
#' @param db `EnsDb` object.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
Expand Down Expand Up @@ -174,17 +214,44 @@ genomeToTranscript <- function(x, db) {
#'
#' ## Mapping of intronic positions fail
#' res[[4]]
genomeToProtein <- function(x, db) {
#'
#' ## Meanwhile, this function can be called in parallel processes if you preload
#' ## the protein, exons and transcripts database.
#'
#' proteins <- proteins(edbx)
#' exons <- exonsBy(edbx)
#' transcripts <- transcripts(edbx)
#'
#' genomeToProtein(gnm, edbx, proteins = proteins, exons = exons, transcripts = transcripts)
genomeToProtein <- function(x, db, proteins = NA, exons = NA, transcripts = NA) {
if (missing(x) || !is(x, "GRanges"))
stop("Argument 'x' is required and has to be a 'GRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
txs <- genomeToTranscript(x, db)
preload_ranges_missing <- which(c(
identical(proteins,NA),
identical(exons,NA),
identical(transcripts,NA)
))
if(identical(integer(0), preload_ranges_missing))
txs <- genomeToTranscript(x, exons)
else if (length(preload_ranges_missing) == 3) {
txs <- genomeToTranscript(x, db)
} else {
stop(paste(
"Argument",
c("'proteins'", "'exons'", "'transcripts'")[preload_ranges_missing],
'missing.'
, sep = " "
))
}
int_ids <- rep(1:length(txs), lengths(txs))
txs <- unlist(txs, use.names = FALSE)
if (is.null(names(txs)))
names(txs) <- ""
prts <- transcriptToProtein(txs, db)
if (identical(integer(0), preload_ranges_missing))
prts <- transcriptToProtein(txs, db, proteins = proteins, exons = exons, transcripts = transcripts)
else prts <- transcriptToProtein(txs, db)
mcols(prts) <- cbind(mcols(prts)[, c("tx_id", "cds_ok")],
mcols(txs)[, colnames(mcols(txs)) != "tx_id"])
prts <- split(prts, int_ids)
Expand Down Expand Up @@ -215,8 +282,10 @@ genomeToProtein <- function(x, db) {
#'
#' @noRd
.genome_to_tx_ranges <- function(genome, exns) {
genome_gr <- as(genome, "GRangesList") # preserve the metadata
genome_gr@unlistData@elementMetadata <- genome@elementMetadata
IRangesList(unlist(
lapply(as(genome, "GRangesList"), FUN = function(rgn, exns) {
lapply(genome_gr, FUN = function(rgn, exns) {
if (length(exns)) {
ints <- pintersect(exns, rgn, drop.nohit.ranges = TRUE,
ignore.strand = FALSE)
Expand Down Expand Up @@ -248,6 +317,7 @@ genomeToProtein <- function(x, db) {
seq_name = as.character(seqnames(rgn)),
seq_strand = as.character(strand(rgn))
)
if(!is.null(mcols(rgn))) dfrm <- DataFrame(dfrm, mcols(rgn))
mcols(irng) <- dfrm
irng
}, MoreArgs = list(rgn = rgn), USE.NAMES = FALSE)
Expand All @@ -261,6 +331,7 @@ genomeToProtein <- function(x, db) {
seq_start = start(rgn), seq_end = end(rgn),
seq_name = as.character(seqnames(rgn)),
seq_strand = as.character(strand(rgn)))
if(!is.null(mcols(rgn))) metad <- DataFrame(metad, mcols(rgn))
empty_rng <- IRanges(start = -1, width = 1)
mcols(empty_rng) <- metad
empty_rng
Expand All @@ -275,7 +346,7 @@ genomeToProtein <- function(x, db) {
#' checks if the region is completely included in an exons and returns
#' the position within the transcript for that exon.
#'
#' @param db `EnsDb`.
#' @param db `EnsDb` or 'CompressedGRangesList'
#'
#' @param genome `GRanges`
#'
Expand Down Expand Up @@ -315,12 +386,16 @@ genomeToProtein <- function(x, db) {
#' ## Example with two genes, on two strands!
.genome_to_tx <- function(genome, db) {
## Get exonsBy for all input ranges.
exns <- exonsBy(db, by = "tx",
filter = AnnotationFilterList(
SeqNameFilter(as.character(unique(seqnames(genome)))),
GeneStartFilter(max(end(genome)), condition = "<="),
GeneEndFilter(min(start(genome)), condition = ">=")
))
if(is(db, "EnsDb")){
exns <- exonsBy(
db, by = "tx",
filter = AnnotationFilterList(
SeqNameFilter(as.character(unique(seqnames(genome)))),
GeneStartFilter(max(end(genome)), condition = "<="),
GeneEndFilter(min(start(genome)), condition = ">=")
)
)
} else exns <- db
res <- .genome_to_tx_ranges(genome, exns)
if (!is.null(names(genome)))
names(res) <- names(genome)
Expand Down
Loading

0 comments on commit 22c6704

Please sign in to comment.