Skip to content

Commit

Permalink
refactor: improve input argument checking in proteinToGenome
Browse files Browse the repository at this point in the history
- Improve `proteinToGenome()` documentation.
- Check input parameters of `proteinToGenome()` to ensure protein identifiers
  were provided (issue #157).
  • Loading branch information
jorainer committed Aug 21, 2024
1 parent b5cbbd5 commit 399de63
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 41 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ jobs:
fail-fast: false
matrix:
config:
- { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: 'devel', bioc: '3.19'}
- { os: windows-latest, r: 'devel', bioc: '3.19'}
- { os: ubuntu-latest, r: '4.4', bioc: '3.19', cont: "bioconductor/bioconductor_docker:RELEASE_3_19", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: '4.4', bioc: '3.19'}
- { os: windows-latest, r: '4.4', bioc: '3.19'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: ensembldb
Type: Package
Title: Utilities to create and use Ensembl-based annotation databases
Version: 2.28.0
Version: 2.28.1
Authors@R: c(person(given = "Johannes", family = "Rainer",
email = "[email protected]",
role = c("aut", "cre"),
Expand Down Expand Up @@ -97,4 +97,4 @@ Collate:
'zzz.R'
biocViews: Genetics, AnnotationData, Sequencing, Coverage
License: LGPL
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
78 changes: 49 additions & 29 deletions R/proteinToX.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@
#' @name proteinToTranscript
#'
#' @aliases proteinToTranscript proteinToTranscript,EnsDb-method
#'
#'
#' @description
#'
#' `proteinToTranscript` maps protein-relative coordinates to positions within
#' the encoding transcript. Note that the returned positions are relative to
#' the complete transcript length, which includes the 5' UTR.
#'
#' The regions within the protein sequence need to be provided as a named
#' `IRanges` object with the names being protein identifiers and the start and
#' end coordinates (within these proteins) defined by the `IRanges` object. As
#' an alternative to the `IRanges`' names, protein identifiers can also be
#' provided through a metadata column (see details below).
#'
#' Similar to the [proteinToGenome()] function, `proteinToTranscript` compares
#' for each protein whether the length of its sequence matches the length of
#' the encoding CDS and throws a warning if that is not the case. Incomplete
Expand All @@ -38,9 +44,9 @@
#' use these as input for the mapping function.
#'
#' @inheritParams proteinToGenome
#'
#'
#' @param ... Further arguments to be passed on.
#'
#'
#' @return
#'
#' `IRangesList`, each element being the mapping results for one of the input
Expand Down Expand Up @@ -158,7 +164,7 @@ setMethod("proteinToTranscript", "EnsDb",
else mc$protein_id <- names(prt)
ir <- IRanges(start = -1, width = 1)
mcols(ir) <- mc
if(!is.null(mcols(prt)))
if(!is.null(mcols(prt)))
mcols(ir) <- cbind(mcols(ir),mcols(prt))
ir
} else {
Expand All @@ -178,7 +184,7 @@ setMethod("proteinToTranscript", "EnsDb",
if (idType == "uniprot_id")
mc$uniprot_id <- names(prt)
mcols(res) <- mc
if(!is.null(mcols(prt)))
if(!is.null(mcols(prt)))
mcols(res) <- cbind(mcols(res),mcols(prt))
res
}
Expand All @@ -190,8 +196,8 @@ setMethod("proteinToTranscript", "EnsDb",
#' @aliases proteinToTranscript proteinToTranscript,Preloaded-method
#'
#' @inheritParams proteinToGenome
#'
#' @param fiveUTR A `CompressedGRangesList` object generated by
#'
#' @param fiveUTR A `CompressedGRangesList` object generated by
#' [fiveUTRsByTranscript()].
#'
#' @family coordinate mapping functions
Expand All @@ -204,13 +210,13 @@ setMethod("proteinToTranscript", "EnsDb",
#'
#' ## Meanwhile, this function can be called in parallel processes if you preload
#' ## the CDS data with desired data columns and fiveUTR data
#'
#'
#' cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','uniprot_id','protein_sequence'))
#' # cds <- cdsBy(edbx,columns = c(listColumns(edbx,'tx'),'protein_id','protein_sequence'))
#' # cds <- cdsBy(edbx,columns = c('tx_id','protein_id','protein_sequence'))
#'
#' fiveUTR <- fiveUTRsByTranscript(edbx)
#'
#'
#' ## Define an IRange with protein-relative coordinates within a protein for
#' ## the gene SYP
#' syp <- IRanges(start = 4, end = 17)
Expand Down Expand Up @@ -278,7 +284,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
else mc$protein_id <- names(prt)
ir <- IRanges(start = -1, width = 1)
mcols(ir) <- mc
if(!is.null(mcols(prt)))
if(!is.null(mcols(prt)))
mcols(ir) <- cbind(mcols(ir),mcols(prt))
ir
} else {
Expand All @@ -298,7 +304,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
if (idType == "uniprot_id")
mc$uniprot_id <- names(prt)
mcols(res) <- mc
if(!is.null(mcols(prt)))
if(!is.null(mcols(prt)))
mcols(res) <- cbind(mcols(res),mcols(prt))
res
}
Expand All @@ -319,9 +325,16 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
#' (and eventually Uniprot to Ensembl protein identifier mappings) from the
#' submitted `EnsDb` object (and thus based on annotations from Ensembl).
#'
#' Not all coding regions for protein coding transcripts are complete, and the
#' function thus checks also if the length of the coding region matches the
#' length of the protein sequence and throws a warning if that is not the case.
#' The regions within the protein sequence need to be provided as a named
#' `IRanges` object with the names being protein identifiers and the start and
#' end coordinates (within these proteins) defined by the `IRanges` object. As
#' an alternative to the `IRanges`' names, protein identifiers can also be
#' provided through a metadata column (see details below).
#'
#' Note that not all coding regions for protein coding transcripts are
#' complete, and the function thus checks also if the length of the coding
#' region matches the length of the protein sequence and throws a warning if
#' that is not the case.
#'
#' The genomic coordinates for the within-protein coordinates, the Ensembl
#' protein ID, the ID of the encoding transcript and the within protein start
Expand Down Expand Up @@ -364,7 +377,7 @@ setMethod("proteinToTranscript", "CompressedGRangesList",
#' encoding transcripts.
#'
#' @param id `character(1)` specifying where the protein identifier can be
#' found. Has to be either `"name"` or one of `colnames(mcols(prng))`.
#' found. Has to be either `"name"` or one of `colnames(mcols(x))`.
#'
#' @param idType `character(1)` defining what type of IDs are provided. Has to
#' be one of `"protein_id"` (default), `"uniprot_id"` or `"tx_id"`.
Expand Down Expand Up @@ -489,7 +502,7 @@ setMethod("proteinToGenome", "EnsDb",
mcols(maps) <- cbind(mcols(maps),
protein_start = start(prt),
protein_end = end(prt))
if(!is.null(mcols(prt)))
if(!is.null(mcols(prt)))
mcols(maps) <- cbind(mcols(maps),mcols(prt))
maps[order(maps$exon_rank)]
}
Expand All @@ -509,7 +522,7 @@ setMethod("proteinToGenome", "EnsDb",
#'
#' @param db For the method for `EnsDb` objects: An `EnsDb` object to be used to
#' retrieve genomic coordinates of encoding transcripts.<br><br>
#' For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object
#' For the method for `CompressedGRangesList` objects: A `CompressedGRangesList` object
#' generated by [cdsBy()] where by = 'tx' and columns = c('tx_id'
#' ,'protein_id','uniprot_id','protein_sequence').
#'
Expand Down Expand Up @@ -582,7 +595,7 @@ setMethod("proteinToGenome", "CompressedGRangesList",
mcols(maps) <- cbind(mcols(maps),
protein_start = start(prt),
protein_end = end(prt))
if(!is.null(mcols(prt)))
if(!is.null(mcols(prt)))
mcols(maps) <- cbind(mcols(maps),mcols(prt))
maps[order(maps$exon_rank)]
}
Expand Down Expand Up @@ -719,17 +732,17 @@ setMethod("proteinToGenome", "CompressedGRangesList",
cds
}

#' @description Fetch the CDS for all transcripts encoding the specified protein using
#' @description Fetch the CDS for all transcripts encoding the specified protein using
#' preloaded data.
#'
#' @note
#'
#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input
#' Use one query to fetch CDS for all (unique) input IDs using preloaded data. If input
#' IDs are Uniprot identifiers we have to perform additional checks and data
#' re-organizations because one transcript (and thus CDS) can be associated
#' with multiple Uniprot identifiers.
#'
#' @param x `CompressedGRangesList` object generated by [cdsBy()] where
#'
#' @param x `CompressedGRangesList` object generated by [cdsBy()] where
#' by = 'tx' and columns = c(listColumns(edb,'tx'),'protein_id','uniprot_id','protein_sequence').
#'
#' @param id `character` with the protein ID(s).
Expand All @@ -749,16 +762,16 @@ setMethod("proteinToGenome", "CompressedGRangesList",
setMethod(".cds_for_id2", "CompressedGRangesList",
function(x, id, idType = "protein_id") {
## No need for get the transcripts first, just use the metadata of CDS
## if you find this modification well, you may want to change the
## if you find this modification well, you may want to change the
## generic .cds_for_id2 function above
if (!all(c("tx_id", "protein_id",idType,"protein_sequence") %in% colnames(mcols(x[[1]]))))
stop(
paste(
c(
"CDS must have",
unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))),
"CDS must have",
unique(c("'tx_id'","'protein_sequence'", "'protein_id'",paste("'",idType,"'",sep=''))),
"in the metadata."
),
),
collapse = ' ')
)
map <- x@unlistData[as.data.frame(x@partitioning)[,1]]@elementMetadata[,unique(c('tx_id', 'protein_id', idType))]
Expand Down Expand Up @@ -820,10 +833,17 @@ setMethod(".cds_for_id2", "CompressedGRangesList",
#' @noRd
.cds_for_id_range <- function(x, range, id = "name", idType = "protein_id") {
idType <- match.arg(idType, c("protein_id", "uniprot_id", "tx_id"))
id <- match.arg(id, c("name", colnames(mcols(range))))
if (!(id %in% c("name", colnames(mcols(range)))))
stop("'id' should be either \"name\" (if identifiers are provided as ",
"the IRanges' names) or one of the column names of the IRanges ",
"metadata (mcols()).", call. = FALSE)
if (id == "name")
ids <- names(range)
else ids <- mcols(range)[, id]
if (length(ids) != length(range) | all(is.na(ids)))
stop("No protein/transcript identifiers found. Please provide the ",
"IDs through the IRanges' names or one of its metadata columns.",
call. = FALSE)
cds <- .cds_for_id2(x, ids, idType = idType)
## check returned CDS if their width matches the provided protein ranges
mapply(cds, end(range) * 3,
Expand Down Expand Up @@ -851,7 +871,7 @@ setMethod(".cds_for_id2", "CompressedGRangesList",
#' @param x `EnsDb` object.
#'
#' @param cds `GRangesList`
#'
#'
#' @param ... Further arguments to be passed on..
#'
#' @return `list` of `GRangesList`s with one list element per input protein,
Expand Down Expand Up @@ -935,7 +955,7 @@ setMethod(".cds_matching_protein", "EnsDb",
#' @author Johannes Rainer, Boyu Yu
#'
#' @md
#'
#'
#' @noRd
setMethod(".cds_matching_protein", "list",
function(x) {
Expand Down
6 changes: 6 additions & 0 deletions inst/NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
Changes in version 2.28.1

- Improve documentation of `proteinToGenome()` better describing the input
parameters. Check input of `proteinToGenome()` to throw an error if no
identifiers are provided.

Changes in version 2.27.1

- Functions for coordinate mapping gain support for pre-loaded genomic ranges
Expand Down
15 changes: 11 additions & 4 deletions man/proteinToGenome.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion man/proteinToTranscript.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 13 additions & 2 deletions tests/testthat/test_proteinToGenome.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
test_that("proteinToGenome fails on IRanges without IDs", {
rng <- IRanges(start = c(123, 322), end = c(234, 444))
expect_error(proteinToGenome(rng, edb), "No protein/transcript")
names(rng) <- c(NA_character_, NA_character_)
expect_error(proteinToGenome(rng, edb), "No protein/transcript")
expect_error(proteinToGenome(rng, edb, id = "ann"), "should be either")
names(rng) <- NULL
mcols(rng)$ann <- c(NA_character_, NA_character_)
expect_error(proteinToGenome(rng, edb), "No protein/transcript")
})

test_that(".proteinCoordsToTx works", {
prts <- IRanges(start = c(1, 2), end = c(1, 2))
res <- .proteinCoordsToTx(prts)
Expand Down Expand Up @@ -277,7 +288,7 @@ test_that("proteinToTranscript works", {

## Preloaded data test
expect_warning(tx_rel_preload <- proteinToTranscript(prngs, cds,
idType = "uniprot_id",
idType = "uniprot_id",
fiveUTR = fiveUTR))
expect_equal(tx_rel_preload,tx_rel)

Expand All @@ -291,7 +302,7 @@ test_that("proteinToTranscript works", {
expect_equal(unlist(unname(start(res))), c(-1, -1))

## Preloaded data test
expect_warning(res_preload <- proteinToTranscript(prngs, cds,
expect_warning(res_preload <- proteinToTranscript(prngs, cds,
fiveUTR = fiveUTR))
expect_equal(res_preload,res)
})
Expand Down

0 comments on commit 399de63

Please sign in to comment.