chr1swallace
diff --git a/Diff for: ‎.travis.yml
-8 b/Diff for: ‎.travis.yml
-8
diff --git a/Diff for: ‎DESCRIPTION
+5-6 b/Diff for: ‎DESCRIPTION
+5-6
diff --git a/Diff for: ‎NEWS.md
+6-3 b/Diff for: ‎NEWS.md
+6-3
diff --git a/Diff for: ‎R/check.R
+114-106 b/Diff for: ‎R/check.R
+114-106
@@ -14,11 +14,10 @@ Suggests:
     testthat,
     mvtnorm,
     magrittr,
-    rmarkdown,
-    snpStats
+    rmarkdown
 Title: Colocalisation Tests of Two Genetic Traits
-Version: 5.2.1
-Date: 2022-06-24
+Version: 5.2.2
+Date: 2023-05-12
 Authors@R: c(person("Chris", "Wallace", role=c("aut","cre"),
     email = "[email protected]"),
     person("Claudia","Giambartolomei", role="aut",
@@ -29,12 +28,12 @@ Maintainer: Chris Wallace <[email protected]>
 Description: Performs the colocalisation tests described in
     Giambartolomei et al (2013) <doi:10.1371/journal.pgen.1004383>,
     Wallace (2020) <doi:10.1371/journal.pgen.1008720>,
-    Wallace (2021) <doi:10.1101/2021.02.23.432421>.
+    Wallace (2021) <doi:10.1371/journal.pgen.1009440>.
 License: GPL
 LazyLoad: yes
 VignetteBuilder: knitr
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 Encoding: UTF-8
 URL: https://github.com/chr1swallace/coloc
 BugReports: https://github.com/chr1swallace/coloc/issues
 
@@ -1,3 +1,6 @@
+# coloc 5.2.2
+* better checking of dataset inputs, and better documentation of what that data should look like
+
 # coloc 5.2.0
 * update to new susieR >= 0.12.06 which requires sample size
 * general tidy up of susie-related code
@@ -70,17 +73,17 @@
 # coloc 2.3-2 
 - tidied code relating mainly to proportional
 - colocalisation testing methods, making more methods confirm to S4.
-- pcs.prepare now imputates missing genotypes by default
+- pcs.prepare now imputes missing genotypes by default
 2013-05-22 
 
 # coloc 2.3 
 ## coloc BUGFIX
-- Introduced a function to estimate trait variance from supplied coefficients and standard errors.  This is used within the approach implemented in coloc.abf(), and replaces the earlier version which implicity assumed that var(Y)=1 for quantitative traits, which could lead to incorrect inference when var(Y) was far from 1.
+- Introduced a function to estimate trait variance from supplied coefficients and standard errors.  This is used within the approach implemented in coloc.abf(), and replaces the earlier version which implicitly assumed that var(Y)=1 for quantitative traits, which could lead to incorrect inference when var(Y) was far from 1.
 
 2013-09-25  
 
 # coloc 2.2
-- Merged coloc.abf and coloc.abf.imputed(), so that datasets for wheich beta, var(beta) are available can be matched to datasets with only p values and maf.2 This means the arguments to coloc.abf() have been changed!  Please
+- Merged coloc.abf and coloc.abf.imputed(), so that datasets for which beta, var(beta) are available can be matched to datasets with only p values and maf.2 This means the arguments to coloc.abf() have been changed!  Please
 check ?coloc.abf for the new function.
 
 2013-19-06  
 
@@ -1,51 +1,53 @@
 ##' Check coloc dataset inputs for errors
 ##'
+##' A coloc dataset is a list, containing a mixture of vectors
+##' capturing quantities that vary between snps (these vectors must
+##' all have equal length) and scalars capturing quantities that
+##' describe the dataset.
+##'
 ##' Coloc is flexible, requiring perhaps only p values, or z scores, or effect
 ##' estimates and standard errors, but with this flexibility, also comes
 ##' difficulties describing exactly the combinations of items required.
-
-##' \describe{
 ##'
+##' Required vectors are some subset of
+##' 
+##' \describe{
+##'   \item{beta}{regression coefficient for each SNP from dataset 1}
+##'   \item{varbeta}{variance of beta}
 ##'   \item{pvalues}{P-values for each SNP in dataset 1}
-##'
-##'   \item{N}{Number of samples in dataset 1}
-##'
 ##'   \item{MAF}{minor allele frequency of the variants}
+##'   \item{snp}{a character vector of snp ids, optional. It will be used to merge dataset1 and dataset2 and will be retained in the results.}
+##' }
 ##'
-##' \item{beta}{regression coefficient for each SNP from dataset 1}
-##'
-##' \item{varbeta}{variance of beta}
-##'
-##' \item{type}{the type of data in dataset 1 - either "quant" or "cc" to denote quantitative or case-control}
-##'
-##' \item{s}{for a case control dataset, the proportion of samples in dataset 1 that are cases}
-##'
-##'  \item{sdY}{for a quantitative trait, the population standard deviation of the trait.  if not given, it can be estimated from the vectors of varbeta and MAF}
-##'
-##' \item{snp}{a character vector of snp ids, optional. If present, it will be used to merge dataset1 and dataset2.  Otherwise, the function assumes dataset1 and dataset2 contain results for the same SNPs in the same order.}
+##' Preferably, give \code{beta} and \code{varbeta}.  But if these are not available, sufficient statistics can be approximated from \code{pvalues} and \code{MAF}.
+##' 
+##' Required scalars are some subset of
 ##'
+##' \describe{
+##'   \item{N}{Number of samples in dataset 1}
+##'   \item{type}{the type of data in dataset 1 - either "quant" or "cc" to denote quantitative or case-control}
+##'   \item{s}{for a case control dataset, the proportion of samples in dataset 1 that are cases}
+##'   \item{sdY}{for a quantitative trait, the population standard deviation of the trait.  if not given, it can be estimated from the vectors of varbeta and MAF}
 ##' }
 ##'
-##' Some of these items may be missing, but you must always give {\code{type}}.
-##'
-##' Then scalars describing the samples used:
+##' You must always give {\code{type}}.  Then,
 ##' \describe{
-##' \item{always needed}{\code{N}}
 ##' \item{if \code{type}=="cc"}{\code{s}}
 ##' \item{if \code{type}=="quant" and \code{sdY} known}{\code{sdY}}
+##' \item{if beta, varbeta not known}{\code{N}}
 ##' }
 ##' If \code{sdY} is unknown, it will be approximated, and this will require
 ##' \describe{
 ##' \item{summary data to estimate \code{sdY}}{\code{beta}, \code{varbeta}, \code{N}, \code{MAF}}
 ##' }
 ##'
-##' Then, if not already covered above, the summary statistics describing the results
+##' Optional vectors are
+##'
 ##' \describe{
-##' \item{preferably}{\code{beta}, \code{varbeta}}
-##' \item{alternatively}{\code{pvalues}, \code{MAF}}
+##'   \item{position}{a vector of snp positions, required for \code{plot_dataset}}
 ##' }
 ##'
-##' \code{check_dataset} call stop() unless a series of expectations on dataset
+##' \code{check_dataset} calls stop() unless a series of expectations on dataset
 ##' input format are met
 ##'
 ##' This is a helper function for use by other coloc functions, but
@@ -60,90 +62,96 @@
 ##' @return NULL if no errors found
 ##' @export
 ##' @author Chris Wallace
-check_dataset <- function(d,suffix="",req=c("snp"),warn.minp=1e-6) {
-  if(!is.list(d) )
-    stop("dataset ",suffix,": is not a list")
-  nd <- names(d)
-
-  ## no missing values - make people clean their own data rather than make assumptions here for datasets I don't know
-  ## req <- unique(c("snp",req)) # always need snp to match now
-  n <- 0
-  if(length(setdiff(req,nd)))
-    stop("dataset ",suffix,": missing required element(s) ",paste(setdiff(req,nd),collapse=", "))
-
-  for(v in nd) {
-    if(any(is.na(d[[v]])))
-      stop("dataset ",suffix,": ",v," contains missing values")
-  }
-
-  ## snps should be unique
-  if("snp" %in% nd && any(duplicated(d$snp)))
-    stop("dataset ",suffix,": duplicated snps found")
-  if("snp" %in% nd && is.factor(d$snp))
-    stop("dataset ",suffix,": snp should be a character vector but is a factor")
-
-  ## MAF should be > 0, < 1
-  if("MAF" %in% nd && (!is.numeric(d$MAF) || any(is.na(d$MAF)) ||
-                       any(d$MAF<=0) || any(d$MAF>=1)))
-    stop("dataset ",suffix,": MAF should be a numeric, strictly >0 & <1")
-
-  ## lengths of these should match
-  l <- -1 # impossible length
-  shouldmatch <- c("pvalues","MAF","beta","varbeta","snp","position")
-  for(v in shouldmatch)
-    if(v %in% nd)
-      if(l<0) { ## update
-        l <- length(d[[v]])
-      } else { ## check
-        if(length(d[[v]])!=l) {
-          stop("dataset ",suffix,": lengths of inputs don't match: ")
-          print(intersect(nd, shouldmatch))
+check_dataset <- function(d,
+                          suffix="",
+                          req=c("type","snp"),
+                          warn.minp=1e-6) {
+    if(!is.list(d) )
+        stop("dataset ",suffix,": is not a list")
+    recognised_items=c("beta","varbeta","pvalues","MAF","snp","position","N","type","s","sdY","LD")
+    nd <- intersect(names(d), recognised_items)
+
+    ## no missing values - make people clean their own data rather
+    ## than make assumptions here for datasets I don't know
+    n <- 0
+    if(length(setdiff(req,nd)))
+        stop("dataset ",suffix,": missing required element(s) ",paste(setdiff(req,nd),collapse=", "))
+
+    for(v in nd) {
+        if(any(is.na(d[[v]])))
+            stop("dataset ",suffix,": ",v," contains missing values")
+    }
+    
+    if (!(d$type %in% c("quant", "cc"))) 
+        stop("dataset ", suffix, ": ", "type must be quant or cc")
+    
+    ## snps should be unique
+    if("snp" %in% nd && any(duplicated(d$snp)))
+        stop("dataset ",suffix,": duplicated snps found")
+    if("snp" %in% nd && is.factor(d$snp))
+        stop("dataset ",suffix,": snp should be a character vector but is a factor")
+
+    ## MAF should be > 0, < 1
+    if("MAF" %in% nd && (!is.numeric(d$MAF) || any(is.na(d$MAF)) ||
+                         any(d$MAF<=0) || any(d$MAF>=1)))
+        stop("dataset ",suffix,": MAF should be a numeric, strictly >0 & <1")
+
+    ## lengths of vector arguments should match
+    l <- -1 # impossible length
+    shouldmatch <- intersect(nd, c("pvalues","MAF","beta","varbeta","snp","position"))
+    for(v in shouldmatch)
+        if(l<0) { ## update
+            l <- length(d[[v]])
+        } else { ## check
+            if(length(d[[v]])!=l) {
+                stop("dataset ",suffix,": lengths of inputs don't match: ")
+                print(shouldmatch)
+            }
         }
-      }
-
-  ## sample size
-  if (("N" %in% req) && (!('N' %in% nd) || is.null(d$N) || is.na(d$N)))
-    stop("dataset ",suffix,": sample size N not set")
-
-  ## type of data
-  if (! ('type' %in% nd))
-    stop("dataset ",suffix,": variable type not set")
-  if(!(d$type %in% c("quant","cc")))
-    stop("dataset ",suffix,": ","type must be quant or cc")
-
-  ## no beta/varbeta
-  if(("s" %in% nd) && (!is.numeric(d$s) || d$s<=0 || d$s>=1))
-    stop("dataset ",suffix,": ","s must be between 0 and 1")
-  if(!("beta" %in% nd) || !("varbeta" %in% nd)) { # need to estimate var (Y)
-    if(!("pvalues" %in% nd) || !( "MAF" %in% nd))
-      stop("dataset ",suffix,": ","require p values and MAF if beta, varbeta are unavailable")
-    if(d$type=="cc" && !("s" %in% nd))
-      stop("dataset ",suffix,": ","require, s, proportion of samples who are cases, if beta, varbeta are unavailable")
-    p=d$pvalues
-  } else {
-    p=pnorm( -abs( d$beta/sqrt(d$varbeta) ) ) * 2
-  }
-
-  ## minp
-  if(min(p) > warn.minp)
-    warning("minimum p value is: ",format.pval(min(p)),"\nIf this is what you expected, this is not a problem.\nIf this is not as small as you expected, please check the 02_data vignette.")
-
-  ## sdY
-  if(d$type=="quant" && !("sdY" %in% nd))
-    if(!("MAF" %in% nd && "N" %in% nd ))
-      stop("dataset ",suffix,": ","must give sdY for type quant, or, if sdY unknown, MAF and N so it can be estimated")
-
-  if("LD" %in% nd) {
-    if(nrow(d$LD)!=ncol(d$LD))
-      stop("LD not square")
-    if(!identical(colnames(d$LD),rownames(d$LD)))
-      stop("LD rownames != colnames")
-    if(length(setdiff(d$snp,colnames(d$LD))))
-      stop("colnames in LD do not contain all SNPs")
-  }
 
-  ## if we reach here, no badness detected
-  NULL
+    ## type of data
+    if (! ('type' %in% nd))
+        stop("dataset ",suffix,": variable type not set")
+    if(!(d$type %in% c("quant","cc")))
+        stop("dataset ",suffix,": ","type must be quant or cc")
+
+    ## no beta/varbeta
+    if(("s" %in% nd) && (!is.numeric(d$s) || d$s<=0 || d$s>=1))
+        stop("dataset ",suffix,": ","s must be between 0 and 1")
+    if(!("beta" %in% nd) || !("varbeta" %in% nd)) { # need to estimate var (Y)
+        if(!("pvalues" %in% nd) || !( "MAF" %in% nd))
+            stop("dataset ",suffix,": ","require p values and MAF if beta, varbeta are unavailable")
+        if(any(d$pvalues<=0))
+            stop("pvalues should not be negative or exactly 0")
+        if(d$type=="cc" && !("s" %in% nd))
+            stop("dataset ",suffix,": ","require, s, proportion of samples who are cases, if beta, varbeta are unavailable")
+        if (!('N' %in% nd) || is.null(d$N) || any(d$N<=0) )
+            stop("dataset ",suffix,": sample size N <=0 or not set")
+        p=d$pvalues
+    } else {
+        p=pnorm( -abs( d$beta/sqrt(d$varbeta) ) ) * 2
+    }
+
+    ## minp
+    if(min(p) > warn.minp)
+        warning("minimum p value is: ",format.pval(min(p)),"\nIf this is what you expected, this is not a problem.\nIf this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.")
+
+    ## sdY
+    if(d$type=="quant" && !("sdY" %in% nd))
+        if(!("MAF" %in% nd && "N" %in% nd ))
+            stop("dataset ",suffix,": ","must give sdY for type quant, or, if sdY unknown, MAF and N so it can be estimated")
+
+    if("LD" %in% nd) {
+        if(nrow(d$LD)!=ncol(d$LD))
+            stop("LD not square")
+        if(!identical(colnames(d$LD),rownames(d$LD)))
+            stop("LD rownames != colnames")
+        if(length(setdiff(d$snp,colnames(d$LD))))
+            stop("colnames in LD do not contain all SNPs")
+    }
+
+    ## if we reach here, no badness detected
+    NULL
 }
 
 #'@rdname check_dataset