|
1 | 1 | ##' Check coloc dataset inputs for errors
|
2 | 2 | ##'
|
| 3 | +##' A coloc dataset is a list, containing a mixture of vectors |
| 4 | +##' capturing quantities that vary between snps (these vectors must |
| 5 | +##' all have equal length) and scalars capturing quantities that |
| 6 | +##' describe the dataset. |
| 7 | +##' |
3 | 8 | ##' Coloc is flexible, requiring perhaps only p values, or z scores, or effect
|
4 | 9 | ##' estimates and standard errors, but with this flexibility, also comes
|
5 | 10 | ##' difficulties describing exactly the combinations of items required.
|
6 |
| - |
7 |
| -##' \describe{ |
8 | 11 | ##'
|
| 12 | +##' Required vectors are some subset of |
| 13 | +##' |
| 14 | +##' \describe{ |
| 15 | +##' \item{beta}{regression coefficient for each SNP from dataset 1} |
| 16 | +##' \item{varbeta}{variance of beta} |
9 | 17 | ##' \item{pvalues}{P-values for each SNP in dataset 1}
|
10 |
| -##' |
11 |
| -##' \item{N}{Number of samples in dataset 1} |
12 |
| -##' |
13 | 18 | ##' \item{MAF}{minor allele frequency of the variants}
|
| 19 | +##' \item{snp}{a character vector of snp ids, optional. It will be used to merge dataset1 and dataset2 and will be retained in the results.} |
| 20 | +##' } |
14 | 21 | ##'
|
15 |
| -##' \item{beta}{regression coefficient for each SNP from dataset 1} |
16 |
| -##' |
17 |
| -##' \item{varbeta}{variance of beta} |
18 |
| -##' |
19 |
| -##' \item{type}{the type of data in dataset 1 - either "quant" or "cc" to denote quantitative or case-control} |
20 |
| -##' |
21 |
| -##' \item{s}{for a case control dataset, the proportion of samples in dataset 1 that are cases} |
22 |
| -##' |
23 |
| -##' \item{sdY}{for a quantitative trait, the population standard deviation of the trait. if not given, it can be estimated from the vectors of varbeta and MAF} |
24 |
| -##' |
25 |
| -##' \item{snp}{a character vector of snp ids, optional. If present, it will be used to merge dataset1 and dataset2. Otherwise, the function assumes dataset1 and dataset2 contain results for the same SNPs in the same order.} |
| 22 | +##' Preferably, give \code{beta} and \code{varbeta}. But if these are not available, sufficient statistics can be approximated from \code{pvalues} and \code{MAF}. |
| 23 | +##' |
| 24 | +##' Required scalars are some subset of |
26 | 25 | ##'
|
| 26 | +##' \describe{ |
| 27 | +##' \item{N}{Number of samples in dataset 1} |
| 28 | +##' \item{type}{the type of data in dataset 1 - either "quant" or "cc" to denote quantitative or case-control} |
| 29 | +##' \item{s}{for a case control dataset, the proportion of samples in dataset 1 that are cases} |
| 30 | +##' \item{sdY}{for a quantitative trait, the population standard deviation of the trait. if not given, it can be estimated from the vectors of varbeta and MAF} |
27 | 31 | ##' }
|
28 | 32 | ##'
|
29 |
| -##' Some of these items may be missing, but you must always give {\code{type}}. |
30 |
| -##' |
31 |
| -##' Then scalars describing the samples used: |
| 33 | +##' You must always give {\code{type}}. Then, |
32 | 34 | ##' \describe{
|
33 |
| -##' \item{always needed}{\code{N}} |
34 | 35 | ##' \item{if \code{type}=="cc"}{\code{s}}
|
35 | 36 | ##' \item{if \code{type}=="quant" and \code{sdY} known}{\code{sdY}}
|
| 37 | +##' \item{if beta, varbeta not known}{\code{N}} |
36 | 38 | ##' }
|
37 | 39 | ##' If \code{sdY} is unknown, it will be approximated, and this will require
|
38 | 40 | ##' \describe{
|
39 | 41 | ##' \item{summary data to estimate \code{sdY}}{\code{beta}, \code{varbeta}, \code{N}, \code{MAF}}
|
40 | 42 | ##' }
|
41 | 43 | ##'
|
42 |
| -##' Then, if not already covered above, the summary statistics describing the results |
| 44 | +##' Optional vectors are |
| 45 | +##' |
43 | 46 | ##' \describe{
|
44 |
| -##' \item{preferably}{\code{beta}, \code{varbeta}} |
45 |
| -##' \item{alternatively}{\code{pvalues}, \code{MAF}} |
| 47 | +##' \item{position}{a vector of snp positions, required for \code{plot_dataset}} |
46 | 48 | ##' }
|
47 | 49 | ##'
|
48 |
| -##' \code{check_dataset} call stop() unless a series of expectations on dataset |
| 50 | +##' \code{check_dataset} calls stop() unless a series of expectations on dataset |
49 | 51 | ##' input format are met
|
50 | 52 | ##'
|
51 | 53 | ##' This is a helper function for use by other coloc functions, but
|
|
60 | 62 | ##' @return NULL if no errors found
|
61 | 63 | ##' @export
|
62 | 64 | ##' @author Chris Wallace
|
63 |
| -check_dataset <- function(d,suffix="",req=c("snp"),warn.minp=1e-6) { |
64 |
| - if(!is.list(d) ) |
65 |
| - stop("dataset ",suffix,": is not a list") |
66 |
| - nd <- names(d) |
67 |
| - |
68 |
| - ## no missing values - make people clean their own data rather than make assumptions here for datasets I don't know |
69 |
| - ## req <- unique(c("snp",req)) # always need snp to match now |
70 |
| - n <- 0 |
71 |
| - if(length(setdiff(req,nd))) |
72 |
| - stop("dataset ",suffix,": missing required element(s) ",paste(setdiff(req,nd),collapse=", ")) |
73 |
| - |
74 |
| - for(v in nd) { |
75 |
| - if(any(is.na(d[[v]]))) |
76 |
| - stop("dataset ",suffix,": ",v," contains missing values") |
77 |
| - } |
78 |
| - |
79 |
| - ## snps should be unique |
80 |
| - if("snp" %in% nd && any(duplicated(d$snp))) |
81 |
| - stop("dataset ",suffix,": duplicated snps found") |
82 |
| - if("snp" %in% nd && is.factor(d$snp)) |
83 |
| - stop("dataset ",suffix,": snp should be a character vector but is a factor") |
84 |
| - |
85 |
| - ## MAF should be > 0, < 1 |
86 |
| - if("MAF" %in% nd && (!is.numeric(d$MAF) || any(is.na(d$MAF)) || |
87 |
| - any(d$MAF<=0) || any(d$MAF>=1))) |
88 |
| - stop("dataset ",suffix,": MAF should be a numeric, strictly >0 & <1") |
89 |
| - |
90 |
| - ## lengths of these should match |
91 |
| - l <- -1 # impossible length |
92 |
| - shouldmatch <- c("pvalues","MAF","beta","varbeta","snp","position") |
93 |
| - for(v in shouldmatch) |
94 |
| - if(v %in% nd) |
95 |
| - if(l<0) { ## update |
96 |
| - l <- length(d[[v]]) |
97 |
| - } else { ## check |
98 |
| - if(length(d[[v]])!=l) { |
99 |
| - stop("dataset ",suffix,": lengths of inputs don't match: ") |
100 |
| - print(intersect(nd, shouldmatch)) |
| 65 | +check_dataset <- function(d, |
| 66 | + suffix="", |
| 67 | + req=c("type","snp"), |
| 68 | + warn.minp=1e-6) { |
| 69 | + if(!is.list(d) ) |
| 70 | + stop("dataset ",suffix,": is not a list") |
| 71 | + recognised_items=c("beta","varbeta","pvalues","MAF","snp","position","N","type","s","sdY","LD") |
| 72 | + nd <- intersect(names(d), recognised_items) |
| 73 | + |
| 74 | + ## no missing values - make people clean their own data rather |
| 75 | + ## than make assumptions here for datasets I don't know |
| 76 | + n <- 0 |
| 77 | + if(length(setdiff(req,nd))) |
| 78 | + stop("dataset ",suffix,": missing required element(s) ",paste(setdiff(req,nd),collapse=", ")) |
| 79 | + |
| 80 | + for(v in nd) { |
| 81 | + if(any(is.na(d[[v]]))) |
| 82 | + stop("dataset ",suffix,": ",v," contains missing values") |
| 83 | + } |
| 84 | + |
| 85 | + if (!(d$type %in% c("quant", "cc"))) |
| 86 | + stop("dataset ", suffix, ": ", "type must be quant or cc") |
| 87 | + |
| 88 | + ## snps should be unique |
| 89 | + if("snp" %in% nd && any(duplicated(d$snp))) |
| 90 | + stop("dataset ",suffix,": duplicated snps found") |
| 91 | + if("snp" %in% nd && is.factor(d$snp)) |
| 92 | + stop("dataset ",suffix,": snp should be a character vector but is a factor") |
| 93 | + |
| 94 | + ## MAF should be > 0, < 1 |
| 95 | + if("MAF" %in% nd && (!is.numeric(d$MAF) || any(is.na(d$MAF)) || |
| 96 | + any(d$MAF<=0) || any(d$MAF>=1))) |
| 97 | + stop("dataset ",suffix,": MAF should be a numeric, strictly >0 & <1") |
| 98 | + |
| 99 | + ## lengths of vector arguments should match |
| 100 | + l <- -1 # impossible length |
| 101 | + shouldmatch <- intersect(nd, c("pvalues","MAF","beta","varbeta","snp","position")) |
| 102 | + for(v in shouldmatch) |
| 103 | + if(l<0) { ## update |
| 104 | + l <- length(d[[v]]) |
| 105 | + } else { ## check |
| 106 | + if(length(d[[v]])!=l) { |
| 107 | + stop("dataset ",suffix,": lengths of inputs don't match: ") |
| 108 | + print(shouldmatch) |
| 109 | + } |
101 | 110 | }
|
102 |
| - } |
103 |
| - |
104 |
| - ## sample size |
105 |
| - if (("N" %in% req) && (!('N' %in% nd) || is.null(d$N) || is.na(d$N))) |
106 |
| - stop("dataset ",suffix,": sample size N not set") |
107 |
| - |
108 |
| - ## type of data |
109 |
| - if (! ('type' %in% nd)) |
110 |
| - stop("dataset ",suffix,": variable type not set") |
111 |
| - if(!(d$type %in% c("quant","cc"))) |
112 |
| - stop("dataset ",suffix,": ","type must be quant or cc") |
113 |
| - |
114 |
| - ## no beta/varbeta |
115 |
| - if(("s" %in% nd) && (!is.numeric(d$s) || d$s<=0 || d$s>=1)) |
116 |
| - stop("dataset ",suffix,": ","s must be between 0 and 1") |
117 |
| - if(!("beta" %in% nd) || !("varbeta" %in% nd)) { # need to estimate var (Y) |
118 |
| - if(!("pvalues" %in% nd) || !( "MAF" %in% nd)) |
119 |
| - stop("dataset ",suffix,": ","require p values and MAF if beta, varbeta are unavailable") |
120 |
| - if(d$type=="cc" && !("s" %in% nd)) |
121 |
| - stop("dataset ",suffix,": ","require, s, proportion of samples who are cases, if beta, varbeta are unavailable") |
122 |
| - p=d$pvalues |
123 |
| - } else { |
124 |
| - p=pnorm( -abs( d$beta/sqrt(d$varbeta) ) ) * 2 |
125 |
| - } |
126 |
| - |
127 |
| - ## minp |
128 |
| - if(min(p) > warn.minp) |
129 |
| - warning("minimum p value is: ",format.pval(min(p)),"\nIf this is what you expected, this is not a problem.\nIf this is not as small as you expected, please check the 02_data vignette.") |
130 |
| - |
131 |
| - ## sdY |
132 |
| - if(d$type=="quant" && !("sdY" %in% nd)) |
133 |
| - if(!("MAF" %in% nd && "N" %in% nd )) |
134 |
| - stop("dataset ",suffix,": ","must give sdY for type quant, or, if sdY unknown, MAF and N so it can be estimated") |
135 |
| - |
136 |
| - if("LD" %in% nd) { |
137 |
| - if(nrow(d$LD)!=ncol(d$LD)) |
138 |
| - stop("LD not square") |
139 |
| - if(!identical(colnames(d$LD),rownames(d$LD))) |
140 |
| - stop("LD rownames != colnames") |
141 |
| - if(length(setdiff(d$snp,colnames(d$LD)))) |
142 |
| - stop("colnames in LD do not contain all SNPs") |
143 |
| - } |
144 | 111 |
|
145 |
| - ## if we reach here, no badness detected |
146 |
| - NULL |
| 112 | + ## type of data |
| 113 | + if (! ('type' %in% nd)) |
| 114 | + stop("dataset ",suffix,": variable type not set") |
| 115 | + if(!(d$type %in% c("quant","cc"))) |
| 116 | + stop("dataset ",suffix,": ","type must be quant or cc") |
| 117 | + |
| 118 | + ## no beta/varbeta |
| 119 | + if(("s" %in% nd) && (!is.numeric(d$s) || d$s<=0 || d$s>=1)) |
| 120 | + stop("dataset ",suffix,": ","s must be between 0 and 1") |
| 121 | + if(!("beta" %in% nd) || !("varbeta" %in% nd)) { # need to estimate var (Y) |
| 122 | + if(!("pvalues" %in% nd) || !( "MAF" %in% nd)) |
| 123 | + stop("dataset ",suffix,": ","require p values and MAF if beta, varbeta are unavailable") |
| 124 | + if(any(d$pvalues<=0)) |
| 125 | + stop("pvalues should not be negative or exactly 0") |
| 126 | + if(d$type=="cc" && !("s" %in% nd)) |
| 127 | + stop("dataset ",suffix,": ","require, s, proportion of samples who are cases, if beta, varbeta are unavailable") |
| 128 | + if (!('N' %in% nd) || is.null(d$N) || any(d$N<=0) ) |
| 129 | + stop("dataset ",suffix,": sample size N <=0 or not set") |
| 130 | + p=d$pvalues |
| 131 | + } else { |
| 132 | + p=pnorm( -abs( d$beta/sqrt(d$varbeta) ) ) * 2 |
| 133 | + } |
| 134 | + |
| 135 | + ## minp |
| 136 | + if(min(p) > warn.minp) |
| 137 | + warning("minimum p value is: ",format.pval(min(p)),"\nIf this is what you expected, this is not a problem.\nIf this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.") |
| 138 | + |
| 139 | + ## sdY |
| 140 | + if(d$type=="quant" && !("sdY" %in% nd)) |
| 141 | + if(!("MAF" %in% nd && "N" %in% nd )) |
| 142 | + stop("dataset ",suffix,": ","must give sdY for type quant, or, if sdY unknown, MAF and N so it can be estimated") |
| 143 | + |
| 144 | + if("LD" %in% nd) { |
| 145 | + if(nrow(d$LD)!=ncol(d$LD)) |
| 146 | + stop("LD not square") |
| 147 | + if(!identical(colnames(d$LD),rownames(d$LD))) |
| 148 | + stop("LD rownames != colnames") |
| 149 | + if(length(setdiff(d$snp,colnames(d$LD)))) |
| 150 | + stop("colnames in LD do not contain all SNPs") |
| 151 | + } |
| 152 | + |
| 153 | + ## if we reach here, no badness detected |
| 154 | + NULL |
147 | 155 | }
|
148 | 156 |
|
149 | 157 | #'@rdname check_dataset
|
|
0 commit comments