-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsyn.strata.r
191 lines (170 loc) · 8.75 KB
/
syn.strata.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Strata can be provided as a vector (=flag) or as variable names
# If all stratifying values have the same values within strata they will be
# automatically removed; if not they will be taken into account
# ?????
#! probably NOT possible and syn.strata needs to be kept as a separate FUN
# creating generic syn function
# changing syn to syn.default
# and if strata != NULL calling syn.strata
# ?????
# Other issues:
# - multiple printout (per strata, per m)
# - compare() by strata?
# - k as a sum?
syn.strata <- function(data, strata = NULL,
minstratumsize = 10 + 10 * length(visit.sequence),
tab.strataobs = TRUE, tab.stratasyn = FALSE,
method = "cart", visit.sequence = (1:ncol(data)),
predictor.matrix = NULL,
m = 1, k = nrow(data), proper = FALSE,
minnumlevels = 1, maxfaclevels = 60,
rules = NULL, rvalues = NULL,
cont.na = NULL, semicont = NULL,
smoothing = NULL, event = NULL, denom = NULL,
drop.not.used = FALSE, drop.pred.only = FALSE,
default.method = c("normrank","logreg","polyreg","polr"),
numtocat = NULL, catgroups = rep(5,length(numtocat)),
models = FALSE,
print.flag = TRUE,
seed = "sample",
...){
m0 <- max(1, m)
if (!is.na(seed) & seed == "sample") {
seed <- sample.int(1e9,1)
}
if (!is.na(seed)) set.seed(seed)
# CHECKS
#--------
if (is.null(strata)) stop("Argument strata is missing.", call. = FALSE)
# If strata given as variable names (check if they exist)
# -> change into one factor with strata names
if (is.character(strata) & any(!duplicated(strata))) {
varindex <- match(strata, colnames(data))
if (any(is.na(varindex))) stop("Unrecognized variable(s) in strata: ",
paste(strata[is.na(varindex)],collapse=", "), call. = FALSE)
else {
dstrataNA <- lapply(data[, strata, drop = FALSE], addNA, ifany = TRUE) # change NA to a factor level
strata.lab <- interaction(dstrataNA, drop = TRUE, sep = "_")
#strata.varnames <- paste0(strata, collapse="_")
}
} else {
# check length of strata vector if given as vector; check for missing
if (length(strata) != nrow(data)) stop(paste("The length of strata index (",
length(strata), ") does not match the number of rows in the data (",
nrow(data),").",sep=""), call. = FALSE)
if (any(is.na(strata))) stop("Strata indicator cannot have missing values.",
call. = FALSE)
strata.lab <- factor(strata)
}
#--------
# make sure stratification variables are included in visit.sequence
# important when drop.not.used==T
if (is.character(strata) & any(!duplicated(strata))){ #GR-20/10/2016 drop.not.used == TRUE removed from the condition
strata <- match(strata, colnames(data))
if (is.character(visit.sequence)) visit.sequence <- match(visit.sequence, colnames(data))
if (any(is.na(visit.sequence))) stop("Unrecognized variable(s) in visit.sequence.", call. = FALSE)
visit.sequence <- c(visit.sequence, strata[!(strata %in% visit.sequence)])
}
stratalev.lab <- levels(strata.lab)
strata.n.obs <- table(strata.lab)
nstrata <- dim(strata.n.obs)
if (tab.strataobs == TRUE) {
cat("Number of observations in strata (original data):")
print(table(strata.lab, deparse.level = 0))
}
# check min number of observations in strata
stratasize.stop <- minstratumsize
stratasize.warn <- 100 + 10*length(visit.sequence)
smallstrata <- sum(strata.n.obs < stratasize.stop)
if (smallstrata > 5) stop("In the original data multiple strata have fewer than the recommended\nnumber of observations. We advise that each should have at least ",
stratasize.stop, " observations\n('minstratumsize' which by default is 10 + 10 * no. of variables used in prediction).\n",
"You can override this by setting the parameter 'minstratumsize' to a lower value.\n",
sep = "", call. = FALSE)
if (smallstrata > 0) stop("In the original data some strata (",
paste(stratalev.lab[strata.n.obs < stratasize.stop], collapse = ", "),
") have fewer than the recommended\nnumber of observations. We advise that each should have at least ",
stratasize.stop, " observations\n('minstratumsize' which by default is 10 + 10 * no. of variables used in prediction).\n",
"You can override this by setting the parameter 'minstratumsize' to a lower value.\n",
sep = "", call. = FALSE)
if (any(strata.n.obs < stratasize.warn) & print.flag == TRUE) {
cat("CAUTION: In the original data some strata (",
paste(stratalev.lab[strata.n.obs < stratasize.warn], collapse=", "),
") have limited numbers of observations.\nWe advise that there should be at least ",
stratasize.warn,
" observations (100 + 10 * no. of variables\nused in prediction).\n", sep = "")
}
synds.names <- c("call", "m", "syn", "method", "visit.sequence",
"predictor.matrix", "smoothing", "event", "denom", "proper", "n", "k",
"rules", "rvalues", "cont.na", "semicont", "drop.not.used", "drop.pred.only",
"models", "seed", "var.lab", "val.lab", "obs.vars", "strata.syn", "strata.lab","numtocat","catgroups")
synds <- list(setNames(vector("list",length(synds.names)),synds.names))
synds <- rep(synds, m0)
sel.names <- match(c("call", "m", "predictor.matrix", "proper", "strata.syn",
"strata.lab", "seed", "numtocat", "catgroups", "models"), synds.names)
same.by.m <- c("call", "m", "method", "visit.sequence", "predictor.matrix",
"smoothing", "event", "denom", "proper", "n", "rules", "rvalues", "cont.na",
"semicont", "drop.not.used", "drop.pred.only", "seed", "var.lab",
"val.lab", "obs.vars", "strata.lab","numtocat","catgroups")
same.by.m.idx <- match(same.by.m, synds.names)
syn.args <- as.list(match.call()[-1])
strata.n.syn <- vector("list", m0)
for (j in 1:m0){
synds[[j]]$strata.syn <- sort(factor(sample(stratalev.lab, k, replace = TRUE,
prob = strata.n.obs), levels = stratalev.lab, exclude = NULL))
synds[[j]]$strata.lab <- stratalev.lab
strata.n.syn[[j]] <- table(synds[[j]]$strata.syn, deparse.level = 0)
if (tab.stratasyn == TRUE) {
cat("\nNumber of observations in strata (synthetic data, m = ", j, "):", sep="")
print(strata.n.syn[[j]])
}
}
#Different way of printing (all syn in one table)
#cat("\nNumber of observations in strata (synthetic data):\n")
#starta.n.syn.df <- do.call("rbind", strata.n.syn)
#rownames(starta.n.syn.df) <- paste0("m = ", 1:m)
#print(starta.n.syn.df)
for (j in 1:m0){
synds.ind <- vector("list", nstrata) # results by stratum
# exclude args that are not in syn()
syn.args$strata <- syn.args$tab.stratasyn <- syn.args$tab.strataobs <-
syn.args$minstratumsize <- NULL
syn.args$m <- 1
syn.args$visit.sequence <- visit.sequence
# vs <- visit.sequence
# syn.args$visit.sequence <- c(vs[(vs %in% strata)],vs[!(vs %in% strata)]) #!GR 9/17 move strata to start of visit sequence
for (i in 1:nstrata) {
if (print.flag) cat("\nm = ",j,", strata = ", stratalev.lab[i],
"\n-----------------------------------------------------\n", sep="")
if (!is.na(stratalev.lab[i])) {
syn.args$data <- data[strata.lab == stratalev.lab[i],]
} else {
syn.args$data <- data[is.na(as.character(strata.lab)),]
}
syn.args$k <- strata.n.syn[[j]][i]; names(syn.args$k) <- "strata"
syn.args$seed <- NA
if (syn.args$k == 0 | m==0) syn.args$m <- 0 else syn.args$m <- 1
synds.ind[[i]] <- do.call("syn", syn.args)
}
synds[[j]]$call <- match.call()
synds[[j]]$m <- m
synds[[j]]$proper <- proper
synds[[j]]$predictor.matrix <- eval(parse(text = paste0("list(",
paste0("synds.ind[[", 1:nstrata, "]]$predictor.matrix", collapse = ", "),")")))
synds[[j]]$models <- eval(parse(text = paste0("list(",
paste0("synds.ind[[", 1:nstrata, "]]$models", collapse = ", "),")")))
synds[[j]]$seed <- seed
synds[[j]][-sel.names] <- eval(parse(text = paste0("Map(rbind, ",
paste0("synds.ind[[", 1:nstrata, "]][-sel.names]", collapse = ", "),")")))
if (k > 0 & m > 0) rownames(synds[[j]]$syn) <- 1:k
}
if (m==1 | m==0) {
synds <- synds[[1]]
} else {
synds <- eval(parse(text = paste0("Map(list, ", paste0("synds[[", 1:m, "]]",
collapse = ", "),")")))
synds[same.by.m.idx] <- lapply(same.by.m.idx, function(x) synds[[x]][[1]])
}
if (m==0) cat("\nCAUTION: method, visit.sequence and predictor.matrix are lists that may vary by stratum and\nshould not be used to initialise syn. For initialising rerun with m = 0 without stratification.\n\n")
class(synds) <- "synds"
return(synds)
}