-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.R
529 lines (498 loc) · 24.8 KB
/
functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
#' This function takes a list of package named, loads them if they are
#' available, otherwise attempts to install each one and then again
#' attempts to load it.
instrequire <- function(pkglist
,repos=getOption('repos','https://cran.rstudio.com/')){
pkgs_installed <- sapply(pkglist,require,character.only=T);
if(length(pkgs_needed <- names(pkgs_installed[!pkgs_installed]))>0){
install.packages(pkgs_needed,repos=repos,dependencies = T);
pkgs_final <- sapply(pkgs_needed,require,character.only=T);
if(!all(pkgs_final)){
stop(c('the following required packages could not be installed:\n'
,paste0(names(pkgs_final[!pkgs_final]),collapse = ', ')));
}
};
}
survAUC<-function(Surv.rsp,Surv.rsp.new,lp,lpnew,times,time,lp0=0
,nonstdSurv=c('OXS','Nagelk','XO')
,...,FUNS){
if(is.null(info<-getOption('survAUCinfo'))){
# create a lookup table for these functions if one doesn't exist
info<-sapply(ls(package:survAUC),get) %>% lapply(formals) %>%
lapply(sapply,class) %>% lapply(bind_rows) %>% bind_rows(.id='fun');
options(survAUCinfo=info);
}
if(!missing(FUNS)) info <- subset(info,fun %in% FUNS);
args <- list();
for(ii in names(match.call())[-1]) args[[ii]] <- get(ii);
# create a value for 'time' if it's missing but there are 'times'
if(!'time' %in% names(args) && 'times' %in% names(args)) args$time <- max(eval(args$times));
# make sure lp0 is the right length (seriously people? You're going to make me
# do this in R of all languages?). BTW, Surv.rsp.new not an error-- for all
# functions requiring lp0, there is also that departure from standard names
# and for those three Surv.rsp.new becomes Surv.rsp
if(missing(lp0)&&!missing(Surv.rsp.new)) args$lp0 <- rep_len(0,nrow(args$Surv.rsp.new));
# which arguments are we missing on our current invokation?
if(!'lp'%in%names(args)) browser();
# find the arguments that each function can accept
allowedargs<-setNames(apply(info[,-1],1,function(xx) names(info)[-1][!is.na(xx)])
,unlist(info[,1]));
# find the names of the non-optional arguments for each function
nonoptargs<- sapply(info$fun,function(ii) subset(info,fun==ii),simplify=F) %>%
lapply(function(jj) na.omit(names(jj)[jj=='name']));
# based on the above, here are the functions we can run with the data we have
# for the ones that are FALSE we will return NAs
canrun <- lapply(nonoptargs,setdiff,names(args)) %>% sapply(length) == 0;
# and filter it down to just the ones that are valid arguments for functions
# that can be run
args2use <- sapply(names(canrun)
,function(xx) if(canrun[xx])
args[intersect(allowedargs[[xx]],names(args))] else NA
,simplify=F);
# correct the inconsistent argument names in the functions names by the
# nonstdSurv variable
for(ii in intersect(nonstdSurv,names(canrun[canrun]))) {
args2use[[ii]][c('Surv.rsp','lp')] <- args[c('Surv.rsp.new','lpnew')];
}
# return output!
invisible(sapply(names(args2use),function(xx) if(is.na(args2use[[xx]])) NA else {
try(do.call(xx,args2use[[xx]]))}));
}
#' into the specified existing or new level. That level is listed last
#' in the resulting factor.
#' @param xx A \code{vector} (required).
#' @param topn \code{numeric} for the top how many levels to keep (optional, default =4)
#' @param binto \code{character} which new or existing level to dump the other values in
cl_bintail <- function(xx,topn=4,binto='other'){
if(!is.factor(xx)) xx <- factor(xx);
counts <- sort(table(xx),decreasing = T);
if(is.numeric(binto)) binto <- names(counts)[binto];
keep <- setdiff(names(counts)[1:min(length(counts),topn)],binto);
droplevels(
factor(
ifelse(
xx %in% keep, as.character(xx), binto
),levels=c(keep,binto)));
}
#' A sketch for a possible future function that converts stargazer tables into
#' a universal markdown pipe format
starkable <- function(xx,firstrowisdata=T,row.names=F,taildrop=1
,hrow=2
,sgoutput=F,kboutput=T,searchrep=matrix(c('V1',''),ncol=2)
,...){
output <- if(!sgoutput) capture.output else identity;
formals(htmltab)$header=hrow;
output(out <- stargazer(xx,type = 'html') %>%
htmltab %>% `[`(firstrowisdata,,drop=F) %>%
kable(format = 'markdown',row.names=row.names,...) %>%
head(length(.)-taildrop) %>% paste0('\n') %>%
submulti(searchrep = searchrep));
if(kboutput) cat(out,'\n');
invisible(out);
}
#' Take a character vector and perform multiple search-replace
#' operations on it.
#' @param xx A \code{vector} of type \code{character} (required)
#' @param searchrep A \code{matrix} with two columns of type \code{character} (required). The left column is the pattern and the right, the replacement.
#' @param method One of 'partial','full', or 'exact'. Controls whether to replace only the matching regexp, replace the entire value that contains a matching regexp, or replace the entire value if it's an exact match.
submulti <- function(xx,searchrep,method=c('partial','full','exact')){
# if no method is specified by the user, this makes it take the first value
# if a method is only partially written out, this completes it, and if the
# method doesn't match any of the valid possibilities this gives an informativ
# error message
method<-match.arg(method);
# rr is a sequence of integers spanning the rows of the searchrep matrix
rr <- 1:nrow(searchrep);
# oo will hold the result that this function will return
oo <- xx;
switch(method
,partial = {for(ii in rr)
oo <- gsub(searchrep[ii,1],searchrep[ii,2],oo)}
,full = {for(ii in rr)
oo[grepl(searchrep[ii,1],oo)]<-searchrep[ii,2]}
,exact = {for(ii in rr)
oo[grepl(searchrep[ii,1],oo,fixed=T)]<-searchrep[ii,2]}
#oo <- gsub(searchrep[ii,1],searchrep[ii,2],oo,fixed = T)}
);
oo;
}
#' Take a data.frame and create a PL/SQL table definition for it, ready to paste into DBVis and such
#' @param xx A \code{data.frame} (required)
sql_create_table <- function(xx,tname,sqltemplate='CREATE TABLE %s (%s);'){
mysearchrep <- cbind(c('integer','character','numeric','Date','logical','factor')
,c('NUMBER(32)','VARCHAR2(nn)','NUMBER(32)','DATE','NUMBER(1)','VARCHAR2(nn)'));
if(missing(tname)) tname <- as.character(substitute(xx));
thenames <- sapply(xx, class);
thenames <- submulti(thenames, mysearchrep, 'exact');
strlengthres<-apply(nsqip,2,function(xx){max(str_length(xx))});
thenames <- apply(cbind(round(strlengthres*1.5), thenames),1,function(xx){gsub('nn',xx[1],xx[2])});
thenames <- paste(' ',names(thenames), as.character(thenames), sep=' ', collapse= ',\n');
thenames <- gsub('.', '_', thenames, fixed=TRUE);
thenames <- gsub('_{1,}','_',thenames);
thenames <- gsub('_ ',' ',thenames);
thenames <- sprintf(sqltemplate, tname, thenames);
cat(thenames);
invisible(thenames);
# TODO: collect the column names and classes as we did in console
# TODO: use the above and submulti() to convert R classes to SQL data types
# TODO: paste() and/or sprintf() to create the string that will be the output
}
#' Take a data.frame or character vector and a vector of grep targets and return
#' the values that match (for data.frame, column names that match). If no
#' patterns given just returns the names
#' @param xx A \code{data.frame} or character vector (required)
#' @param patterns A character vector of regexp targets to be OR-ed
grepor <- function(xx,patterns='.') {
if(is.list(xx)) xx <-names(xx);
grep(paste0(patterns,collapse='|'),xx,val=T);
}
#' Take an object name \code{obj}, check to see if it exists in environment \code{env}
#' and if it does not, run \code{expression} \code{EXPR} and assign its result to \code{obj}
#'
#' @param obj A \code{character} string (required) naming the variable in env
#' to which the results of running \code{EXPR} will be assigned.
#' @param EXPR An \code{expression} to execute and assign to the object named
#' by \code{obj} if it doesn't already exist.
#' @param env An \code{environment} to check for the existence of \code{obj}
#' and where to create it if it doesn't exist.
#'
#' @return Does not return anything.
#'
#' @examples `checkrun('dat3',{group_by(dat1,idn_mrn) %>% summarise_all(first)});`
checkrun <- function(obj,EXPR,env=as.environment(-1)){
env<-env;
if(length(ls(env,all=T,pattern=obj))==0){
browser();
}
}
#' Delete all the junk in your environment, for testing
clearenv <- function(env=.GlobalEnv) rm(list=setdiff(ls(all=T,envir=env),'clearenv'),envir=env);
#' Plots in the style we've been doing (continuous y, discrete x and optionally z)
#'
#' Instead of creating new tables for 'All', just set xx=T
#'
#' To suppress plotting of a legend (but still use fill) set fill.name=NA
#' Likewise, to suppress printing y-axis labels make yy.name=NA
#' To merely omit printing a name, set the name in question to ''
autoboxplot <- function(pdata, xx, yy, zz, subset=T
, type=c('box','violin')
, title=sprintf('%s vs %s\n by %s',xx,yy,zz)
, xx.name=if(xx==TRUE) 'All' else xx, xx.breaks=if(xx==TRUE) xx else unique(pdata[[xx]])
, xx.labels=if(xx==TRUE) '' else xx.breaks
, yy.name=yy, yy.labels
, fill.name, fill.breaks, fill.labels
, counts=T
,...){
subset <- substitute(subset);
plot_type <- switch(match.arg(type)
,box=geom_boxplot(coef=100)
,violin=geom_violin());
pdata <- subset(pdata,subset=eval(subset));
if(!missing(zz)){
if(missing(fill.name)) fill.name <- zz;
if(missing(fill.breaks)) fill.breaks <- unique(pdata[[zz]]);
if(missing(fill.labels)) fill.labels <- fill.breaks;
fill <- if(is.atomic(fill.name)&&is.na(fill.name)) scale_fill_discrete(guide=F) else {
scale_fill_discrete(name=fill.name,breaks=fill.breaks,labels=fill.labels);
}
}
if(missing(yy.labels)){
yy.labels <- if(is.na(yy.name)) scale_y_continuous(name='',labels=NULL) else {
scale_y_continuous(name=yy.name,labels = comma);
};
};
out <- ggplot(data = pdata, aes_string(x = xx, y = yy)) + plot_type + yy.labels +
scale_x_discrete(name=xx.name,breaks=xx.breaks,labels=xx.labels) +
labs(title=title)
if(!missing(zz)) out <- out + aes_string(fill=zz) + fill;
if(counts){
ccrds<-ggplot_build(out)$layout$panel_ranges[[1]];
ann.label <- if(xx==TRUE && missing(zz)) nrow(pdata) else if(xx==TRUE){
paste0(table(pdata[,zz]),collapse=' \t ') } else if(missing(zz)){
paste0(table(pdata[,xx]),collapse=' \t ') } else {
apply(table(pdata[,c(xx,zz)]),1,function(ii) paste0(ii[!ii%in%list(0,'')],collapse=' \t '));
}
#ann.label <- gsub('\\b0\\b','',ann.label);
out <- out + annotate('text',x=ccrds$x.major_source,y=ccrds$y.range[1]
,label=ann.label[ccrds$x.major_source]);
}
attr(out,'call') <- match.call();
out;
}
getCall.data.frame <- getCall.gg <- function(xx) attr(xx,'call');
#' From ... http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))}
if (numPlots==1) print(plots[[1]]) else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
mktabsie <- function(data,subsets=list(Full=T),pw
,vars
,filepath='.'
,filename='survSave.rdata'
,serverTitle='TABSIE'
,serverStatement=bquote(h4("Welcome to TABSIE"))){
serverData <- sapply(subsets,function(ii) subset(data[,vars],eval(ii)),simplify=F);
serverDataDic <- names(serverData);
serverHash <- digest::digest(pw,algo='sha512',ascii=TRUE);
save(serverStatement,serverData,serverDataDic,serverTitle,serverHash
,file=paste0(filepath,'/',filename));
}
#' Returns a vector of column names that contain data elements of a particular type
#' as specified by the user: "integer","POSIXct" "POSIXt", "numeric", "character",
#' "factor" and "logical".
vartype <- function(dat, ctype) {
xx <- unlist(sapply(dat, class));
idx <- which(xx %in% ctype);
res <- names(xx)[idx];
return(res)
}
#' ## Functions for RMarkdown and ggplot2
#'
#' Return a commit hash (for inclusion in reports for example) after first making
#' sure all changes are committed and pushed
#' TODO: instead of auto-committing, error if uncommited changes, needs to be
#' a deliberate process, otherwise we have tons of meaningless auto-commit
#' messages that will make future maintenance harder
#gitstamp <- function(production=getOption('gitstamp.production',T),branch=F
# ,whichfile='') {
# br<- if(branch) system("git rev-parse --abbrev-ref HEAD",intern=T) else NULL;
# if(production){
# if(length(gitdiff<-system("git update-index --refresh && git diff-index HEAD --",intern = T))!=0){
# stop(sprintf(
# "\ngit message: %s\n\nYou have uncommitted changes. Please do 'git commit' and then try again."
# ,gitdiff));};
# hash<-system(sprintf("git push && git log --pretty=format:'%%h' -n 1 -- %s"
# ,whichfile),intern=T);
# if(length(hash)==0) stop('
#The specified file is not part of a git repository. You need to do (for example):
#
#git add NAMEOFTHISSCRIPT.R && git commit -a -m "New script"
#
#...and then try again.');
# return(c(br,hash));
# } else return(c(br,'TEST_OUTPUT_DO_NOT_USE'));
#}
#' This function can be called from `stat_summary()` as the
#' `fun.data=` argument. It will cause group counts to be
#' over-printed on a `geom_boxplot()` (probably other similar
#' plots too) if `stat_summary()` is added to it.
n_fun <- function(xx) data.frame(y=mean(quantile(xx,c(.5,.75))),label=as.character(length(xx)));
#' take a list of subset criteria and return a list of data.frames
ssply<-function(dat,...) sapply(sys.call()[-(1:2)],function(ii) subset(dat,eval(ii)),simplify=F);
#' save a named list of tables, including names
savetablelist <- function(lst,fileprefix,filesuffix=paste0(format(Sys.Date(),'%m-%d-%Y-%H_%M_%S'),'.tsv')
,filepath='./',outfile=paste0(filepath,fileprefix,filesuffix)
,sep='\t',row.names=F
,singletabname=as.character(substitute(lst))
,replaceifexists=T,...) {
if(is.data.frame(lst)) lst<-setNames(list(lst),singletabname);
if(replaceifexists&&file.exists(outfile)) file.remove(outfile);
for(ii in names(lst)){
write(ii,file=outfile,append=T);
write.table(lst[[ii]],outfile,sep=sep,row.names=row.names,append=T);
}
}
#' Usage: `xx<-mapnames(xx,lookup)` where lookup is a named character vector
#' the names of the elements in the character vector are what you are renaming
#' things TO and the values are what needs to be matched, i.e. what renaming things
#' FROM. If you set namesonly=T then it just returns the names, not the original
#' object.
mapnames<-function(xx,lookup,namesonly=F,...){
xnames <- names(xx);
idxmatch <- na.omit(match(xnames,lookup));
newnames <- names(lookup)[idxmatch];
if(namesonly) return(newnames);
names(xx)[xnames %in% lookup] <- newnames;
xx;
}
#' Example of using R methods dispatch
#'
#' The actual usage is: `truthy(foo)` and `truthy()` itself figures
#' out which method to actually dispatch.
truthy <- function(xx,...) UseMethod('truthy');
truthy.logical <- function(xx,...) xx;
truthy.factor <- function(xx,...) truthy.default(as.character(xx),...);
truthy.numeric <- function(xx,...) xx>0;
truthy.default <- function(xx,truewords=c('TRUE','true','Yes','T','Y','yes','y')
,...) xx %in% truewords;
truthy.data.frame <- function(xx,...) as.data.frame(lapply(xx,truthy,...));
#' Possibly obsolete?
countfrac <- function(xx,outcomes
# set to TOTAL in order to do _only_ the total
,groupcols='rai_range',sortby=groupcols
# set to 'none' if don't want to sort
,dir=c('desc','asc','none')
,summfns=c(n=sum,frac=mean)
# set to '' to disable
,totalrow='Total'){
# callback, so this function can call itself even if it's renamed
# or nameless, for purposes of rerunning to get the totals row
if(totalrow!='') thisfn <- sys.function();
oo <- xx;
# is the function being invoked for the purpose of calculating a total row?
if(doingtotalrow <- tolower(trimws(groupcols[1]))=='total'){
oo[[groupcols]] <- totalrow;
};
oo <- group_by_(oo,groupcols);
# we coerce everything to logical, whatever it might have been
# but leave alone columns other than those specified in the 'outcomes' variable
oo[,outcomes] <- mutate_all(oo[,outcomes],truthy);
# Two different summary tables, the second one applies the same set of
# functions to everything, and the first one is just a count that doesn't
# rely on any one specific column. So we do them separately and then cbind()
oo <- cbind(summarise(oo,bin_n=n()),summarise_all(oo[,c(groupcols,outcomes)],summfns)[,-1]) %>%
# creating cumul_count, as in the current code
mutate(cumul_count=rev(cumsum(rev(bin_n))));
# sort, if desired, just as in the current code
if(!doingtotalrow) oo <- switch(match.arg(dir)
,none=oo
,desc=arrange_(oo,sprintf('desc(%s)',groupcols))
,asc=arrange_(oo,groupcols));
# now we insure that the column order is the same as the order of the groupcols
# argument-- first the grouping column name, then the 'bin_n' (used to be called
# rai_n, but this isn't specific to RAI, could group by anithing, so renamed)
# cumul_count...
oo <- oo[,c(groupcols,'bin_n','cumul_count'
# ...and then this: it pastes the suffixes as obtained from the names
# of the summfns argument but orders them in the same way as they were
# given, rather than first the first summary function and then the second
,c(t(outer(outcomes,names(summfns),paste,sep='_'))))];
if(!doingtotalrow&&totalrow!='') {
tot <- thisfn(xx,outcomes,groupcols=totalrow,summfns=summfns,totalrow='');
names(tot)<-names(oo);
tot[[groupcols]] <- totalrow;
oo <- rbind(oo,tot);
}
attr(oo,'call') <- match.call();
oo;
}
#' Stack a vector to form a matrix with repeating rows, with optional
#' column names and transformation
#'
#' @param vv A vector which will become the row of a matrix
#' @param nr Number of (identical) rows this matrix will contain
#' @param trans An optional function that can take a matrix as its
#' sole argument. Useful functions include `as.data.frame()`
#' `as_tibble()` and `as.table()`
#' @return A matrix, unless the function specified in the `trans` argument
#' causes the output to be something else.
#' @export
#'
#' @examples
#' vec2m(1:10,5);
#' vec2m(1:10,5,tr=data.frame);
#' vec2m(setNames(1:12,month.name),3);
vec2m <- function(vv,nr=1,trans=identity) {
return(trans(matrix(as.matrix(c(vv)),nrow=nr,ncol=length(vv),byrow=T
,dimnames=list(NULL,names(vv)))));
}
#' Return a tableone object formatted just the way we like it
#' @param xx A \code{data.frame} (required).
#' @param vars Vector of variable names to pass to \code{CreatTableOne} (optional)
#' @param ... Named expressions from which to create the strata variable
#' for \code{CreatTableOne} (only tested for one variable)
stratatable <- function(xx,vars=NULL,...){
nmx <- names(xx);
xx <- transform(xx,...);
mystrata <- setdiff(names(xx),nmx);
res <- tableone::CreateTableOne(data=xx, vars= vars, strata=mystrata) %>% print;
res[,'p'] %>% gsub("<","", x=.) %>% trimws() %>%
as.numeric() %>% p.adjust() %>% cbind(res[,1:2],padj=.) -> res;
return(res);
}
#' Returns a list of column names from the data dictionary for which the column
#' named in the first argument is true. The first arg can be either a string or
#' a name. The second must be a data.frame
#'
#' @param var Either a string or a name, of a column in `dictionary`
#' @param dat An optional data.frame, to constrain which rows of the
#' 'dictionary' object get used
#' @param matchcol Optional column that maps the rows of 'dictionary' to the rows
#' of a 'data.frame' of interest
#' @param retcol Which column to return-- by default the same as used for 'matchcol'
#' @param dictionary A 'data.frame' that is used as a data dictionary. It must at
#' minimum contain a column of column-names for the dataset for
#' which it is a data dictionary ('matchcol') and one or more
#' columns each representing a _group_ of columns in the dataset,
#' such that a TRUE or T value means the column whose name is
#' the value of 'matchcol' is the name of a column in the data
#' that belongs to the group defined by the grouping column.
#' These grouping columns are what the argument 'var' is
#' supposed to refer to. We will use the convention that grouping
#' column names begin with 'c_' but this convention is not
#' (currently) enforced programmatically.
v <- function(var,dat
,matchcol='dataset_column_names'
,retcol=matchcol
,dictionary=dct0) {
# convenience function: if forgot what column names are available, call with
# no arguments and they will be listed
if(missing(var)) return(names(dictionary));
# support both standard or non-standard evaluation
var<-as.character(substitute(var));
# if a 'dat' argument is given, restrict the output so that only results having
# having values found in the colnames of 'dat' are returned.
if(!missing(dat)) dictionary <- dictionary[dictionary[[matchcol]]%in%colnames(dat),];
# TODO: Think about what to do when nothing matches... not necessarily an error
# condition, might just be something to warn about and move on.
out<-dictionary[dictionary[[var]],retcol][[1]];
# if something other than matchcol is returned, give it a name to make it
# easier to align with column names in the data
if(retcol != matchcol){
out<-setNames(out,dictionary[dictionary[[var]],matchcol][[1]]);
}
# 'na.omit()' needed because we allows the 'dictionary' object to have NAs instead
# of FALSEs. 'c()' needed to strip na.omit metadata, so the output is a plain
# old vector
return(c(na.omit(out)));
}
#'.This function will create a variable summary table that will provide
#' the name of the variable, tells whether the variable is discrete or
#' continuous, Tukey's 5 number summary for continuousvariables, and
#' the number of observations for each category for discrete variables.
#' All the user need for this function is the data frame (df) and the
#' function will do the rest.
#'
#' However, this is commented out because it's here just as a reminder
#' for when this functionality is again needed and this function gets
#' rewritten properly.
# variable_summary <- function(df){
# #collecting the column names from the data frame:
# all_names <- names(df)
# #passing the column names to an 'sapply' function:
# theresult <- sapply(all_names, summary_vector, df)
# #reformatting the table:
# theresult <- as.data.frame(t(theresult))
# colnames(theresult) <-c('VariableName', 'VariableType', 'TotalCounts', 'NumberOfNAs', 'VariableDescription')
# return(theresult)
# }