-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_analysis.R
140 lines (136 loc) · 5.45 KB
/
example_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#' ---
#' title: "Example Analysis"
#' css: "production.css"
#' output:
#' html_document:
#' toc: true
#' toc_float: true
#' ---
#'
#' ### Settings
#'
#' In the below two lines are the minimum script-level settings you need.
#' The `.projpackages` object has the names of the packages you need installed
#' if necessary and then loaded for this scriport. The `.deps` object contains
#' other scriports on which this one _directly_ depends (you don't need to worry
#' about the indirect ones-- each scriport manages its own dependencies for
#' packages and scriports). The recommended value to start with is the one shown
#' here. You can add more if the need arises later. For more information, please
#' see the [overview](overview.html) scriport.
.projpackages <- c('GGally','tableone','pander','dplyr','ggplot2');
.deps <- c( 'dictionary.R' );
#+ load_deps, echo=FALSE, message=FALSE, warning=FALSE,results='hide'
# do not edit the next two lines
.junk<-capture.output(source('./scripts/global.R',chdir=TRUE,echo=FALSE));
#' Set some formatting options for this document
panderOptions('table.alignment.default','right');
panderOptions('table.alignment.rownames','right');
panderOptions('table.split.table',Inf);
panderOptions('p.wrap','');
panderOptions('p.copula',', and ');
.currentscript <- current_scriptname('example_analysis.R');
if(!exists('dat01')) dat01 <- get(names(inputdata)[1]);
#'
#'
#' ### Choosing predictor and response variables
#'
#' The values below are variables arbitrarily selected for demonstration
#' purposes. To do your actual analysis, replace `binary_outcome` and
#' `numeric_outcome` below with vectors containing one or more column names you
#' want to serve as the outcome variables. If you have no binary outcomes, set
#' the value to an empty vector, `c()`. Likewise if you have no numeric
#' outcomes.
binary_outcome <- choose_outcomes(dat01,'c_safetf');
binary_outcome;
numeric_outcome <- choose_outcomes(dat01,'c_safenumeric');
numeric_outcome;
#'
#' Insuring that the binary outcomes get treated as discrete values
#'
for(ii in binary_outcome) dat01[[ii]] <- factor(dat01[[ii]]);
#' Again, replace `predictorvars` below with vectors containing one or more
#' column names you actually want to serve as the predictor variables
predictorvars <- choose_predictors(dat01,'c_safe'
,exclude=c(binary_outcome,numeric_outcome));
predictorvars;
#' If you are satisfied with your choice of `binary_outcome`, `numeric_outcome`,
#' and `predictorvars` you don't need to change the following line. It simply
#' combines the above.
mainvars <- c(predictorvars,binary_outcome,numeric_outcome);
mainvars;
#'
#' ### Data dictionary
#'
#' Here are some useful characteristics of the variables in `dat01`
#'
pander(attr(dat01,'tblinfo') %>% select(-c('nn','md5')));
#'
#' ### Plot the data
#'
#' #### Explore pairwise relationships
#'
#' A plot of all pairwise relationships between the variables of interest.
#+ ggpairs_plot, message=FALSE, warning=FALSE, cache=TRUE
# select just the columns in 'mainvars' (otherwise the plot will take forever)
# and turn the ones tagged as 'ordinal' into factors
dat01[,mainvars] %>% mutate_at(.,v(c_ordinal,.),factor) %>%
# now make a scatterplot matrix, using the first 'binary_outcome' to assign
# color
ggpairs(.,aes_string(color=binary_outcome[1]));
#'
#' Note: if your data does not have a binary outcome, an artificial value is
#' randomly generated for demonstration purposes.
#'
#' #### Exploring group differences.
#'
#' The `r sprintf("\x60%s\x60",binary_outcome[1])`
#' variable is again used to stratify the groups to create a cohort table that
#' can, among other things, help identify potential confounders and unexpected
#' explanatory variables.
#'
pander(print(CreateTableOne(vars = mainvars, strata = binary_outcome[1]
,data=dat01, includeNA=TRUE, test=FALSE)
, printToggle=FALSE)
,caption='Cohort Characterization');
#'
#' ### Specify the statistical models
#'
#' #### Numeric outcome
#'
#' Formulas for the numeric outcome models.
#+ numfrm, results='asis'
num_formulas <- c();
for(yy in numeric_outcome) for(xx in predictorvars){
num_formulas <- c(num_formulas,sprintf('%s ~ %s',yy,xx))};
pander(cbind(num_formulas),justify='l');
#' Linear regression models fitted to the above formulas.
num_models <- sapply(num_formulas,function(xx) lm(xx,dat01) %>% update(.~.)
,simplify=FALSE);
#' The results of the univariate model fits
#+ pandernum,results='asis'
for(xx in num_models) {cat('***','\n'); cat(pander(xx)); cat('***','\n');};
#'
#' #### Binary outcome
#'
#' Formulas for the binary outcome models
#+ binfrm, results='asis'
bin_formulas <- c();
for(yy in binary_outcome) for(xx in predictorvars){
bin_formulas <- c(bin_formulas,sprintf('%s ~ %s',yy,xx))};
pander(cbind(bin_formulas),justify='l');
#' Logistic regression models fitted to the above formulas.
bin_models <- sapply(bin_formulas,function(xx){
glm(xx,dat01,family='binomial') %>% update(.~.)},simplify=FALSE);
#' The results of the univariate model fits
#+ panderbin,results='asis'
for(xx in bin_models) {cat('***','\n'); cat(pander(xx)); cat('***','\n');};
#'
#'
#' ### Save results
#'
#' Now the results are saved and available for use by other scriports if you
#' place `r sprintf("\x60'%s'\x60",.currentscript)` among the values in their
#' `.deps` variables.
save(file=paste0(.currentscript,'.rdata'),list=setdiff(ls(),.origfiles));
#+ echo=FALSE, results='hide'
c()