-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_cutting_analysis.R
executable file
·271 lines (217 loc) · 16.7 KB
/
1_cutting_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# Author: Jeremy Boyd ([email protected])
# Date: June 26, 2015
# Summary: Analysis of HDP 181 Experiment 1B data: effects of driver SES
# on cutting behavior at all-way stops. NOTE: Datapoint me10 was missing its
# time, so I estimated time as being in between datapoints me09 and me11.
# Load libraries
library(ggplot2)
library(plotrix)
library(timeDate)
library(lme4)
library(plyr)
library(gridExtra)
library(dplyr)
library(extrafont)
# Read in data.
cuts <- read.delim("Experiment 1B data 140122.txt", header = TRUE)
################################################################
# CLEAN UP DATA
################################################################
# Make cutoff codes uniform
cuts$cutoff = factor(ifelse((cuts$cutoff == 'no' | cuts$cutoff == 'No' | cuts$cutoff == 'N' | cuts$cutoff == 'n'), 'no', ifelse((cuts$cutoff == 'y' | cuts$cutoff == 'Y' | cuts$cutoff == 'yes' | cuts$cutoff == 'Yes'), 'yes', 'other')))
# Make driverSex codes uniform
cuts$driverSex = factor(ifelse((cuts$driverSex == 'f' | cuts$driverSex == 'F' | cuts$driverSex == 'w'), 'f', ifelse((cuts$driverSex == 'm' | cuts $driverSex == 'M'), 'm', 'other')))
# Exclude datapoints with 'other' in the driverSex, driverAge, traffic, and cutoff columns. This amounts to 19 datapoints, or 2.44% of the data (778 originally - 19 = 759.
cuts = cuts[cuts$driverSex != 'other',]
cuts = cuts[cuts$driverAge != 'other',]
cuts = cuts[cuts$traffic != 'other',]
cuts = cuts[cuts$cutoff != 'other',]
# Drop unused 'other' levels from variables.
cuts$driverSex = cuts$driverSex[, drop = TRUE]
cuts$driverAge = cuts$driverAge[, drop = TRUE]
cuts$traffic = cuts$traffic[, drop = TRUE]
cuts$cutoff = cuts$cutoff[, drop = TRUE]
cuts$vehicleID = cuts$vehicleID[, drop = TRUE]
# Make driverAge and traffic numeric
cuts$driverAge = as.numeric(cuts$driverAge)
cuts$traffic = as.numeric(cuts$traffic)
# Make zip code categorical
cuts$zip = factor(cuts$zip)
# Recode time as hours past midnight
cuts$time2 = difftime(timeDate(as.character(cuts$time), format = '%H:%M:%S' ), timeDate('00:00:00', format = '%H:%M:%S'))
# Recode cutoff as numeric: 1 for yes; 0 for no.
cuts$cutoff2 = as.numeric(ifelse(cuts$cutoff == 'yes', 1, 0))
################################################################
# SUMMARIZE USING FIGURES
################################################################
# Interaction of vehicleStatus and traffic, with traffic coded as low (1-2) vs. high (3-6). This gives us 421 datapoints in the low category (55%), and 338 in the high category (45%).
cuts$traffic2 = factor(ifelse(cuts$traffic <= 2, 'low', 'high'))
cuts.sum4 = ddply(cuts, c('vehicleStatus', 'traffic2'), summarise, mean = mean(cutoff2) * 100, se = std.error(cutoff2) * 100)
pd <- position_dodge(0.16)
ggplot(cuts.sum4, aes(x = factor(vehicleStatus), y = mean, color = traffic2, group = traffic2)) + geom_line(position = pd) + geom_errorbar(aes(ymax = mean + se, ymin = mean - se), color = 'black', width = .15, position = pd) + geom_point(position = pd) + scale_x_discrete(name = 'Vehicle Value') + scale_y_continuous(name = 'Cut Likelihood (%)', limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_color_manual(name = 'Traffic', labels = c('Heavy', 'Light'), values = c('gray60', 'black')) + theme(legend.position = c(.86, .807), legend.background = element_rect(fill = "transparent"), legend.key = element_rect(fill = "transparent", color = "transparent"), legend.title.align = .5)
# Create png, pdf, and jpeg versions of the figure.
ggsave(filename = 'figure4.png', width = 3.5, height = 3, dpi = 400)
ggsave(filename = 'figure4.pdf', width = 3.5, height = 3)
ggsave(filename = 'figure4.jpeg', width = 3.5, height = 3, dpi = 1000, units = "in")
# Same figure as above, but use colors and classic theme.
ggplot(cuts.sum4, aes(x = factor(vehicleStatus), y = mean, color = traffic2, group = traffic2)) + geom_line(position = pd) + geom_errorbar(aes(ymax = mean + se, ymin = mean - se), color = 'black', width = .15, position = pd) + geom_point(position = pd) + scale_x_discrete(name = 'Vehicle Value') + scale_y_continuous(name = 'Cut Likelihood (%)', limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_color_manual(name = 'Traffic', labels = c('Heavy', 'Light'), values = c('firebrick', 'deepskyblue')) + theme_classic() + theme(legend.position = c(.85, .80), legend.background = element_rect(fill = "transparent"), legend.key = element_rect(fill = "transparent", color = "transparent"), legend.title.align = .5)
ggsave(filename = "figure4_color.svg", width = 3.5, height = 3)
ggsave(filename = "figure4_color.pdf", width = 3.5, height = 3)
ggsave(filename = "figure4_color.png", width = 3.5, height = 2.6, dpi = 400)
# Make EPS version of figure.
loadfonts(device = "postscript")
postscript(file = "Fig4.eps", width = 3.5, height = 2.6, paper = "special", horizontal = FALSE, onefile = FALSE, fonts = "Arial")
ggplot(cuts.sum4, aes(x = factor(vehicleStatus), y = mean, color = traffic2, group = traffic2)) + geom_line(position = pd) + geom_errorbar(aes(ymax = mean + se, ymin = mean - se), color = 'black', width = .15, position = pd) + geom_point(position = pd) + scale_x_discrete(name = 'Vehicle Value') + scale_y_continuous(name = 'Cut Likelihood (%)', limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_color_manual(name = 'Traffic', labels = c('Heavy', 'Light'), values = c('firebrick', 'deepskyblue')) + theme_classic() + theme(legend.position = c(.85, .80), legend.background = element_rect(fill = "transparent"), legend.key = element_rect(fill = "transparent", color = "transparent"), legend.title.align = .5)
dev.off()
# Embed fonts
embed_fonts("./Fig4.eps", outfile = "./Fig4-embed.eps", options = "-dEPSCrop")
# Figure of vehicleStatus predicting proportion of cuts.
cuts.sum1 = ddply(cuts, 'vehicleStatus', summarise, mean = mean(cutoff2), se = std.error(cutoff2))
cuts.sum1$group = 'group'
ggplot(cuts.sum1, aes(x = factor(vehicleStatus), y = mean, group = group)) + geom_line() + geom_point() + geom_errorbar(aes(ymax = mean + se, ymin = mean - se), width = .15) + scale_x_discrete(name = 'Vehicle Status') + scale_y_continuous(name = 'P(Cut)', limits = c(0, 1), breaks = seq(0, 1, .2))
# Make png and pdf of figure
ggsave(filename = 'vehicleStatus.png', width = 6, height = 4.5, dpi = 400)
ggsave(filename = 'vehicleStatus.pdf', width = 6, height = 4.5)
# Figure of traffic predicting proportion of cuts.
cuts.sum7 = ddply(cuts, 'traffic', summarise, mean = mean(cutoff2) * 100, se = std.error(cutoff2) * 100, n = length(cutoff2))
cuts.sum7$group = 'group'
ggplot(cuts.sum7, aes(x = factor(traffic), y = mean, group = group)) + geom_line() + geom_point() + geom_errorbar(aes(ymax = mean + se, ymin = mean - se), width = .15) + scale_x_discrete(name = 'Traffic') + scale_y_continuous(name = 'Cut Likelihood (%)', limits = c(0, 100)) + geom_text(aes(label = n, y = mean + se + .06))
# Make png and pdf of figure.
ggsave(filename = 'traffic.png', width = 6, height = 4.5, dpi = 400)
ggsave(filename = 'traffic.pdf', width = 6, height = 4.5)
# Piff's low and high cut numbers: 10-30
# so vehicleStatus 1=10, 2-15, 3=20, 4=25, 5=30
# Make mine like this to center in figure better: 1=30, 2=35, 3=40, 4=45, 5=50
# For no interaction figure, just add/subtract 10 for each value
# Make a prediction figure for no interaction
noInter = data.frame(vehicleStatus = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), traffic = c('high', 'low', 'high', 'low', 'high', 'low', 'high', 'low', 'high', 'low'), mean = c(40, 20, 46, 26, 52, 32, 58, 38, 64, 44))
pred1 = ggplot(noInter, aes(x = factor(vehicleStatus), y = mean, color = traffic, group = traffic)) + geom_line() + geom_point() + scale_x_discrete(name = 'Driver SES') + scale_y_continuous(name = 'Cut Likelihood', limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_color_manual(name = 'Traffic', labels = c('Heavy', 'Light'), values = c('firebrick', 'deepskyblue')) + theme_classic() + theme(axis.text.y = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank()) + ggtitle("Ethics")
# Make interaction prediction figure
inter = data.frame(vehicleStatus = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), traffic = c('high', 'low', 'high', 'low', 'high', 'low', 'high', 'low', 'high', 'low'), mean = c(30, 30, 40, 32, 50, 34, 60, 36, 70, 38))
pred2 = ggplot(inter, aes(x = factor(vehicleStatus), y = mean, color = traffic, group = traffic)) + geom_line() + geom_point() + scale_x_discrete(name = 'Driver SES') + scale_y_continuous(name = 'Cut Likelihood', limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_color_manual(name = 'Traffic', labels = c('Heavy', 'Light'), values = c('firebrick', 'deepskyblue')) + theme_classic() + theme(axis.text.y = element_blank(), axis.text.x = element_blank(), legend.position = 'none') + ggtitle("Attention")
# Combine into 1 x 2 plot
pdf('figure1.pdf', width = 7.25, height = 2.3)
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 200)))
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(pred2, vp = vplayout(1, 1:87))
print(pred1, vp = vplayout(1, 88:200))
dev.off()
# Same as above but jpeg format
jpeg('figure1.jpeg', width = 7.25, height = 2.3, units = "in", quality = 100, res = 1000)
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 200)))
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(pred2, vp = vplayout(1, 1:87))
print(pred1, vp = vplayout(1, 88:200))
dev.off()
# SVG format
svg('figure1.svg', width = 7.25, height = 2.3)
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 200)))
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(pred2, vp = vplayout(1, 1:87))
print(pred1, vp = vplayout(1, 88:200))
dev.off()
# EPS format
postscript(file = "Fig1.eps", width = 7.25, height = 2.3, paper = "special", horizontal = FALSE, onefile = FALSE, fonts = "Arial")
grid.newpage()
pushViewport(viewport(layout = grid.layout(1, 200)))
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(pred2, vp = vplayout(1, 1:87))
print(pred1, vp = vplayout(1, 88:200))
dev.off()
# Embed fonts
embed_fonts("./Fig1.eps", outfile = "./Fig1-embed.eps", options = "-dEPSCrop")
################################################################
# FIGURES/TABLES SUMMARIZING THE DATA
################################################################
# Amount of data at each level of status x traffic.
xtabs(~ vehicleStatus + traffic, cuts)
# Counts and proportions of data based on sex.
xtabs(~ driverSex, cuts)
xtabs(~ driverSex, cuts) / length(cuts$driverSex)
# Counts and proportions of data based on age
xtabs(~ driverAge, cuts)
xtabs(~ driverAge, cuts) / length(cuts$driverAge)
# Set font sizes for figures
fontSize = 3
# Frequency distribution of vehicle status. Shows a median vehicle status of 3 (n = 220).
figA = ggplot(cuts, aes(x = factor(vehicleStatus))) + geom_histogram(binwidth = 1, color = 'black', fill = 'gray50', width = 1) + geom_text(size = fontSize, stat = 'bin', binwidth = 1, aes(y = ..count.., label = sprintf("%.1f%%", ..count.. / 759 * 100)), vjust = -.5) + scale_x_discrete(name = 'Vehicle Value') + scale_y_continuous(name = 'Frequency', limits = c(0, 250), breaks = seq(0, 250, 50)) + ggtitle('A.') + theme_classic()
# Frequency distribution of traffic. Shows a median of 2 (n = 257) with decreasing amounts of data after that. Maximum traffic for these data was 6 (n = 3).
figB = ggplot(cuts, aes(x = factor(traffic))) + geom_histogram(binwidth = 1, color = 'black', fill = 'gray50', width = 1) + geom_text(size = fontSize, stat = 'bin', binwidth = 1, aes(y = ..count.., label = sprintf("%.1f%%", ..count.. / 759 * 100)), vjust = -.5) + scale_x_discrete(name = 'Traffic') + scale_y_continuous(name = 'Frequency', limits = c(0, 290), breaks = seq(0, 260, 50)) + ggtitle('B.') + theme_classic()
# Frequency distribution of driverSex. Shows 39% of drivers were female, 61% were male.
figC = ggplot(cuts, aes(x = driverSex)) + geom_histogram(binwidth = 1, color = 'black', fill = 'gray50', width = 1) + geom_text(size = fontSize, stat = 'bin', binwidth = 1, aes(y = ..count.., label = sprintf("%.1f%%", ..count.. / 759 * 100)), vjust = -.5) + scale_x_discrete(name = 'Driver Sex', labels = c('Female', 'Male')) + scale_y_continuous(name = 'Frequency', limits = c(0, 520), breaks = seq(0, 500, 100)) + ggtitle('C.') + theme_classic()
# Frequency distribution of driverAge. The majority of drivers (n = 407 were perceived to be between 16-35 years old.
figD = ggplot(cuts, aes(x = factor(driverAge))) + geom_histogram(binwidth = 1, color = 'black', fill = 'gray50', width = 1) + geom_text(size = fontSize, stat = 'bin', binwidth = 1, aes(y = ..count.., label = sprintf("%.1f%%", ..count.. / 759 * 100)), vjust = -.5) + scale_x_discrete(name = 'Driver Age (Years)', labels = c('16-35', '36-55', '56+')) + scale_y_continuous(name = 'Frequency', limits = c(0, 460), breaks = seq(0, 420, 100)) + ggtitle('D.') + theme_classic()
# Frequency distribution of time of day. Shows a median from 18:00-18:59 (6-6:59pm), with most data collected between 12:00 and 19:00.
cuts$time3 = as.numeric(cuts$time2)
figE = ggplot(cuts, aes(x = time3)) + geom_histogram(binwidth = 1, color = 'black', fill = 'gray50') + scale_x_continuous(name = 'Time of Day (24-Hour Clock)', limits = c(0, 23), breaks = seq(0, 23, 1)) + scale_y_continuous(name = 'Frequency', limits = c(0, 120), breaks = seq(0, 120, 20)) + ggtitle('E.') + theme_classic()
# Arrange all figures in same image file.
pdf('figure3.pdf', width = 7.25, height = 6)
grid.newpage()
pushViewport(viewport(layout = grid.layout(3, 2)))
print(figA, vp = vplayout(1, 1))
print(figB, vp = vplayout(1, 2))
print(figC, vp = vplayout(2, 1))
print(figD, vp = vplayout(2, 2))
print(figE, vp = vplayout(3, 1:2))
dev.off()
# JPG version of above image
jpeg('figure3.jpeg', width = 7.25, height = 6, units = "in", quality = 100, res = 1000)
grid.newpage()
pushViewport(viewport(layout = grid.layout(3, 2)))
print(figA, vp = vplayout(1, 1))
print(figB, vp = vplayout(1, 2))
print(figC, vp = vplayout(2, 1))
print(figD, vp = vplayout(2, 2))
print(figE, vp = vplayout(3, 1:2))
dev.off()
# SVG of above image
svg('figure3.svg', width = 7.25, height = 6)
grid.newpage()
pushViewport(viewport(layout = grid.layout(3, 2)))
print(figA, vp = vplayout(1, 1))
print(figB, vp = vplayout(1, 2))
print(figC, vp = vplayout(2, 1))
print(figD, vp = vplayout(2, 2))
print(figE, vp = vplayout(3, 1:2))
dev.off()
# EPS of above image
postscript(file = "Fig3.eps", width = 7.25, height = 6, paper = "special", horizontal = FALSE, onefile = FALSE, fonts = "Arial")
grid.newpage()
pushViewport(viewport(layout = grid.layout(3, 2)))
print(figA, vp = vplayout(1, 1))
print(figB, vp = vplayout(1, 2))
print(figC, vp = vplayout(2, 1))
print(figD, vp = vplayout(2, 2))
print(figE, vp = vplayout(3, 1:2))
dev.off()
# Embed fonts
embed_fonts("./Fig3.eps", outfile = "./Fig3-embed.eps", options = "-dEPSCrop")
################################################################
# MIXED MODEL WITH RANDOM EFFECTS FOR RATERS & INTERSECTIONS
################################################################
# Convert time2 to numeric
cuts$time2 <- as.numeric(cuts$time2)
# Make subset with only those variables that will be used in model.
cuts2 <- select(cuts, rater, interID, vehicleStatus, traffic, driverSex, driverAge, time2, cutoff)
# Write cuts2 to disk for later use in predictive models.
write.table(cuts2, file = "cleaned_up_for_prediction.txt", sep = "\t", row.names = FALSE)
# Convert continuous predictors to z-scores using scale(). This is done in an effort to get rid of warnings when I try to fit a glmer model below.
cuts2$vehicleStatus <- scale(cuts2$vehicleStatus)
cuts2$traffic <- scale(cuts2$traffic)
cuts2$driverAge <- scale(cuts2$driverAge)
cuts2$time2 <- scale(cuts2$time2)
# Adjust contrasts to balance driverSex. Male is the default, so the model coefficient will indicate whether males are more/less likely to cut than females.
xtabs(~ driverSex, cuts2) / length(cuts2$rater)
contrasts(cuts2$driverSex) = cbind("Male" = c(-.61, .39))
# Model with maximal random effects structure. Throws multiple convergence warnings.
cuts2.glmer1 <- glmer(cutoff == "yes" ~ vehicleStatus + traffic + vehicleStatus:traffic + driverSex + driverAge + time2 + (1 + (vehicleStatus + traffic + vehicleStatus:traffic + driverSex + driverAge + time2)|rater) + (1 + (vehicleStatus + traffic + vehicleStatus:traffic + driverSex + driverAge + time2)|interID), data = cuts2, family = "binomial")
# Restart from previous fit, but now bumping up the maximum number of iterations and using the bobyqa optimizer for both phases.
ss <- getME(cuts2.glmer1, c("theta","fixef"))
cuts2.glmer2 <- update(cuts2.glmer1, start = ss, control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e6)))
# Calculate 95% CIs for fixed effects parameters using the Wald method.
ci.wald <- confint(cuts2.glmer2, level = 0.95, method = "Wald", .progress = "txt")
# Print variance-covariance matrix. This is required for submission to some journals.
vcov(cuts2.glmer2)