-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_clean_data.r
375 lines (314 loc) · 13.7 KB
/
import_clean_data.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# In this project code scaffolding will only be given for functions that were not
# explained in the prerequisite courses. Look at the hints if you need help.
# Load the packages
library(readr, quietly = TRUE)
library(dplyr, quietly = TRUE)
# Read in the data from the datasets folder
wwc_raw <- read_csv("datasets/2019_WWCFIFA_summary.csv")
# Check the dimensions and structure of the data
glimpse(wwc_raw)
summary(wwc_raw)
str(wwc_raw)
#These packages need to be loaded in the first @tests cell
library(testthat)
library(IRkernel.testthat)
soln_wwc_raw <- read_csv("datasets/2019_WWCFIFA_summary.csv")
run_tests({
test_that("the correct package is loaded", {
expect_true("readr" %in% .packages(),
info = "Did you load the readr package?")
expect_true("dplyr" %in% .packages(),
info = "Did you load the dplyr package?")
})
test_that("the dataset is loaded correctly", {
expect_identical(colnames(wwc_raw), colnames(soln_wwc_raw),
info = "It does not look like the dataset was loaded correctly. Check the hint if you are unsure of the problem")
expect_equal(wwc_raw, soln_wwc_raw,
info = "It does not look like the dataset was loaded correctly. Check the hint if you are unsure of the problem")
})
})
# Read in the data specifying column types
wwc_raw <- read_csv("datasets/2019_WWCFIFA_summary.csv",
col_types = cols(
Round = col_factor(),
Date = col_date(format = "%m/%d/%y"),
Venue = col_factor()
)
)
# Look at the summary and structure of the data
glimpse(wwc_raw)
summary(wwc_raw)
# Print the dataset
wwc_raw
soln_wwc_raw <- read_csv("datasets/2019_WWCFIFA_summary.csv",
col_types = cols(
Round = col_factor(),
Date = col_date(format = "%m/%d/%y"),
Venue = col_factor()
)
)
run_tests({
test_that("the dataset is loaded correctly", {
expect_identical(colnames(wwc_raw), colnames(soln_wwc_raw),
info = "It does not look like the dataset was loaded correctly. Check the hint if you are unsure of the problem")
expect_equal(wwc_raw, soln_wwc_raw,
info = "It does not look like the dataset was loaded correctly. Check the hint if you are unsure of the problem")
})
})
# load the package
library(tidyr)
# Remove rows of NA
wwc_1 <- wwc_raw %>%
rename_all(tolower) %>%
filter(!is.na(round))
# Get the dimensions and inspect the first 10 and last 10 rows
str(wwc_1)
head(wwc_1, n = 10)
tail(wwc_1, n = 10)
soln_wwc_1 <- soln_wwc_raw %>%
rename_all(tolower) %>%
filter(!is.na(round))
run_tests({
test_that("the tidyr package is loaded", {
expect_true("tidyr" %in% .packages(),
info = "Did you load the tidyr package?")
})
test_that("wwc_ is correct", {
expect_equal(wwc_1, soln_wwc_1,
info = "wwc_1 is not correct. Check the hint for possible solutions.")
})
})
# Housekeeping
wwc_2 <- wwc_1
# Find, view, and replace NA in column date
index_date <- which(is.na(wwc_2$date))
wwc_2[6,4] <- as.Date("2019-06-09")
# Find, view, and replace NA in column venue
index_venue <- which(is.na(wwc_2$venue))
wwc_2[49,11] <- "Groupama Stadium"
# Housekeeping
soln_wwc_2 <- soln_wwc_1
# Find and replace NA in column date
soln_index_date <- which(is.na(soln_wwc_2$date))
soln_wwc_2$date[soln_index_date] <- "2019-06-09"
# Find and replace NA in column venue
soln_index_venue <- which(is.na(soln_wwc_2$venue))
soln_wwc_2$venue[soln_index_venue] <- "Groupama Stadium"
run_tests({
test_that("ind_dt is correct", {
expect_equal(index_date, soln_index_date,
info = "ind_dt is not correct. Check the hint to make sure you used which() and is.na() correctly.")
expect_equal(index_venue, soln_index_venue,
info = "ind_vn is not correct. Check the hint to make sure you used which() and is.na() correctly.")
})
test_that("wwc_2 is correct", {
expect_equal(wwc_2, soln_wwc_2,
info = "wwc_2 is not correct. Check the hint to make sure you filled in the NAs correctly.")
})
})
# Separate columns and replace NA (you've got this!)
wwc_3 <- wwc_2 %>%
separate(score, c("home_score", "away_score"), sep = "-", convert = TRUE) %>%
separate(pks, c("home_pks", "away_pks"), sep = "-", convert = TRUE) %>%
mutate(home_pks = replace_na(home_pks, 0),
away_pks = replace_na(away_pks, 0)
)
# Print the data
wwc_3
unique(wwc_3$home_score)
unique(wwc_3$away_score)
soln_wwc_3 <- soln_wwc_2 %>%
separate(score, c("home_score", "away_score"), sep = "-", convert = TRUE) %>%
separate(pks, c("home_pks", "away_pks"), sep = "-", convert = TRUE) %>%
mutate(home_pks = replace_na(home_pks, 0),
away_pks = replace_na(away_pks, 0))
run_tests({
test_that("home_score was created and is type double", {
expect_true("home_score" %in% names(wwc_3),
info = "It looks like home_score is not a column name. Please check the hint.")
expect_type(wwc_3$home_score, "double")
expect_equal(sum(wwc_3$home_score), sum(soln_wwc_3$home_score),
info = "home_score does not sum to the correct value. Please check the hint.")
})
test_that("away_score was created and is type integer", {
expect_true("away_score" %in% names(wwc_3),
info = "It looks like away_score is not a column name. Please check the hint.")
expect_true(class(wwc_3$away_score) == "integer",
info = "away_score should be an integer.\nNo need to call replace_na() on away_score and home_score.\n Please check the hint.")
expect_equal(sum(wwc_3$away_score), sum(soln_wwc_3$away_score),
info = "away_score does not sum to the correct value. Please check the hint.")
})
test_that("home_pks was created and is type double", {
expect_true("home_pks" %in% names(wwc_3),
info = "It looks like home_pks is not a column name. Please check the hint.")
expect_type(wwc_3$home_pks, "double")
expect_equal(sum(wwc_3$home_pks), sum(soln_wwc_3$home_pks),
info = "home_pks does not sum to the correct value. Please check the hint.")
})
test_that("away_pks was created and is type double", {
expect_true("away_pks" %in% names(wwc_3),
info = "It looks like away_pks is not a column name. Please check the hint.")
expect_type(wwc_3$away_pks, "double")
expect_equal(sum(wwc_3$away_pks), sum(soln_wwc_3$away_pks),
info = "away_pks does not sum to the correct value. Please check the hint.")
})
})
# Housekeeping for plot size
options(repr.plot.width=6, repr.plot.height=4)
# Load the package
library(ggplot2)
# Make a boxplot of attendance by venue and add the point data
ggplot(wwc_3, aes(venue, attendance)) +
geom_boxplot() +
geom_jitter(color = "red", size = 0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p <- last_plot()
soln_p <- ggplot(soln_wwc_3, aes(venue, attendance)) +
geom_boxplot() +
geom_jitter(color = "red", size = 0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
run_tests({
test_that("The ggplot2 package is loaded", {
expect_true("ggplot2" %in% .packages(),
info = "Did you load the ggplot2 package?")
})
test_that("the plot is correct", {
expect_identical(deparse(p$mapping$x),deparse(soln_p$mapping$x),
info = "The x aesthetic is incorrect. Did you map it to venue? Please check the hint for a possible solution.")
expect_identical(deparse(p$mapping$y),deparse(soln_p$mapping$y),
info = "The y aesthetic is incorrect. Did you map it to attendance? Please check the hint for a possible solution.")
expect_identical(class(p$layers[[1]]$geom)[1],class(soln_p$layers[[1]]$geom)[1],
info = "The boxplot geometry is missing. Please check the hint for a possible solution.")
})
})
# Summarize the number of games, and min and max attendance for each venue
wwc_3 %>%
group_by(venue) %>%
summarize(n_games = n(),
min_attnd = min(attendance),
max_attend = max(attendance)
)
# Correct the outlier
wwc_4 <- wwc_3 %>%
mutate(attendance = replace(attendance, which(attendance == max(attendance)), max(attendance)/10))
# Print an updated summary table
wwc_venue_summary <- wwc_4 %>%
group_by(venue) %>%
summarize(n_games = n(),
min_attnd = min(attendance),
max_attend = max(attendance)
)
wwc_venue_summary
soln_wwc_4 <- soln_wwc_3 %>%
mutate(attendance = replace(attendance, which(attendance == 579000), 57900))
soln_wwc_venue_summary <- soln_wwc_4 %>%
group_by(venue) %>%
summarize(nb_of_games = n(),
min_attendance = min(attendance),
max_attendance = max(attendance))
run_tests({
test_that("wwc_4 was correctly updated", {
expect_equal(sum(wwc_4$attendance), sum(soln_wwc_4$attendance),
info = "The attendance column of wwc_4 is not correct. Please check the hint." )
})
test_that("wwc_venue_summary is correct", {
expect_equal(ncol(wwc_venue_summary), 4,
info = "wwc_venue_summary does not have the correct number of columns. Please check the hint.")
expect_equal(max(wwc_venue_summary$attendace), max(soln_wwc_venue_summary$attendace),
info = "The attendance column of wwc_4 is not correct. Please check the hint." )
})
})
# Housekeeping for plot size
options(repr.plot.width=6, repr.plot.height=4)
# Prettier boxplot of attendance data by venue
wwc_4 %>%
ggplot(aes(x = forcats::fct_reorder(venue, attendance), y = attendance)) +
geom_boxplot() +
geom_jitter(color = "red", size = 0.5) +
coord_flip() +
theme(axis.text.x = element_text(angle = 45, , hjust = 1)) +
labs(title = "Distribution of attendance by stadium",
subtitle = "2019 FIFA Women's World Cup",
x = "Stadium",
y = "Attendance")
p <- last_plot()
soln_4_bp <- soln_wwc_4 %>%
ggplot(aes(forcats::fct_reorder(venue, attendance), attendance)) +
geom_boxplot() +
geom_jitter(color = "red", size = 0.5) +
coord_flip() +
theme(axis.text.x = element_text(angle = 45, , hjust = 1)) +
labs(title = "Distribution of attendance by stadium",
subtitle = "2019 FIFA Women's World Cup",
x = "Stadium",
y = "Attendance")
run_tests({
test_that("the plot is correct", {
expect_identical(deparse(p$mapping$y),deparse(soln_4_bp$mapping$y),
info = "The y aesthetic is incorrect. Did you map it to attendance? Please check the hint.")
expect_identical(class(p$layers[[1]]$geom)[1],class(soln_p$layers[[1]]$geom)[1],
info = "The boxplot geometry is missing. Please check the hint for a possible solution.")
expect_identical(class(p$layers[[2]]$geom)[1],class(soln_p$layers[[2]]$geom)[1],
info = "The point geometry is missing. Please check the hint for a possible solution.")
})
})
# Housekeeping for plot size
options(repr.plot.width=6, repr.plot.height=4)
# Line plot of attendance over time
wwc_4 %>%
ggplot(aes(date, attendance, color = venue)) +
geom_line(alpha = 0.5,
size = 1.75) +
theme_minimal() +
theme(legend.position = "bottom",
legend.text = element_text(size = 8)) +
guides(col = guide_legend(nrow = 3)) +
labs(title = "Stadium attendance during the tournament",
subtitle = "2019 FIFA Women's World Cup",
x = "Date",
y = "Attendance",
color = "")
p <- last_plot()
soln_4_lp <- soln_wwc_4 %>%
ggplot(aes(date, attendance, color = venue)) +
geom_line() +
theme(legend.position = "bottom",
legend.text = element_text(size = 8)) +
guides(col = guide_legend(nrow = 3)) +
labs(title = "Stadium attendance during the tournament",
subtitle = "2019 FIFA Women's World Cup",
x = "Date",
y = "Attendance",
color = "")
run_tests({
test_that("the plot is correct", {
expect_identical(deparse(p$mapping$x),deparse(soln_4_lp$mapping$x),
info = "The x aesthetic is incorrect. Did you map it to date? Please check the hint for a possible solution.")
expect_identical(deparse(p$mapping$y),deparse(soln_4_lp$mapping$y),
info = "The y aesthetic is incorrect. Did you map it to attendance? Please check the hint for a possible solution.")
expect_identical(deparse(p$mapping$colour),deparse(soln_4_lp$mapping$colour),
info = "The color aesthetic is incorrect. Did you map it to venue? Please check the hint for a possible solution.")
expect_identical(class(p$layers[[1]]$geom)[1],class(soln_4_lp$layers[[1]]$geom)[1],
info = "The line geometry is missing. Please check the hint for a possible solution.")
})
})
# What match had the higest attendance?
# A: wk = SMIF, England vs. USA
# B: wk = FIN, USA vs. Netherlands
# C: wk = SMIF, Netherlands vs. Sweden
ans_1 <- "B"
# In what stadium was the match with the highest attendance played?
# A: Groupama Stadium
# B: Parc des Princes
# C: Stade des Alpes
ans_2 <- "A"
run_tests({
test_that("ans_1 is correct", {
expect_true(ans_1 == "B",
info = "The correct answer is B, the Final between the USA and the Netherlands had the greatest attendance.")
})
test_that("ans_2 is correct", {
expect_true(ans_2 == "A",
info = "The correct answer is A, Groupama Stadium.")
})
})