-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathtidying_automation_homework_key.R
123 lines (90 loc) · 4.64 KB
/
tidying_automation_homework_key.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#PSYC 259 Homework 3 - Data Tidying and Automation
#For full credit, provide answers for at least 6/9 questions
#List names of students collaborating with:
### SETUP: RUN THIS BEFORE STARTING ----------
install.packages("tidyverse") #If not installed
#Load packages
library(tidyverse)
paths <- c("https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Fellowship_Of_The_Ring.csv",
"https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Two_Towers.csv",
"https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Return_Of_The_King.csv")
#Read data
#Each dataset has the words spoken by male/female characters in the LOTR triology by race (elf, hobbit, or human)
ds1 <- read_csv(paths[1])
ds2 <- read_csv(paths[2])
ds3 <- read_csv(paths[3])
ds_combined <- bind_rows(ds1, ds2, ds3)
### Question 1 ----------
#For this assignment, you created a fork from the Github repo and cloned your own copy
#As you work on the assignment, make commits and push the changes to your own repository.
#Make your repository public and paste the link here:
#ANSWER
#YOUR GITHUB LINK:
### Question 2 ----------
#Use a for loop with paths to read the data in to a new tibble "ds_loop" so that the data are combined into a single dataset
#(Yes, Vroom does this automatically but practice doing it with a loop)
#If you did this correctly, it should look the same as ds_combined created above
#ANSWER
ds_loops <- read_csv(paths[1]) %>% filter(FALSE)
for (p in paths) {
ds_loops <- bind_rows(ds_loops, read_csv(p))
}
### Question 3 ----------
#Use map with paths to read in the data to a single tibble called ds_map
#If you did this correctly, it should look the same as ds_combined created above
#ANSWER
ds_map <- map_dfr(paths, read_csv)
### Question 4 ----------
#The data are in a wider-than-ideal format.
#Use pivot_longer to reshape the data so that sex is a column with values male/female and words is a column
#Use ds_combined or one of the ones you created in Question 2 or 3, and save the output to ds_longer
#ANSWER
ds_longer <- pivot_longer(ds_combined, Male:Female, names_to = "Sex", values_to = "Words")
### Question 5 ----------
#It's helpful to know how many words were spoken, but each book was a different length
#The tibble below contains the total number of words in each book (make sure to run those lines so that it appears in your environment)
#Merge it into ds_longer and then create a new column that expresses the words spoken as a percentage of the total
total_words <- tibble(Film = c("The Fellowship Of The Ring", "The Two Towers","The Return Of The King"),
Total = c(177277, 143436, 134462))
#ANSWER
ds_longer <- ds_longer %>% left_join(total_words, by = "Film") %>%
mutate(Percent = Words/Total * 100)
### Question 6 ----------
#The function below creates a graph to compare the words spoken by race/sex for a single film
#The input for the function is a tibble that contains only a single film
#Write a for loop that iterates through the film names to apply the function to a subset of ds_longer (each film)
#Run all 6 lines code below to define the function (it should show in your environment after running)
words_graph <- function(df) {
p <- ggplot(df, aes(x = Race, y = Words, fill = Sex)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle(df$Film) + theme_minimal()
print(p)
}
#ANSWER
films <- unique(ds_longer$Film) #Or write them all out
for (film in films) {
ds_longer %>% filter(Film == film) %>% words_graph
}
### Question 7 ----------
#Apply the words_graph function again, but this time
#use split and map to apply the function to each film separately
#ANSWER
ds_longer %>% split(.$Film) %>% map(words_graph)
### Question 8 ----------
#The PI wants a .csv file for each film with a row for male and a row for female
#and separate columns for the words spoken by each race and the percentage of words spoken by each race
#First, get the data formatted in the correct way
#From ds_longer, create a new tibble "ds_wider" that has columns for words for each race and percentage for each race
#ANSWER
ds_wider <- ds_longer %>% pivot_wider(names_from = Race, values_from = c(Words, Percent))
### Question 9 ---------
#Using your new "ds_wider" tibble, write the three data files using either a for loop or map
#The files should be written to "data_cleaned" and should be named by film title
#ANSWER - Map
films <- unique(ds_wider$Film)
map(films, ~ write_csv(filter(ds_wider, Film == .x), paste0("data_cleaned/",.x,".csv")))
#ANSWER - for loop
films <- unique(ds_wider$Film)
for (film in films) {
ds_wider %>% filter(Film == film) %>% write_csv(paste0("data_cleaned/",film,".csv"))
}