|
| 1 | +################################################################################ |
| 2 | +## |
| 3 | +## <PROJ> College Advising Map |
| 4 | +## <FILE> make_data.R |
| 5 | +## <AUTH> Benjamin Skinner |
| 6 | +## <INIT> September 2017 |
| 7 | +## |
| 8 | +################################################################################ |
| 9 | + |
| 10 | +## libraries |
| 11 | +libs <- c('tidyverse', 'geojsonio', 'sp', 'rgdal', 'stringr') |
| 12 | +lapply(libs, require, character.only = TRUE) |
| 13 | + |
| 14 | +## paths |
| 15 | +ddir <- '../assets/data' |
| 16 | +jdir <- '../assets/js' |
| 17 | +rdir <- file.path(ddir, '_raw') |
| 18 | + |
| 19 | +## crosswalk |
| 20 | +cw <- read_csv(file.path(ddir, 'stcrosswalk.csv')) |
| 21 | + |
| 22 | +################################################################################ |
| 23 | +## FUNCTIONS |
| 24 | +################################################################################ |
| 25 | + |
| 26 | +## slight modification of stringr::str_to_title |
| 27 | +str_to_title_mod <- function(x) { |
| 28 | + x <- str_to_title(x) |
| 29 | + ## Of --> of |
| 30 | + x <- gsub(' Of ', ' of ', x, fixed = TRUE) |
| 31 | + ## And --> and |
| 32 | + x <- gsub(' And ', ' and ', x, fixed = TRUE) |
| 33 | + ## The --> the |
| 34 | + x <- gsub(' The ', ' the ', x, fixed = TRUE) |
| 35 | + ## A --> a |
| 36 | + x <- gsub(' A ', ' a ', x, fixed = TRUE) |
| 37 | + return(x) |
| 38 | +} |
| 39 | + |
| 40 | +## write dataframe to JS array with no unnecessary whitespace |
| 41 | +writeJSArray <- function(df, array_name, vars, outfile) { |
| 42 | + |
| 43 | + df <- df[,names(df) %in% vars] |
| 44 | + ## maintain 1 index (JS is zero-index) for sanity |
| 45 | + lines <- paste0(array_name,'=[{}') |
| 46 | + |
| 47 | + for (i in 1:nrow(df)) { |
| 48 | + ## line <- paste0(array_name, '[', i, ']={') |
| 49 | + line <- paste0('{') |
| 50 | + for (v in 1:length(vars)) { |
| 51 | + if (!is.na(df[i, grep(vars[v], names(df))])) { |
| 52 | + if (v == 1) { |
| 53 | + } else { |
| 54 | + line <- paste0(line,',') |
| 55 | + } |
| 56 | + if (class(df[[vars[v]]]) == 'character') { |
| 57 | + line <- paste0(line, vars[v], ':"', |
| 58 | + df[i, grepl(vars[v], names(df))], '"') |
| 59 | + } else { |
| 60 | + line <- paste0(line, vars[v], ':', |
| 61 | + df[i, grepl(vars[v], names(df))]) |
| 62 | + } |
| 63 | + |
| 64 | + } |
| 65 | + } |
| 66 | + line <- paste0(line, '}') |
| 67 | + if (i == nrow(df)) { line <- paste0(line, '];') } |
| 68 | + lines <- c(lines,line) |
| 69 | + } |
| 70 | + writeLines(paste(lines, collapse = ','), outfile) |
| 71 | +} |
| 72 | + |
| 73 | +################################################################################ |
| 74 | +## CLEAN DATA |
| 75 | +################################################################################ |
| 76 | + |
| 77 | +## ------------------------------------- |
| 78 | +## COLLEGE |
| 79 | +## ------------------------------------- |
| 80 | + |
| 81 | +## read in school data |
| 82 | +college <- read_csv(file.path(rdir, 'HD2015.zip')) %>% |
| 83 | + setNames(tolower(names(.))) %>% |
| 84 | + select(instnm, fips, sector, lon = longitud, lat = latitude) %>% |
| 85 | + filter(sector %in% c(1,2,4,5), |
| 86 | + fips %in% cw$stfips) %>% |
| 87 | + mutate(lon = as.numeric(lon), |
| 88 | + lat = as.numeric(lat), |
| 89 | + cat = ifelse(sector == 1, 5, |
| 90 | + ifelse(sector == 2, 6, |
| 91 | + ifelse(sector == 4, 7, |
| 92 | + ifelse(sector == 5, 8, 0))))) %>% |
| 93 | + select(-sector) %>% |
| 94 | + filter(!is.na(lon), |
| 95 | + !is.na(lat)) |
| 96 | + |
| 97 | +## ------------------------------------- |
| 98 | +## HIGH SCHOOL |
| 99 | +## ------------------------------------- |
| 100 | + |
| 101 | +## read in school data |
| 102 | +hs <- read_csv(file.path(rdir, 'school_level_clean_2.csv')) %>% |
| 103 | + setNames(tolower(names(.))) %>% |
| 104 | + select(nces_id, |
| 105 | + nces_dist_id, |
| 106 | + instnm = school_name, |
| 107 | + fips = school_state_fips, |
| 108 | + lon = school_longitude, |
| 109 | + lat = school_latitude, |
| 110 | + enroltot = school_enrollment_total, |
| 111 | + frlpct = school_frl_pct, |
| 112 | + csr = school_student_counselor_ratio) %>% |
| 113 | + mutate(fips = as.integer(fips)) %>% |
| 114 | + filter(fips %in% cw$stfips) %>% |
| 115 | + mutate(instnm = str_to_title_mod(instnm), |
| 116 | + lon = as.numeric(lon), |
| 117 | + lat = as.numeric(lat)) %>% |
| 118 | + filter(!is.na(lon), |
| 119 | + !is.na(lat)) |
| 120 | + |
| 121 | +hs_imp <- hs %>% |
| 122 | + group_by(fips) %>% |
| 123 | + summarise(csr_mean = mean(csr, na.rm = TRUE)) %>% |
| 124 | + ungroup() |
| 125 | + |
| 126 | +## advising programs at school level |
| 127 | +advise_tmp <- read_csv(file.path(rdir, 'advising_program_school_clean.csv')) %>% |
| 128 | + setNames(tolower(names(.))) %>% |
| 129 | + mutate(trio_subprogram = ifelse(trio_subprogram == 'N/A', NA, |
| 130 | + trio_subprogram)) %>% |
| 131 | + ## need to make wide |
| 132 | + arrange(nces_id) %>% |
| 133 | + group_by(nces_id) %>% |
| 134 | + mutate(count = seq(n())) %>% |
| 135 | + rename(org_1 = organization_name, |
| 136 | + div_1 = division_name, |
| 137 | + tri_1 = trio_subprogram, |
| 138 | + web_1 = website) %>% |
| 139 | + mutate(org_2 = ifelse(count == 2, org_1, NA), |
| 140 | + div_2 = ifelse(count == 2, div_1, NA), |
| 141 | + tri_2 = ifelse(count == 2, tri_1, NA), |
| 142 | + web_2 = ifelse(count == 2, web_1, NA), |
| 143 | + org_3 = ifelse(count == 3, org_1, NA), |
| 144 | + div_3 = ifelse(count == 3, org_1, NA), |
| 145 | + tri_3 = ifelse(count == 3, org_1, NA), |
| 146 | + web_3 = ifelse(count == 3, org_1, NA), |
| 147 | + org_4 = ifelse(count == 4, org_1, NA), |
| 148 | + div_4 = ifelse(count == 4, org_1, NA), |
| 149 | + tri_4 = ifelse(count == 4, org_1, NA), |
| 150 | + web_4 = ifelse(count == 4, org_1, NA), |
| 151 | + org_5 = ifelse(count == 5, org_1, NA), |
| 152 | + div_5 = ifelse(count == 5, org_1, NA), |
| 153 | + tri_5 = ifelse(count == 5, org_1, NA), |
| 154 | + web_5 = ifelse(count == 5, org_1, NA), |
| 155 | + org_6 = ifelse(count == 6, org_1, NA), |
| 156 | + div_6 = ifelse(count == 6, org_1, NA), |
| 157 | + tri_6 = ifelse(count == 6, org_1, NA), |
| 158 | + web_6 = ifelse(count == 6, org_1, NA), |
| 159 | + org_7 = ifelse(count == 7, org_1, NA), |
| 160 | + div_7 = ifelse(count == 7, org_1, NA), |
| 161 | + tri_7 = ifelse(count == 7, org_1, NA), |
| 162 | + web_7 = ifelse(count == 7, org_1, NA), |
| 163 | + org_8 = ifelse(count == 8, org_1, NA), |
| 164 | + div_8 = ifelse(count == 8, org_1, NA), |
| 165 | + tri_8 = ifelse(count == 8, org_1, NA), |
| 166 | + web_8 = ifelse(count == 8, org_1, NA)) |
| 167 | + |
| 168 | +advise <- advise_tmp %>% |
| 169 | + filter(count == 1) %>% |
| 170 | + select(nces_id, ends_with('_1')) %>% |
| 171 | + left_join(advise_tmp %>% |
| 172 | + filter(count == 2) %>% |
| 173 | + select(nces_id, ends_with('_2')), by = 'nces_id') %>% |
| 174 | + left_join(advise_tmp %>% |
| 175 | + filter(count == 3) %>% |
| 176 | + select(nces_id, ends_with('_3')), by = 'nces_id') %>% |
| 177 | + left_join(advise_tmp %>% |
| 178 | + filter(count == 4) %>% |
| 179 | + select(nces_id, ends_with('_4')), by = 'nces_id') %>% |
| 180 | + left_join(advise_tmp %>% |
| 181 | + filter(count == 5) %>% |
| 182 | + select(nces_id, ends_with('_5')), by = 'nces_id') %>% |
| 183 | + left_join(advise_tmp %>% |
| 184 | + filter(count == 6) %>% |
| 185 | + select(nces_id, ends_with('_6')), by = 'nces_id') %>% |
| 186 | + left_join(advise_tmp %>% |
| 187 | + filter(count == 7) %>% |
| 188 | + select(nces_id, ends_with('_7')), by = 'nces_id') %>% |
| 189 | + left_join(advise_tmp %>% |
| 190 | + filter(count == 8) %>% |
| 191 | + select(nces_id, ends_with('_8')), by = 'nces_id') %>% |
| 192 | + unite(advise_org, starts_with('org_'), sep = '|') %>% |
| 193 | + mutate(advise_org = gsub('NA', '', advise_org)) %>% |
| 194 | + unite(advise_div, starts_with('div_'), sep = '|') %>% |
| 195 | + mutate(advise_div = gsub('NA', '', advise_div)) %>% |
| 196 | + unite(advise_tri, starts_with('tri_'), sep = '|') %>% |
| 197 | + mutate(advise_tri = gsub('NA', '', advise_tri)) %>% |
| 198 | + unite(advise_web, starts_with('web_'), sep = '|') %>% |
| 199 | + mutate(advise_web = gsub('NA', '', advise_web)) |
| 200 | + |
| 201 | +## merge into high school data |
| 202 | +hs <- hs %>% |
| 203 | + ## left_join(dist) %>% |
| 204 | + left_join(advise) %>% |
| 205 | + left_join(hs_imp) %>% |
| 206 | + select(-starts_with('nces_')) %>% |
| 207 | + mutate(csr_flag = as.integer(is.na(csr)), |
| 208 | + csr = ifelse(is.na(csr), csr_mean, csr), |
| 209 | + cat = ifelse(!is.na(advise_org) & !is.na(csr), 1, |
| 210 | + ifelse(is.na(advise_org) & !is.na(csr), 2, |
| 211 | + ifelse(!is.na(advise_org) & is.na(csr), 3, |
| 212 | + ifelse(is.na(advise_org) & is.na(csr), 4, 0))))) |
| 213 | + |
| 214 | +## ------------------------------------- |
| 215 | +## COMMUNITY |
| 216 | +## ------------------------------------- |
| 217 | + |
| 218 | +## advising programs at school level |
| 219 | +community <- read_csv(file.path(rdir, 'advising_program_community_clean.csv')) %>% |
| 220 | + setNames(tolower(names(.))) %>% |
| 221 | + rename(advise_org = organization_name, |
| 222 | + advise_div = division_name, |
| 223 | + advise_web = website) |
| 224 | + |
| 225 | +## get zipcode geo |
| 226 | +zipgeo <- read_tsv(file.path(rdir, '2016_Gaz_zcta_national.zip')) %>% |
| 227 | + setNames(tolower(names(.))) %>% |
| 228 | + mutate(zip = geoid, |
| 229 | + lon = intptlong, |
| 230 | + lat = intptlat) %>% |
| 231 | + select(zip, lon, lat) |
| 232 | + |
| 233 | +## merge to community |
| 234 | +community <- community %>% |
| 235 | + left_join(zipgeo) %>% |
| 236 | + mutate(cat = 9) %>% |
| 237 | + ## need to fix! |
| 238 | + drop_na(lon) |
| 239 | + |
| 240 | +################################################################################ |
| 241 | +## COMBINE & WRITE |
| 242 | +################################################################################ |
| 243 | + |
| 244 | +## bind |
| 245 | +df <- bind_rows(college, hs, community) %>% |
| 246 | + mutate(z = row_number(), # redundant id # |
| 247 | + cat = as.integer(cat)) %>% |
| 248 | + ## rename for very small names |
| 249 | + rename(a = cat, # a := category |
| 250 | + b = instnm, # b := name |
| 251 | + c = fips, # c := fips |
| 252 | + d = enroltot, # d := enrollment (hs) |
| 253 | + e = frlpct, # e := frpl pct (hs) |
| 254 | + f = csr, # f := stu/cou ratio (hs) |
| 255 | + g = csr_flag, # g := missing csr |
| 256 | + h = zip, # h := zip code |
| 257 | + i = advise_org, # i := organization name |
| 258 | + j = advise_div, # h := division name |
| 259 | + k = advise_tri, # i := trio subprogram |
| 260 | + l = advise_web) # l := website |
| 261 | + |
| 262 | +## split by schools/community and college |
| 263 | +df_coll <- df %>% filter(a %in% c(5:8)) |
| 264 | +df_icon <- df %>% filter(a %in% c(0:4,9)) |
| 265 | + |
| 266 | +## set up as SP data frame |
| 267 | +lonlat_coll <- df_coll %>% select(lon, lat) %>% as.matrix() |
| 268 | +dfsp_coll <- SpatialPointsDataFrame(lonlat_coll, df_coll %>% select(z), |
| 269 | + proj4string = CRS('+init=epsg:3857')) |
| 270 | +lonlat_icon <- df_icon %>% select(lon, lat) %>% as.matrix() |
| 271 | +dfsp_icon <- SpatialPointsDataFrame(lonlat_icon, df_icon %>% select(z), |
| 272 | + proj4string = CRS('+init=epsg:3857')) |
| 273 | + |
| 274 | +## write as geojson |
| 275 | +geojson_write(input = dfsp_coll, file = file.path(ddir, 'college.geojson')) |
| 276 | +geojson_write(input = dfsp_icon, file = file.path(ddir, 'icon.geojson')) |
| 277 | + |
| 278 | +## write all data as minified JS |
| 279 | +writeJSArray(df, 's', letters[1:12], file.path(jdir, 'all_icon_array.js')) |
| 280 | + |
| 281 | +## ============================================================================= |
| 282 | +## END FILE |
| 283 | +################################################################################ |
0 commit comments