initial commit of A10 data prep file.

NEFSC · Oct 30, 2024 · db3c17d · db3c17d
1 parent 1a88c35
commit db3c17d
Showing 1 changed file with 159 additions and 0 deletions.
diff --git a/Amendment10/writing/Amendment10_data_prep.Rmd b/Amendment10/writing/Amendment10_data_prep.Rmd
@@ -0,0 +1,159 @@
+---
+title: "Am10_dataprep"
+author: "Min-Yang Lee"
+date: "2024-10-28"
+output: html_document
+---
+
+
+1.  You must be able to see this folder ``nefscfile\BLAST\READ-SSB-Lee-MRIP-BLAST\data_folder\raw``. It contains the raw MRIP data that has been imported into Rds format. These were constructed with ``https://github.com/NEFSC/READ-SSB-MRIP-BLAST/R_code/data_extracting_processing/extraction/pull_in_MRIP.R``.  
+
+
+
+```{r global_options, include=FALSE}
+library("here")
+library("data.table")
+library("tidyverse")
+library("knitr")
+library("lubridate")
+library("scales")
+library("fredr")
+library("kableExtra")
+library("censusapi")
+library("tigris")
+library("mapview")
+library("haven")
+library("survey")
+library("srvyr")
+
+#turn off scientific notation
+options(scipen=999)
+
+#Handle single PSUs
+options(survey.lonely.psu = "certainty")
+options(tigris_use_cache = TRUE)
+
+here::i_am("writing/Amendment10_data_prep.Rmd")
+
+#############################################################################
+#knitr options
+
+knitr::opts_chunk$set(echo=FALSE, warning = FALSE, error = FALSE, message = FALSE, comment = FALSE, cache = FALSE, progress = TRUE, verbose = FALSE, 
+											dpi = 600)
+options(tinytex.verbose = TRUE)
+# options(knitr.table.format = "latex")
+#############################################################################
+
+
+# RFA data
+RFA_filepath<-file.path("//nefscdata","RFA_EO12866_Guidelines" ,"Ownership Data", "current data and metadata","affiliates_2024_06_01.Rdata")
+BLAST_raw_filepath<-file.path("//nefscfile","BLAST" ,"READ-SSB-Lee-MRIP-BLAST", "data_folder","raw")
+local_BLAST_folder<-file.path("V:","READ-SSB-Lee-MRIP-BLAST")
+network_BLAST_folder<-file.path("//nefscfile","BLAST" ,"READ-SSB-Lee-MRIP-BLAST")
+
+
+
+
+
+#############################################################################
+# The census data that I'm using reports out 2022 Inflation adjusted dollars. Therefore, it makes sense to adjust everything to base year 2022.
+
+deflate_by <- "year"
+base_year = 2022 # base year for GDP deflator - Maximum = 2020, Minimum = 1947  - max(GDPDEF_quarterly$Year)
+
+
+vintage_string<-Sys.Date()
+vintage_string<-gsub("-","_",vintage_string)
+
+```
+
+
+```{r get_census, eval=TRUE}
+# Pull in and tidy up some Census data on household income
+income <- getCensus(
+  name = "acs/acs5/subject",
+  vintage = 2022,
+  vars = c("S1901_C01_001E",  "S1901_C01_010E","S1901_C01_011E","S1901_C01_012E","S1901_C01_013E","S0101_C01_001E", "S0101_C02_028E","S0601_C01_001E","S0601_C01_022E"),
+  region = "zip code tabulation area:*", 
+  show_call=FALSE)
+
+income<-income %>%
+  rename ( households=S1901_C01_001E,
+           household_inc_150_200pct =S1901_C01_010E,
+           household_inc_200pct=S1901_C01_011E,
+           household_median_inc=S1901_C01_012E, 
+           household_mean_inc=S1901_C01_013E,
+           total_population=S0101_C01_001E,
+           population_pct_over60=S0101_C02_028E,
+           total_population_s0601=S0601_C01_001E,
+           pct_white_alone =S0601_C01_022E
+           ) %>%
+  mutate(obscured_median=case_when(household_median_inc<0 ~ 1, .default=0),
+         obscured_mean=case_when(household_mean_inc<0 ~ 1, .default=0)
+         )
+
+
+# Read in the state-zcta correspondence file.
+zcta_state<-read_delim("https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_county20_natl.txt", delim="|", guess_max=2000)
+
+# Contract down to the fraction in each area.
+zcta_state <- zcta_state %>%
+  dplyr::filter(is.na(OID_ZCTA5_20)==FALSE) %>%
+  group_by(OID_ZCTA5_20) %>%
+  mutate(total_area=sum(AREALAND_PART)) %>%
+  mutate(fraction=AREALAND_PART/total_area) %>%
+  select(c(GEOID_ZCTA5_20, OID_ZCTA5_20,OID_COUNTY_20, GEOID_COUNTY_20, fraction))
+
+
+
+# Pull in and tidy up some Census data on household income at the state level
+county_income <- getCensus(
+  name = "acs/acs5/subject",
+  vintage = 2022,
+  vars = c("S1901_C01_012E","S1901_C01_013E"),
+  region = "county:*", 
+  show_call=FALSE)
+
+county_income<-county_income %>%
+  rename (county_household_median_inc=S1901_C01_012E, 
+           county_household_mean_inc=S1901_C01_013E
+  ) %>%
+  mutate(st_co=paste0(state,county))
+
+
+# use county income to create an in imputed income based on county income
+income_fill<-zcta_state %>%
+  left_join(county_income, by=join_by(GEOID_COUNTY_20==st_co), relationship="many-to-one") %>%
+  mutate(wmedian=fraction*county_household_median_inc,
+         wmean=fraction*county_household_mean_inc) %>%
+  group_by(GEOID_ZCTA5_20, OID_ZCTA5_20) %>%
+  summarise(imputed_household_median_income=sum(wmedian),
+           imputed_household_mean_income=sum(wmean)) %>%
+  ungroup()
+
+income<-income %>%
+  left_join(income_fill, by=join_by(zip_code_tabulation_area==GEOID_ZCTA5_20), relationship="one-to-one")%>%
+  mutate(household_median_inc=case_when(obscured_median==1 ~ imputed_household_median_income, .default=household_median_inc),
+         household_mean_inc=case_when(obscured_mean<0 ~ imputed_household_mean_income, .default=household_mean_inc)
+         ) %>%
+  select(-c(imputed_household_mean_income,imputed_household_median_income))
+
+summary(income)
+
+
+saveRDS(income,file=here("data_folder", "main", paste0("income_",vintage_string,".Rds")))
+
+saveRDS(zcta_state,file=here("data_folder", "main", paste0("zcta_state",vintage_string,".Rds")))
+
+```
+
+
+
+```{r get_MRIP, eval=TRUE}
+
+
+
+
+
+```
+