-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata.R
166 lines (150 loc) · 7.58 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#' ---
#' title: "Preparation of data for analysis of successful community discharge from rehab by RSAs"
#' author: 'Timothy Reistetter ^1^, Alex Bokov ^1^, Susanne Schmidt ^1^, Mei-Lin Min ^1^'
#' abstract: |
#' | Merging and cleanup of CMS and Census data, with some preliminary
#' unsupervised variable selection
#' documentclass: article
#' description: 'Manuscript'
#' clean: false
#' self_contained: true
#' number_sections: false
#' keep_md: true
#' fig_caption: true
#' css: 'production.css'
#' output:
#' html_document:
#' toc: true
#' toc_float: true
#' ---
#+ init, echo=FALSE, message=FALSE, warning=FALSE,results='hide'
# Init ----
debug <- 0;
knitr::opts_chunk$set(echo=debug>0, warning=debug>0, message=debug>0);
# Global project settings ----
inputdata <- c();
source('config.R',local=T,echo=debug>0);
# inputdata <- c(dat0='data/SIM_SDOH_ZCTA.xlsx' # census data by ZCTA
# ,cx0='data/SIM_ALLCMS.csv' # RSA-ZCTA crosswalk
# ,rsa0='data/SIM_RSAv4 SCD RSRs.csv' # outcomes (RSR)
# ,dct0='data/data_dictionary.csv' # data dictionary for the
# # dat1 dataset that _this_
# # scriport produces
# ,dat1='SDOH_RSR_201X_prelim.csv' # the dat1 dataset
# ,dat2='SDOH_RSR_201X_scaled_prelim.csv' # the scaled version of
# # dat1
# );
# Load libraries ----
library(rio); library(dplyr); library(tidbits); # data handling
library(RCurl);
library(metamedian); # median-of-medians01
# Local project settings ----
# overwrite previously set global values if needed
if(file.exists('local.config.R')){
source('local.config.R',local=TRUE,echo = debug>0);
if(exists('.local.inputdata')){
inputdata <- replace(inputdata,names(.local.inputdata),.local.inputdata)};
};
# Local functions ----
# Percents. Use with den=ACS_TOTAL_POP_WT for c_pct columns. For c_per1k, use
# the same function but with per=1000. For c_byarea columns, use
# den=CEN_AREALAND_SQM and per=1
.aggpct <-function(num,den,per=100) per*sum(num*pmax(den,1,na.rm=T)/per,na.rm=T)/sum(den,na.rm=T);
# Medians. For c_median columns, use den=ACS_TOTAL_POP_WT but for c_medianhh
# columns use den=ACS_TOTAL_HOUSEHOLD
.aggmed <-function(num,den,...) if(all(is.na(num))) return(NA) else {
pool.med(num,pmax(den,1,na.rm=T))$pooled.est};
# Means
# For c_meanhh, den= ACS_TOTAL_HOUSEHOLD
.aggmean <-function(num,den,...) weighted.mean(num,coalesce(den,0),na.rm=T);
# Per capita
.aggpcap <-function(num,den,...) {out <- try(sum(num*den,na.rm=T)/sum(den,na.rm=T));
if(is(out,'try-error')) browser(); out;}
# Largest by area (for discrete values)
.aggmaxa <-function(num,den,...) rep.int(num,den) %>% table %>% sort %>% rev %>%
head(1) %>% names;
# Import data ----
# Downloading and importing AHRQ ZCTA data, which presumably we will eventually use
if(!file.exists('SDOH_ZCTA.rdata')){
sdoh0 <- sapply(sprintf('https://www.ahrq.gov/sites/default/files/wysiwyg/sdohchallenge/data/SDOH_ZCTA_20%d.xlsx',11:18)
,import,simplify=F);
save(sdoh0,file = 'SDOH_ZCTA.rdata');
} else sdoh0 <- import('SDOH_ZCTA.rdata');
if(!file.exists('SDOH_ZIP.rdata')){
sdohzip <- sapply(sprintf('https://www.ahrq.gov/sites/default/files/wysiwyg/sdoh/SDOH_20%d_ZIPCODE_1_0.xlsx',11:20)
,import,which='Data',simplify=F);
save(sdohzip,file = 'SDOH_ZIP.rdata');
} else sdohzip <- import('SDOH_ZIP.rdata');
sdohdctzip <- sapply(sprintf('https://www.ahrq.gov/sites/default/files/wysiwyg/sdoh/SDOH_20%d_Codebook_1_0.xlsx',11:20)
,import,which='ZIPCODE',simplify=F)
# Downloading and importing AHRQ SDOH codebook, which presumably we will eventually use
if(!file.exists('AHRQ_SDOH_codebook.xlsx')){
writeBin(getBinaryURL('https://www.ahrq.gov/sites/default/files/wysiwyg/sdohchallenge/data/sdoh_codebook_final.xlsx')
,'AHRQ_SDOH_codebook.xlsx')};
sdohdct0 <- sapply(paste0('ZCTA_20',13:18),function(xx){
import('AHRQ_SDOH_codebook.xlsx',which=xx)},simplify=F);
# Downloading and importing the Social Deprivation Index
if(!file.exists('ACS2015_zctaallvars.xlsx')){
writeBin(getBinaryURL('https://www.graham-center.org/content/dam/rgc/documents/maps-data-tools/sdi/ACS2015_zctaallvars.xlsx'
,ssl.verifypeer=F,ssl.verifyhost=F)
,'ACS2015_zctaallvars.xlsx')};
sdi <- import('ACS2015_zctaallvars.xlsx') %>%
mutate(ZCTA=sprintf("%05d", zcta)) %>% select(-'zcta') %>%
subset(!is.na(sdi_score));
# Importing the local data files
dat0 <- import(inputdata['dat0']);
cx0 <- import(inputdata['cx0']);
rsa0 <- import(inputdata['rsa0']);
dct0 <- import(inputdata['dct0']);
# Merge data ----
#' # Merging and Preparing Data
#'
#' `dat1` is the combined dataset, aggregated by RSA (`CN`) and weighted by
#' `ACS_TOTAL_POP_WT`.
#' The `dat2` dataset is `dat1` with numeric values scaled and the following
#' columns omitted: `ZCTA`, `YEAR`, and `Quartile`.
#' Finally, `dat3tr` is the just the numeric columns from the `dat2` dataset.
#+ dat1,cache=F
.c_numericother <- setdiff(v(c_numeric,dat=dat0)
,c(v(c_pct),v(c_per1k),v(c_byarea),v(c_median)
,v(c_medianhh),v(c_total),v(c_percap),v(c_meanhh)
,v(c_factor))) %>% c('sdi_score');
set.seed(project_seed);
dat1 <- inner_join(dat0,cx0,by="ZCTA") %>%
left_join(sdi[,c('ZCTA','sdi_score')] ,by="ZCTA") %>%
group_by(CN,YEAR) %>% summarise(
# using the aggregation functions declared above on their respective sets of columns
across(.cols=v(c_pct,dat=(.)),.aggpct,den=ACS_TOTAL_POP_WT)
,across(.cols=v(c_per1k,dat=(.)),.aggpct,den=ACS_TOTAL_POP_WT,per=1000)
,across(.cols=v(c_byarea,dat=(.)),.aggpct,den=CEN_AREALAND_SQM,per=1)
,across(.cols=v(c_median,dat=(.)),.aggmed,den=ACS_TOTAL_POP_WT)
,across(.cols=v(c_medianhh,dat=(.)),.aggmed,den=ACS_TOTAL_HOUSEHOLD)
,across(.cols=v(c_meanhh,dat=(.)),.aggmean,den=ACS_TOTAL_HOUSEHOLD)
,across(.cols=v(c_percap,dat=(.)),.aggpcap,den=ACS_TOTAL_POP_WT)
,across(.cols=all_of(.c_numericother),.aggmean,den=ACS_TOTAL_POP_WT)
,across(.cols=v(c_factorarea,dat=(.)),.aggmaxa,den=CEN_AREALAND_SQM)
# The c_total group of columns needs to get aggregated last because their
# de-aggregated forms may be needed for the other aggregations. Maybe it
# won't make a difference but decided to play it safe for no.
,across(.cols=v(c_total,dat=(.)),sum,na.rm=T)
) %>% subset(!is.na(CN)) %>%
left_join(rsa0,by=c(CN='RSA')) %>% ungroup %>%
mutate(across(v(c_factor,dat=(.)),factor)
,sdi_score=ifelse(is.na(sdi_score)
,median(sdi_score,na.rm=T),sdi_score)
,Quartile=cut(RSR,breaks=quantile(RSR,seq(0,1,len=5))
,include.lowest=T,labels=paste0('Q',1:4))
,Quintile=cut(RSR,breaks=quantile(RSR,seq(0,1,len=6))
,include.lowest=T,labels=paste0('Q',1:5))
,Decile=cut(RSR,breaks=quantile(RSR,seq(0,1,len=11))
,include.lowest=T,labels=paste0('Q',1:10))
# this column will be used to assign rows to the training and the testing sets
,subsample=sample(c('train','test'),n(),replace = TRUE)
);
dct0 <- subset(dct0,column %in% colnames(dat1));
# data prep ----
# Scale the numeric variables
dat2 <- select(dat1,-c('YEAR','Quartile','Quintile','Decile',v(c_factor),'subsample'),CN);
dat2[,sapply(dat2,is.numeric)] <- scale(dat2[,sapply(dat2,is.numeric)]);
export(dat1,inputdata['dat1']);
export(dat2,inputdata['dat2']);