-
Notifications
You must be signed in to change notification settings - Fork 2
/
passive_data_setup.R
124 lines (110 loc) · 6.41 KB
/
passive_data_setup.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(drake)
library(tidyverse)
library(readxl)
library(data.table)
library(toxEval)
library(openxlsx)
path_to_data <- Sys.getenv("PASSIVE_PATH")
options(drake_make_menu = FALSE)
dir.create("data", showWarnings = FALSE)
dir.create("plots", showWarnings = FALSE)
dir.create(file.path("data","raw"), showWarnings = FALSE)
dir.create(file.path("data","clean"), showWarnings = FALSE)
# Go from raw files to R objects:
source(file = "R/analyze/data_reader.R")
source(file = "R/analyze/get_sites_ready.R")
source(file = "R/analyze/get_chem_info.R")
source(file = "R/analyze/create_tox_file.R")
source(file = "R/analyze/open_land_use.R")
pkgconfig::set_config("drake::strings_in_dots" = "literals")
data_setup_plan <- drake_plan(
cas_change = readxl::read_xlsx(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/cas_change.xlsx"))),
chem_info_old = read.csv(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/chem_classes.csv")), stringsAsFactors = FALSE),
exclude = get_exclude(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/exclude.csv"))),
AOP_crosswalk_copy = file.copy(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/AOP_crosswalk.csv")),
file_out("data/raw/AOP_crosswalk.csv")),
OC_2014 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2014.xlsx")), cas_df,
n_max = 45,
sheet = "OC-PCB-PBDE",
site_sheet = "site info",
year = 2014),
PAHs_2014 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2014.xlsx")), cas_df,
n_max = 33,
sheet = "PAHs",
site_sheet = "site info",
year = 2014),
pharm_2014 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/pharm_update.xlsx")), cas_df,
n_max = 41,
sheet = "est water concentrations",
site_sheet = "PrioritySiteInfo",
year = 2014,
skip = 7, skip_site = 2),
PAHs_2010 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2010.xlsx")), cas_df,
n_max = 33,
sheet = "PAHs",
site_sheet = "site info",
year = 2010,
skip_site = 2),
OC_2010 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2010.xlsx")), cas_df,
n_max = 40,
sheet = "OC-PCB-PBDE",
site_sheet = "site info",
year = 2010,
skip_site = 2),
WW_2010 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2010.xlsx")), cas_df,
n_max = 53,
sheet = "WW",
site_sheet = "site info",
year = 2010,
skip_site = 2),
WW_2014 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/ww_update.xlsx")), cas_df,
n_max = 46,
sheet = "est water concentrations",
site_sheet = "PrioritySiteInfo",
year = 2014,
skip = 7,
skip_site = 2),
pharm_2010 = generic_file_opener(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2010.xlsx")), cas_df,
n_max = 44,
sheet = "pharms",
site_sheet = "site info",
year = 2010,
skip_site = 2),
all_data = bind_rows(pharm_2010,
WW_2010,
OC_2010,
PAHs_2010,
pharm_2014,
WW_2014,
OC_2014,
PAHs_2014) ,
cas_df = all_cas(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/cas.xlsx"))),
clean_cas_df = clean_cas(cas_df),
clean_cas_fixed = fix_cas(clean_cas_df, cas_change),
all_data_chnm = clean_names(all_data),
all_data_fixed_cas = fix_cas(all_data_chnm, cas_change),
chem_info = get_chem_info(all_data_fixed_cas, chem_info_old),
chem_info_fixed_cas = fix_cas(chem_info, cas_change),
out_cas = saveRDS(object = chem_info_fixed_cas,
file = file_out("data/clean/cas_df.rds")),
out_cas_sync = saveRDS(object = chem_info_fixed_cas,
file = file_out(!!file.path(path_to_data,"data/data_for_git_repo/clean/cas_df.rds"))),
sites_orig_2014 = readxl::read_excel(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2014.xlsx")),
sheet = "site info",
skip = 3),
sites_OWC = data.table::fread(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/sites_from_OWC.txt")),
data.table = FALSE,
sep="\t", select = c("SiteID", "site_grouping", "Short Name"),
colClasses = c("SiteID"="character")),
sites_orig_2010 = readxl::read_excel(file_in(!!file.path(path_to_data,"data/data_for_git_repo/raw/general_2010.xlsx")),
sheet = "site info",
skip = 2),
df_lu = open_land_use(),
sites = get_sites_ready(sites_orig_2014, sites_orig_2010, sites_OWC, df_lu),
tox_list_init = create_tox_object(all_data_fixed_cas, chem_info_fixed_cas, sites, exclude),
saveOutput2 = openxlsx::write.xlsx(tox_list_init,
file = file_out(!!file.path(path_to_data,"data/data_for_git_repo/clean/passive.xlsx")))
)
config <- drake_config(data_setup_plan)
vis_drake_graph(config, build_times = "none")
make(data_setup_plan, trigger = trigger(condition=TRUE))