Skip to content

Commit

Permalink
Merge pull request #74 from ecohealthalliance/feature/augment
Browse files Browse the repository at this point in the history
Feature/augment
  • Loading branch information
emmamendelsohn authored Feb 2, 2024
2 parents 5578c0d + 7353eb5 commit 720aeed
Show file tree
Hide file tree
Showing 10 changed files with 773 additions and 632 deletions.
17 changes: 17 additions & 0 deletions .Renvignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
*
!packages.R
!packages.r
!*.R
!*.r
!R/*.R
!R/*.r
!_targets*.R
!_targets*.r
!vignettes/*.r
!vignettes/*.R
!vignettes/*.Rmd
!vignettes/*.rmd
!reports/*.r
!reports/*.R
!reports/*.Rmd
!reports/*.rmd
87 changes: 27 additions & 60 deletions .Rprofile
Original file line number Diff line number Diff line change
Expand Up @@ -8,82 +8,49 @@ load_env <- function(){
}
load_env()

local({
user_rprof <- Sys.getenv("R_PROFILE_USER", normalizePath("~/.Rprofile", mustWork = FALSE))
if(interactive() && file.exists(user_rprof)) {
source(user_rprof)
# If there is a bucket, cache targets remotely. Otherwise, do so locally.
if(!nzchar(Sys.getenv("TAR_PROJECT"))) {
if(nzchar(Sys.getenv("AWS_BUCKET_ID"))) {
Sys.setenv(TAR_PROJECT = "s3")
} else {
Sys.setenv(TAR_PROJECT = "main")
}
})
}
if(interactive()){
message(paste("targets project is", Sys.getenv("TAR_PROJECT")))
}

# Set options for renv convenience
options(
repos = c(CRAN = "https://cloud.r-project.org",
MILESMCBAIN = "https://milesmcbin.r-universe.dev",
ROPENSCI = "https://ropensci.r-universe.dev"),
renv.config.auto.snapshot = FALSE, ## Attempt to keep renv.lock updated automatically
renv.config.rspm.enabled = TRUE, ## Use RStudio Package manager for pre-built package binaries for linux
renv.config.install.shortcuts = FALSE, ## Use the existing local library to fetch copies of packages for renv
renv.config.cache.enabled = TRUE ## Use the renv build cache to speed up install times
)

# Put the project library *outside* the project
#Sys.setenv(RENV_PATHS_LIBRARY_ROOT = file.path(normalizePath("~/.renv-project-libraries", mustWork = FALSE)))
# Set options for internet timeout
options(timeout = max(300, getOption("timeout")))

# Use capsule if specified by the user
if(Sys.getenv("USE_CAPSULE") %in% c("1", "TRUE", "true")) {
if (interactive() && file.exists("renv.lock")) {
message("renv library not loaded (found env var USE_CAPSULE=", Sys.getenv("USE_CAPSULE"), "). Use `capsule` functions (see https://github.com/MilesMcBain/capsule)")
if(require(capsule, quietly = TRUE)) {
capsule::whinge()
} else {
message('Install {capsule} with install.packages("capsule", repos = c(mm = "https://milesmcbain.r-universe.dev", getOption("repos")))')
}
capsule::whinge()
}
} else {
source("renv/activate.R")
load_env() # reload project .env files, after renv/activate.R runs renv::load() which reads user's .renviron
}

# Use the local user's .Rprofile when interactive.
# Good for keeping local preferences, but not always reproducible.

if (nzchar( Sys.getenv("TAR_PROJECT"))) {
message(paste0("targets project is '", Sys.getenv("TAR_PROJECT"), "'"))
} else {
message("targets project is default")
}

options(
repos = c(RSPM = "https://packagemanager.rstudio.com/all/latest",
CRAN = "https://cran.rstudio.com/",
INLA = "https://inla.r-inla-download.org/R/testing"),

renv.config.auto.snapshot = TRUE, ## Attempt to keep renv.lock updated automatically
renv.config.rspm.enabled = TRUE, ## Use RStudio Package manager for pre-built package binaries
renv.config.install.shortcuts = TRUE, ## Use the existing local library to fetch copies of packages for renv
renv.config.cache.enabled = TRUE, ## Use the renv build cache to speed up install times
renv.config.cache.symlinks = TRUE, ## Keep full copies of packages locally than symlinks to make the project portable in/out of containers
renv.config.install.transactional = FALSE

)

# Set maximum allowed total size (in bytes) of global variables for future package. Used to prevent too large exports.
if (Sys.info()[["nodename"]] %in% c("aegypti-reservoir" , "prospero-reservoir")) {
options(
future.globals.maxSize = 4194304000
)
}

# Since RSPM does not provide Mac binaries, always install packages from CRAN
# on mac or windows, even if renv.lock specifies they came from RSPM
if (Sys.info()[["sysname"]] %in% c("Darwin", "Windows")) {
options(renv.config.repos.override = c(
CRAN = "https://cran.rstudio.com/",
INLA = "https://inla.r-inla-download.org/R/testing"))
} else if (Sys.info()[["sysname"]] == "Linux") {
options(renv.config.repos.override = c(
RSPM = "https://packagemanager.rstudio.com/all/latest",
INLA = "https://inla.r-inla-download.org/R/testing"))
}

# If project packages have conflicts define them here
# If project packages have conflicts define them here so as
# as to manage them across all sessions when building targets
if(requireNamespace("conflicted", quietly = TRUE)) {
conflicted::conflict_prefer("filter", "dplyr", quiet = TRUE)
conflicted::conflict_prefer("count", "dplyr", quiet = TRUE)
conflicted::conflict_prefer("select", "dplyr", quiet = TRUE)
conflicted::conflict_prefer("geom_rug", "ggplot2", quiet = TRUE)
conflicted::conflict_prefer("set_names", "magrittr", quiet = TRUE)
conflicted::conflict_prefer("View", "utils", quiet = TRUE)
}

# Suppress summarize messages
options(dplyr.summarise.inform = FALSE)

46 changes: 46 additions & 0 deletions R/augment_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' @title
#' @param weather_anomalies
#' @param forecasts_anomalies
#' @param ndvi_anomalies
#' @param augmented_data_directory
#' @return
#' @author Emma Mendelsohn
#' @export
augment_data <- function(weather_anomalies, forecasts_anomalies,
ndvi_anomalies, augmented_data_directory) {


message("Load datasets into memory")
weather <- arrow::open_dataset(weather_anomalies) |> dplyr::collect()
forecasts <- arrow::open_dataset(forecasts_anomalies) |> dplyr::collect()
ndvi <- arrow::open_dataset(ndvi_anomalies) |> dplyr::collect()

message("NA checks")
## Weather and forecasts
### NAs are in scaled precip data, due to days with 0 precip
weather_check <- purrr::map_lgl(weather, ~any(is.na(.)))
assertthat::assert_that(all(str_detect(names(weather_check[weather_check]), "scaled")))

forecasts_check <- purrr::map_lgl(forecasts, ~any(is.na(.)))
assertthat::assert_that(all(str_detect(names(forecasts_check[forecasts_check]), "scaled")))

## NDVI
### Prior to 2018: NAs are due to region missing from Eastern Africa in modis data
### After 2018: NAs are due to smaller pockets of missing data on a per-cycle basis
### okay to remove when developing RSA model (issue #72)
ndvi_check <- purrr::map_lgl(ndvi, ~any(is.na(.)))
assertthat::assert_that(!any(ndvi_check[c("date", "x", "y")]))
ndvi <- drop_na(ndvi)

message("Join into a single object")
augmented_data <- left_join(weather, forecasts, by = join_by(date, x, y)) |>
left_join(ndvi, by = join_by(date, x, y))

message("Save as parquets using hive partitioning by date")
augmented_data |>
group_by(date) |>
write_dataset(augmented_data_directory)

return(augmented_data_directory)

}
5 changes: 4 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,7 @@ all_targets <- function(env = parent.env(environment()), type = "tar_target", ad
#
# tar_load_s3 <- function(target_name, ...) {
#
# }
# }

#' Get NAs
col_na <- function(df) purrr::map_lgl(df, ~any(is.na(.)))
27 changes: 23 additions & 4 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ source("_targets_settings.R")
# For development purposes only, it can be helpful to set these targets to have a tar_cue of tar_cue_upload_aws, which means targets will not check the target for changes after it has been built once

tar_cue_general = "thorough" # CAUTION changing this to never means targets can miss changes to the code. Use only for developing.
tar_cue_upload_aws = "never" # CAUTION changing this to never means targets can miss changes to the code. Use only for developing.
tar_cue_upload_aws = "thorough" # CAUTION changing this to never means targets can miss changes to the code. Use only for developing.

# Static Data Download ----------------------------------------------------
static_targets <- tar_plan(
Expand Down Expand Up @@ -313,8 +313,8 @@ data_targets <- tar_plan(
lag_intervals,
seed = 212) |>
filter(select_date) |> pull(date)
),

),
# recorded weather anomalies --------------------------------------------------
tar_target(weather_historical_means_directory,
create_data_directory(directory_path = "data/weather_historical_means")),
Expand Down Expand Up @@ -444,8 +444,27 @@ data_targets <- tar_plan(
key = ndvi_anomalies,
check = TRUE),
pattern = ndvi_anomalies,
cue = tar_cue("thorough")), # only run this if you need to upload new data
cue = tar_cue(tar_cue_upload_aws)), # only run this if you need to upload new data

# all anomalies --------------------------------------------------
tar_target(augmented_data_directory,
create_data_directory(directory_path = "data/augmented_data")),

tar_target(augmented_data,
augment_data(weather_anomalies,
forecasts_anomalies,
ndvi_anomalies,
augmented_data_directory),
format = "file",
repository = "local",
cue = tar_cue(tar_cue_general)),

tar_target(augmented_data_upload_aws_s3,
aws_s3_upload(path = augmented_data,
bucket = aws_bucket,
key = augmented_data,
check = TRUE),
cue = tar_cue(tar_cue_upload_aws)), # only run this if you need to upload new data

)

Expand Down
Loading

0 comments on commit 720aeed

Please sign in to comment.