diff --git a/R/process_weather_data.R b/R/process_weather_data.R index 0e8bbf9..9431a49 100644 --- a/R/process_weather_data.R +++ b/R/process_weather_data.R @@ -8,94 +8,46 @@ #' @return #' @author Emma Mendelsohn #' @export -process_weather_data <- function(nasa_weather_directory_dataset, nasa_weather_dataset) { +process_weather_data <- function(nasa_weather_directory_dataset, nasa_weather_dataset, model_dates) { - library(dbplyr) weather_dataset <- open_dataset(nasa_weather_directory_dataset) |> to_duckdb(table_name = "weather") weather_dataset <- weather_dataset |> - mutate(across(c("month", "day", "day_of_year"), ~as.integer(.))) |> - mutate(year_doy = paste(year, day_of_year, sep = "_")) + mutate(across(c(year, month, day, day_of_year), as.integer)) |> + mutate(year_day_of_year = paste(year, day_of_year, sep = "_")) - # calculate monthly averages by pixel - weather_means <- weather_dataset |> - group_by(x, y, month) |> - summarize(mean_relative_humidity = mean(relative_humidity), - mean_temperature = mean(temperature), - mean_precipitation = mean(precipitation)) |> - ungroup() - - # weather_means_check <- collect(weather_means) - - # get 30 day lags for selected dates - for(i in 1:nrow(model_dates_random_select)){ - date_select <- model_dates_random_select[i,]$date - previous_30_days <- seq(date_select - 30, date_select - 1, by = "day") - previous_30_days_tib <- tibble(year = year(previous_30_days), day_of_year = yday(previous_30_days)) + # generate the weather dataset - get the lagged anomolies for selected dates + # TODO: do this for each lag internal + outt <- map(model_dates$date[model_dates$select_date], function(date_selected){ - #TODO filter for year_doy for these 30 days + row_select <- which(model_dates$date == date_selected) + lag_dates <- model_dates |> slice((row_select - 30):(row_select - 1)) - lag <- weather_dataset |> - inner_join(previous_30_days_tib) + # lag: calculate mean by pixel for the preceeding 30 days + lagged_means <- weather_dataset |> + filter(year_day_of_year %in% !!lag_dates$year_day_of_year) |> group_by(x, y) |> - window_frame(-30, -1) |> - window_order(day_of_year) |> - mutate(anomaly_relative_humidity_roll30 = mean(anomaly_relative_humidity)) |> - ungroup() - } - - - - # calculate anomalies between lags and weighted monthly averages - - - - - - - - # # calculate monthly averages by pixel - # # calculate anomalies for each day relative to monthly average - # weather_anomalies <- weather_dataset |> - # group_by(x, y, month) |> - # mutate(mean_relative_humidity = mean(relative_humidity), - # mean_temperature = mean(temperature), - # mean_precipitation = mean(precipitation)) |> - # ungroup() |> - # mutate(anomaly_relative_humidity = relative_humidity - mean_relative_humidity, - # anomaly_temperature = temperature - mean_temperature, - # anomaly_precipitation = precipitation - mean_precipitation) - - # get rolling avg for each x,y - # throws memory error - PRAGMA temp_directory='/path/to/tmp.tmp' - weather_lags <- weather_anomalies |> - group_by(x, y) |> - window_frame(-30, -1) |> - window_order(day_of_year) |> - mutate(anomaly_relative_humidity_roll30 = mean(anomaly_relative_humidity)) |> - ungroup() - - # check this worked - test = weather_lags |> filter(x == min(x), y == max(y), day_of_year < 50) |> collect() - - # ^ get rolling avg for 1 month, 2 month, 3 month ahead for each var - - # filter out top 90 days of days (these are used to inform the lags only) - - # filter for model_dates_random_select - - - ### testing notes - # SQL rolling avg - # "AVG(anomaly_precipitation) OVER ( - # ORDER BY day_of_year - # ROWS BETWEEN 30 PRECEDING AND 1 PRECEDING)" - - # sanity checks - # mf <- memdb_frame(x = 1:100, y = 1:100) - # mf2 <- mf |> - # window_frame(-30, -1) |> - # window_order(y) |> - # mutate(x2 = mean(x)) + summarize(lag_relative_humidity = mean(relative_humidity), + lag_temperature = mean(temperature), + lag_precipitation = mean(precipitation)) |> + ungroup() |> + mutate(year_day_of_year = date_selected) + + # overall: calculate mean across the full dataset for the days of the year covered by the lag period + overall_means <- weather_dataset |> + filter(day_of_year %in% !!lag_dates$day_of_year ) |> + group_by(x, y) |> + summarize(overall_relative_humidity = mean(relative_humidity), + overall_temperature = mean(temperature), + overall_precipitation = mean(precipitation)) |> + ungroup() |> + mutate(year_day_of_year = date_selected) + + + #TODO + # join the lagged mean with the historical mean and calculate anomoly + # join with the data from the actual day, one row for each x, y, selected date + + }) } diff --git a/R/random_select_model_dates.R b/R/random_select_model_dates.R deleted file mode 100644 index e40108f..0000000 --- a/R/random_select_model_dates.R +++ /dev/null @@ -1,30 +0,0 @@ -#' .. content for \description{} (no empty lines) .. -#' -#' .. content for \details{} .. -#' -#' @title -#' @param start_year -#' @param end_year -#' @param n_per_month -#' @return -#' @author Emma Mendelsohn -#' @export -random_select_model_dates <- function(start_year, end_year, n_per_month, seed = 1) { - - set.seed(seed) - - # Create a vector of dates from January 2005 to December 2022 - dates <- seq(as.Date(paste0(start_year,"-01-01")), as.Date(paste0(end_year, "-12-31")), by = "day") - - tibble(date = dates) |> - mutate(year = format(dates, "%Y"), - month = format(date, "%m"), - day = format(dates, "%d")) |> - group_by(month, year) |> - sample_n(size = n_per_month, replace = FALSE) |> - ungroup() |> - mutate(across(c(year, month, day), as.integer)) |> - arrange(year, month, day) - - -} diff --git a/R/set_model_dates.R b/R/set_model_dates.R new file mode 100644 index 0000000..a19df7d --- /dev/null +++ b/R/set_model_dates.R @@ -0,0 +1,35 @@ +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param start_year +#' @param end_year +#' @param n_per_month +#' @return +#' @author Emma Mendelsohn +#' @export +set_model_dates <- function(start_year, end_year, n_per_month, lag_intervals, seed = 1) { + + set.seed(seed) + + # Create a vector of dates from January 2005 to December 2022 + dates <- seq(as.Date(paste0(start_year,"-01-01")), as.Date(paste0(end_year, "-12-31")), by = "day") + + model_dates <-tibble(date = dates) |> + mutate(year = format(dates, "%Y"), + month = format(dates, "%m"), + day = format(dates, "%d"), + day_of_year = format(dates, "%j")) |> + mutate(across(c(year, month, day, day_of_year), as.integer)) |> + mutate(year_day_of_year = paste(year, day_of_year, sep = "_")) |> + group_by(month, year) |> + mutate(select_date = row_number() %in% sample(n(), n_per_month)) |> + ungroup() |> + arrange(year, month, day) + + # make sure we always have enough days to calculate lags + model_dates$select_date[1:max(lag_intervals)] <- FALSE + + return(model_dates) +} diff --git a/_targets.R b/_targets.R index a758cf8..6a50339 100644 --- a/_targets.R +++ b/_targets.R @@ -286,12 +286,13 @@ dynamic_targets <- tar_plan( # Data Processing ----------------------------------------------------------- data_targets <- tar_plan( - tar_target(model_dates_random_select, random_select_model_dates(start_year = 2006, end_year = 2022, n_per_month = 2, seed = 212)), + tar_target(lag_intervals, c(30, 60, 90)), + tar_target(model_dates, set_model_dates(start_year = 2005, end_year = 2022, n_per_month = 2, lag_intervals, seed = 212)), # TODO take nasa_weather_directory_dataset and do full lag calcs in this function using duckdb, then collect into memory tar_target(weather_data, process_weather_data(nasa_weather_directory_dataset, nasa_weather_dataset, # enforce dependency - model_dates_random_select)), + model_dates)), # tar_target(ndvi_data, process_ndvi_data(sentinel_ndvi_directory_dataset, sentinel_ndvi_dataset, model_dates_random_select)) ) diff --git a/_targets/meta/meta b/_targets/meta/meta index 9afe758..1b57ea0 100644 --- a/_targets/meta/meta +++ b/_targets/meta/meta @@ -1,5 +1,5 @@ name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error -.Random.seed|object|03fa9cad30a33312||||||||||||||| +.Random.seed|object|adfbe9c01236ca30||||||||||||||| all_targets|function|2dda5afbd1f92385||||||||||||||| aws_bucket|object|d9cf2c5ff7cc1be4||||||||||||||| cache_aws_branched_target|function|6e2abfa4969de1bf||||||||||||||| @@ -18,7 +18,7 @@ create_modis_ndvi_dataset|function|bb9bcd506ae906bd||||||||||||||| create_nasa_weather_dataset|function|c12b134b7be25c25||||||||||||||| create_raster_template_plot|function|db738156a3247831||||||||||||||| create_sentinel_ndvi_dataset|function|201d4eaf8c87d0c3||||||||||||||| -data_targets|object|77e0b70e7dba2755||||||||||||||| +data_targets|object|20a3ace986c6d250||||||||||||||| define_bounding_boxes|function|e614caacc0592e73||||||||||||||| define_country_regions|function|54808365a1bb460e||||||||||||||| deploy_targets|object|1eb1bc8d77111ded||||||||||||||| @@ -179,8 +179,10 @@ get_sentinel_ndvi_api_parameters|function|ec6ebe60c7637311||||||||||||||| get_wahis_rvf_outbreaks_raw|function|a8564ce9127c9c1d||||||||||||||| get_weather_anomalies|function|1956aa290dc4fc5d||||||||||||||| get_weather_data|function|1956aa290dc4fc5d||||||||||||||| +lag_intervals|stem|f4c9e8a4d588925c|6b4f81cd41a7b83e|a3dad144c40657ed|1055089432|bucket=open-rvfcast-data*region=NULL*key=_targets/lag_intervals*endpoint=TlVMTA*version=|t19615.7794897048s||55|qs|aws|vector|||0|| make_model_data|function|df0e5631ac6d7b53||||||||||||||| -model_dates_random_select|stem|42d120b8aa048c60|5719b67ece4afca5|4ea98346634ca2b4|-368262049|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_random_select*endpoint=TlVMTA*version=|t19614.8223630127s||6256|qs|aws|vector|||0.037|| +model_dates|stem|27f5b73d53fcb5e3|7646f39f9ba1810e|a8bb283b4980c84f|-204838086|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates*endpoint=TlVMTA*version=|t19615.7807563168s||80537|qs|aws|vector|||0.052|| +model_dates_random_select|stem|ae6d12f9d280efdc|5719b67ece4afca5|41d39307354de6b6|-368262049|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_random_select*endpoint=TlVMTA*version=|t19615.75211742s||76313|qs|aws|vector|||0.04|| model_targets|object|1eb1bc8d77111ded||||||||||||||| modis_directory|stem|0404b408f5e5efef|c985137dd9b95cd4|ef46db3751d8e999|-671711443|bucket=project-dtra-ml-main*region=NULL*key=open-rvfcast/_targets/modis_directory*endpoint=TlVMTA*version=qdLdze87LwJZuMPusz2ovPXxe2rabGWb|t19493.6071375984s||55|qs|aws|vector|||0.001|| modis_ndvi_bundle|stem|error|a6c770fab6751fac|9d56e94e8363274f|-1456098296|bucket=open-rvfcast-data*region=NULL*key=_targets/modis_ndvi_bundle*endpoint=TlVMTA*version=|t19605.5645307177s||30|qs|aws|vector|||0.33||object task_id not found @@ -3940,8 +3942,8 @@ preprocess_ecmwf_forecasts|function|033bd8a3c45b4d46||||||||||||||| preprocess_nasa_weather|function|f5c92fafb420500d||||||||||||||| preprocess_wahis_rvf_outbreaks|function|1739270cf02b72d6||||||||||||||| process_ndvi_data|function|8a56ce9bd504bbec||||||||||||||| -process_weather_data|function|cfab097511ff525b||||||||||||||| -random_select_model_dates|function|9ff4c8b11c4c6a0f||||||||||||||| +process_weather_data|function|6dbc7677081a102e||||||||||||||| +random_select_model_dates|function|75d79de28b5c2e87||||||||||||||| read_transform_raster|function|f7518264efa394ed||||||||||||||| report_targets|object|1eb1bc8d77111ded||||||||||||||| rolling_box|function|1195d491d98c697b||||||||||||||| @@ -4964,6 +4966,7 @@ sentinel_ndvi_transformed_rasters_fefd5c9f|branch|891b7ee44128fe79|7490e5717bc38 sentinel_ndvi_transformed_upload_aws_s3|stem|138c6bf81e1f5512|14599b5def8fdb12|a880a8df380b19ae|1391835690|bucket=open-rvfcast-data*region=NULL*key=_targets/sentinel_ndvi_transformed_upload_aws_s3*endpoint=TlVMTA*version=|t19520.9852950252s||19579|qs|aws|vector|||159.826|| sentinel_ndvi_upload_aws_s3|stem|1a358ba66cfb3375|b9c6a884566d5476|826f421e76bc51b5|-1245746504|bucket=open-rvfcast-data*region=NULL*key=_targets/sentinel_ndvi_upload_aws_s3*endpoint=TlVMTA*version=|t19517.6998651133s||20237|qs|aws|vector|||397.502|| set_ecmwf_api_parameter|function|e3e4962883690ed5||||||||||||||| +set_model_dates|function|b0923ca118fec651||||||||||||||| set_nasa_api_parameter|function|ac1cc420c9c9242c||||||||||||||| static_targets|object|cbd54d1aeedd375e||||||||||||||| submit_modis_ndvi_bundle_request|function|70d5dcdcf3510fa0||||||||||||||| @@ -4974,7 +4977,7 @@ test_targets|object|1eb1bc8d77111ded||||||||||||||| transform_nasa_weather|function|e80c244fb32ef2bd||||||||||||||| transform_raster|function|47f20ba2b9ef9722||||||||||||||| transform_sentinel_ndvi|function|92a19330c7f2bff2||||||||||||||| -user_rprof|object|a66bd50c4c539996||||||||||||||| +user_rprof|object|2a027f145a5a1891||||||||||||||| wahis_rvf_outbreaks_preprocessed|stem|30ccd988b415d773|3ea98184b5887c93|275a59d310ff2a63|2127878318|bucket=open-rvfcast-data*region=NULL*key=_targets/wahis_rvf_outbreaks_preprocessed*endpoint=TlVMTA*version=|t19517.6952212142s||172965|qs|aws|vector|||0.043|| wahis_rvf_outbreaks_raw|stem|6fc7e6c7238977b3|b988ec4215d4213c|5ed4661ae3efb1aa|1933416983|bucket=open-rvfcast-data*region=NULL*key=_targets/wahis_rvf_outbreaks_raw*endpoint=TlVMTA*version=|t19517.6952047733s||173410|qs|aws|vector|||29.629|| wahis_rvf_query|function|9836433f6f1061fb|||||||||||||||