From 4821e081cf1394b6cd8c14d97c298f9fbbe0c656 Mon Sep 17 00:00:00 2001 From: Emma Mendelsohn Date: Tue, 26 Sep 2023 09:44:21 -0400 Subject: [PATCH] slimming down the weather data to increase speed --- R/process_weather_data.R | 33 +++++++++----------- R/set_model_dates.R | 1 - _targets.R | 6 ++-- _targets/meta/meta | 21 +++++++------ data/nasa_weather_anomalies_dataset/.gitkeep | 0 5 files changed, 30 insertions(+), 31 deletions(-) create mode 100644 data/nasa_weather_anomalies_dataset/.gitkeep diff --git a/R/process_weather_data.R b/R/process_weather_data.R index 3aaeb3a..389a9b9 100644 --- a/R/process_weather_data.R +++ b/R/process_weather_data.R @@ -33,9 +33,8 @@ process_weather_data <- function(nasa_weather_dataset, # enforce dependency # TODO this could go into create_nasa_weather_dataset() to avoid repeating it on each branch weather_dataset <- weather_dataset |> mutate(across(c(year, month, day, day_of_year), as.integer)) |> - mutate(year_day_of_year = paste(year, day_of_year, sep = "_")) |> mutate(date = lubridate::make_date(year, month, day)) |> - select(x, y, date, year, month, day, day_of_year, year_day_of_year, relative_humidity, temperature, precipitation) + select(x, y, date, day_of_year, relative_humidity, temperature, precipitation) # generate the weather dataset - get the lagged anomolies for selected dates # map over the lag intervals @@ -51,9 +50,9 @@ process_weather_data <- function(nasa_weather_dataset, # enforce dependency lagged_means <- weather_dataset |> filter(date %in% !!lag_dates$date) |> group_by(x, y) |> - summarize(!!paste0("lag_relative_humidity_", end) := mean(relative_humidity), - !!paste0("lag_temperature_", end) := mean(temperature), - !!paste0("lag_precipitation_", end) := mean(precipitation)) |> + summarize(lag_relative_humidity = mean(relative_humidity), + lag_temperature = mean(temperature), + lag_precipitation = mean(precipitation)) |> ungroup() # historical: calculate mean across the full dataset for the days of the year covered by the lag period @@ -63,26 +62,24 @@ process_weather_data <- function(nasa_weather_dataset, # enforce dependency historical_means <- weather_dataset |> filter(day_of_year %in% !!lag_dates$day_of_year ) |> group_by(x, y) |> - summarize(!!paste0("historical_relative_humidity_", end) := mean(relative_humidity), - !!paste0("historical_temperature_", end) := mean(temperature), - !!paste0("historical_precipitation_", end) := mean(precipitation)) |> + summarize(historical_relative_humidity = mean(relative_humidity), + historical_temperature = mean(temperature), + historical_precipitation = mean(precipitation)) |> ungroup() # anomaly full_join(lagged_means, historical_means, by = c("x", "y")) |> - mutate(!!paste0("anomaly_relative_humidity_", end) := !!sym(paste0("lag_relative_humidity_", end)) - !!sym(paste0("historical_relative_humidity_", end)), - !!paste0("anomaly_temperature_", end) := !!sym(paste0("lag_temperature_", end)) - !!sym(paste0("historical_temperature_", end)), - !!paste0("anomaly_precipitation_", end) := !!sym(paste0("lag_precipitation_", end)) - !!sym(paste0("historical_precipitation_", end))) + mutate(!!paste0("anomaly_relative_humidity_", end) := lag_relative_humidity - historical_relative_humidity, + !!paste0("anomaly_temperature_", end) := lag_temperature - historical_temperature, + !!paste0("anomaly_precipitation_", end) := lag_precipitation - historical_precipitation) |> + select(-starts_with("lag"), -starts_with("historical")) }) |> - reduce(left_join, by = c("x", "y")) - - # get selected day info and pull in all calculated data - date_selected_all_dat <- weather_dataset |> - filter(date == !!date_selected) |> - full_join(anomalies, by = c("x", "y")) + reduce(left_join, by = c("x", "y")) |> + mutate(date = date_selected) |> + relocate(date) # Save as parquet - write_dataset(date_selected_all_dat, here::here(nasa_weather_anomalies_directory_dataset, save_filename), compression = "gzip", compression_level = 5) + write_parquet(anomalies, here::here(nasa_weather_anomalies_directory_dataset, save_filename), compression = "gzip", compression_level = 5) return(file.path(nasa_weather_anomalies_directory_dataset, save_filename)) diff --git a/R/set_model_dates.R b/R/set_model_dates.R index a19df7d..d6286f5 100644 --- a/R/set_model_dates.R +++ b/R/set_model_dates.R @@ -22,7 +22,6 @@ set_model_dates <- function(start_year, end_year, n_per_month, lag_intervals, se day = format(dates, "%d"), day_of_year = format(dates, "%j")) |> mutate(across(c(year, month, day, day_of_year), as.integer)) |> - mutate(year_day_of_year = paste(year, day_of_year, sep = "_")) |> group_by(month, year) |> mutate(select_date = row_number() %in% sample(n(), n_per_month)) |> ungroup() |> diff --git a/_targets.R b/_targets.R index 528dcdf..0ffa37d 100644 --- a/_targets.R +++ b/_targets.R @@ -21,6 +21,8 @@ tar_option_set(resources = tar_resources( workspace_on_error = TRUE # allows interactive session for failed branches ) +future::plan(future::multisession, workers = 2) + # Static Data Download ---------------------------------------------------- static_targets <- tar_plan( @@ -293,7 +295,6 @@ data_targets <- tar_plan( tar_target(nasa_weather_anomalies_directory_dataset, create_data_directory(directory_path = "data/nasa_weather_anomalies_dataset")), - # TODO take nasa_weather_directory_dataset and do full lag calcs in this function using duckdb, then collect into memory tar_target(weather_data, process_weather_data(nasa_weather_dataset, # enforce dependency nasa_weather_directory_dataset, nasa_weather_anomalies_directory_dataset, @@ -301,10 +302,11 @@ data_targets <- tar_plan( model_dates_selected, lag_intervals, overwrite = FALSE), - pattern = head(model_dates_selected, 2), + pattern = head(model_dates_selected, 20), format = "file", repository = "local", cue = tar_cue("thorough")), + # at 10 min per date, this would take 4260 minutes = 71 hours = 3 days when run sequentially # tar_target(ndvi_data, process_ndvi_data(sentinel_ndvi_directory_dataset, sentinel_ndvi_dataset, model_dates_random_select)) diff --git a/_targets/meta/meta b/_targets/meta/meta index 07285f2..43f69c6 100644 --- a/_targets/meta/meta +++ b/_targets/meta/meta @@ -1,5 +1,5 @@ name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error -.Random.seed|object|6f06c0f055e30ee3||||||||||||||| +.Random.seed|object|ebc99db6b4faa65f||||||||||||||| all_targets|function|2dda5afbd1f92385||||||||||||||| aws_bucket|object|d9cf2c5ff7cc1be4||||||||||||||| cache_aws_branched_target|function|6e2abfa4969de1bf||||||||||||||| @@ -18,7 +18,7 @@ create_modis_ndvi_dataset|function|bb9bcd506ae906bd||||||||||||||| create_nasa_weather_dataset|function|c12b134b7be25c25||||||||||||||| create_raster_template_plot|function|db738156a3247831||||||||||||||| create_sentinel_ndvi_dataset|function|201d4eaf8c87d0c3||||||||||||||| -data_targets|object|aad6767c991d4096||||||||||||||| +data_targets|object|6158d9dd44d512c7||||||||||||||| define_bounding_boxes|function|e614caacc0592e73||||||||||||||| define_country_regions|function|54808365a1bb460e||||||||||||||| deploy_targets|object|1eb1bc8d77111ded||||||||||||||| @@ -181,9 +181,9 @@ get_weather_anomalies|function|1956aa290dc4fc5d||||||||||||||| get_weather_data|function|1956aa290dc4fc5d||||||||||||||| lag_intervals|stem|f4c9e8a4d588925c|6b4f81cd41a7b83e|a3dad144c40657ed|1055089432|bucket=open-rvfcast-data*region=NULL*key=_targets/lag_intervals*endpoint=TlVMTA*version=|t19615.7794897048s||55|qs|aws|vector|||0|| make_model_data|function|df0e5631ac6d7b53||||||||||||||| -model_dates|stem|27f5b73d53fcb5e3|7646f39f9ba1810e|a8bb283b4980c84f|-204838086|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates*endpoint=TlVMTA*version=|t19615.7807563168s||80537|qs|aws|vector|||0.052|| +model_dates|stem|f16700eb9a3ebc85|7646f39f9ba1810e|956c3fce31974cf0|-204838086|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates*endpoint=TlVMTA*version=|t19626.0713514342s||34752|qs|aws|vector|||0.045|| model_dates_random_select|stem|ae6d12f9d280efdc|5719b67ece4afca5|41d39307354de6b6|-368262049|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_random_select*endpoint=TlVMTA*version=|t19615.75211742s||76313|qs|aws|vector|||0.04|| -model_dates_selected|stem|6c3acddd1dd8ab94|56502ed0c5bd7ce1|5ab6e5886391faa2|-1898424287|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_selected*endpoint=TlVMTA*version=|t19625.7811853969s||2910|qs|aws|vector|||0.013|| +model_dates_selected|stem|6c3acddd1dd8ab94|56502ed0c5bd7ce1|2b876b87f1967bb9|-1898424287|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_selected*endpoint=TlVMTA*version=|t19626.0713597157s||2910|qs|aws|vector|||0.003|| model_targets|object|1eb1bc8d77111ded||||||||||||||| modis_directory|stem|0404b408f5e5efef|c985137dd9b95cd4|ef46db3751d8e999|-671711443|bucket=project-dtra-ml-main*region=NULL*key=open-rvfcast/_targets/modis_directory*endpoint=TlVMTA*version=qdLdze87LwJZuMPusz2ovPXxe2rabGWb|t19493.6071375984s||55|qs|aws|vector|||0.001|| modis_ndvi_bundle|stem|error|a6c770fab6751fac|9d56e94e8363274f|-1456098296|bucket=open-rvfcast-data*region=NULL*key=_targets/modis_ndvi_bundle*endpoint=TlVMTA*version=|t19605.5645307177s||30|qs|aws|vector|||0.33||object task_id not found @@ -3945,7 +3945,7 @@ preprocess_ecmwf_forecasts|function|033bd8a3c45b4d46||||||||||||||| preprocess_nasa_weather|function|f5c92fafb420500d||||||||||||||| preprocess_wahis_rvf_outbreaks|function|1739270cf02b72d6||||||||||||||| process_ndvi_data|function|8a56ce9bd504bbec||||||||||||||| -process_weather_data|function|62e3a04160d9cd83||||||||||||||| +process_weather_data|function|e87a43122f67858e||||||||||||||| random_select_model_dates|function|75d79de28b5c2e87||||||||||||||| read_transform_raster|function|f7518264efa394ed||||||||||||||| report_targets|object|1eb1bc8d77111ded||||||||||||||| @@ -4969,7 +4969,7 @@ sentinel_ndvi_transformed_rasters_fefd5c9f|branch|891b7ee44128fe79|7490e5717bc38 sentinel_ndvi_transformed_upload_aws_s3|stem|138c6bf81e1f5512|14599b5def8fdb12|a880a8df380b19ae|1391835690|bucket=open-rvfcast-data*region=NULL*key=_targets/sentinel_ndvi_transformed_upload_aws_s3*endpoint=TlVMTA*version=|t19520.9852950252s||19579|qs|aws|vector|||159.826|| sentinel_ndvi_upload_aws_s3|stem|1a358ba66cfb3375|b9c6a884566d5476|826f421e76bc51b5|-1245746504|bucket=open-rvfcast-data*region=NULL*key=_targets/sentinel_ndvi_upload_aws_s3*endpoint=TlVMTA*version=|t19517.6998651133s||20237|qs|aws|vector|||397.502|| set_ecmwf_api_parameter|function|e3e4962883690ed5||||||||||||||| -set_model_dates|function|b0923ca118fec651||||||||||||||| +set_model_dates|function|e64f4ff47f178768||||||||||||||| set_nasa_api_parameter|function|ac1cc420c9c9242c||||||||||||||| static_targets|object|cbd54d1aeedd375e||||||||||||||| submit_modis_ndvi_bundle_request|function|70d5dcdcf3510fa0||||||||||||||| @@ -4980,10 +4980,11 @@ test_targets|object|1eb1bc8d77111ded||||||||||||||| transform_nasa_weather|function|e80c244fb32ef2bd||||||||||||||| transform_raster|function|47f20ba2b9ef9722||||||||||||||| transform_sentinel_ndvi|function|92a19330c7f2bff2||||||||||||||| -user_rprof|object|f5d6e573fd1bc8a3||||||||||||||| +user_rprof|object|b2d9e1567041392f||||||||||||||| wahis_rvf_outbreaks_preprocessed|stem|30ccd988b415d773|3ea98184b5887c93|275a59d310ff2a63|2127878318|bucket=open-rvfcast-data*region=NULL*key=_targets/wahis_rvf_outbreaks_preprocessed*endpoint=TlVMTA*version=|t19517.6952212142s||172965|qs|aws|vector|||0.043|| wahis_rvf_outbreaks_raw|stem|6fc7e6c7238977b3|b988ec4215d4213c|5ed4661ae3efb1aa|1933416983|bucket=open-rvfcast-data*region=NULL*key=_targets/wahis_rvf_outbreaks_raw*endpoint=TlVMTA*version=|t19517.6952047733s||173410|qs|aws|vector|||29.629|| wahis_rvf_query|function|9836433f6f1061fb||||||||||||||| -weather_data|pattern|5d7636ed3ef8bce6|494243a125cbc271||655573160||||29371159|file|local|vector||weather_data_94f732f8*weather_data_5e501efa|323.933|| -weather_data_5e501efa|branch|7da64b6f42c158f3|650ca452b036d8a5|c5b75a7d4b91bf19|-504004841|data/nasa_weather_anomalies_dataset/2005-04-28.gz.parquet|t19625.8078600598s|839d3c31da852455|14917125|file|local|vector|weather_data||0|| -weather_data_94f732f8|branch|ba91c0e4320203ae|650ca452b036d8a5|c9754ab7a773d644|-562021073|data/nasa_weather_anomalies_dataset/2005-04-14.gz.parquet|t19625.8058865201s|57a671ac1fe41409|14454034|file|local|vector|weather_data||0.024|| +weather_data|pattern|97f55e00d91c5a97|650ca452b036d8a5||655573160||||15183084|file|local|vector||weather_data_5e501efa|695.411|| +weather_data_34327510|branch|4930118303ab6913|650ca452b036d8a5|f716aafd8133e127|1793884029|data/nasa_weather_anomalies_dataset/2005-05-19.gz.parquet|t19625.9314582121s|c05c09546f589fcc|39359411|file|local|vector|weather_data||2135.612|| +weather_data_5e501efa|branch|b122541af7ee8c04|650ca452b036d8a5|43f7beb891b0543f|-504004841|data/nasa_weather_anomalies_dataset/2005-04-28.gz.parquet|t19626.0949577789s|94c8f72a458d361e|15183084|file|local|vector|weather_data||695.411|| +weather_data_94f732f8|branch|8b43e134f8eed04d|650ca452b036d8a5|44ee5b3ca3f1758a|-562021073|data/nasa_weather_anomalies_dataset/2005-04-14.gz.parquet|t19626.0853789808s|820c2f04d85e32a3|12630358|file|local|vector|weather_data||382.848|| diff --git a/data/nasa_weather_anomalies_dataset/.gitkeep b/data/nasa_weather_anomalies_dataset/.gitkeep new file mode 100644 index 0000000..e69de29