Skip to content

Commit

Permalink
slimming down the weather data to increase speed
Browse files Browse the repository at this point in the history
  • Loading branch information
emmamendelsohn committed Sep 26, 2023
1 parent 8f5f378 commit 4821e08
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 31 deletions.
33 changes: 15 additions & 18 deletions R/process_weather_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ process_weather_data <- function(nasa_weather_dataset, # enforce dependency
# TODO this could go into create_nasa_weather_dataset() to avoid repeating it on each branch
weather_dataset <- weather_dataset |>
mutate(across(c(year, month, day, day_of_year), as.integer)) |>
mutate(year_day_of_year = paste(year, day_of_year, sep = "_")) |>
mutate(date = lubridate::make_date(year, month, day)) |>
select(x, y, date, year, month, day, day_of_year, year_day_of_year, relative_humidity, temperature, precipitation)
select(x, y, date, day_of_year, relative_humidity, temperature, precipitation)

# generate the weather dataset - get the lagged anomolies for selected dates
# map over the lag intervals
Expand All @@ -51,9 +50,9 @@ process_weather_data <- function(nasa_weather_dataset, # enforce dependency
lagged_means <- weather_dataset |>
filter(date %in% !!lag_dates$date) |>
group_by(x, y) |>
summarize(!!paste0("lag_relative_humidity_", end) := mean(relative_humidity),
!!paste0("lag_temperature_", end) := mean(temperature),
!!paste0("lag_precipitation_", end) := mean(precipitation)) |>
summarize(lag_relative_humidity = mean(relative_humidity),
lag_temperature = mean(temperature),
lag_precipitation = mean(precipitation)) |>
ungroup()

# historical: calculate mean across the full dataset for the days of the year covered by the lag period
Expand All @@ -63,26 +62,24 @@ process_weather_data <- function(nasa_weather_dataset, # enforce dependency
historical_means <- weather_dataset |>
filter(day_of_year %in% !!lag_dates$day_of_year ) |>
group_by(x, y) |>
summarize(!!paste0("historical_relative_humidity_", end) := mean(relative_humidity),
!!paste0("historical_temperature_", end) := mean(temperature),
!!paste0("historical_precipitation_", end) := mean(precipitation)) |>
summarize(historical_relative_humidity = mean(relative_humidity),
historical_temperature = mean(temperature),
historical_precipitation = mean(precipitation)) |>
ungroup()

# anomaly
full_join(lagged_means, historical_means, by = c("x", "y")) |>
mutate(!!paste0("anomaly_relative_humidity_", end) := !!sym(paste0("lag_relative_humidity_", end)) - !!sym(paste0("historical_relative_humidity_", end)),
!!paste0("anomaly_temperature_", end) := !!sym(paste0("lag_temperature_", end)) - !!sym(paste0("historical_temperature_", end)),
!!paste0("anomaly_precipitation_", end) := !!sym(paste0("lag_precipitation_", end)) - !!sym(paste0("historical_precipitation_", end)))
mutate(!!paste0("anomaly_relative_humidity_", end) := lag_relative_humidity - historical_relative_humidity,
!!paste0("anomaly_temperature_", end) := lag_temperature - historical_temperature,
!!paste0("anomaly_precipitation_", end) := lag_precipitation - historical_precipitation) |>
select(-starts_with("lag"), -starts_with("historical"))
}) |>
reduce(left_join, by = c("x", "y"))

# get selected day info and pull in all calculated data
date_selected_all_dat <- weather_dataset |>
filter(date == !!date_selected) |>
full_join(anomalies, by = c("x", "y"))
reduce(left_join, by = c("x", "y")) |>
mutate(date = date_selected) |>
relocate(date)

# Save as parquet
write_dataset(date_selected_all_dat, here::here(nasa_weather_anomalies_directory_dataset, save_filename), compression = "gzip", compression_level = 5)
write_parquet(anomalies, here::here(nasa_weather_anomalies_directory_dataset, save_filename), compression = "gzip", compression_level = 5)

return(file.path(nasa_weather_anomalies_directory_dataset, save_filename))

Expand Down
1 change: 0 additions & 1 deletion R/set_model_dates.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ set_model_dates <- function(start_year, end_year, n_per_month, lag_intervals, se
day = format(dates, "%d"),
day_of_year = format(dates, "%j")) |>
mutate(across(c(year, month, day, day_of_year), as.integer)) |>
mutate(year_day_of_year = paste(year, day_of_year, sep = "_")) |>
group_by(month, year) |>
mutate(select_date = row_number() %in% sample(n(), n_per_month)) |>
ungroup() |>
Expand Down
6 changes: 4 additions & 2 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ tar_option_set(resources = tar_resources(
workspace_on_error = TRUE # allows interactive session for failed branches
)

future::plan(future::multisession, workers = 2)

# Static Data Download ----------------------------------------------------
static_targets <- tar_plan(

Expand Down Expand Up @@ -293,18 +295,18 @@ data_targets <- tar_plan(
tar_target(nasa_weather_anomalies_directory_dataset,
create_data_directory(directory_path = "data/nasa_weather_anomalies_dataset")),

# TODO take nasa_weather_directory_dataset and do full lag calcs in this function using duckdb, then collect into memory
tar_target(weather_data, process_weather_data(nasa_weather_dataset, # enforce dependency
nasa_weather_directory_dataset,
nasa_weather_anomalies_directory_dataset,
model_dates,
model_dates_selected,
lag_intervals,
overwrite = FALSE),
pattern = head(model_dates_selected, 2),
pattern = head(model_dates_selected, 20),
format = "file",
repository = "local",
cue = tar_cue("thorough")),
# at 10 min per date, this would take 4260 minutes = 71 hours = 3 days when run sequentially

# tar_target(ndvi_data, process_ndvi_data(sentinel_ndvi_directory_dataset, sentinel_ndvi_dataset, model_dates_random_select))

Expand Down
21 changes: 11 additions & 10 deletions _targets/meta/meta
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error
.Random.seed|object|6f06c0f055e30ee3|||||||||||||||
.Random.seed|object|ebc99db6b4faa65f|||||||||||||||
all_targets|function|2dda5afbd1f92385|||||||||||||||
aws_bucket|object|d9cf2c5ff7cc1be4|||||||||||||||
cache_aws_branched_target|function|6e2abfa4969de1bf|||||||||||||||
Expand All @@ -18,7 +18,7 @@ create_modis_ndvi_dataset|function|bb9bcd506ae906bd|||||||||||||||
create_nasa_weather_dataset|function|c12b134b7be25c25|||||||||||||||
create_raster_template_plot|function|db738156a3247831|||||||||||||||
create_sentinel_ndvi_dataset|function|201d4eaf8c87d0c3|||||||||||||||
data_targets|object|aad6767c991d4096|||||||||||||||
data_targets|object|6158d9dd44d512c7|||||||||||||||
define_bounding_boxes|function|e614caacc0592e73|||||||||||||||
define_country_regions|function|54808365a1bb460e|||||||||||||||
deploy_targets|object|1eb1bc8d77111ded|||||||||||||||
Expand Down Expand Up @@ -181,9 +181,9 @@ get_weather_anomalies|function|1956aa290dc4fc5d|||||||||||||||
get_weather_data|function|1956aa290dc4fc5d|||||||||||||||
lag_intervals|stem|f4c9e8a4d588925c|6b4f81cd41a7b83e|a3dad144c40657ed|1055089432|bucket=open-rvfcast-data*region=NULL*key=_targets/lag_intervals*endpoint=TlVMTA*version=|t19615.7794897048s||55|qs|aws|vector|||0||
make_model_data|function|df0e5631ac6d7b53|||||||||||||||
model_dates|stem|27f5b73d53fcb5e3|7646f39f9ba1810e|a8bb283b4980c84f|-204838086|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates*endpoint=TlVMTA*version=|t19615.7807563168s||80537|qs|aws|vector|||0.052||
model_dates|stem|f16700eb9a3ebc85|7646f39f9ba1810e|956c3fce31974cf0|-204838086|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates*endpoint=TlVMTA*version=|t19626.0713514342s||34752|qs|aws|vector|||0.045||
model_dates_random_select|stem|ae6d12f9d280efdc|5719b67ece4afca5|41d39307354de6b6|-368262049|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_random_select*endpoint=TlVMTA*version=|t19615.75211742s||76313|qs|aws|vector|||0.04||
model_dates_selected|stem|6c3acddd1dd8ab94|56502ed0c5bd7ce1|5ab6e5886391faa2|-1898424287|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_selected*endpoint=TlVMTA*version=|t19625.7811853969s||2910|qs|aws|vector|||0.013||
model_dates_selected|stem|6c3acddd1dd8ab94|56502ed0c5bd7ce1|2b876b87f1967bb9|-1898424287|bucket=open-rvfcast-data*region=NULL*key=_targets/model_dates_selected*endpoint=TlVMTA*version=|t19626.0713597157s||2910|qs|aws|vector|||0.003||
model_targets|object|1eb1bc8d77111ded|||||||||||||||
modis_directory|stem|0404b408f5e5efef|c985137dd9b95cd4|ef46db3751d8e999|-671711443|bucket=project-dtra-ml-main*region=NULL*key=open-rvfcast/_targets/modis_directory*endpoint=TlVMTA*version=qdLdze87LwJZuMPusz2ovPXxe2rabGWb|t19493.6071375984s||55|qs|aws|vector|||0.001||
modis_ndvi_bundle|stem|error|a6c770fab6751fac|9d56e94e8363274f|-1456098296|bucket=open-rvfcast-data*region=NULL*key=_targets/modis_ndvi_bundle*endpoint=TlVMTA*version=|t19605.5645307177s||30|qs|aws|vector|||0.33||object task_id not found
Expand Down Expand Up @@ -3945,7 +3945,7 @@ preprocess_ecmwf_forecasts|function|033bd8a3c45b4d46|||||||||||||||
preprocess_nasa_weather|function|f5c92fafb420500d|||||||||||||||
preprocess_wahis_rvf_outbreaks|function|1739270cf02b72d6|||||||||||||||
process_ndvi_data|function|8a56ce9bd504bbec|||||||||||||||
process_weather_data|function|62e3a04160d9cd83|||||||||||||||
process_weather_data|function|e87a43122f67858e|||||||||||||||
random_select_model_dates|function|75d79de28b5c2e87|||||||||||||||
read_transform_raster|function|f7518264efa394ed|||||||||||||||
report_targets|object|1eb1bc8d77111ded|||||||||||||||
Expand Down Expand Up @@ -4969,7 +4969,7 @@ sentinel_ndvi_transformed_rasters_fefd5c9f|branch|891b7ee44128fe79|7490e5717bc38
sentinel_ndvi_transformed_upload_aws_s3|stem|138c6bf81e1f5512|14599b5def8fdb12|a880a8df380b19ae|1391835690|bucket=open-rvfcast-data*region=NULL*key=_targets/sentinel_ndvi_transformed_upload_aws_s3*endpoint=TlVMTA*version=|t19520.9852950252s||19579|qs|aws|vector|||159.826||
sentinel_ndvi_upload_aws_s3|stem|1a358ba66cfb3375|b9c6a884566d5476|826f421e76bc51b5|-1245746504|bucket=open-rvfcast-data*region=NULL*key=_targets/sentinel_ndvi_upload_aws_s3*endpoint=TlVMTA*version=|t19517.6998651133s||20237|qs|aws|vector|||397.502||
set_ecmwf_api_parameter|function|e3e4962883690ed5|||||||||||||||
set_model_dates|function|b0923ca118fec651|||||||||||||||
set_model_dates|function|e64f4ff47f178768|||||||||||||||
set_nasa_api_parameter|function|ac1cc420c9c9242c|||||||||||||||
static_targets|object|cbd54d1aeedd375e|||||||||||||||
submit_modis_ndvi_bundle_request|function|70d5dcdcf3510fa0|||||||||||||||
Expand All @@ -4980,10 +4980,11 @@ test_targets|object|1eb1bc8d77111ded|||||||||||||||
transform_nasa_weather|function|e80c244fb32ef2bd|||||||||||||||
transform_raster|function|47f20ba2b9ef9722|||||||||||||||
transform_sentinel_ndvi|function|92a19330c7f2bff2|||||||||||||||
user_rprof|object|f5d6e573fd1bc8a3|||||||||||||||
user_rprof|object|b2d9e1567041392f|||||||||||||||
wahis_rvf_outbreaks_preprocessed|stem|30ccd988b415d773|3ea98184b5887c93|275a59d310ff2a63|2127878318|bucket=open-rvfcast-data*region=NULL*key=_targets/wahis_rvf_outbreaks_preprocessed*endpoint=TlVMTA*version=|t19517.6952212142s||172965|qs|aws|vector|||0.043||
wahis_rvf_outbreaks_raw|stem|6fc7e6c7238977b3|b988ec4215d4213c|5ed4661ae3efb1aa|1933416983|bucket=open-rvfcast-data*region=NULL*key=_targets/wahis_rvf_outbreaks_raw*endpoint=TlVMTA*version=|t19517.6952047733s||173410|qs|aws|vector|||29.629||
wahis_rvf_query|function|9836433f6f1061fb|||||||||||||||
weather_data|pattern|5d7636ed3ef8bce6|494243a125cbc271||655573160||||29371159|file|local|vector||weather_data_94f732f8*weather_data_5e501efa|323.933||
weather_data_5e501efa|branch|7da64b6f42c158f3|650ca452b036d8a5|c5b75a7d4b91bf19|-504004841|data/nasa_weather_anomalies_dataset/2005-04-28.gz.parquet|t19625.8078600598s|839d3c31da852455|14917125|file|local|vector|weather_data||0||
weather_data_94f732f8|branch|ba91c0e4320203ae|650ca452b036d8a5|c9754ab7a773d644|-562021073|data/nasa_weather_anomalies_dataset/2005-04-14.gz.parquet|t19625.8058865201s|57a671ac1fe41409|14454034|file|local|vector|weather_data||0.024||
weather_data|pattern|97f55e00d91c5a97|650ca452b036d8a5||655573160||||15183084|file|local|vector||weather_data_5e501efa|695.411||
weather_data_34327510|branch|4930118303ab6913|650ca452b036d8a5|f716aafd8133e127|1793884029|data/nasa_weather_anomalies_dataset/2005-05-19.gz.parquet|t19625.9314582121s|c05c09546f589fcc|39359411|file|local|vector|weather_data||2135.612||
weather_data_5e501efa|branch|b122541af7ee8c04|650ca452b036d8a5|43f7beb891b0543f|-504004841|data/nasa_weather_anomalies_dataset/2005-04-28.gz.parquet|t19626.0949577789s|94c8f72a458d361e|15183084|file|local|vector|weather_data||695.411||
weather_data_94f732f8|branch|8b43e134f8eed04d|650ca452b036d8a5|44ee5b3ca3f1758a|-562021073|data/nasa_weather_anomalies_dataset/2005-04-14.gz.parquet|t19626.0853789808s|820c2f04d85e32a3|12630358|file|local|vector|weather_data||382.848||
Empty file.

0 comments on commit 4821e08

Please sign in to comment.