ecohealthalliance · emmamendelsohn · Oct 30, 2023 · Oct 23, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/R/calculate_ndvi_anomalies.R b/R/calculate_ndvi_anomalies.R
@@ -0,0 +1,83 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param ndvi_date_lookup
+#' @param ndvi_historical_means
+#' @param ndvi_anomalies_directory
+#' @param model_dates
+#' @param model_dates_selected
+#' @param lag_intervals
+#' @param overwrite
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+calculate_ndvi_anomalies <- function(ndvi_date_lookup, ndvi_historical_means,
+                                     ndvi_anomalies_directory, model_dates,
+                                     model_dates_selected, lag_intervals,
+                                     overwrite = FALSE) {
+
+  # Set filename
+  date_selected <- model_dates_selected
+  save_filename <- glue::glue("ndvi_anomaly_{date_selected}.gz.parquet")
+  message(paste0("Calculating NDVI anomalies for ", date_selected))
+
+  # Check if file already exists
+  existing_files <- list.files(ndvi_anomalies_directory)
+  if(save_filename %in% existing_files & !overwrite) {
+    message("file already exists, skipping download")
+    return(file.path(ndvi_anomalies_directory, save_filename))
+  }
+
+  # Get historical means for DOY
+  doy <- model_dates |> filter(date == date_selected) |> pull(day_of_year)
+  doy_frmt <- str_pad(doy,width = 3, side = "left", pad = "0")
+  historical_means <- read_parquet(ndvi_historical_means[str_detect(ndvi_historical_means, doy_frmt)]) |> 
+    select(-day_of_year)
+
+  # Get the lagged anomalies for selected dates, mapping over the lag intervals
+  row_select <- which(model_dates$date == date_selected)
+
+  lag_intervals_start <- c(1 , 1+lag_intervals[-length(lag_intervals)])
+  lag_intervals_end <- lag_intervals
+
+  anomalies <- map2(lag_intervals_start, lag_intervals_end, function(start, end){
+
+    lag_dates <- model_dates |> slice((row_select - start):(row_select - end))
+
+    # get files and weights for the calculations
+    weights <- ndvi_date_lookup |> 
+      mutate(lag_date = map(lookup_dates, ~. %in% lag_dates$date)) |> 
+      mutate(weight = unlist(map(lag_date, sum))) |> 
+      filter(weight > 0) |> 
+      select(start_date, filename, weight)
+
+   ndvi_dataset <- open_dataset(weights$filename)
+
+   # Lag: calculate mean by pixel for the preceding x days
+   lagged_means <- ndvi_dataset |> 
+     left_join(weights |> select(-filename))  |> 
+        group_by(x, y) |>
+        summarize(lag_ndvi_mean = sum(ndvi * weight)/ sum(weight)) |>
+        ungroup()
+
+      # Join in historical means to calculate anomalies raw and scaled
+      full_join(lagged_means, historical_means, by = c("x", "y")) |>
+        mutate(!!paste0("anomaly_ndvi_", end) := lag_ndvi_mean - historical_ndvi_mean,
+               !!paste0("anomaly_ndvi_scaled_", end) := (lag_ndvi_mean - historical_ndvi_mean)/historical_ndvi_sd) |>
+        select(-starts_with("lag"), -starts_with("historical"))
+    }) |>
+      reduce(left_join, by = c("x", "y")) |>
+      mutate(date = date_selected) |>
+      relocate(date)
+
+    # Save as parquet 
+    write_parquet(anomalies, here::here(ndvi_anomalies_directory, save_filename), compression = "gzip", compression_level = 5)
+
+    return(file.path(ndvi_anomalies_directory, save_filename))
+
+
+
+  }
+
diff --git a/R/calculate_ndvi_historical_means.R b/R/calculate_ndvi_historical_means.R
@@ -0,0 +1,53 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param sentinel_ndvi_transformed
+#' @param sentinel_ndvi_transformed_directory
+#' @param modis_ndvi_transformed
+#' @param modis_ndvi_transformed_directory
+#' @param ndvi_date_lookup
+#' @param days_of_year
+#' @param overwrite
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+calculate_ndvi_historical_means <- function(ndvi_historical_means_directory,
+                                            ndvi_date_lookup, days_of_year,
+                                            overwrite = FALSE) {
+
+  # Set filename
+  doy <- days_of_year
+  doy_frmt <- str_pad(doy,width = 3, side = "left", pad = "0")
+  save_filename <- glue::glue("historical_ndvi_mean_doy_{doy_frmt}.gz.parquet")
+  message(paste("calculating historical ndvi means and standard deviations for doy", doy_frmt))
+
+  # Check if file already exists
+  existing_files <- list.files(ndvi_historical_means_directory)
+  if(save_filename %in% existing_files & !overwrite) {
+    message("file already exists, skipping download")
+    return(file.path(ndvi_historical_means_directory, save_filename))
+  }
+
+  # Get relevant NDVI intervals
+  doy_lookup <-  ndvi_date_lookup |> 
+    filter(map_lgl(lookup_day_of_year, ~any(. == doy)))
+
+  # Create dataset of relevant files
+  ndvi_dataset <- open_dataset(doy_lookup$filename)
+
+  # Calculate historical means and standard deviations
+  historical_means <- ndvi_dataset |> 
+    mutate(day_of_year = doy) |> 
+    group_by(x, y, day_of_year) |> 
+    summarize(historical_ndvi_mean = mean(ndvi),
+              historical_ndvi_sd = sd(ndvi)) |> 
+    ungroup() 
+
+  # Save as parquet 
+  write_parquet(historical_means, here::here(ndvi_historical_means_directory, save_filename), compression = "gzip", compression_level = 5)
+
+  return(file.path(ndvi_historical_means_directory, save_filename))
+
+}
diff --git a/R/create_ndvi_date_lookup.R b/R/create_ndvi_date_lookup.R
@@ -0,0 +1,113 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param sentinel_ndvi_transformed
+#' @param sentinel_ndvi_transformed_directory
+#' @param modis_ndvi_transformed
+#' @param modis_ndvi_transformed_directory
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+create_ndvi_date_lookup <- function(sentinel_ndvi_transformed,
+                                    sentinel_ndvi_transformed_directory,
+                                    modis_ndvi_transformed,
+                                    modis_ndvi_transformed_directory) {
+
+  # Connect to Sentinel and Modis datasets
+  sentinel_dataset <- open_dataset(sentinel_ndvi_transformed_directory) 
+  modis_dataset <- open_dataset(modis_ndvi_transformed_directory) 
+
+
+  # Sentinel dates handling -------------------------------------------------
+
+  # Get start and end dates from sentinel, as reported in by the source
+  # create a list column of all the dates covered by the interval
+  sentinel_dates <- sentinel_dataset |> 
+    distinct(start_date, end_date) |> 
+    arrange(start_date) |> 
+    collect() |> 
+    mutate(lookup_dates = map2(start_date, end_date, ~seq(.x, .y, by = "1 day"))) |>  
+    mutate(satellite = "sentinel") |> 
+    mutate(filename = sort(sentinel_ndvi_transformed))
+
+  # Visual inspection of the rasters shows that the Sentinel data in 2018 is not same scale/format as 2019 onward, let's filter that out
+  sentinel_dates <- sentinel_dates |> 
+    filter(year(start_date) > 2018)
+
+  # Notice that the intervals are mostly 9 or 10 days
+  # 7 day lengths are 2/21 - 2/28
+  # 9 day lengths: end date does not overlap with the next start date
+  # 10 day lengths: end date does overlap with the next start date
+  sentinel_interval_lengths <- map_int(sentinel_dates$lookup_dates, length)
+
+  # Notice that there is three day overlap at one point in 2020
+  sentinel_overlap_dates <- sentinel_dates |> filter(start_date %in% c("2020-05-04", "2020-05-11"))
+
+  # Because of above, to avoid overlaps: let's assume the end date is the day before the next start date
+  sentinel_dates_diffs <- diff(sentinel_dates$start_date)
+  sentinel_dates_diffs <- c(sentinel_dates_diffs, "NA") # NA for the last start date
+
+  # Now get end dates in date format and replace existing end dates
+  sentinel_dates_diffs_as_date <-  sentinel_dates$start_date + sentinel_dates_diffs-1
+  sentinel_dates <- sentinel_dates |> 
+    mutate(end_date_reported = end_date) |> 
+    mutate(end_date = coalesce(sentinel_dates_diffs_as_date, end_date_reported)) |> # replace last NA with reported date
+    mutate(lookup_dates = map2(start_date, end_date, ~seq(.x, .y, by = "1 day")))  
+
+  # When did sentinel data start? We'll use this to filter the MODIS data
+  min_sentinel_start <- min(sentinel_dates$start_date) 
+
+  # MODIS dates handling -------------------------------------------------
+
+  # Get start dates from MODIS 
+  # end dates are not provided by source and will need to be calculated
+  modis_dates <- modis_dataset |> 
+    distinct(start_date) |> 
+    arrange(start_date) |> 
+    collect() 
+
+  # Get days in between start dates
+  # we are assuming that the end date is the day before the next start date
+  modis_dates_diffs <- diff(modis_dates$start_date)
+  modis_dates_diffs <- c(modis_dates_diffs, "NA") # NA for the last start date
+
+  # ^ Note that some intervals are 13 or 14 days
+  # there is always a report on December 19th (or 18th if leap year), 
+  # and then the next is January 1st, which makes the interval 13 (or 14) days instead of 16.
+
+  # Now get end dates in date format
+  modis_dates_diffs_as_date <- modis_dates$start_date + modis_dates_diffs-1
+
+  # Add end date and create a list column of all the dates covered by the interval
+  # filter to end where sentinel starts
+  modis_dates <- modis_dates |> 
+    mutate(end_date = modis_dates_diffs_as_date) |> 
+    mutate(satellite = "modis") |> 
+    mutate(filename = sort(modis_ndvi_transformed)) |> 
+    drop_na(end_date) |> # we could assume an end date, but not necessary for modis because we're using sentinel past 2018
+    mutate(lookup_dates = map2(start_date, end_date, ~seq(.x, .y, by = "1 day"))) |> 
+    mutate(lookup_dates = map(lookup_dates, ~na.omit(if_else(.>=min_sentinel_start, NA, .)))) |> 
+    filter(map_lgl(lookup_dates, ~length(.) > 0)) 
+
+
+  # Combine modis and NDVI --------------------------------------------------
+
+  # Create lookup table so we know which rows to query, without doing an expansion on the actual data
+  ndvi_dates <- bind_rows(modis_dates, sentinel_dates) |> 
+    mutate(lookup_day_of_year = map(lookup_dates, yday)) |> 
+    relocate(satellite) |> 
+    select(-end_date_reported, -end_date)
+
+  # Check there is no overlap in the dates
+  all_dates <- reduce(ndvi_dates$lookup_dates, c)
+  assertthat::are_equal(length(all_dates), n_distinct(all_dates))
+
+  # Check that all dates are there
+  all_dates_check <- seq(from = min(all_dates), to = max(all_dates), by = 1)
+  assertthat::assert_that(all(all_dates_check %in% all_dates))
+
+  return(ndvi_dates)
+
+}
diff --git a/R/transform_modis_ndvi.R b/R/transform_modis_ndvi.R
@@ -19,10 +19,9 @@ transform_modis_ndvi <- function(modis_ndvi_downloaded_subset,
   filename <- basename(modis_ndvi_downloaded_subset)
   year_doy <- sub(".*doy(\\d+).*", "\\1", filename) 
   start_date <- as.Date(year_doy, format = "%Y%j") # confirmed this is start date through manual download tests 
-  end_date <- start_date + 16 
-
+
   # Set filename for saving
-  save_filename <- glue::glue("transformed_modis_NDVI_{start_date}_to_{end_date}.gz.parquet")
+  save_filename <- glue::glue("transformed_modis_NDVI_{start_date}.gz.parquet")
   message(paste0("Transforming ", save_filename))
 
   # Check if file already exists
@@ -40,8 +39,7 @@ transform_modis_ndvi <- function(modis_ndvi_downloaded_subset,
   dat_out <- as.data.frame(transformed_raster, xy = TRUE) |> 
     as_tibble() |> 
     rename(ndvi = 3) |> 
-    mutate(start_date = start_date,
-           end_date = end_date)
+    mutate(start_date = start_date)
 
   # Save as parquet 
   write_parquet(dat_out, here::here(modis_ndvi_transformed_directory, save_filename), compression = "gzip", compression_level = 5)

diff --git a/R/transform_sentinel_ndvi.R b/R/transform_sentinel_ndvi.R
@@ -15,6 +15,8 @@ transform_sentinel_ndvi <- function(sentinel_ndvi_downloaded,
                                     overwrite = FALSE) {
 
   # Extract start and end dates from the raw downloaded file name
+  # naming conventions
+  # https://sentinels.copernicus.eu/web/sentinel/user-guides/sentinel-3-synergy/naming-conventions
   filename <- basename(sentinel_ndvi_downloaded)
   assertthat::are_equal(nchar(filename), 97)
   start_date <- as.Date(str_sub(filename, 17, 24), format = "%Y%m%d")

diff --git a/_targets.R b/_targets.R
@@ -297,7 +297,8 @@ data_targets <- tar_plan(
   tar_target(model_dates, set_model_dates(start_year = 2005, end_year = 2022, n_per_month = 2, lag_intervals, seed = 212)),
   tar_target(model_dates_selected, model_dates |> filter(select_date) |> pull(date)),
 
-  # weather data
+
+  # weather anomalies --------------------------------------------------
   tar_target(weather_historical_means_directory, 
              create_data_directory(directory_path = "data/weather_historical_means")),
 
@@ -334,6 +335,50 @@ data_targets <- tar_plan(
              pattern = weather_anomalies,
              cue = tar_cue("never")), # only run this if you need to upload new data  
 
+  # ndvi anomalies --------------------------------------------------
+  tar_target(ndvi_date_lookup, 
+             create_ndvi_date_lookup(sentinel_ndvi_transformed,
+                                     sentinel_ndvi_transformed_directory,
+                                     modis_ndvi_transformed,
+                                     modis_ndvi_transformed_directory)),
+
+
+  tar_target(ndvi_historical_means_directory, 
+             create_data_directory(directory_path = "data/ndvi_historical_means")),
+
+  tar_target(ndvi_historical_means, calculate_ndvi_historical_means(ndvi_historical_means_directory,
+                                                                    ndvi_date_lookup,
+                                                                    days_of_year,
+                                                                    overwrite = FALSE),
+             pattern = days_of_year,
+             format = "file", 
+             repository = "local"),  
+
+  tar_target(ndvi_anomalies_directory, 
+             create_data_directory(directory_path = "data/ndvi_anomalies")),
+
+  tar_target(ndvi_anomalies, calculate_ndvi_anomalies(ndvi_date_lookup,
+                                                      ndvi_historical_means,
+                                                      ndvi_anomalies_directory,
+                                                      model_dates,
+                                                      model_dates_selected,
+                                                      lag_intervals,
+                                                      overwrite = FALSE),
+             pattern = model_dates_selected,
+             format = "file", 
+             repository = "local"),  
+
+
+  # save anomalies to AWS bucket
+  tar_target(ndvi_anomalies_upload_aws_s3, 
+             aws_s3_upload(path = ndvi_anomalies,
+                           bucket =  aws_bucket,
+                           key = ndvi_anomalies, 
+                           check = TRUE), 
+             pattern = ndvi_anomalies,
+             cue = tar_cue("never")), # only run this if you need to upload new data  
+
+
 )
 
 # Model -----------------------------------------------------------