Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/ndvi anomalies #65

Merged
merged 7 commits into from
Oct 30, 2023
Merged
83 changes: 83 additions & 0 deletions R/calculate_ndvi_anomalies.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#' .. content for \description{} (no empty lines) ..
#'
#' .. content for \details{} ..
#'
#' @title
#' @param ndvi_date_lookup
#' @param ndvi_historical_means
#' @param ndvi_anomalies_directory
#' @param model_dates
#' @param model_dates_selected
#' @param lag_intervals
#' @param overwrite
#' @return
#' @author Emma Mendelsohn
#' @export
calculate_ndvi_anomalies <- function(ndvi_date_lookup, ndvi_historical_means,
ndvi_anomalies_directory, model_dates,
model_dates_selected, lag_intervals,
overwrite = FALSE) {

# Set filename
date_selected <- model_dates_selected
save_filename <- glue::glue("ndvi_anomaly_{date_selected}.gz.parquet")
message(paste0("Calculating NDVI anomalies for ", date_selected))

# Check if file already exists
existing_files <- list.files(ndvi_anomalies_directory)
if(save_filename %in% existing_files & !overwrite) {
message("file already exists, skipping download")
return(file.path(ndvi_anomalies_directory, save_filename))
}

# Get historical means for DOY
doy <- model_dates |> filter(date == date_selected) |> pull(day_of_year)
doy_frmt <- str_pad(doy,width = 3, side = "left", pad = "0")
historical_means <- read_parquet(ndvi_historical_means[str_detect(ndvi_historical_means, doy_frmt)]) |>
select(-day_of_year)

# Get the lagged anomalies for selected dates, mapping over the lag intervals
row_select <- which(model_dates$date == date_selected)

lag_intervals_start <- c(1 , 1+lag_intervals[-length(lag_intervals)])
lag_intervals_end <- lag_intervals

anomalies <- map2(lag_intervals_start, lag_intervals_end, function(start, end){

lag_dates <- model_dates |> slice((row_select - start):(row_select - end))

# get files and weights for the calculations
weights <- ndvi_date_lookup |>
mutate(lag_date = map(lookup_dates, ~. %in% lag_dates$date)) |>
mutate(weight = unlist(map(lag_date, sum))) |>
filter(weight > 0) |>
select(start_date, filename, weight)

ndvi_dataset <- open_dataset(weights$filename)

# Lag: calculate mean by pixel for the preceding x days
lagged_means <- ndvi_dataset |>
left_join(weights |> select(-filename)) |>
group_by(x, y) |>
summarize(lag_ndvi_mean = sum(ndvi * weight)/ sum(weight)) |>
ungroup()

# Join in historical means to calculate anomalies raw and scaled
full_join(lagged_means, historical_means, by = c("x", "y")) |>
mutate(!!paste0("anomaly_ndvi_", end) := lag_ndvi_mean - historical_ndvi_mean,
!!paste0("anomaly_ndvi_scaled_", end) := (lag_ndvi_mean - historical_ndvi_mean)/historical_ndvi_sd) |>
select(-starts_with("lag"), -starts_with("historical"))
}) |>
reduce(left_join, by = c("x", "y")) |>
mutate(date = date_selected) |>
relocate(date)

# Save as parquet
write_parquet(anomalies, here::here(ndvi_anomalies_directory, save_filename), compression = "gzip", compression_level = 5)

return(file.path(ndvi_anomalies_directory, save_filename))



}

53 changes: 53 additions & 0 deletions R/calculate_ndvi_historical_means.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#' .. content for \description{} (no empty lines) ..
#'
#' .. content for \details{} ..
#'
#' @title
#' @param sentinel_ndvi_transformed
#' @param sentinel_ndvi_transformed_directory
#' @param modis_ndvi_transformed
#' @param modis_ndvi_transformed_directory
#' @param ndvi_date_lookup
#' @param days_of_year
#' @param overwrite
#' @return
#' @author Emma Mendelsohn
#' @export
calculate_ndvi_historical_means <- function(ndvi_historical_means_directory,
ndvi_date_lookup, days_of_year,
overwrite = FALSE) {

# Set filename
doy <- days_of_year
doy_frmt <- str_pad(doy,width = 3, side = "left", pad = "0")
save_filename <- glue::glue("historical_ndvi_mean_doy_{doy_frmt}.gz.parquet")
message(paste("calculating historical ndvi means and standard deviations for doy", doy_frmt))

# Check if file already exists
existing_files <- list.files(ndvi_historical_means_directory)
if(save_filename %in% existing_files & !overwrite) {
message("file already exists, skipping download")
return(file.path(ndvi_historical_means_directory, save_filename))
}

# Get relevant NDVI intervals
doy_lookup <- ndvi_date_lookup |>
filter(map_lgl(lookup_day_of_year, ~any(. == doy)))

# Create dataset of relevant files
ndvi_dataset <- open_dataset(doy_lookup$filename)

# Calculate historical means and standard deviations
historical_means <- ndvi_dataset |>
mutate(day_of_year = doy) |>
group_by(x, y, day_of_year) |>
summarize(historical_ndvi_mean = mean(ndvi),
historical_ndvi_sd = sd(ndvi)) |>
ungroup()

# Save as parquet
write_parquet(historical_means, here::here(ndvi_historical_means_directory, save_filename), compression = "gzip", compression_level = 5)

return(file.path(ndvi_historical_means_directory, save_filename))

}
113 changes: 113 additions & 0 deletions R/create_ndvi_date_lookup.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#' .. content for \description{} (no empty lines) ..
#'
#' .. content for \details{} ..
#'
#' @title
#' @param sentinel_ndvi_transformed
#' @param sentinel_ndvi_transformed_directory
#' @param modis_ndvi_transformed
#' @param modis_ndvi_transformed_directory
#' @return
#' @author Emma Mendelsohn
#' @export
create_ndvi_date_lookup <- function(sentinel_ndvi_transformed,
sentinel_ndvi_transformed_directory,
modis_ndvi_transformed,
modis_ndvi_transformed_directory) {

# Connect to Sentinel and Modis datasets
sentinel_dataset <- open_dataset(sentinel_ndvi_transformed_directory)
modis_dataset <- open_dataset(modis_ndvi_transformed_directory)


# Sentinel dates handling -------------------------------------------------

# Get start and end dates from sentinel, as reported in by the source
# create a list column of all the dates covered by the interval
sentinel_dates <- sentinel_dataset |>
distinct(start_date, end_date) |>
arrange(start_date) |>
collect() |>
mutate(lookup_dates = map2(start_date, end_date, ~seq(.x, .y, by = "1 day"))) |>
mutate(satellite = "sentinel") |>
mutate(filename = sort(sentinel_ndvi_transformed))

# Visual inspection of the rasters shows that the Sentinel data in 2018 is not same scale/format as 2019 onward, let's filter that out
sentinel_dates <- sentinel_dates |>
filter(year(start_date) > 2018)

# Notice that the intervals are mostly 9 or 10 days
# 7 day lengths are 2/21 - 2/28
# 9 day lengths: end date does not overlap with the next start date
# 10 day lengths: end date does overlap with the next start date
sentinel_interval_lengths <- map_int(sentinel_dates$lookup_dates, length)

# Notice that there is three day overlap at one point in 2020
sentinel_overlap_dates <- sentinel_dates |> filter(start_date %in% c("2020-05-04", "2020-05-11"))

# Because of above, to avoid overlaps: let's assume the end date is the day before the next start date
sentinel_dates_diffs <- diff(sentinel_dates$start_date)
sentinel_dates_diffs <- c(sentinel_dates_diffs, "NA") # NA for the last start date

# Now get end dates in date format and replace existing end dates
sentinel_dates_diffs_as_date <- sentinel_dates$start_date + sentinel_dates_diffs-1
sentinel_dates <- sentinel_dates |>
mutate(end_date_reported = end_date) |>
mutate(end_date = coalesce(sentinel_dates_diffs_as_date, end_date_reported)) |> # replace last NA with reported date
mutate(lookup_dates = map2(start_date, end_date, ~seq(.x, .y, by = "1 day")))

# When did sentinel data start? We'll use this to filter the MODIS data
min_sentinel_start <- min(sentinel_dates$start_date)

# MODIS dates handling -------------------------------------------------

# Get start dates from MODIS
# end dates are not provided by source and will need to be calculated
modis_dates <- modis_dataset |>
distinct(start_date) |>
arrange(start_date) |>
collect()

# Get days in between start dates
# we are assuming that the end date is the day before the next start date
modis_dates_diffs <- diff(modis_dates$start_date)
modis_dates_diffs <- c(modis_dates_diffs, "NA") # NA for the last start date

# ^ Note that some intervals are 13 or 14 days
# there is always a report on December 19th (or 18th if leap year),
# and then the next is January 1st, which makes the interval 13 (or 14) days instead of 16.

# Now get end dates in date format
modis_dates_diffs_as_date <- modis_dates$start_date + modis_dates_diffs-1

# Add end date and create a list column of all the dates covered by the interval
# filter to end where sentinel starts
modis_dates <- modis_dates |>
mutate(end_date = modis_dates_diffs_as_date) |>
mutate(satellite = "modis") |>
mutate(filename = sort(modis_ndvi_transformed)) |>
drop_na(end_date) |> # we could assume an end date, but not necessary for modis because we're using sentinel past 2018
mutate(lookup_dates = map2(start_date, end_date, ~seq(.x, .y, by = "1 day"))) |>
mutate(lookup_dates = map(lookup_dates, ~na.omit(if_else(.>=min_sentinel_start, NA, .)))) |>
filter(map_lgl(lookup_dates, ~length(.) > 0))


# Combine modis and NDVI --------------------------------------------------

# Create lookup table so we know which rows to query, without doing an expansion on the actual data
ndvi_dates <- bind_rows(modis_dates, sentinel_dates) |>
mutate(lookup_day_of_year = map(lookup_dates, yday)) |>
relocate(satellite) |>
select(-end_date_reported, -end_date)

# Check there is no overlap in the dates
all_dates <- reduce(ndvi_dates$lookup_dates, c)
assertthat::are_equal(length(all_dates), n_distinct(all_dates))

# Check that all dates are there
all_dates_check <- seq(from = min(all_dates), to = max(all_dates), by = 1)
assertthat::assert_that(all(all_dates_check %in% all_dates))

return(ndvi_dates)

}
8 changes: 3 additions & 5 deletions R/transform_modis_ndvi.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@ transform_modis_ndvi <- function(modis_ndvi_downloaded_subset,
filename <- basename(modis_ndvi_downloaded_subset)
year_doy <- sub(".*doy(\\d+).*", "\\1", filename)
start_date <- as.Date(year_doy, format = "%Y%j") # confirmed this is start date through manual download tests
end_date <- start_date + 16


# Set filename for saving
save_filename <- glue::glue("transformed_modis_NDVI_{start_date}_to_{end_date}.gz.parquet")
save_filename <- glue::glue("transformed_modis_NDVI_{start_date}.gz.parquet")
message(paste0("Transforming ", save_filename))

# Check if file already exists
Expand All @@ -40,8 +39,7 @@ transform_modis_ndvi <- function(modis_ndvi_downloaded_subset,
dat_out <- as.data.frame(transformed_raster, xy = TRUE) |>
as_tibble() |>
rename(ndvi = 3) |>
mutate(start_date = start_date,
end_date = end_date)
mutate(start_date = start_date)

# Save as parquet
write_parquet(dat_out, here::here(modis_ndvi_transformed_directory, save_filename), compression = "gzip", compression_level = 5)
Expand Down
2 changes: 2 additions & 0 deletions R/transform_sentinel_ndvi.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ transform_sentinel_ndvi <- function(sentinel_ndvi_downloaded,
overwrite = FALSE) {

# Extract start and end dates from the raw downloaded file name
# naming conventions
# https://sentinels.copernicus.eu/web/sentinel/user-guides/sentinel-3-synergy/naming-conventions
filename <- basename(sentinel_ndvi_downloaded)
assertthat::are_equal(nchar(filename), 97)
start_date <- as.Date(str_sub(filename, 17, 24), format = "%Y%m%d")
Expand Down
47 changes: 46 additions & 1 deletion _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,8 @@ data_targets <- tar_plan(
tar_target(model_dates, set_model_dates(start_year = 2005, end_year = 2022, n_per_month = 2, lag_intervals, seed = 212)),
tar_target(model_dates_selected, model_dates |> filter(select_date) |> pull(date)),

# weather data

# weather anomalies --------------------------------------------------
tar_target(weather_historical_means_directory,
create_data_directory(directory_path = "data/weather_historical_means")),

Expand Down Expand Up @@ -334,6 +335,50 @@ data_targets <- tar_plan(
pattern = weather_anomalies,
cue = tar_cue("never")), # only run this if you need to upload new data

# ndvi anomalies --------------------------------------------------
tar_target(ndvi_date_lookup,
create_ndvi_date_lookup(sentinel_ndvi_transformed,
sentinel_ndvi_transformed_directory,
modis_ndvi_transformed,
modis_ndvi_transformed_directory)),


tar_target(ndvi_historical_means_directory,
create_data_directory(directory_path = "data/ndvi_historical_means")),

tar_target(ndvi_historical_means, calculate_ndvi_historical_means(ndvi_historical_means_directory,
ndvi_date_lookup,
days_of_year,
overwrite = FALSE),
pattern = days_of_year,
format = "file",
repository = "local"),

tar_target(ndvi_anomalies_directory,
create_data_directory(directory_path = "data/ndvi_anomalies")),

tar_target(ndvi_anomalies, calculate_ndvi_anomalies(ndvi_date_lookup,
ndvi_historical_means,
ndvi_anomalies_directory,
model_dates,
model_dates_selected,
lag_intervals,
overwrite = FALSE),
pattern = model_dates_selected,
format = "file",
repository = "local"),


# save anomalies to AWS bucket
tar_target(ndvi_anomalies_upload_aws_s3,
aws_s3_upload(path = ndvi_anomalies,
bucket = aws_bucket,
key = ndvi_anomalies,
check = TRUE),
pattern = ndvi_anomalies,
cue = tar_cue("never")), # only run this if you need to upload new data


)

# Model -----------------------------------------------------------
Expand Down
Loading