diff --git a/DESCRIPTION b/DESCRIPTION index 238ea81..7c3dfe8 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,14 +18,12 @@ Imports: aws.s3, usethis, ncdf4, - distributional, dplyr, tidyr, arrow, rlang, tibble, - read4cast (>= 0.0.0.9000), - score4cast (>= 0.0.0.9000) + read4cast (>= 0.0.0.9000) RoxygenNote: 7.2.3 Language: en-US Suggests: @@ -51,5 +49,4 @@ Additional_repositories: https://cboettig.github.io/drat Config/testthat/edition: 3 Remotes: github::eco4cast/read4cast, - github::eco4cast/score4cast, github::cboettig/aws.s3 diff --git a/NAMESPACE b/NAMESPACE index 5c47bfe..9ee553e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,30 +1,12 @@ # Generated by roxygen2: do not edit by hand -export(check_submission) -export(combined_scores) -export(crps_logs_score) -export(download_forecast) -export(download_scores) export(efi_format) export(efi_format_ensemble) export(forecast_output_validator) -export(get_target) -export(include_horizon) export(noaa_stage1) export(noaa_stage2) export(noaa_stage3) -export(pivot_forecast) -export(pivot_target) -export(read_forecast) -export(score) export(submit) -export(theme_statistics) importFrom(dplyr,`%>%`) -importFrom(read4cast,read_forecast) importFrom(rlang,.data) importFrom(rlang,`:=`) -importFrom(score4cast,crps_logs_score) -importFrom(score4cast,include_horizon) -importFrom(score4cast,pivot_forecast) -importFrom(score4cast,pivot_target) -importFrom(score4cast,score) diff --git a/R/combine_scores.R b/R/combine_scores.R deleted file mode 100644 index 1e2effb..0000000 --- a/R/combine_scores.R +++ /dev/null @@ -1,70 +0,0 @@ -#' Combining scores from a theme together -#' -#' @param theme theme name -#' @param collect TRUE/FALSE to download results -#' @return a data.frame of scores -#' @export -combined_scores <- function(theme, collect = TRUE){ - - vars <- arrow_env_vars() - - #GENERALIZATION: THIS IS A SPECIFIC ENDPOINT - s3 <- arrow::s3_bucket(bucket = paste0("neon4cast-scores/parquet/", theme), - endpoint_override = "data.ecoforecast.org", - anonymous = TRUE) - ds <- arrow::open_dataset(s3, schema=score4cast::score_schema()) - if (collect) { - ds <- dplyr::collect(ds) - } - on.exit(unset_arrow_vars(vars)) - ds -} - - -#' Calculating forecast challenge submission statistics -#' -#' @param themes theme names -#' @return a data.frame of challenge statistics -#' @export - -theme_statistics <- function(themes){ - -theme_stats <- purrr::map_dfr(themes, function(theme){ - - message(theme) - - theme_scores <- combined_scores(theme = theme, collect = FALSE) - - teams <- theme_scores |> - dplyr::summarise(n = dplyr::n_distinct(model_id)) |> - dplyr::collect() |> - dplyr::pull(n) - - forecasts <- theme_scores |> - dplyr::select(model_id,reference_datetime, variable) |> - dplyr::distinct() |> - dplyr::summarise(n = dplyr::n(), .groups = "drop") |> - dplyr::summarise(total = sum(n)) |> - dplyr::collect() |> - dplyr::pull(total) - - forecast_obs <- theme_scores |> - dplyr::filter(!is.na(crps)) |> - dplyr::summarise(n = n(), .groups = "drop") |> - dplyr::summarise(total = sum(n)) |> - dplyr::collect() |> - dplyr::pull(total) - - output <- tibble::tibble(theme = theme, - n_teams = teams, - n_submissions = forecasts, - n_obs_forecasts_pairs = forecast_obs) - - return(output) - }) - -return(theme_stats) - -} -globalVariables(c("theme", "n", "crps", "total"), "neon4cast") - diff --git a/R/download_forecast.R b/R/download_forecast.R deleted file mode 100644 index 50ea283..0000000 --- a/R/download_forecast.R +++ /dev/null @@ -1,52 +0,0 @@ -# DEPRECATE, mark for removal - - - -## grab data -#project = "phenology" -#dir = "~/efi_neon_challenge/forecasts/" - -#' Download forecasts for NEON sites from the EFI server -#' -#' @param theme string of the theme -#' @param date start date for the forecast -#' @param dir storage location. Use tempdir unless you want to keep this -#' data around on your computer, in which case, `neonstore::neon_dir()` might -#' be a convenient choice. -#' @param s3_region data -#' @param s3_endpoint ecoforecast.org -#' @export -#' @examples -#' download_forecast("phenology") -download_forecast <- function(theme, - date = Sys.Date()-2, - dir = tempdir(), - s3_region = "data", - s3_endpoint = "ecoforecast.org"){ - lapply(theme, download_forecast_, date, dir) - invisible(dir) -} -download_forecast_ <- function(theme, - date = Sys.Date()-2, - dir = tempdir(), - s3_region = "data", - s3_endpoint = "ecoforecast.org"){ - - dir.create(dir, FALSE, TRUE) - parent_theme <- unlist(stringr::str_split(theme, "_"))[1] - prefix <- paste(parent_theme, paste0(theme,"-", date), sep="/") - #GENERALIZATION: Specific AWS info - object <- aws.s3::get_bucket("neon4cast-forecasts", - prefix = prefix, - region = s3_region, - base_url = s3_endpoint) - - #GENERALIZATION: Specific AWS info - for(i in seq_along(object)){ - aws.s3::save_object(object[[i]], - bucket = "neon4cast-forecasts", - file = file.path(dir, object[[i]]$Key), - region = s3_region, - base_url = s3_endpoint) - } -} diff --git a/R/download_scores.R b/R/download_scores.R deleted file mode 100644 index 18610a2..0000000 --- a/R/download_scores.R +++ /dev/null @@ -1,43 +0,0 @@ -# DEPRECATE, mark for removal - - -## grab data -#project = "phenology" -#dir = "~/efi_neon_challenge/forecasts/" - -#' Download score for NEON sites from the EFI server -#' -#' @param theme string of the theme -#' @param date start date for the forecast -#' @param dir storage location. Use tempdir unless you want to keep this -#' data around on your computer, in which case, `neonstore::neon_dir()` might -#' be a convenient choice. -#' @export -#' @examples -#' download_scores("phenology") -download_scores <- function(theme, - date = Sys.Date()-2, - dir = tempdir()){ - lapply(theme, download_forecast_, date, dir) - invisible(dir) -} -download_scores_ <- function(theme, - date = Sys.Date()-2, - dir = tempdir()){ - - dir.create(dir, FALSE, TRUE) - parent_theme <- unlist(stringr::str_split(theme, "_"))[1] - prefix <- paste(parent_theme, paste0("scores-",theme,"-", date), sep="/") - object <- aws.s3::get_bucket("neon4cast-scores", - prefix = prefix, - region = "data", - base_url = "ecoforecast.org") - - for(i in seq_along(object)){ - aws.s3::save_object(object[[i]], - bucket = "neon4cast-scores", - file = file.path(dir, object[[i]]$Key), - region = "data", - base_url = "ecoforecast.org") - } -} diff --git a/R/download_target.R b/R/download_target.R deleted file mode 100644 index 047e7a2..0000000 --- a/R/download_target.R +++ /dev/null @@ -1,31 +0,0 @@ -# DEPRECATE, mark for removal - -#GENERALIZATION: SPECIFC THEMES, TARGET FILES, AND TARGET FILE URL - - - -download_target <- function(theme = c("aquatics", "beetles", - "phenology", "terrestrial_30min", - "terrestrial_daily","ticks"), - download_url = NA){ - - if(is.na(download_url)){ - theme <- match.arg(theme) - - target_file <- switch(theme, - aquatics = "aquatics-targets.csv.gz", - beetles = "beetles-targets.csv.gz", - phenology = "phenology-targets.csv.gz", - terrestrial_daily = "terrestrial_daily-targets.csv.gz", - terrestrial_30min = "terrestrial_30min-targets.csv.gz", - ticks = "ticks-targets.csv.gz" - ) - download_url <- paste0("https://data.ecoforecast.org/neon4cast-targets/", - theme, "/", target_file) - } - - readr::read_csv(download_url, show_col_types = FALSE, - lazy = FALSE, progress = FALSE) - -} - diff --git a/R/forecast_output_validator.R b/R/forecast_output_validator.R index 81ff118..b90165c 100644 --- a/R/forecast_output_validator.R +++ b/R/forecast_output_validator.R @@ -1,279 +1,106 @@ -#' forecast_output_validator +#' Validate forecast file #' -#' @param forecast_file Your forecast csv or nc file -#' @param target_variables Possible target variables -#' @param theme_names valid EFI theme names +#' @param forecast_file forecast csv or csv.gz file #' @export -#' -#' @examples -#' -#' forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", -#' package = "neon4cast") -#' forecast_output_validator(forecast_file) -#' -forecast_output_validator <- function(forecast_file, - target_variables = c("oxygen", - "temperature", - "richness", - "abundance", - "nee", - "le", - "vswc", - "gcc_90", - "rcc_90", - "ixodes_scapularis", - "amblyomma_americanum", - "prediction", - "observed"), - #GENERALIZATION: Specific themes - theme_names = c("aquatics", "beetles", - "phenology", "terrestrial_30min", - "terrestrial_daily","ticks")){ + +forecast_output_validator <- function(forecast_file){ + + file_in <- forecast_file valid <- TRUE message(file_in) - #usethis::ui_todo("Checking validity of file name...") - file_basename <- basename(file_in) - parsed_basename <- unlist(stringr::str_split(file_basename, "-")) - file_name_parsable <- TRUE - - if(!(parsed_basename[1] %in% theme_names)){ - usethis::ui_warn(paste0("first position of file name (before first -) is not one of the following : ", - paste(theme_names, collapse = " "))) - valid <- FALSE - file_name_parsable <- FALSE - } - - date_string <- lubridate::as_date(paste(parsed_basename[2:4], collapse = "-")) - - if(is.na(date_string)){ - usethis::ui_warn("file name does not contain parsable date") - file_name_parsable <- FALSE - valid <- FALSE - } - - if(file_name_parsable){ - usethis::ui_done("file name is correct") - } - - if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ + if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ # if file is csv zip file out <- readr::read_csv(file_in, guess_max = 1e6, show_col_types = FALSE) + if(lexists(out, c("model_id"))){ + usethis::ui_done("file has model_id column") + }else{ + usethis::ui_warn("file missing model_id column ") + } + + if("variable" %in% names(out) & "prediction" %in% names(out)){ usethis::ui_done("forecasted variables found correct variable + prediction column") - }else if("variable" %in% names(out) & "predicted" %in% names(out)){ - usethis::ui_warn("file as predicted column. change column name to prediction") - valid <- FALSE }else{ usethis::ui_warn("missing the variable and prediction columns") valid <- FALSE } - #usethis::ui_todo("Checking that file contains either ensemble or statistic column...") - if(lexists(out, "ensemble")){ usethis::ui_warn("ensemble dimension should be named parameter") valid <- FALSE }else if(lexists(out, "family")){ - if("normal" %in% unique(out$family)){ - usethis::ui_done("file has normal distribution in family column") - }else if("ensemble" %in% unique(out$family)){ - usethis::ui_done("file has ensemble distribution in family column") - }else{ - usethis::ui_warn("only normal or ensemble distributions in family columns are currently supported") - valid <- FALSE - } - if(lexists(out, "parameter")){ - if("mu" %in% unique(out$parameter) & "sigma" %in% unique(out$parameter)){ - usethis::ui_done("file has parameter and family column with normal distribution") - }else if("ensemble" %in% unique(out$family) | "sample" %in% unique(out$family) ){ - usethis::ui_done("file has parameter and family column with ensemble generated distribution") - }else{ - usethis::ui_warn("file does not have parameter column is not a normal or ensemble distribution") - valid <- FALSE - } + usethis::ui_done("file has correct family and parameter columns") }else{ - usethis::ui_warn("file does not have parameter and family column ") + usethis::ui_warn("file does not have parameter column ") valid <- FALSE } }else{ - usethis::ui_warn("file does not have ensemble or family + parameter column") + usethis::ui_warn("file does not have ensemble or family and/or parameter column") valid <- FALSE } #usethis::ui_todo("Checking that file contains siteID column...") if(lexists(out, c("site_id"))){ usethis::ui_done("file has site_id column") - }else if(lexists(out, c("siteID"))){ - usethis::ui_warn("file siteID column should be named site_id") }else{ usethis::ui_warn("file missing site_id column") } - #usethis::ui_todo("Checking that file contains parsable time column...") if(lexists(out, c("datetime"))){ - usethis::ui_done("file has time column") - if(!stringr::str_detect(out$datetime[1], "-")){ - usethis::ui_done("time column format is not in the correct YYYY-MM-DD format") + usethis::ui_done("file has datetime column") + if(!grepl("-", out$datetime[1])){ + usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format") valid <- FALSE }else{ if(sum(class(out$datetime) %in% c("Date","POSIXct")) > 0){ - usethis::ui_done("file has correct time column") + usethis::ui_done("file has correct datetime column") }else{ - usethis::ui_done("time column format is not in the correct YYYY-MM-DD format") + usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format") valid <- FALSE } } - }else if(lexists(out, c("time"))){ - usethis::ui_warn("time dimension should be named datetime. We are converting it during processing but please update your submission format") - valid <- TRUE }else{ - usethis::ui_warn("file missing time column") + usethis::ui_warn("file missing datetime column") valid <- FALSE } - if(lexists(out, c("reference_datetime"))){ - usethis::ui_done("file has reference_datetime column") - }else if(lexists(out, c("start_time"))){ - usethis::ui_warn("file start_time column should be named reference_datetime. We are converting it during processing but please update your submission format") - }else{ - usethis::ui_warn("file missing reference_datetime column") - } - - } else if(grepl("[.]nc", file_in)){ #if file is nc - - nc <- ncdf4::nc_open(file_in) - - #usethis::ui_todo("Checking that file contains correct variables...") - if(lexists(nc$var, target_variables) > 0){ - usethis::ui_done("target variables found") - var_dim <- dim(ncdf4::ncvar_get(nc, varid = names(nc$var[which(names(nc$var) %in% target_variables)][1]))) + if(lexists(out, c("duration"))){ + usethis::ui_done("file has duration column") }else{ - usethis::ui_warn(paste0("no target variables in found in possible list: ", paste(target_variables, collapse = " "))) - valid <- FALSE + usethis::ui_warn("file missing duration column (values for the column: daily = P1D, hourly = PT1H)") } - #usethis::ui_todo("Checking that time variable exist and is parseable...") - - if(lexists(nc$dim, c("time", "datetime"))){ - usethis::ui_done("file has time dimension") - if("time" %in% names(nc$dim)){ - usethis::ui_warn("time dimension should be named datetime we are converting it during processing but please update your submission format") - time <- ncdf4::ncvar_get(nc, "time") - time_dim <- length(time) - valid <- TRUE - }else{ - time <- ncdf4::ncvar_get(nc, "datetime") - tustr<-strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ") - t_string <- strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ")[[1]][1] - time_dim <- length(time) - time <-lubridate::as_date(time,origin=unlist(tustr)[3]) - if(t_string %in% c("days","seconds")){ - usethis::ui_done("file has correct time dimension") - }else{ - usethis::ui_warn("time dimension is in correct format") - valid <- FALSE - } - } - }else{ - usethis::ui_warn("file missing time dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that siteID variable exists...") - #GENERALIZATION: using siteID here - should be site_id - if(lexists(nc$var, c("siteID", "site_id"))){ - usethis::ui_done("file has siteID variable") - }else{ - usethis::ui_warn("file missing siteID variable") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf contains site dimension...") - - if(lexists(nc$dim, c("site")) > 0){ - usethis::ui_done("file has site dimension") - site_dim <- length(ncdf4::ncvar_get(nc, "site")) - + if(lexists(out, c("project_id"))){ + usethis::ui_done("file has project_id column") }else{ - usethis::ui_warn("file missing site dimension") - valid <- FALSE + usethis::ui_warn("file missing project_id column (use `vera4cast` as the project_id") } - #usethis::ui_todo("Checking that netcdf contains ensemble dimension...") - - if(lexists(nc$dim, "ensemble")){ - usethis::ui_warn("ensemble dimension should be named parameter") - ensemble_dim <- length(ncdf4::ncvar_get(nc, "ensemble")) - valid <- FALSE - }else if(lexists(nc$dim, "parameter")){ - usethis::ui_done("file has parameter dimension") - ensemble_dim <- length(ncdf4::ncvar_get(nc, "parameter")) + if(lexists(out, c("reference_datetime"))){ + usethis::ui_done("file has reference_datetime column") + }else if(lexists(out, c("start_time"))){ + usethis::ui_warn("file start_time column should be named reference_datetime. We are converting it during processing but please update your submission format") }else{ - usethis::ui_warn("file missing parameter dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf dimensions are correct order...") - dim_order <- TRUE - - if(var_dim[1] != time_dim){ - usethis::ui_warn("time is not the first dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(var_dim[2] != site_dim){ - usethis::ui_warn("site is not the second dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(var_dim[3] != ensemble_dim){ - usethis::ui_warn("ensemble is not the third dimension") + usethis::ui_warn("file missing reference_datetime column") valid <- FALSE - dim_order <- FALSE - } - - if(dim_order){ - usethis::ui_done("dimensions are correct order") } - ncdf4::nc_close(nc) - - }else if(grepl("[.]xml", file_in)){ #if file is eml - - #usethis::ui_todo("Checking validity of metdata...") - - #out <- EML::read_eml(file_in) - - #valid_metadata <- tryCatch(EFIstandards::forecast_validator(out),error = function(e){ - # message(e) - # return(FALSE) - #}, - #finally = NULL) - - #if(!valid_metadata){ - # usethis::ui_warn("metadata is not correct") - # valid <- FALSE - #}else{ - # usethis::ui_done("metadata is correct") - #} - valid <- TRUE }else{ + usethis::ui_warn("incorrect file extension (csv or csv.gz are accepted)") valid <- FALSE } if(!valid){ - message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html") + message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-ci/instructions.html#forecast-file-format") }else{ message("Forecast format is valid") } @@ -283,5 +110,4 @@ forecast_output_validator <- function(forecast_file, lexists <- function(list,name){ any(!is.na(match(name, names(list)))) -} - +} \ No newline at end of file diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index ad1b4c5..b5c2b42 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -61,12 +61,21 @@ noaa_stage1 <- function(cycle = 0, endpoint = "data.ecoforecast.org", verbose = TRUE, start_date = "") { - noaa_gefs_stage(file.path("stage1",cycle, start_date), - partitioning = "start_date", - version = version, - endpoint = endpoint, - verbose = verbose, - start_date = start_date) + + vars <- arrow_env_vars() + + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) + + endpoint_override <- "https://sdsc.osn.xsede.org" + s3 <- arrow::s3_bucket(paste0(bucket), + endpoint_override = endpoint_override, + anonymous = TRUE) + + site_df <- arrow::open_dataset(s3) + + unset_arrow_vars(vars) + + return(site_df) } #' NOAA GEFS forecasts with EFI stage 2 processing @@ -79,18 +88,27 @@ noaa_stage1 <- function(cycle = 0, #' @export noaa_stage2 <- function(cycle = 0, version = "v12", - endpoint = "data.ecoforecast.org", + endpoint = NA, verbose = TRUE, start_date = "") { - noaa_gefs_stage(file.path("stage2/parquet",cycle, start_date), - partitioning = "start_date", - version = version, - endpoint = endpoint, - verbose = verbose, - start_date = start_date) - -} + vars <- arrow_env_vars() + + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2/reference_datetime=",start_date) + + endpoint_override <- "https://sdsc.osn.xsede.org" + s3 <- arrow::s3_bucket(paste0(bucket), + endpoint_override = endpoint_override, + anonymous = TRUE) + + site_df <- arrow::open_dataset(s3) |> + dplyr::mutate(reference_datetime = lubridate::as_datetime(start_date)) + + unset_arrow_vars(vars) + + return(site_df) + +} #' NOAA GEFS forecasts with EFI stage 3 processing #' @@ -108,45 +126,25 @@ noaa_stage2 <- function(cycle = 0, noaa_stage3 <- function(version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE) { - noaa_gefs_stage("stage3/parquet", - partitioning = "site_id", - version = version, - endpoint = endpoint, - verbose = verbose, - start_date = NA) -} -noaa_gefs_stage <- function(stage = "stage1", - partitioning = c("cycle","start_date"), - cycle = 0, - version = "v12", - endpoint = "data.ecoforecast.org", - verbose = getOption("verbose", TRUE), - start_date = start_date) { - if(verbose) - message(paste("establishing connection to", stage, "at", endpoint, "...")) - s3 <- noaa_gefs(version, endpoint) - if (!is.na(as.Date(start_date))) { - ds <- arrow::open_dataset(s3$path(stage)) |> dplyr::filter(parameter <= 31) - } else { - ds <- arrow::open_dataset(s3$path(stage), partitioning = partitioning) |> dplyr::filter(parameter <= 31) - } - if(verbose) - message(paste0("connected! Use dplyr functions to filter and summarise.\n", - "Then, use collect() to read result into R\n")) - ds -} +vars <- arrow_env_vars() -noaa_gefs <- function(version = "v12", - endpoint = "data.ecoforecast.org") { + bucket <- "bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3" - vars <- arrow_env_vars() - gefs <- arrow::s3_bucket(paste0("neon4cast-drivers/noaa/gefs-", version), - endpoint_override = endpoint, - anonymous = TRUE) - on.exit(unset_arrow_vars(vars)) - gefs + endpoint_override <- "https://sdsc.osn.xsede.org" + s3 <- arrow::s3_bucket(bucket, + endpoint_override = endpoint_override, + anonymous = TRUE) + + site_df <- arrow::open_dataset(s3) |> + dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> + dplyr::rename(parameter = ensemble) + + unset_arrow_vars(vars) + + return(site_df) + } arrow_env_vars <- function(){ diff --git a/R/re_exports.R b/R/re_exports.R deleted file mode 100644 index 02861ba..0000000 --- a/R/re_exports.R +++ /dev/null @@ -1,28 +0,0 @@ -#' @importFrom read4cast read_forecast -#' @export -read4cast::read_forecast - -#' @importFrom score4cast score -#' @export -score4cast::score - -#' @importFrom score4cast crps_logs_score -#' @export -score4cast::crps_logs_score - -#' @importFrom score4cast include_horizon -#' @export -score4cast::include_horizon - -#' @importFrom score4cast pivot_forecast -#' @export -score4cast::pivot_forecast - -#' @importFrom score4cast pivot_target -#' @export -score4cast::pivot_target - -# @importFrom dplyr `%>%` - -#globalVariables("forecast_start_time", "horizon", "observed", -# "predicted","statistic", "target_url", "theme") \ No newline at end of file diff --git a/R/s3_helpers.R b/R/s3_helpers.R index 297e3f1..6f0c56f 100644 --- a/R/s3_helpers.R +++ b/R/s3_helpers.R @@ -1,42 +1,20 @@ -# DEPRECATE and mark for removal +get_target <- function(variable, duration, project_id = "neon4cast", lazy = FALSE){ -#' Download target data from s3 -#' -#' @param dir full path to working directory -#' @param theme forecast theme -#' @param s3_region s3 region -#' @export -#' -get_target <- function(dir, theme, s3_region = Sys.getenv("AWS_DEFAULT_REGION")){ - download_s3_objects(dir, - bucket = "neon4cast-targets", - prefix = theme) -} + s3_targets <- arrow::s3_bucket("bio230014-bucket01/challenges/targets", endpoint_override = "sdsc.osn.xsede.org") + + target <- arrow::open_csv_dataset(s3_targets, + schema = arrow::schema( + project_id = arrow::string(), + site_id = arrow::string(), + datetime = arrow::timestamp(unit = "ns", timezone = "UTC"), + duration = arrow::string(), + variable = arrow::string(), + observation = arrow::float()), + skip = 1) |> + dplyr::filter(variable %in% variable, + duration == duration, + project_id == project_id) -#### + if(!lazy) target <- target |> dplyr::collect() -download_s3_objects <- function(dir, bucket, prefix, s3_region = Sys.getenv("AWS_DEFAULT_REGION")){ - - files <- aws.s3::get_bucket(bucket = bucket, - prefix = prefix, - region = s3_region, - use_https = as.logical(Sys.getenv("USE_HTTPS")), - max = Inf) - keys <- vapply(files, `[[`, "", "Key", USE.NAMES = FALSE) - empty <- grepl("/$", keys) - keys <- keys[!empty] - files_present <- TRUE - if(length(keys) > 0){ - for(i in 1:length(keys)){ - aws.s3::save_object(object = keys[i], - bucket = bucket, - file = file.path(dir, bucket, keys[i]), - region = s3_region, - use_https = as.logical(Sys.getenv("USE_HTTPS"))) - } - }else{ - message("Requested files are not available on the s3 bucket") - files_present <- FALSE - } - invisible(files_present) } diff --git a/R/submit.R b/R/submit.R index f1a9eaa..faed250 100644 --- a/R/submit.R +++ b/R/submit.R @@ -1,18 +1,18 @@ ## Technically this could become arrow-based #' submit forecast to EFI -#' +#' #' @inheritParams forecast_output_validator #' @param metadata path to metadata file #' @param ask should we prompt for a go before submission? #' @param s3_region subdomain (leave as is for EFI challenge) #' @param s3_endpoint root domain (leave as is for EFI challenge) #' @export -submit <- function(forecast_file, - metadata = NULL, - ask = interactive(), - s3_region = "data", - s3_endpoint = "ecoforecast.org" +submit <- function(forecast_file, + metadata = NULL, + ask = interactive(), + s3_region = "submit", + s3_endpoint = "ecoforecast.org" ){ if(file.exists("~/.aws")){ warning(paste("Detected existing AWS credentials file in ~/.aws,", @@ -20,6 +20,7 @@ submit <- function(forecast_file, } message("validating that file matches required standard") go <- forecast_output_validator(forecast_file) + if(!go){ warning(paste0("forecasts was not in a valid format and was not submitted\n", @@ -28,87 +29,80 @@ submit <- function(forecast_file, return(NULL) } - if(go & ask){ - go <- utils::askYesNo("Forecast file is valid, ready to submit?") - } - - #GENERALIZATION: Here are specific AWS INFO - exists <- aws.s3::put_object(file = forecast_file, - object = basename(forecast_file), - bucket = "neon4cast-submissions", - region= s3_region, - base_url = s3_endpoint) - - if(exists){ - message("Successfully submitted forecast to server") - }else{ - warning("Forecasts was not sucessfully submitted to server") - } - - - - - if(!is.null(metadata)){ - if(tools::file_ext(metadata) == "xml"){ - EFIstandards::forecast_validator(metadata) - aws.s3::put_object(file = metadata, - object = basename(metadata), - bucket = "neon4cast-submissions", - region= s3_region, - base_url = s3_endpoint) + check_model_id <- FALSE + if(check_model_id){ + googlesheets4::gs4_deauth() + message("Checking if model_id is registered") + registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:V")) + + registered_project_id <- registered_model_id$`What forecasting challenge are you registering for?` + registered_model_id <- registered_model_id$model_id + + registered_model_project_id <- paste(registered_project_id, registered_model_id, sep = "-") + + df <- read4cast::read_forecast(forecast_file) + model_id <- df$model_id[1] + model_project_id <- paste("neon4cast", model_id, sep = "-") + + if(grepl("(example)", model_id)){ + message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n", + "No registration is required to submit an example forecast.\n", + "If you want your forecast to be retained, please select a different model_id that does not contain `example` and register you model id at https://forms.gle/kg2Vkpho9BoMXSy57\n")) + } + + if(!(model_project_id %in% registered_model_project_id) & !grepl("(example)",model_id)){ + + message("Checking if model_id for neon4cast is already used in submissions") + + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) + submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") + + + if(model_project_id %in% submitted_project_model_id){ + + stop(paste0("Your model_id (",model_id,") has not been registered yet but is already used in other submissions. Please use and register another model_id\n", + " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", + "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) + + }else{ + + stop(paste0("Your model_id (",model_id,") has not been registered\n", + " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", + "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) + + } + } + + if(!grepl("(example)",model_id)){ + if(first_submission & model_project_id %in% registered_model_project_id){ + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) + submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") + + if(model_project_id %in% submitted_project_model_id){ + stop(paste0("Your model_id (",model_id,") is already used in other submitted forecasts. There are two causes for this error: \n + - If you have previously submitted a forecast, set the argument `first_submission = FALSE` to remove this error\n + - If you have not previously submitted a forecast, this error message means that the model_id has already been registered and used for submissions. Please register and use another model_id at [https://forms.gle/kg2Vkpho9BoMXSy57](https://forms.gle/kg2Vkpho9BoMXSy57)")) + } + } }else{ - warning(paste("Metadata file is not an .xml file", - "Did you incorrectly submit the model description yml file instead of an xml file")) + message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but not used in future analyses.") } } -} - -#' Check that submission was successfully processed -#' -#' @param forecast_file Your forecast csv or nc file -#' @param s3_region subdomain (leave as is for EFI challenge) -#' @param s3_endpoint root domain (leave as is for EFI challenge) -#' @export -check_submission <- function(forecast_file, - s3_region = "data", - s3_endpoint = "ecoforecast.org"){ - theme <- stringr::str_split_fixed(basename(forecast_file), "-", n = 2) - - #All forecats are converted into a common file format when they are processed. This generates that name. - #if (grepl("[.]nc$", forecast_file)) { - # base_name <- paste0(tools::file_path_sans_ext(basename(forecast_file)), - # ".csv.gz") - #}else if (grepl("[.]csv$", forecast_file)) { - # base_name <- paste0(tools::file_path_sans_ext(basename(forecast_file)), - # ".csv.gz") - #}else if (grepl("[.]csv\\.gz$", forecast_file)) { - # base_name <- basename(forecast_file) - #}else { - # message("File is not a .nc, .cvs, or .csv.gz file") - # base_name <- forecast_file - #} - - base_name <- forecast_file - - exists <- suppressMessages(aws.s3::object_exists(object = file.path("raw", theme[,1], base_name), - bucket = "neon4cast-forecasts", - region= s3_region, - base_url = s3_endpoint)) + if(go & ask){ + go <- utils::askYesNo("Forecast file is valid, ready to submit?") + } + #GENERALIZATION: Here are specific AWS INFO + exists <- aws.s3::put_object(file = forecast_file, + object = basename(forecast_file), + bucket = "submissions", + region= s3_region, + base_url = s3_endpoint) if(exists){ - message("Submission was successfully processed") + message("Thank you for submitting!") }else{ - not_in_standard <- suppressMessages(aws.s3::object_exists(object = file.path("not_in_standard", basename(forecast_file)), - bucket = "neon4cast-forecasts", - region= s3_region, - base_url = s3_endpoint)) - if(not_in_standard){ - message("Submission is not in required format. Try running neon4cast::forecast_output_validator on your file to see what the issue may be") - }else{ - message("Your forecast is still in queue to be processed by the server. Please check again in a few hours") - } + warning("Forecasts was not sucessfully submitted to server. Try again, then contact the Challenge organizers.") } - invisible(exists) } diff --git a/R/to_hourly.R b/R/to_hourly.R new file mode 100644 index 0000000..f5f0b5c --- /dev/null +++ b/R/to_hourly.R @@ -0,0 +1,134 @@ +to_hourly <- function(df, + use_solar_geom = TRUE, + psuedo = FALSE){ + + if(!psuedo){ + reference_datetime <- lubridate::as_datetime(df$reference_datetime)[1] + }else{ + reference_datetime <- NA + } + +var_order <- names(df) + +ensemble_maxtime <- df |> + dplyr::group_by(site_id, family, ensemble) |> + dplyr::summarise(max_time = max(datetime), .groups = "drop") + +ensembles <- unique(df$ensemble) +datetime <- seq(min(df$datetime), max(df$datetime), by = "1 hour") +variables <- unique(df$variable) +sites <- unique(df$site_id) + +full_time <- expand.grid(sites, ensembles, datetime, variables) |> + dplyr::rename(site_id = Var1, + ensemble = Var2, + datetime = Var3, + variable = Var4) |> + dplyr::mutate(datetime = lubridate::as_datetime(datetime)) |> + dplyr::arrange(site_id, ensemble, variable, datetime) |> + dplyr::left_join(ensemble_maxtime, by = c("site_id","ensemble")) |> + dplyr::filter(datetime <= max_time) |> + dplyr::select(-c("max_time")) |> + dplyr::distinct() + +states <- df |> + dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> + dplyr::filter(!psuedo | (psuedo & horizon != "006") | (psuedo & datetime == max(df$datetime))) |> + dplyr::select(-horizon) |> + dplyr::group_by(site_id, family, ensemble, variable) |> + dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> + dplyr::filter(variable %in% c("PRES", "RH", "TMP", "UGRD", "VGRD")) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + dplyr::mutate(prediction = imputeTS::na_interpolation(prediction, option = "linear")) |> + dplyr::mutate(prediction = ifelse(variable == "TMP", prediction + 273, prediction)) |> + dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) |> + dplyr::ungroup() + +fluxes <- df |> + dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> + dplyr::filter(horizon != "003") |> + dplyr::select(-horizon) |> + dplyr::group_by(site_id, family, ensemble, variable) |> + dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> + dplyr::filter(variable %in% c("APCP","DSWRF","DLWRF")) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + tidyr::fill(prediction, .direction = "up") |> + dplyr::mutate(prediction = ifelse(variable == "APCP", prediction / (6 * 60 * 60), prediction), + variable = ifelse(variable == "APCP", "PRATE", variable)) |> + dplyr::ungroup() + +if(use_solar_geom){ + + site_list <- readr::read_csv(paste0("https://github.com/eco4cast/", + "neon4cast-noaa-download/", + "raw/master/noaa_download_site_list.csv"), + show_col_types = FALSE) |> + dplyr::select(-site_name) + + fluxes <- fluxes |> + dplyr::left_join(site_list, by = "site_id") |> + dplyr::mutate(hour = lubridate::hour(datetime), + date = lubridate::as_date(datetime), + doy = lubridate::yday(datetime) + hour/24, + longitude = ifelse(longitude < 0, 360 + longitude, longitude), + rpot = downscale_solar_geom(doy, longitude, latitude)) |> # hourly sw flux calculated using solar geometry + dplyr::group_by(site_id, family, ensemble, date, variable) |> + dplyr::mutate(avg.rpot = mean(rpot, na.rm = TRUE), + avg.SW = mean(prediction, na.rm = TRUE))|> # daily sw mean from solar geometry + dplyr::ungroup() |> + dplyr::mutate(prediction = ifelse(variable == "DSWRF" & avg.rpot > 0.0, rpot * (avg.SW/avg.rpot),prediction)) |> + dplyr::select(any_of(var_order)) +} + +hourly_df <- dplyr::bind_rows(states, fluxes) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + dplyr::mutate(variable = ifelse(variable == "TMP", "air_temperature", variable), + variable = ifelse(variable == "PRES", "air_pressure", variable), + variable = ifelse(variable == "RH", "relative_humidity", variable), + variable = ifelse(variable == "DLWRF", "surface_downwelling_longwave_flux_in_air", variable), + variable = ifelse(variable == "DSWRF", "surface_downwelling_shortwave_flux_in_air", variable), + variable = ifelse(variable == "PRATE", "precipitation_flux", variable), + variable = ifelse(variable == "VGRD", "eastward_wind", variable), + variable = ifelse(variable == "UGRD", "northward_wind", variable), + variable = ifelse(variable == "APCP", "precipitation_amount", variable), + reference_datetime = reference_datetime) |> + dplyr::select(any_of(var_order)) + + return(hourly_df) + +} + +cos_solar_zenith_angle <- function(doy, lat, lon, dt, hr) { + et <- equation_of_time(doy) + merid <- floor(lon / 15) * 15 + merid[merid < 0] <- merid[merid < 0] + 15 + lc <- (lon - merid) * -4/60 ## longitude correction + tz <- merid / 360 * 24 ## time zone + midbin <- 0.5 * dt / 86400 * 24 ## shift calc to middle of bin + t0 <- 12 + lc - et - tz - midbin ## solar time + h <- pi/12 * (hr - t0) ## solar hour + dec <- -23.45 * pi / 180 * cos(2 * pi * (doy + 10) / 365) ## declination + cosz <- sin(lat * pi / 180) * sin(dec) + cos(lat * pi / 180) * cos(dec) * cos(h) + cosz[cosz < 0] <- 0 + return(cosz) +} + +equation_of_time <- function(doy) { + stopifnot(doy <= 367) + f <- pi / 180 * (279.5 + 0.9856 * doy) + et <- (-104.7 * sin(f) + 596.2 * sin(2 * f) + 4.3 * + sin(4 * f) - 429.3 * cos(f) - 2 * + cos(2 * f) + 19.3 * cos(3 * f)) / 3600 # equation of time -> eccentricity and obliquity + return(et) +} + +downscale_solar_geom <- function(doy, lon, lat) { + + dt <- median(diff(doy)) * 86400 # average number of seconds in time interval + hr <- (doy - floor(doy)) * 24 # hour of day for each element of doy + + ## calculate potential radiation + cosz <- cos_solar_zenith_angle(doy, lat, lon, dt, hr) + rpot <- 1366 * cosz + return(rpot) +} diff --git a/man/check_submission.Rd b/man/check_submission.Rd deleted file mode 100644 index 614be98..0000000 --- a/man/check_submission.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/submit.R -\name{check_submission} -\alias{check_submission} -\title{Check that submission was successfully processed} -\usage{ -check_submission( - forecast_file, - s3_region = "data", - s3_endpoint = "ecoforecast.org" -) -} -\arguments{ -\item{forecast_file}{Your forecast csv or nc file} - -\item{s3_region}{subdomain (leave as is for EFI challenge)} - -\item{s3_endpoint}{root domain (leave as is for EFI challenge)} -} -\description{ -Check that submission was successfully processed -} diff --git a/man/combined_scores.Rd b/man/combined_scores.Rd deleted file mode 100644 index ee6a3d1..0000000 --- a/man/combined_scores.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/combine_scores.R -\name{combined_scores} -\alias{combined_scores} -\title{Combining scores from a theme together} -\usage{ -combined_scores(theme, collect = TRUE) -} -\arguments{ -\item{theme}{theme name} - -\item{collect}{TRUE/FALSE to download results} -} -\value{ -a data.frame of scores -} -\description{ -Combining scores from a theme together -} diff --git a/man/download_forecast.Rd b/man/download_forecast.Rd deleted file mode 100644 index 1c3eb21..0000000 --- a/man/download_forecast.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/download_forecast.R -\name{download_forecast} -\alias{download_forecast} -\title{Download forecasts for NEON sites from the EFI server} -\usage{ -download_forecast( - theme, - date = Sys.Date() - 2, - dir = tempdir(), - s3_region = "data", - s3_endpoint = "ecoforecast.org" -) -} -\arguments{ -\item{theme}{string of the theme} - -\item{date}{start date for the forecast} - -\item{dir}{storage location. Use tempdir unless you want to keep this -data around on your computer, in which case, \code{neonstore::neon_dir()} might -be a convenient choice.} - -\item{s3_region}{data} - -\item{s3_endpoint}{ecoforecast.org} -} -\description{ -Download forecasts for NEON sites from the EFI server -} -\examples{ -download_forecast("phenology") -} diff --git a/man/download_scores.Rd b/man/download_scores.Rd deleted file mode 100644 index b250899..0000000 --- a/man/download_scores.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/download_scores.R -\name{download_scores} -\alias{download_scores} -\title{Download score for NEON sites from the EFI server} -\usage{ -download_scores(theme, date = Sys.Date() - 2, dir = tempdir()) -} -\arguments{ -\item{theme}{string of the theme} - -\item{date}{start date for the forecast} - -\item{dir}{storage location. Use tempdir unless you want to keep this -data around on your computer, in which case, \code{neonstore::neon_dir()} might -be a convenient choice.} -} -\description{ -Download score for NEON sites from the EFI server -} -\examples{ -download_scores("phenology") -} diff --git a/man/forecast_output_validator.Rd b/man/forecast_output_validator.Rd index 1e2c55f..e46fbae 100644 --- a/man/forecast_output_validator.Rd +++ b/man/forecast_output_validator.Rd @@ -2,31 +2,13 @@ % Please edit documentation in R/forecast_output_validator.R \name{forecast_output_validator} \alias{forecast_output_validator} -\title{forecast_output_validator} +\title{Validate forecast file} \usage{ -forecast_output_validator( - forecast_file, - target_variables = c("oxygen", "temperature", "richness", "abundance", "nee", "le", - "vswc", "gcc_90", "rcc_90", "ixodes_scapularis", "amblyomma_americanum", - "prediction", "observed"), - theme_names = c("aquatics", "beetles", "phenology", "terrestrial_30min", - "terrestrial_daily", "ticks") -) +forecast_output_validator(forecast_file) } \arguments{ -\item{forecast_file}{Your forecast csv or nc file} - -\item{target_variables}{Possible target variables} - -\item{theme_names}{valid EFI theme names} +\item{forecast_file}{forecast csv or csv.gz file} } \description{ -forecast_output_validator -} -\examples{ - -forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", - package = "neon4cast") -forecast_output_validator(forecast_file) - +Validate forecast file } diff --git a/man/get_target.Rd b/man/get_target.Rd deleted file mode 100644 index f7d5fa2..0000000 --- a/man/get_target.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_helpers.R -\name{get_target} -\alias{get_target} -\title{Download target data from s3} -\usage{ -get_target(dir, theme, s3_region = Sys.getenv("AWS_DEFAULT_REGION")) -} -\arguments{ -\item{dir}{full path to working directory} - -\item{theme}{forecast theme} - -\item{s3_region}{s3 region} -} -\description{ -Download target data from s3 -} diff --git a/man/noaa_stage2.Rd b/man/noaa_stage2.Rd index 1938d5e..d503783 100644 --- a/man/noaa_stage2.Rd +++ b/man/noaa_stage2.Rd @@ -13,7 +13,7 @@ Stage2 processing involves the following transforms of the data: noaa_stage2( cycle = 0, version = "v12", - endpoint = "data.ecoforecast.org", + endpoint = NA, verbose = TRUE, start_date = "" ) diff --git a/man/reexports.Rd b/man/reexports.Rd deleted file mode 100644 index 1107f89..0000000 --- a/man/reexports.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/re_exports.R -\docType{import} -\name{reexports} -\alias{reexports} -\alias{read_forecast} -\alias{score} -\alias{crps_logs_score} -\alias{include_horizon} -\alias{pivot_forecast} -\alias{pivot_target} -\title{Objects exported from other packages} -\keyword{internal} -\description{ -These objects are imported from other packages. Follow the links -below to see their documentation. - -\describe{ - \item{read4cast}{\code{\link[read4cast]{read_forecast}}} - - \item{score4cast}{\code{\link[score4cast]{crps_logs_score}}, \code{\link[score4cast]{include_horizon}}, \code{\link[score4cast]{pivot_forecast}}, \code{\link[score4cast]{pivot_target}}, \code{\link[score4cast]{score}}} -}} - diff --git a/man/submit.Rd b/man/submit.Rd index ce56063..608dfe0 100644 --- a/man/submit.Rd +++ b/man/submit.Rd @@ -8,12 +8,12 @@ submit( forecast_file, metadata = NULL, ask = interactive(), - s3_region = "data", + s3_region = "submit", s3_endpoint = "ecoforecast.org" ) } \arguments{ -\item{forecast_file}{Your forecast csv or nc file} +\item{forecast_file}{forecast csv or csv.gz file} \item{metadata}{path to metadata file} diff --git a/man/theme_statistics.Rd b/man/theme_statistics.Rd deleted file mode 100644 index 920edbb..0000000 --- a/man/theme_statistics.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/combine_scores.R -\name{theme_statistics} -\alias{theme_statistics} -\title{Calculating forecast challenge submission statistics} -\usage{ -theme_statistics(themes) -} -\arguments{ -\item{themes}{theme names} -} -\value{ -a data.frame of challenge statistics -} -\description{ -Calculating forecast challenge submission statistics -}