functions work with all sites from database

SCCWRP · Nov 9, 2018 · ae76718 · ae76718
1 parent c48a26e
commit ae76718
Show file tree

Hide file tree

Showing 8 changed files with 242 additions and 191 deletions.
diff --git a/R/chkinp.R b/R/chkinp.R
@@ -58,7 +58,8 @@ chkinp <- function(data, purge = FALSE, msgs = FALSE){
   algae <- c('Microalgae Thickness', 'Macrophyte Cover', 'Macroalgae Cover, Attached', 'Macroalgae Cover, Unattached')
   channelsinuosity <- c('Slope', 'Length, Segment', 'Elevation Difference', 'Bearing', 'Proportion', 'Length, Reach')
   densiometer <- c('Canopy Cover')
-  sels <- c(algae, channelsinuosity, densiometer)
+  ripveg <- c('Riparian GroundCover Barren', 'Riparian GroundCover NonWoody Plants', 'Riparian GroundCover Woody Shrubs', 'Riparian Lower Canopy All Vegetation', 'Riparian Upper Canopy All Trees', 'Riparian Lower Canopy All Vegetation', 'Riparian Upper Canopy All Trees', 'Riparian GroundCover Woody Shrubs', 'Riparian GroundCover NonWoody Plants')
+  sels <- c(algae, channelsinuosity, densiometer, ripveg)
 
   # see if duplicate id, locationcode, analytename
   chk <- data %>% 

diff --git a/R/misc.R b/R/misc.R
@@ -9,7 +9,7 @@
 #' @examples 
 #' misc(sampdat)
 misc <- function(data){
-  
+
   data <- data[which(data$AnalyteName %in% c('Riffle/Run Channel Alteration', 'Riffle/Run Epifaunal Substrate', 'Riffle/Run Sediment Deposition', 'Dominant Land Use', 'Evidence of Fire', 'Evidence of Recent Rainfall')),]
   ###Report###
   data$VariableResult[data$ResQualCode=="NR"] <- NA
@@ -43,7 +43,7 @@ misc <- function(data){
     dplyr::full_join(RBP_SED, by = 'id') %>% 
     as.data.frame(stringsAsFactors = F)
 
-  rownames(misc_metrics) <- gsub('\\s.*$', '', misc_metrics$id)
+  rownames(misc_metrics) <- misc_metrics$id
   misc_metrics <- misc_metrics[, !names(misc_metrics) %in% 'id']
 
   colnames(misc_metrics)<- c("NFC_DLU.result", "NFC_EFR.result", "NFC_ERN.result",

diff --git a/R/phabmetrics.R b/R/phabmetrics.R
@@ -10,6 +10,7 @@
 #' }
 phabmetrics <- function(data){
   data <- phabformat(data)
+  data <- chkinp(data, purge = TRUE)
   metrics <- list(bankmorph(data), channelmorph(data), channelsinuosity(data),
                   densiometer(data),  habitat(data), disturbance(data), flow(data),
                   misc(data), bankstability(data), quality(data), ripveg(data),

diff --git a/R/ripveg.R b/R/ripveg.R
@@ -14,16 +14,19 @@ ripveg <- function(data){
   barren <- data.frame(cbind(data$id[barrenindex], as.character(data$AnalyteName[barrenindex]),
                              as.character(data$VariableResult[barrenindex])))
   colnames(barren) <- c("id", "AnalyteName", "result")
-  barren$result <- as.numeric(as.character(barren$result))
+  barren$result <- as.character(barren$result)
 
   ###Compute XGB###
-
-  for(i in (1:length(barren$result))[which(!is.na(barren$result))]){
-    if(barren$result[i] == 1){barren$result[i] <- 5} else
-      if(barren$result[i] == 2){barren$result[i] <- 25} else
-        if(barren$result[i] == 3){barren$result[i] <- 57.5} else
-          if(barren$result[i] == 4){barren$result[i] <- 87.5}
-  }
+
+  barren$result <- dplyr::case_when(
+    barren$result == '0' ~ 0, 
+    barren$result == '1' ~ 5, 
+    barren$result == '2' ~ 25, 
+    barren$result == '3' ~ 57.5, 
+    barren$result == '4' ~ 87.5,
+    TRUE ~ NaN
+  )
+
   sumna <- function(data){
     sum(data, na.rm = T)
   }
@@ -45,15 +48,18 @@ ripveg <- function(data){
   nonwoody <- data.frame(cbind(data$id[nonwoodyindex], as.character(data$AnalyteName[nonwoodyindex]),
                                as.character(data$VariableResult[nonwoodyindex])))
   colnames(nonwoody) <- c("id", "AnalyteName", "result")
-  nonwoody$result <- as.numeric(as.character(nonwoody$result))
+  nonwoody$result <- as.character(nonwoody$result)
 
   ###Compute XGH###
-  for(i in (1:length(nonwoody$result))[which(!is.na(nonwoody$result))]){
-    if(nonwoody$result[i] == 1){nonwoody$result[i] <- 5} else
-      if(nonwoody$result[i] == 2){nonwoody$result[i] <- 25} else
-        if(nonwoody$result[i] == 3){nonwoody$result[i] <- 57.5} else
-          if(nonwoody$result[i] == 4){nonwoody$result[i] <- 87.5}
-  }
+  nonwoody$result <- dplyr::case_when(
+    nonwoody$result == '0' ~ 0, 
+    nonwoody$result == '1' ~ 5, 
+    nonwoody$result == '2' ~ 25, 
+    nonwoody$result == '3' ~ 57.5, 
+    nonwoody$result == '4' ~ 87.5,
+    TRUE ~ NaN
+  )
+
   sumna <- function(data){
     sum(data, na.rm = T)
   }
@@ -78,16 +84,18 @@ ripveg <- function(data){
 
   woody$result[woody$result %in% 'Not Recorded'] <- NA 
   woody$result <- as.numeric(as.character(woody$result))
-  head(woody)
 
   ###Compute XGW###
 
-  for(i in (1:length(woody$result))[which(!is.na(woody$result))]){
-    if(woody$result[i] == 1){woody$result[i] <- 5} else
-      if(woody$result[i] == 2){woody$result[i] <- 25} else
-        if(woody$result[i] == 3){woody$result[i] <- 57.5} else
-          if(woody$result[i] == 4){woody$result[i] <- 87.5}
-  }
+  woody$result <- dplyr::case_when(
+    woody$result == '0' ~ 0, 
+    woody$result == '1' ~ 5, 
+    woody$result == '2' ~ 25, 
+    woody$result == '3' ~ 57.5, 
+    woody$result == '4' ~ 87.5,
+    TRUE ~ NaN
+  )
+
   sumna <- function(data){
     sum(data, na.rm = T)
   }
@@ -109,17 +117,19 @@ ripveg <- function(data){
   lowercanopy <- data.frame(cbind(data$id[lowercanopyindex], as.character(data$AnalyteName[lowercanopyindex]),
                                   as.character(data$VariableResult[lowercanopyindex])))
   colnames(lowercanopy) <- c("id", "AnalyteName", "result")
-  lowercanopy$result <- as.numeric(as.character(lowercanopy$result))
-  head(lowercanopy)
+  lowercanopy$result <- as.character(lowercanopy$result)
 
   ###Compute XM###
 
-  for(i in (1:length(lowercanopy$result))[which(!is.na(lowercanopy$result))]){
-    if(lowercanopy$result[i] == 1){lowercanopy$result[i] <- 5} else
-      if(lowercanopy$result[i] == 2){lowercanopy$result[i] <- 25} else
-        if(lowercanopy$result[i] == 3){lowercanopy$result[i] <- 57.5} else
-          if(lowercanopy$result[i] == 4){lowercanopy$result[i] <- 87.5}
-  }
+  lowercanopy$result <- dplyr::case_when(
+    lowercanopy$result == '0' ~ 0, 
+    lowercanopy$result == '1' ~ 5, 
+    lowercanopy$result == '2' ~ 25, 
+    lowercanopy$result == '3' ~ 57.5, 
+    lowercanopy$result == '4' ~ 87.5,
+    TRUE ~ NaN
+  )
+
   sumna <- function(data){
     sum(data, na.rm = T)
   }
@@ -142,17 +152,19 @@ ripveg <- function(data){
                                   as.character(data$VariableResult[uppercanopyindex])))
   colnames(uppercanopy) <- c("id", "AnalyteName", "result")
 
-  uppercanopy$result <- as.numeric(as.character(uppercanopy$result))
-  head(uppercanopy)
+  uppercanopy$result <- as.character(uppercanopy$result)
 
   ###Compute XC###
 
-  for(i in (1:length(uppercanopy$result))[which(!is.na(uppercanopy$result))]){
-    if(uppercanopy$result[i] == 1){uppercanopy$result[i] <- 5} else
-      if(uppercanopy$result[i] == 2){uppercanopy$result[i] <- 25} else
-        if(uppercanopy$result[i] == 3){uppercanopy$result[i] <- 57.5} else
-          if(uppercanopy$result[i] == 4){uppercanopy$result[i] <- 87.5}
-  }
+  uppercanopy$result <- dplyr::case_when(
+    uppercanopy$result == '0' ~ 0, 
+    uppercanopy$result == '1' ~ 5, 
+    uppercanopy$result == '2' ~ 25, 
+    uppercanopy$result == '3' ~ 57.5, 
+    uppercanopy$result == '4' ~ 87.5,
+    TRUE ~ NaN
+  )
+
   sumna <- function(data){
     sum(data, na.rm = T)
   }
@@ -185,6 +197,7 @@ ripveg <- function(data){
   lowercanopy <- data.frame(cbind(data$id[lowercanopyindex], as.character(data$AnalyteName[lowercanopyindex]),
                                   as.character(data$VariableResult[lowercanopyindex])))
   colnames(lowercanopy) <- c("id", "AnalyteName", "result")
+  lowercanopy$result[lowercanopy$result %in% 'Not Recorded'] <- NA 
   lowercanopy$result <- as.numeric(as.character(lowercanopy$result))
 
   XPMID_total <- tapply(lowercanopy$result, lowercanopy$id, lengthna)
@@ -202,8 +215,8 @@ ripveg <- function(data){
   uppercanopy <- data.frame(cbind(data$id[uppercanopyindex], as.character(data$AnalyteName[uppercanopyindex]),
                                   as.character(data$VariableResult[uppercanopyindex])))
   colnames(uppercanopy) <- c("id", "AnalyteName", "result")
+  uppercanopy$result[uppercanopy$result %in% 'Not Recorded'] <- NA 
   uppercanopy$result <- as.numeric(as.character(uppercanopy$result))
-  head(uppercanopy)
 
   XPCAN_total <- tapply(uppercanopy$result, uppercanopy$id, lengthna)
 
@@ -213,7 +226,7 @@ ripveg <- function(data){
   }
   XPCAN_subcount <- tapply(uppercanopy$result, uppercanopy$id, XPCAN_subcountf)
   XPCAN.result <- XPCAN_subcount/XPCAN_total
-  
+
   ###Compute XPGVEG###
   woodyindex <- which(data$AnalyteName == "Riparian GroundCover Woody Shrubs")
   woody <- data.frame(cbind(data$id[woodyindex], as.character(data$AnalyteName[woodyindex]),
@@ -226,6 +239,7 @@ ripveg <- function(data){
   nonwoody <- data.frame(cbind(data$id[nonwoodyindex], as.character(data$AnalyteName[nonwoodyindex]),
                                as.character(data$VariableResult[nonwoodyindex])))
   colnames(nonwoody) <- c("id", "AnalyteName", "result")
+  nonwoody$result[nonwoody$result %in% 'Not Recorded'] <- NA 
   nonwoody$result <- as.numeric(as.character(nonwoody$result))
   head(nonwoody)
   woody$XPGVEG<-apply((cbind(woody$result, nonwoody$result)), 1, sum, na.rm=T)

diff --git a/R/substrate.R b/R/substrate.R
@@ -4,10 +4,21 @@
 #' 
 #' @export
 #' 
+#' @importFrom magrittr "%>%"
+#' 
 #' @examples 
 #' substrate(sampdat)
 substrate <- function(data){
   data <- data[which(data$AnalyteName %in% c('Substrate Size Class', 'Embeddedness', 'CPOM')),]
+
+  data <- data %>%
+    dplyr::select(id, LocationCode, AnalyteName, VariableResult, Result) %>%
+    unique %>%
+    tidyr::complete(id, LocationCode, AnalyteName) %>%
+    dplyr::group_by(id) %>%
+    dplyr::filter(!all(is.na(VariableResult) & !all(is.na(Result)))) %>%
+    dplyr::ungroup()
+
   data$VariableResult <- as.character(data$VariableResult)
   data$VariableResult[data$VariableResult=="a"]<-"SA"
   data$VariableResult[data$VariableResult=="as"]<-"SA"
@@ -33,12 +44,11 @@ substrate <- function(data){
   ###Compute
 
   metric <- c('RS', 'RR', 'RC', 'XB', 'SB', 'CB', 'GC', 'GF', 'SA', 'FN', 'HP', 'WD', 'OT')
-
   sub$VariableResult <- lapply(sub$VariableResult, toupper)
-  
+
   lengths <- function(data){
     length(which(((data != "NOT RECORDED") &(data != "NA"))&(data != "FNOT RECORDED")))}
-  totals <- tapply(sub$VariableResult, sub$id, lengths)
+  totals <- tapply(unlist(sub$VariableResult), sub$id, lengths)
   tnames <- as.vector(dimnames(totals))
   qq <-unlist(tnames)
   l <- matrix(NA, ncol=length(metric), nrow=length(totals))
@@ -140,13 +150,13 @@ substrate <- function(data){
   result$XEMBED.result <- XEMBED_sum/XEMBED_count
   result$XEMBED.count <- XEMBED_count
   result$XEMBED.sd <- tapply(embed$Result, embed$id, sdna)
-  
+
   cpom <- data[data$AnalyteName=="CPOM",]
   present <- function(data){
-    sum(data == "Present")
+    sum(data == "Present", na.rm = TRUE)
   }
   cpomtotal <- function(data){
-    sum((data == "Present") | (data == "Absent"))
+    sum((data == "Present") | (data == "Absent"), na.rm = TRUE)
   }
   cpresent <- tapply(cpom$VariableResult, cpom$id, present)
   ctotal <- tapply(cpom$VariableResult, cpom$id, cpomtotal)

diff --git a/README.Rmd b/README.Rmd
@@ -41,14 +41,25 @@ phabmetrics(sampdat)
 
 # Required data checks
 
-* Every unique `id`, `LocationCode`, `AnalyteName`, and `VariableResult` combination should have only one entry in `VariableResult` for `AnalyteName %in% c('Microalgae Thickness', 'Macrophyte Cover', 'Macroalgae Cover, Attached', 'Macroalgae Cover, Unattached')`.  The `algae.R` function will remove duplicate entries but a checker should be built that verifies a unique value can be determined.  
+* For every function, make sure there are no duplicate or conflicting values for every unique combination of `id`, `LocationCode`, `AnalyteName`, and `VariableResult` (or `Result`).  This should be specific to the metric classes just to be safe.  For example, every combination should have only one entry in `VariableResult` for `AnalyteName %in% c('Microalgae Thickness', 'Macrophyte Cover', 'Macroalgae Cover, Attached', 'Macroalgae Cover, Unattached')` for the algae metrics.  The `algae.R` function will remove duplicate entries but a checker should be built that verifies a unique value can be determined.  
 
 * Required column names, see those in `sampdat`.  
 
 * Check for required values in `AnalyteName` (note that `chkinp()` can check of the columns exist but we'll need a checker on data input to check for these and only these): 
      * `c('Microalgae Thickness', 'Macrophyte Cover', 'Macroalgae Cover, Attached', 'Macroalgae Cover, Unattached')` for `algae()`
      * `c('Bankfull Height', 'Bankfull Width', 'StationWaterDepth', 'Wetted Width')` for `bankmorph()`
-     * `c(Slope', 'Length, Segment', 'Elevation Difference', 'Bearing', 'Proportion', 'Length, Reach')` for `channelsinuosity()`
      * `c('Cascade/Falls', 'Dry', 'Glide', 'Pool', 'Rapid', 'Riffle', 'Run'))` for `channelmorph()` 
+     * `c(Slope', 'Length, Segment', 'Elevation Difference', 'Bearing', 'Proportion', 'Length, Reach')` for `channelsinuosity()`
+     * `c('Canopy Cover')` for `densiometer()`
+     * `c('Distance from Bank', 'StationWaterDepth', 'Velocity', 'Distance, Float', 'Float Time', 'Wetted Width')` for `flow()`
+     * `c('Fish Cover Macrophytes', 'Fish Cover Artificial Structures', 'Fish Cover Boulders', 'Fish Cover Filamentous Algae', 'Fish Cover Woody Debris >0.3 m', 'Fish Cover Live Trees/Roots', 'Fish Cover Overhang.Veg', 'Fish Cover Woody Debris <0.3 m', 'Fish Cover Undercut Banks')` for `habitat()`
+     * `c('Riparian Bridges/Abutments', 'Riparian Buildings', 'Riparian Landfill/Trash', 'Riparian Logging', 'Riparian Mining', 'Riparian Orchards/Vineyards', 'Riparian Park/Lawn', 'Riparian Pasture/Range', 'Riparian Pavement', 'Riparian Pipes', 'Riparian Road', 'Riparian Row Crops', 'Riparian Vegetation Management', 'Riparian Wall/Dike')` for `disturbance()`
+     * `c('Riffle/Run Channel Alteration', 'Riffle/Run Epifaunal Substrate', 'Riffle/Run Sediment Deposition', 'Dominant Land Use', 'Evidence of Fire', 'Evidence of Recent Rainfall')` for `misc()`
+     * `c('Bank Stability')` for `bankstability()`
+     * `c("Alkalinity as CaCO3", "Oxygen, Dissolved", "pH", "Salinity", "SpecificConductivity", "Temperature", "Turbidity")` for `quality()`
+     * `c('Riparian GroundCover Barren', 'Riparian GroundCover NonWoody Plants', 'Riparian GroundCover Woody Shrubs', 'Riparian Lower Canopy All Vegetation', 'Riparian Upper Canopy All Trees', 'Riparian Lower Canopy All Vegetation', 'Riparian Upper Canopy All Trees', 'Riparian GroundCover Woody Shrubs', 'Riparian GroundCover NonWoody Plants')` for `ripveg()` 
+     * `c('Substrate Size Class', 'Embeddedness', 'CPOM')` for `substrate()`
 
-* Check for required values in `LocationCode` (note that `chkinp()` can check of the columns exist but we'll need a checker on data input to check for these and only these)
+* Check for required values in `LocationCode` (note that `chkinp()` can check of the columns exist but we'll need a checker on data input to check for these and only these)
+
+* Maybe we need to add a checker to make sure all values in each field are present but with appropriate NA values for `Result`, `VariableResult`, this can be done with `tidyr::complete()` but may be unnecessary since this will increase data volume