-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBeeToolJames.R
210 lines (157 loc) · 8.45 KB
/
BeeToolJames.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Install necessary packages
install.packages('lubridate')
install.packages('dplyr')
install.packages('tidyverse')
install.packages('tidyr')
install.packages('sf')
install.packages('rgbif')
install.packages('data.table')
# Setting system memory
Sys.setenv('R_MAX_VSIZE'=64000000000)
# Clear unused memory
gc()
# If you just need to import a data frame so you can spatial join
all_bees <- read.delim("clipboard")
# Once you download a GBIF dataset, save the .csv file into your working directory
importFilename <- "may1.csv"
# This will be what the final exported .csv file will be called
exportFilename <- "export.csv"
# This is the largest GBIF ID in the pollinator database
largestGbifID <- 0
# This is the location of the county shapefile for the entire United States which is needed for spatial join
# NORTH AMERICA
require(sf)
require(dplyr)
counties_shp <- st_read("C:/Users/jrweaver/OneDrive - DOI/BEE_RICHNESS_TOOL/NA_Shapefile/NA_Shapefile.shp")
counties_shp <- select(counties_shp, -c(STATEFP, COUNTYFP, COUNTYNS, GEOID, NAME, NAMELSAD, LSAD, CLASSFP, MTFCC, CSAFP, CBSAFP, METDIVFP, FUNCSTAT, ALAND, AWATER, INTPTLAT, INTPTLON, new_value, PRUID, PRNAME, PRENAME, PRFNAME, PREABBR, PRFABBR, ADM1_ES, ADM1_PCODE, ADM1_REF, ADM1ALT1ES, ADM1ALT2ES, ADM0_ES, ADM0_PCODE, date, validOn, validTo, Shape_Leng, Shape_Le_1, Shape_Area))
# EPA L3
EPAL3 <- st_read("C:\\Users\\jrweaver\\OneDrive - DOI\\EPA LEVEL 3\\NA_CEC_Eco_Level3.shp")
EPAL3 <- select(EPAL3, -c(NA_L3NAME, NA_L2CODE, NA_L2NAME, NA_L1CODE, NA_L1NAME, NA_L3KEY, NA_L2KEY, NA_L1KEY, Shape_Leng, Shape_Area))
# This batch code filters and edits the raw GBIF dataset
for(i in 1:1){
require(sf)
require(lubridate)
require(dplyr)
require(tidyverse)
require(tidyr)
require(data.table)
# Import raw dataset from GBIF (6 families in North America)
all_bees <- fread(importFilename, quote = "", select = c("gbifID", "individualCount", "decimalLatitude", "decimalLongitude", "speciesKey", "basisOfRecord", "institutionCode", "collectionCode", "identifiedBy","eventDate", "day","month","year", "species", "infraspecificEpithet"), na.strings = "")
# When updating a GBIF dataset, this is how you delete all records that are less than the highest GBIF record in he database
all_bees <- all_bees[!(all_bees$gbifID <= largestGbifID), ]
# Remove all records that do not have coordinates (should be zero if the GBIF data query requires coordinates)
all_bees <- filter(all_bees, decimalLatitude != "NA")
all_bees <- filter(all_bees, decimalLongitude != "NA")
# Remove all records that are not identified to species (this tends to remove quite a few occurrences)
all_bees <- filter(all_bees, species != "")
# Remove all records that have taxonRank marked as UNRANKED
# all_bees <- filter(all_bees, taxonRank != "UNRANKED")
# Replace "NA" values in subspecies with blank field
all_bees$infraspecificEpithet <- sapply(all_bees$infraspecificEpithet, as.character)
all_bees$infraspecificEpithet[is.na(all_bees$infraspecificEpithet)] <- ""
# If there is a variety or subspecies, add it to full name.
all_bees$fullName <- paste(all_bees$species, all_bees$infraspecificEpithet)
# Trim whitespace (if any) from fullName field
all_bees$fullName <- str_trim(all_bees$fullName, side = c("both", "left", "right"))
# Finding all values in Individual count that are NA and making them 1
all_bees$individualCount[is.na(all_bees$individualCount)] <- 1
# Finding all values in Individual count that are 0 and making them 1
# This may seem counterintuitive but 0 does not mean absence data.
# There are many museum curated specimens erroneously marked as 0 for individualCount
all_bees$individualCount[all_bees$individualCount == 0] <- 1
# Create a date field
all_bees$date <- as.Date(paste(all_bees$month,all_bees$day,all_bees$year,sep="-"),format = "%m-%d-%Y")
all_bees$date <- format(strptime(all_bees$date, format = "%Y-%m-%d"), "%m/%d/%Y")
# Remove unneeded date and species/subspecies fields
all_bees <- select(all_bees, -c(species, infraspecificEpithet))
all_bees <- select(all_bees, -c(day, month, year, eventDate))
# Replace NA values in the date field with blanks
all_bees$date[is.na(all_bees$date)] <- ""
}
# NORTH AMERICA DATASET - This batch code spatially joins the records to a county field (location_ID) and exports a CSV
for (i in 1:1){
# Add county fields with GIS spatial join
all_bees <- st_as_sf(all_bees,
coords = c(x = "decimalLongitude",
y = "decimalLatitude"),
crs = "NAD83",
remove = FALSE)
# Only use the below code if you need to convert the coordinate system to NAD83 or something else
# st_transform(counties_shp, crs = st_crs("NAD83"))
# Set the counties_shp to NAD83
st_crs(counties_shp) <- "NAD83"
# Make any invalid lines in the shapefile valid (this takes a little while)
counties_shp <- st_make_valid(counties_shp)
# Use S2 geometry TRUE or FALSE
sf_use_s2(TRUE)
# This is the actual join function
all_bees <- st_join(all_bees, counties_shp, join = st_within)
# all_bees <- st_join(all_bees, refuges, join = st_within)
# This removes all the unnecessary columns added by the join function
all_bees <- data.frame(all_bees)
all_bees <- select(all_bees, -c(geometry))
# names(all_bees)[names(all_bees) == "date.x"] <- "date"
# Remove all records where the occurrence was not geo-referenced within a county boundary
# all_bees <- filter(all_bees, GEOID != "NA")
all_bees <- filter(all_bees, ID != "NA")
names(all_bees)[names(all_bees) == "ID"] <- "location_ID"
# Export CSV file
write.csv(all_bees, exportFilename, row.names = FALSE)
}
# NORTH AMERICA DATASET - This batch code spatially joins the records to a EPA LIII field (NA_L3CODE) and exports a CSV
for (i in 1:1){
# Add county fields with GIS spatial join
all_bees <- st_as_sf(all_bees,
coords = c(x = "decimalLongitude", y = "decimalLatitude"),
crs = "NAD83",
remove = FALSE)
# Only use the below code if you need to convert the coordinate system to NAD83 or something else
# st_transform(counties_shp, crs = st_crs("NAD83"))
# Set the counties_shp to NAD83
st_crs(EPAL3) <- "NAD83"
EPAL3 <- st_transform(EPAL3, "NAD83")
# Make any invalid lines in the shapefile valid (take a little while)
EPAL3 <- st_make_valid(EPAL3)
# refuges <- st_make_valid(refuges)
sf_use_s2(TRUE)
# This is the actual join function
all_bees <- st_join(all_bees, EPAL3, join = st_within)
# all_bees <- st_join(all_bees, refuges, join = st_within)
# This removes all the unnecessary columns added by the join function
all_bees <- data.frame(all_bees)
all_bees <- select(all_bees, -c(geometry))
names(all_bees)[names(all_bees) == "date.x"] <- "date"
# Remove all records where the occurrence was not geo-referenced within a county boundary
# all_bees <- filter(all_bees, GEOID != "NA")
all_bees <- filter(all_bees, ID != "NA")
names(all_bees)[names(all_bees) == "ID"] <- "location_ID"
# Export CSV file
write.csv(all_bees, exportFilename, row.names = FALSE)
}
# IF YOU JUST NEED TO CREATE A TAXONOMIC LIST OF UNIQUE SPECIES
for(i in 1:1){
require(lubridate)
require(dplyr)
require(tidyverse)
require(tidyr)
require(sf)
require(data.table)
# Import raw dataset from GBIF (Present, 6 families, United States)
# Just change the CSV filename to whatever I called it
all_bees <- fread(importFilename, quote = "", select = c("family", "genus", "species", "infraspecificEpithet", "taxonRank", "scientificName", "speciesKey"), na.strings = "")
# Remove all records that are not identified to species
all_bees <- filter(all_bees, species != "")
# Remove all records that have taxonRank marked as UNRANKED
all_bees <- filter(all_bees, taxonRank != "UNRANKED")
# Replace "NA" values in subspecies with blank field
all_bees$infraspecificEpithet <- sapply(all_bees$infraspecificEpithet, as.character)
all_bees$infraspecificEpithet[is.na(all_bees$infraspecificEpithet)] <- ""
# If there is a variety or subspecies, add it to full name.
all_bees$fullName <- paste(all_bees$species, all_bees$infraspecificEpithet)
# Trim whitespace from right side of fullName field
all_bees$fullName <- str_trim(all_bees$fullName, side = c("both", "left", "right"))
# Filter the taxa dataset to only include unique values
all_bees <- unique(all_bees)
#Export the taxa list
write.csv(all_bees, exportFilename, row.names = FALSE)
}