-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReadxlsx.R
127 lines (104 loc) · 4.4 KB
/
Readxlsx.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
install.packages()
## Select readxl, janitor, here, tidyverse, lubridate, broom, knitr, rmarkdown, cowplot, gridExtra,
## patchwork, scales, plotly, ggcorrplot
## Dir refers to the directory name you'd like to work from
## setwd(dir)
## setwd('C:/Users/roorbach_o/Documents/R/win-library/4.0')
library(readxl)
library(janitor)
library(here)
library(tidyverse)
library(lubridate)
library(broom)
library(knitr)
library(rmarkdown)
library(cowplot)
library(gridExtra)
library(patchwork)
library(scales)
library(plotly)
library(ggcorrplot)
library(kableExtra)
## ???
here::here('data')
## :: means look in this package to use this function,
## %>% pass the left hand side of the operator to the first argument
## of the right hand side of the operator
##WORKS
dat <- readxl::read_xlsx(here::here('data', 'Guana_masterdata_2022.04.06.xlsx'),
sheet = 'Sheet1') %>%
janitor::clean_names()
## change column name to work with previously written code
dat <- rename(dat, date_sampled = sample_date)
# data dictionary with site-specific information
dict <- readr::read_csv(here::here('data', 'guana_data_dictionary.csv')) %>%
janitor::clean_names()
# inspect the data file
head(dat)
str(dat)
dplyr::glimpse(dat) # this one is my favorite to use
## remove dup samples
## cleaning up data, selecting columns we want
# removing all but wind and secchi, all component toupper (not sure why)
dat2 <- dat %>%
dplyr::filter(station_code != "GTMOLNUT_dup") %>% # remove the 'duplicate' station that was only sampled for a short while
dplyr::select(unit,
station_code,
date_sampled,
component_short,
component_long,
result,
remark,
flag) %>%
dplyr::filter(!(component_short %in% c("WIND_D", "SECCHI"))) %>% # remove wind direction and secchi
dplyr::mutate(component_long = toupper(component_long),
component_short = toupper(component_short))
# CAUTION: rewrites over dat2 dataframe created in previous lines
# keeps 'data' as ORIGINAL dat that you read in
dat2 <- dplyr::left_join(dat2, dict, by = "station_code")
# make sure both short and long have the same number of entries (~69)
# should be the same number of entries!
unique(dat2$component_short) # will pull out all the unique component names
unique(dat2$component_long) # will pull out the unique component names
##check for duplicates
janitor::get_dupes(dat2)
##check dat for NA or BLANKS ?? I have quite a few??
View(dat2 %>% dplyr::filter(is.na(result)))
## remove NA files
dat2 <- dat2 %>% dplyr::filter(result != "NA")
##rewrite data2, formatting time and sample and char
dat2 <- dat2 %>%
dplyr::mutate(date_sampled = as.POSIXct(date_sampled,
format = "%m/%d/%Y %H:%M",
tz = 'America/Regina'),
result = as.numeric(result),
month = month(date_sampled),
day = day(date_sampled),
year = as.character(year(date_sampled)), # set year as a character
site = factor(site,
levels = c("MICKLERS",
"DEPGL1",
"DEPGL2",
"LAKE MIDDLE",
"DEPGL4",
"LAKE SOUTH",
"RIVER NORTH",
"DEPGR1",
"GUANA RIVER",
"DEPGR3")),
site_friendly = factor(site_friendly,
levels = c("Micklers",
"GL1",
"GL2",
"Lake Middle",
"GL4",
"Lake South",
"River North",
"GR1",
"Guana River",
"GR3"))
)
#remove bad data - flagged data
dat2 <- dat2 %>% dplyr::filter(!grepl("-3", flag))
## Load Coloring Script
## Load plotting script