.Rhistory

amat = matrix(c(1,2,3,4), nrow = 2, ncol = 2)
amat
amat[0,1]
amat[c(0,1)]
amat[c(1,1)]
amat[c(0,2)]
amat[c(0,3)]
amat[3]
amat[2,2]
amat[1,2]
x="e"
switch(x,
a = "option 1",
b = ,
c = "option 2",
d = ,
e = ,
f = "option 3",
stop("Invalid `x` value")
)
x="e"
switch(x,
a = "option 1",
b = ,
c = "option 2",
d = ,
e = ,
stop("Invalid `x` value")
)
clc
clc()
setwd("~/Desktop/BU/Academics/Masters/F24/GRSMA615/HW/HW4/buoy-rainfall-hw4")
library(readr)
Rainfall <- read_csv("Rainfall.csv")
View(Rainfall)
# (Part A) Create a loop to read data from multiple files (if files for each year exist)
years <- 1985:2023
all_data <- list()
for (year in years) {
file_name <- paste0("Rainfall_", year, ".csv")  # Example file naming
if (file.exists(file_name)) {
yearly_data <- read.csv(file_name, stringsAsFactors = FALSE)
all_data[[as.character(year)]] <- yearly_data
}
}
combined_data <- do.call(rbind, all_data)
View(Rainfall)
# Read in the data
data <- read.csv("Rainfall.csv", stringsAsFactors = FALSE)
# Extract the year from the 'DATE' column (assuming the first four characters represent the year)
data$YEAR <- substr(data$DATE, 1, 4)
# Filter data to include only the range from 1985 to 2023
data_filtered <- subset(data, YEAR >= 1985 & YEAR <= 2023)
# Check the filtered data
head(data_filtered)
# Read in the data
data <- read.csv("Rainfall.csv", stringsAsFactors = FALSE)
# Extract the year from the 'DATE' column (assuming the first four characters represent the year)
data$YEAR <- substr(data$DATE, 1, 4)
# Filter data to include only the range from 1985 to 2023
data_filtered <- subset(data, YEAR >= 1985 & YEAR <= 2023)
# Check the filtered data
head(data_filtered)
# Part B
# Replace 999 values in relevant columns with NA (use actual column names as needed)
data_filtered[data_filtered == 999] <- NA
# Analyze the NA patterns by looking at the number of NA values in each column
na_counts <- sapply(data_filtered, function(x) sum(is.na(x)))
print(na_counts)
# Optionally visualize the missing values pattern
library(naniar)
install.packages("naniar")
# Read in the data
data <- read.csv("Rainfall.csv", stringsAsFactors = FALSE)
# Extract the year from the 'DATE' column (assuming the first four characters represent the year)
data$YEAR <- substr(data$DATE, 1, 4)
# Filter data to include only the range from 1985 to 2023
data_filtered <- subset(data, YEAR >= 1985 & YEAR <= 2023)
# Check the filtered data
head(data_filtered)
# Part B
# Replace 999 values in relevant columns with NA (use actual column names as needed)
data_filtered[data_filtered == 999] <- NA
# Analyze the NA patterns by looking at the number of NA values in each column
na_counts <- sapply(data_filtered, function(x) sum(is.na(x)))
print(na_counts)
# Optionally visualize the missing values pattern
library(naniar)
gg_miss_var(data_filtered)
View(Rainfall)
structure("Rainfall.csv")
structure(Rainfall.csv)
setwd("~/Desktop/BU/Academics/Masters/F24/GRSMA615/HW/HW4/buoy-rainfall-hw4")
structure(Rainfall.csv)
structure("Rainfall.csv")
structure(Rainfall)
file_root<-"https://www.ndbc.noaa.gov/view_text_file.php?filename=44013h"
year<-"2023"
tail<- ".txt.gz&dir=data/historical/stdmet/"
path<-paste0(file_root,year,tail)
header=scan(path,what= 'character',nlines=1)
buoy<-fread(path,header=FALSE,skip=2)
#R Script to read in all the buoy data
library(data.table)
library(lubridate)
file_root<-"https://www.ndbc.noaa.gov/view_text_file.php?filename=44013h"
year<-"2023"
tail<- ".txt.gz&dir=data/historical/stdmet/"
path<-paste0(file_root,year,tail)
header=scan(path,what= 'character',nlines=1)
buoy<-fread(path,header=FALSE,skip=2)
colnames(buoy)<-header
View(buoy)
View(data)
rm(list=ls())
#R Script to read in all the buoy data
library(data.table)
library(lubridate)
file_root<-"https://www.ndbc.noaa.gov/view_text_file.php?filename=44013h"
year<-"2023"
tail<- ".txt.gz&dir=data/historical/stdmet/"
path<-paste0(file_root,year,tail)
header=scan(path,what= 'character',nlines=1)
buoy<-fread(path,header=FALSE,skip=2)
colnames(buoy)<-header
View(buoy)
### TAHA H. ABABOU ###
### Homework 2                                         ###
### GGPlot Basics ###
#Put your code in this file. Make sure you assign the relevant values to the correct variable names, which are given below.
#Uncomment the variables as you assign your final values/functions/results to them.
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(stringr)# This loads the packages necessary to run your plots. Do not delete or comment this out.
library(readr)
### Exercise 1
# Read in the CSV file
sp500_data <- read.csv("SPX-1Month.csv")
setwd("~/Desktop/BU/Academics/Masters/F24/GRSMA615/HW/HW2")
### TAHA H. ABABOU ###
### Homework 2                                         ###
### GGPlot Basics ###
#Put your code in this file. Make sure you assign the relevant values to the correct variable names, which are given below.
#Uncomment the variables as you assign your final values/functions/results to them.
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(stringr)# This loads the packages necessary to run your plots. Do not delete or comment this out.
library(readr)
### Exercise 1
# Read in the CSV file
sp500_data <- read.csv("SPX-1Month.csv")
# Create the basic plot (spx_plot1)
spx_plot1 <- ggplot(sp500_data, aes(x = Date, y = `Close.Last`, group = 1)) +
geom_point() +
geom_line()
print(spx_plot1)
### TAHA H. ABABOU ###
### Homework 2                                         ###
### GGPlot Basics ###
#Put your code in this file. Make sure you assign the relevant values to the correct variable names, which are given below.
#Uncomment the variables as you assign your final values/functions/results to them.
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(stringr)# This loads the packages necessary to run your plots. Do not delete or comment this out.
library(readr)
### Exercise 1
# Read in the CSV file
sp500_data <- read.csv("SPX-1Month.csv")
# Create the basic plot (spx_plot1)
spx_plot1 <- ggplot(sp500_data, aes(x = Date, y = `Close.Last`, group = 1)) +
geom_point() +
print(spx_plot1)
### TAHA H. ABABOU ###
### Homework 2                                         ###
### GGPlot Basics ###
#Put your code in this file. Make sure you assign the relevant values to the correct variable names, which are given below.
#Uncomment the variables as you assign your final values/functions/results to them.
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(stringr)# This loads the packages necessary to run your plots. Do not delete or comment this out.
library(readr)
### Exercise 1
# Read in the CSV file
sp500_data <- read.csv("SPX-1Month.csv")
# Create the basic plot (spx_plot1)
spx_plot1 <- ggplot(sp500_data, aes(x = Date, y = `Close.Last`, group = 1)) +
geom_point() +
geom_line()
print(spx_plot1)
### TAHA H. ABABOU ###
### Homework 2                                         ###
### GGPlot Basics ###
#Put your code in this file. Make sure you assign the relevant values to the correct variable names, which are given below.
#Uncomment the variables as you assign your final values/functions/results to them.
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(stringr)# This loads the packages necessary to run your plots. Do not delete or comment this out.
library(readr)
### Exercise 1
# Read in the CSV file
sp500_data <- read.csv("SPX-1Month.csv")
# Create the basic plot (spx_plot1)
spx_plot1 <- ggplot(sp500_data, aes(x = Date, y = `Close.Last`, group = 1)) +
geom_line()
print(spx_plot1)
?ggplot()
#| label: load libraries
#| warning: false
#| message: false
library(knitr)
library(kableExtra)
install.packages("kableExtra")
#| label: load libraries
#| warning: false
#| message: false
library(knitr)
library(kableExtra)
library(tidyverse)
library(stringr)
#| label: read data - glimpse
strawberry <- read_csv("strawberries25_v3.csv", col_names = TRUE)
glimpse(strawberry)
## is every line associated with a state?
state_all <- strawberry |> distinct(State)
state_all1 <- strawberry |> group_by(State) |> count()
## every row is associated with a state
sum(state_all1$n) == dim(strawberry)[1]
## to get an idea of the data -- looking at california only
calif_census <- strawberry |> filter((State=="CALIFORNIA") & (Program=="CENSUS"))
calif_census <- calif_census |> select(Year, `Data Item`, Value)
###
calif_survey <- strawberry |> filter((State=="CALIFORNIA") & (Program=="SURVEY"))
calif_survey <- strawberry |> select(Year, Period, `Data Item`, Value)
#|label: drop 1-item columns
drop_one_value_col <- function(df){
drop <- NULL
for(i in 1:dim(df)[2]){
if((df |> distinct(df[,i]) |> count()) == 1){
drop = c(drop, i)
} }
if(is.null(drop)){return("none")}else{
print("Columns dropped:")
print(colnames(df)[drop])
strawberry <- df[, -1*drop]
}
}
## use the function
strawberry <- drop_one_value_col(strawberry)
drop_one_value_col(strawberry)
#|label: split Data Item
strawberry <- strawberry |>
separate_wider_delim(  cols = `Data Item`,
delim = ",",
names = c("Fruit",
"Category",
"Item",
"Metric"),
too_many = "error",
too_few = "align_start"
)
## Use too_many and too_few to set up the separation operation.
#|label: fix the leading space
# note
strawberry$Category[1]
# strawberry$Item[2]
# strawberry$Metric[6]
# strawberry$Domain[1]
##
## trim white space
strawberry$Category <- str_trim(strawberry$Category, side = "both")
strawberry$Item <- str_trim(strawberry$Item, side = "both")
strawberry$Metric <- str_trim(strawberry$Metric, side = "both")
unique(strawberry$Fruit)
## generate a list of rows with the production and price information
spr <- which((strawberry$Fruit=="STRAWBERRIES - PRODUCTION") | (strawberry$Fruit=="STRAWBERRIES - PRICE RECEIVED"))
strw_prod_price <- strawberry |> slice(spr)
## this has the census data, too
strw_chem <- strawberry |> slice(-1*spr)  ## too soon
#|label: split srawberry into census and survey pieces
strw_b_sales <- strawberry |> filter(Program == "CENSUS")
strw_b_chem <- strawberry |> filter(Program == "SURVEY")
nrow(strawberry) == (nrow(strw_b_chem) + nrow(strw_b_sales))
## Move marketing-related rows in strw_b_chem
## to strw_b_sales
#|label: plot 1
plot1_data <- strawberry |>
select(c(Year, State, Category, Value)) |>
filter((Year == 2021) & (Category == "ORGANIC - OPERATIONS WITH SALES"))
plot1_data$Value <- as.numeric(plot1_data$Value)
plot1_data <- plot1_data |> arrange(desc(Value))
ggplot(plot1_data, aes(x=reorder(State, -Value), y=Value)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(x = "States", y = "Count",
title ="Number of Organic Strawberry operations with Sales in 2021")
## plot 2
plot2_data <- strawberry |>
select(c(Year, State, Category, Item, Value)) |>
filter((Year == 2021) &
(Category == "ORGANIC - SALES") &
(Item == "MEASURED IN $") &
(Value != "(D)"))
plot2_data$Value <- as.numeric(gsub(",", "", plot2_data$Value))
plot2_data <- plot1_data |> arrange(desc(Value))
ggplot(plot2_data, aes(x=reorder(State, -Value), y=Value)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45,hjust=1)) +
labs(x = "States", y = "Sales",
title ="Organic Strawberry Sales ($) in 2021")
cc <- strawberry |> distinct(Category)
cca <- strawberry |>
distinct(Domain)
## Split domain into two columns "type" and "subtype"
strawberry <- strawberry |>
separate_wider_delim(  cols = Domain,
delim = ",",
names = c("type",
"subtype"),
too_many = "error",
too_few = "align_start"
)
## check the result
ctype <- strawberry |> distinct(type)
csubtype <- strawberry |> distinct(subtype)
##
##
yr <- strawberry |> distinct(Year)
cc <- strawberry |> distinct(Category)
cca <- strawberry |>
distinct(Domain)
## columns need descriptive names
doc_cat <- strawberry |> distinct(`Domain Category`)
strawberry <- strawberry |>
separate_wider_delim(  cols = `Domain Category`,
delim = ",",
names = c("type1",
"detail1",
"detail2",
"datail3"),
too_many = "error",
too_few = "align_start"
)
## columns need descriptive names
strawberry <- strawberry |>
separate_wider_delim(  cols = type1,
delim = ":",
names = c("type1a",
"type1b"),
too_many = "error",
too_few = "align_start"
)
# dat1 <- strawberry |> filter(type=="CHEMICAL")
#
# dat2 <- strawberry |> filter(strawberry$type!=strawberry$type1a)
#
#
# data_f21 <- strawberry |>
#   filter((subtype == " FUNGICIDE") & (State == "CALIFORNIA") & (Year == "2021")  )
#
# data_f20 <- strawberry |>
#   filter((subtype == " FUNGICIDE") & (State == "CALIFORNIA") & (Year == "2020")  )
#
# data_f19 <- strawberry |>
#   filter((subtype == " FUNGICIDE") & (State == "CALIFORNIA") & (Year == "2019")  )
#
# data_f18 <- strawberry |>
#   filter((subtype == " FUNGICIDE") & (State == "CALIFORNIA") & (Year == "2018")  )
#
# data_f17 <- strawberry |>
#   filter((subtype == " FUNGICIDE") & (State == "CALIFORNIA") & (Year == "2017")  )
#
## columns need descriptive names
strawberry <- strawberry |>
separate_wider_delim(  cols = detail1,
delim = ":",
names = c("detail1a",
"detail1b"),
too_many = "error",
too_few = "align_start"
)
strawberry$detail1b <- strawberry$detail1b |>
str_trim(side = "both") |>
str_sub(start = 2, end = -2)
aa <- strawberry$detail1b
aa <- na.omit(aa)
group1 <- c("captafol", "ethylene dibromide",
"glyphosate","malathion", "diazinon",
"dichlorophenyltrichloroethane", "DDT")
setwd("~/Desktop/BU/Academics/Masters/F24/GRSMA615/HW/HW4/buoy-rainfall-hw4")