install.packages('dplyr'); install.packages('ggplot2'); install.packages('tidyverse'); install.packages('xts'); install.packages('zoo'); library('dplyr'); library('ggplot2'); library('tidyverse'); install.packages("PerformanceAnalytics"); library(PerformanceAnalytics);
wine <- read.csv("~/Downloads/BurgundySip.csv",na.strings = c("","NA","N.V."));
head(wine); dim(wine); # 7500 rows and 14 cols when loaded. nrow(wine); str(wine); summary(wine);
wine$RSG <- as.numeric(gsub(" ","",wine$RSG)); wine$AL <- as.numeric(gsub(" ","",wine$AL)); wine$DN <- as.numeric(gsub(" ","",wine$DN));
length(unique(wine$TP)); # 22 length(unique(wine$NAME)); # 481 length(unique(wine$WINE)); # 848 length(unique(wine$REG)); # 77
replace <- function(dataFrame, var) { wineCount <- dplyr::count(dataFrame, var = dataFrame[,var],sort = TRUE); top10 <- head(wineCount,10); countLess <- wineCount[!(wineCount[,1] %in% top10[,1]),1] dataFrame[dataFrame[,var] %in% countLess, var] <- "other"; return(dataFrame); }
wine <- replace(wine, "REG"); wine <- replace(wine, "NAME"); wine <- replace(wine, "WINE"); wine <- replace(wine, "TP");
wine <- transform(wine, NAME = factor(NAME), WINE = factor(WINE),
TP = factor(TP),
REG = factor(REG)
);
str(wine);
anyDuplicated(wine$SN); # returns the index of first duplicate entry
sum(duplicated(wine[,"SN"])); #It has 3834 duplicate values
nrow(wine[duplicated(wine[,"SN"]),]) ;
(sum(duplicated(wine[1]))/nrow(wine)) * 100; # 51.12 % of data is will be taken out
dup2 <-duplicated(wine[,"SN"]) | duplicated(wine[,"SN"],fromLast=TRUE); wineDup <- wine[dup2,]; wineDup;
wine <- wine[!duplicated(wine[,"SN"],fromLast=TRUE),]; # bottom to top nrow(wine) # after cleaning 3666
anyDuplicated(wine[,"SN"]);
rownames(wine) <- NULL;
####### From here, the data set has no duplicates in SN.
wine %>% select_if(is.numeric) %>% chart.Correlation();
summary(wine);
summary(wine$NUMR);
hist(wine$NUMR, xlab = "NUMR", main = "Number of testers", breaks = sqrt(nrow(wine)));
hist(wine[wine$NUMR > 3000,"NUMR"], xlab = "NUMR", main = "Number of testers", breaks = sqrt(nrow(wine)));
boxplot(wine$NUMR, main = "Number of testers"); boxplot.stats(wine$NUMR);
length(boxplot.stats(wine$NUMR)$out)/length(na.omit(wine$NUMR))*100; min(boxplot.stats(wine$NUMR)$out); max(boxplot.stats(wine$NUMR)$out);
outlier <- boxplot.stats(wine$NUMR)$out; outlier; outind <- which(wine$NUMR %in% outlier); outind;
wine$NUMR[outind] <- summary(wine$NUMR)[5] + 1.5*(summary(wine$NUMR)[5]-summary(wine$NUMR)[2]); outind; wine$NUMR[outind];
boxplot(wine$NUMR);
summary(wine$PR);
hist(wine$PR, xlab = "PR", main = "Price of wine, €", breaks = sqrt(nrow(wine)));
hist(wine[wine$PR > 100,"PR"], xlab = "PR", main = "Price of wine, €", breaks = sqrt(nrow(wine)));
boxplot(wine$PR, main = "Price of wine, €"); boxplot.stats(wine$PR);
length(boxplot.stats(wine$PR)$out)/length(na.omit(wine$PR))*100; min(boxplot.stats(wine$PR)$out); max(boxplot.stats(wine$PR)$out);
plot(wine$RT, wine$PR, , main = "Price vs average rating");
plot(wine$RSG, wine$PR, main = "Price vs residual sugar level");
plot(wine$TP, wine$PR, main = "Price by wine variety"); plot(wine$YR, wine$PR, main = "Price by year"); plot(wine$REG, wine$PR, main = "Price by region");
summary(wine$RT);
hist(wine$RT, xlab = "RT", main = "Average rating given by the test users", breaks = sqrt(nrow(wine)));
boxplot(wine$RT, main = "Average rating given by the test users"); boxplot.stats(wine$RT);
length(boxplot.stats(wine$RT)$out)/length(na.omit(wine$RT))*100; min(boxplot.stats(wine$RT)$out); max(boxplot.stats(wine$RT)$out);
plot(wine$RSG, wine$RT, main = "Average rating vs residual sugar level");
plot(wine$AL, wine$RT, main = "Average rating vs alcohol percentage");
summary(wine$BD);
hist(wine$BD, xlab = "BD", main = "Body score of wine", breaks = sqrt(nrow(wine)));
boxplot(wine$BD, main = "Body score of wine"); boxplot.stats(wine$BD);
length(boxplot.stats(wine$BD)$out)/length(na.omit(wine$BD))*100; min(boxplot.stats(wine$BD)$out); max(boxplot.stats(wine$BD)$out);
summary(wine$ACD);
hist(wine$ACD, xlab = "ACD", main = "Acidity score", breaks = sqrt(nrow(wine)));
boxplot(wine$ACD, main = "Acidity score"); boxplot.stats(wine$ACD);
length(boxplot.stats(wine$ACD)$out)/length(na.omit(wine$ACD))*100; min(boxplot.stats(wine$ACD)$out); max(boxplot.stats(wine$ACD)$out);
summary(wine$RSG);
hist(wine$RSG, xlab = "RSG", main = "Residual sugar level of wine");
boxplot(wine$RSG, main = "Residual sugar level of wine"); boxplot.stats(wine$RSG);
length(boxplot.stats(wine$RSG)$out)/length(na.omit(wine$RSG))*100; min(boxplot.stats(wine$RSG)$out); max(boxplot.stats(wine$RSG)$out);
summary(wine$AL);
hist(wine$AL, xlab = "AL", main = "Alcohol percentage of wine");
boxplot(wine$AL, main = "Alcohol percentage of wine"); boxplot.stats(wine$AL);
length(boxplot.stats(wine$AL)$out)/length(na.omit(wine$AL))*100; min(boxplot.stats(wine$AL)$out); max(boxplot.stats(wine$AL)$out);
plot(wine$RSG, wine$AL);
4-8 VARIABLE DN - The typical density or specific gravity of the wine is generally between 1.080 and 1.090 / DEP
the original gravity, the gravity just before adding the yeast, of a typical wine will be 1.075 to 1.090.
summary(wine$DN);
hist(wine$DN, xlab = "DN", main = "Typical density or specific gravity of the wine");
boxplot(wine$DN); boxplot.stats(wine$DN);
length(boxplot.stats(wine$DN)$out)/length(na.omit(wine$DN))*100; min(boxplot.stats(wine$DN)$out); max(boxplot.stats(wine$DN)$out);
plot(wine$AL, wine$DN);
####### OUTLIERS HANDLED! From here, we build a df to show the qty of NAs per variable
#STEP 5: Handling missing values
count_na<-function(data){ require(tidyverse); require(cowplot);
df.na.count <- map(data,~sum(is.na(.))) %>% simplify() %>% tibble(col = names(.), NAs = .) %>% mutate(percentage = round(NAs/nrow(data)*100));
print(df.na.count %>% as.data.frame());
p1<-ggplot(df.na.count,aes(x = col , y = NAs))+ geom_col()+ theme(axis.text.x = element_text(angle = 90))
p2<- ggplot(df.na.count,aes(x = col, y = percentage))+ geom_col()+ theme(axis.text.x = element_text(angle = 90,))+ scale_y_continuous(limits = c(0,100))
#plot_grid(p1,p2,nrow = 2) } count_na(wine);
#check the cor table of all numeric variables wineCorrelation <- cbind(wine$YR, wine$RT, wine$NUMR, wine$PR, wine$BD, wine$ACD, wine$RSG, wine$AL, wine$DN); colnames(wineCorrelation) <- c('year', 'rating', 'number', 'price', 'body', 'acidity', 'rsg', 'alcohol', 'dn'); head(wineCorrelation); cor(wineCorrelation, use = "complete.obs");
#identify the row with NA in SN and drop it. #The observation with NA in NAME is the same with NA in SN, to both cases are solved here wine[is.na(wine$SN),]; wine <- wine[-which(is.na(wine$SN)),]; wine[is.na(wine$SN),];
#STEP 5.2 #check NAs in year wine[is.na(wine$YR),];
#function to fill the NA in years based on the wine name fillYearNA <- function(df){ ##use the random only for the years that already exists in df
SNfromYearNA <- wine[is.na(wine$YR),]$SN #get all SNs from the NAs in year allWinesNameYr <- aggregate(YR ~ NAME, wine, FUN=max) #sumamrize the maximum years by NAME #merge the summarizations between max/min of year by NAME
#as there is no correlation at all between year and any other numeric variable #so we will fill the year NAs by assigning a random number between the min and the max year for the NAME
for(i in 1:length(SNfromYearNA)){ curWineName <- allWinesNameYr[allWinesNameYr$NAME==wine[wine$SN==SNfromYearNA[i],]$NAME,] #print(wine[wine$SN==SNfromYearNA[i],]) if(nrow(curWineName)==0){ #if there is no year register for NAME, we will get a random value between the min and max from all wines wine[wine$SN==SNfromYearNA[i], ]$YR <<- round(runif(1, min(na.omit(wine$YR)), max(na.omit(wine$YR))))
} else{
#if there is at least one other observation of the same NAME with any year, we will generate a random
#number between the min and the max year from this NAME
allYearsFromName <- unique(wine[wine$NAME=="Contino",]$YR)
wine[wine$SN==SNfromYearNA[i], ]$YR <<- sample(allYearsFromName,1) ### round(runif(1, curWineName$MINYEAR, curWineName$MAXYEAR))
}
} }
fillYearNA(wine) sum(is.na(wine$YR)) #no NAs #check NAs again: count_na(wine)
#function to find the mode getmode <- function(variable) { uniqv <- unique(variable) #get uniques return (uniqv[which.max(tabulate(match(variable, uniqv)))]) #get modes }
#STEP 5.3 Handle NAs in TP (153 NAs) fillTypeNA <- function(df){ SNfromTypeNA <- wine[is.na(wine$TP),]$SN #get all SNs from the NAs in year allWinesRegTp <- aggregate(TP ~ REG, wine, FUN=getmode) #get mode from type aggregated by REG #when aggregated by region, only 24 regions do not have a correspondent type mode. If we aggregate by NAME, it would be 101.
for(i in 1:length(SNfromTypeNA)){ curWineReg <- allWinesRegTp[allWinesRegTp$REG==wine[wine$SN==SNfromTypeNA[i],]$REG,]
#print(wine[wine$SN==SNfromTypeNA[i],])
if(nrow(curWineReg)==0){
#if there is no year register for REG, then we will get the mode from all wines
wine[wine$SN==SNfromTypeNA[i], ]$TP <<- getmode(wine$TP)
} else{
#if there is at least one other observation of the same REG with any TP, we will take the mode from region
wine[wine$SN==SNfromTypeNA[i], ]$TP <<- curWineReg$TP
}
}
}
fillTypeNA(wine); sum(is.na(wine$TP));
sum(is.na(wine$DN));
hist(wine$DN);
as we can see , this variable values are in normal distribution, so mean is good representation of the missing values :
wine$DN[is.na(wine$DN)]<- mean(wine$DN,na.rm = T);
sum(is.na(wine$DN));
#STEP 5.5
sum(is.na(wine$RSG)); hist(wine$RSG); sd(wine$RSG,na.rm = T);
RSG has normal distribution,with small standard deviation , mean is good representation for the missing values :
wine$RSG[is.na(wine$RSG)]<- mean(wine$RSG,na.rm = T);
sum(is.na(wine$RSG));
sum(is.na(wine$AL)); wine_no_na<- na.omit(wine); cor(wine_no_na$AL,wine_no_na$RSG);
al_train<-wine[!is.na(wine$AL),];
test_set2<-wine[is.na(wine$AL),];
al_model <- lm(AL~RSG,al_train); summary(al_model); al_model$fitted.values;
al_predict<- predict(al_model,test_set2); al_predict;
wine$AL[is.na(wine$AL)]<-al_predict;
sum(is.na(wine$AL));
sum(is.na(wine$ACD));
sum(is.na(wine$ACD)); hist(wine$ACD);
count_values<-wine %>% na.omit() %>% group_by(ACD) %>% summarise(count = n()) %>% mutate(percent = count/sum(count));
we will treat missing values using sampling from original dataset by applying probabilities as shown above :
na_fill_values<- sample(sample(c(3,2,1),size = 438,replace = T,prob = c(0.968,0.0233,0.00853)));
wine$ACD[is.na(wine$ACD)]<- na_fill_values;
sum(is.na(wine$ACD));
sum(is.na(wine$BD));
cor(wine4$BD,wine4$ACD);
hist(wine$BD);
count_values_bd<-wine %>% na.omit() %>% group_by(BD) %>% summarise(count = n()) %>% mutate(percent = count/sum(count));
we will treat missing values using sampling from original dataset by applying probabilities as shown above :
na_fill_values_bd<- sample(sample(c(4,5,3,2),size = 438,replace = T,prob = c(0.578,0.286,0.124,0.0108)));
wine$BD[is.na(wine$BD)]<- na_fill_values_bd;
sum(is.na(wine$BD));
sum(is.na(wine$PR));
train_set1<- wine[!is.na(wine$PR),]; #create test set : test_set1<- wine[is.na(wine$PR),];
pr_model <- lm(PR~log(YR), data =train_set1);
summary(pr_model); pr_model$fitted;
pr_predict<- predict(pr_model,test_set1); pr_predict;
wine$PR[is.na(wine$PR)]<- pr_predict;
sum(is.na(wine$PR));
count_na(wine);
wine$YRFactor <- cut(wine$YR, breaks = c(1900,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010,2020,2030), right = T, labels = c("1900s","1910s","1920s","1930s","1940s","1950s","1960s","1970s", "1980s", "1990s", "2000s", "2010s", "2020s"));
avg_rating<-wine %>% #grouping by winery group_by(NAME) %>% #calculating the mean for each group summarise(avg_rating = mean(RT)) %>% ungroup() %>%
arrange(desc(avg_rating)) %>%
mutate(NAME = fct_inorder(NAME)) %>% #picking the top 5 head(5) ; p1<-avg_rating%>% #plot ggplot()+ geom_bar(aes(x = NAME, y = avg_rating, fill = NAME), stat = "identity")+ scale_fill_viridis_d(option = "magma", direction = -1)+ xlab("Name of the winery")+ ylab("Average rating")+
geom_label(aes(x = NAME, y = avg_rating, label = round(avg_rating,2), hjust = .5), size = 2.5)+ scale_x_discrete(label = NULL)+ ggtitle("Top 5 average rating Wineries")+ theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45), panel.grid = element_line(linewidth = 0)); p1;
expensive_wine<-wine %>% #group by NAME and WINE group_by(NAME,WINE) %>% #Find the max price summarise(max_price = max(PR)) %>% ungroup() %>% #order by max price desc arrange(desc(max_price)) %>% #factorizing WINE to sort it in plot mutate(WINE = fct_inorder(NAME)) %>% #pick the top 5 head(5);
p2<-expensive_wine%>% ggplot()+ geom_bar(aes(x = NAME, y = max_price, fill = NAME), stat = "identity")+ scale_fill_viridis_d(option = "magma", direction = -1)+ xlab("Wine")+ ylab("Price")+ #add labels to the plot : geom_label(aes(x = NAME, y = max_price, label = round(max_price,0), hjust = .5), size = 2.5)+ scale_x_discrete(label = NULL)+ ggtitle("Top 5 most expensive wine")+ theme(plot.title = element_text(hjust = 0.5), panel.grid = element_line(linewidth = 0)); p2;
exp_wine_var<-wine %>% #grouping by WINE and TP : group_by(WINE,TP) %>%
summarise(max_price = max(PR)) %>% ungroup() %>% #sorting DESC by max price : arrange(desc(max_price)) %>%
mutate(TP = fct_inorder(TP)) %>% #picking the top 10 head(10);
p3<-exp_wine_var%>% ggplot()+ geom_bar(aes(x = TP, y = max_price, fill = TP), stat = "identity")+ scale_fill_viridis_d(option = "magma", direction = -1)+ xlab("Wine Variety")+ ylab("Price")+ scale_x_discrete(label = NULL)+ ggtitle("Top 6 most expensive wine",subtitle = "By wine Variety")+ theme(plot.title = element_text(hjust = 0.5), panel.grid = element_line(linewidth = 0), plot.subtitle = element_text(hjust = 0.5)); p3;
analyse_cor<-function(data){ require(tidyverse); require(PerformanceAnalytics);
wine_numeric<-lapply(X = data,FUN = is.numeric) %>% simplify();
wine_numeric<-data[,wine_numeric];
wine_numeric %>% chart.Correlation(); print(wine_numeric %>% na.omit() %>% cor()); wine_numeric %>% na.omit %>% cor() %>% heatmap(); }
analyse_cor(wine);
ggplot(data = wine,aes(x = RSG, y = RT,colour = TP,size = PR)) + geom_point() + facet_grid(.~YRFactor)+ ggtitle("Affect of Sugar and Rating on Price");
ggplot(data = wine, aes( x = RSG, y = AL, size = PR,colour = TP)) + geom_point(alpha = 0.5) + ggtitle("Correlation of Sugar and Alcohol")
meanRSG <- mean(wine$RSG,na.rm = T); sdRSG <- sd(wine$RSG,na.rm = T); n <- nrow(wine); meanRSG; sdRSG;
RSGZscore <- round(abs(wine$RSG - meanRSG)/sdRSG,2);
RSGZscore95Per <- quantile(RSGZscore,0.95,na.rm = T); RSGZscore95Per;
c <- RSGZscore95Per * sdRSG/sqrt(n); c; a_confidence <- meanRSG-c; b_confidence <- meanRSG+c; print(paste("Residual sugar level with 95% confidence interval [",round(a_confidence,2),",",round(b_confidence,2), "]"));
hist(wine$RSG, xlab = "RSG", main = "Residual sugar level of wine");
abline(v=meanRSG, col="red"); abline(v=a_confidence, col="blue"); abline(v=b_confidence, col="blue");
wine %>% ggplot()+ geom_line(aes(x = RSG, y = AL))
df_train <- na.omit(wine)
sl_model<- lm(AL~RSG,data = df_train)
summary(sl_model)
ggplot(na.omit(wine))+ geom_line(aes(x = RSG, y = AL,color = "purple"))+ geom_line(aes(x = RSG, y = sl_model$fitted.values,color = "forestgreen"))
q_model<-lm(AL~RSG+RSG^2,data =df_train)
summary(q_model)
ggplot(na.omit(wine))+ geom_line(aes(x = RSG, y = AL,color = "purple"))+ geom_line(aes(x = RSG, y = q_model$fitted.values,color = "forestgreen"))
newdata<- data.frame(RSG = runif(n = 100,min = 1,max = 3),AL = rep(NA,100))
pred_sl<- predict(sl_model,newdata = newdata)
newdata[,2]<- pred_sl; newdata;
wine2<-df_train[,c(12,13)] forecast<- rbind(wine2,newdata) forecast<- forecast %>% arrange(RSG) %>% mutate(id = row_number())
#plot our predictions
ggplot()+ geom_line(data = wine,aes(x = RSG, y = AL,color = "forestgreen"))+ geom_line(data = df_train,aes(x = RSG, y = sl_model$fitted.values))+ geom_line(data = newdata,aes(x = RSG, y = AL, color = "red"))+
geom_vline(xintercept = 3 ,color = "blue");
ggplot(wine)+ geom_line(aes(x = YR, y = PR));
df2_train<-na.omit(wine);
ml_model <- lm(PR~log(YR)+ACD+ACD^2+AL+AL^2+log(BD), data =df2_train); summary(ml_model); ml_model$fitted;
as we can see from the comparison between the two models ,second one has better Residual standard error, so we will run our prediction using the Second one :
newdata_pr<- data.frame(PR = rep(NA,100), YR = seq(1811,1910,1), ACD = runif(100,1,5), AL = sample(wine$AL,size = 100,replace = F), BD = runif(100,1,5));
pred_ml<- predict(ml_model,newdata_pr); pred_ml; newdata_pr[,1]<-pred_ml; newdata_pr;
wine4<-df2_train[,c(9,4,11,13,10)]; forecast2<- rbind(wine4,newdata_pr);
#plot final result
ggplot()+ geom_line(data = wine,aes(x = YR, y = PR),color = "forestgreen")+ geom_line(data = df2_train,aes(x = YR, y = ml_model$fitted.values))+ geom_line(data = newdata_pr,aes(x = YR, y = PR), color = "red")+
geom_vline(xintercept = 1910 ,color = "blue");
#-------------------------------------------------------------------------------------------- #STEP 9 CLUSTERING
fit <- kmeans(wine$PR, 3); # 3 cluster solution fit;
fit$cluster; fit$iter;
#get the clusters cluster1 <- wine[fit$cluster==1,]; cluster1; cluster2 <- wine[fit$cluster==2,]; cluster2; cluster3 <- wine[fit$cluster==3,]; cluster3;
wine$PRClass <- fit$cluster; wine$PRClass[wine$PRClass==1] <- "LOW"; wine$PRClass[wine$PRClass==2] <- "MEDIUM"; wine$PRClass[wine$PRClass==3] <- "HIGH";
#Plot the clusters plot(wine$PR, pch=21, bg=fit$cluster * 2 + 3 );
plot(wine[12:13], pch=21);
fit <- kmeans(wine[12:13], 4) # 4 cluster solution fit;
wine$RsgAlClass <- fit$cluster; wine$RsgAlClass[wine$RsgAlClass==1] <- "Cluster1"; wine$RsgAlClass[wine$RsgAlClass==2] <- "Cluster2"; wine$RsgAlClass[wine$RsgAlClass==3] <- "Cluster3"; wine$RsgAlClass[wine$RsgAlClass==4] <- "Cluster4"; wine$RsgAlClass <- factor(wine$RsgAlClass);
plot(wine[12:13], pch=21, bg=fit$cluster * 2 + 3 ); wine;