diff --git a/AllModels.Rmd b/AllModels.Rmd new file mode 100644 index 0000000..b9160b2 --- /dev/null +++ b/AllModels.Rmd @@ -0,0 +1,759 @@ +--- +title: "DATA 311, All models" +author: "Barret Jackson, Emily Medema, Kat Lecha, Lauren St. Clair" +date: "March 30th, 2019" +output: pdf_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r setup, include=FALSE} +pokemon<-read.csv("pokemon_alopez247.csv") +``` + +#Number of Pokemon per Type +```{r} +library(ggplot2) +type<-ggplot(pokemon, aes(pokemon$Type_1, fill = pokemon$Type_1)) + geom_histogram(stat="count", color = "black") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +type +``` + +Pokemon dataset split into training and testing sets +```{r} +set.seed(1995) +train<-sample(1:nrow(pokemon),432) +poke.test<-pokemon[-train,] +poke.train<-pokemon[train,] +``` + +#Linear Model +Linear model, Total as response and HP, Attack, and Defense are predictors: +```{r} +library(DAAG) +linmod <- lm(poke.train$Total~poke.train$HP+poke.train$Attack+poke.train$Defense) +summary(linmod) +#plot(linmod) +plot(poke.train$HP+poke.train$Attack+poke.train$Defense, poke.train$Total) +abline(linmod, h = 0.5, col = "red") +#mmmm tasty sig values +predicted<-predict(linmod, newdata=poke.test) +mean(linmod$residuals^2) +mean((poke.test$Total-predicted)^2) +``` + +#Clustering +Single, Average, and Complete linkage respectively modeled below +```{r} +eucdist<-dist(pokemon, method="euclidean") +clusPokemon<-hclust(eucdist, method = "single") +plot(clusPokemon) +clusPokemonAvg<-hclust(eucdist, method = "average") +plot(clusPokemonAvg) +clusComplete<-hclust(eucdist, method = "complete") +plot(clusComplete) +``` +We see that a complete linkage method appears to fit our dataset best. + +#Regression Tree, Total as response and HP, Attack, Defense, Sp_Atk, Sp_Def, and Speed as predictors +```{r} +library(tree) +poke<-data.frame(pokemon) +attach(poke) +pocl<-tree(Total~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke) +plot(pocl) +text(pocl) +``` + +Let's try pruning back our tree from above, +```{r} +cv.pocl<-cv.tree(pocl, FUN=prune.tree) +plot(cv.pocl,type="b") +p.pocl<-prune.tree(pocl,best=10) +plot(p.pocl) +text(p.pocl) +summary(p.pocl) +``` +We can see that the lowest is MSE is given with 12 nodes, suggesting that pruning may be unnecessary. + +#Random Forests +```{r} +library(randomForest) +set.seed(1995) +pokebag<-randomForest(Total~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke,mtry=6,importance=FALSE) +pokebag +``` + +Random forest where m = 3 +```{r} +pokeRF<-randomForest(Total~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke,mtry=3,importance=TRUE) +pokeRF +``` + +Let's see if we can predict if a Pokemon is legendary using the Total predictor. In using the predictor Total, we are under the assumption that legendary Pokemon have high totals. From looking at the data set this appears to be true. It also appears from the data set that legendary Pokemon do not have a gender expecept for a couple outliers. Let's see if this is the case, +```{r} +#https://www.kaggle.com/excaliburzero/predicting-legendary-pokemon +maxTotal<-order(pokemon$Total, decreasing = TRUE) +head(pokemon[maxTotal,]) +``` +It does appear that the Pokemon with the highest total are in fact of the legendary type. + +```{r} +library(ggplot2) +plot<-ggplot(pokemon, aes(x =Total, fill = isLegendary)) + geom_histogram() +plot +``` +From this graph we can see that the higher the total the more likely a pokemon is to be legendary. In fact, it appears that a pokemon is only legendary when it is above 650 in total and most likely legendary from around 550-625. + +Let's now check the correlation between gender and legendary status, +```{r} +pokemon$hasGender<-factor(pokemon$hasGender) +plot2<-ggplot(pokemon, aes(x =hasGender, fill = isLegendary)) + geom_bar() +plot2 +``` +As our first assumption suggested, the plot too suggests that majority of legendary pokemon (isLegendary = TRUE), do not have a gender (hasGender = FALSE). + +Let's see if there are any linear relationships within our Pokemon dataset. Name and Number will be excluded from examination as these will likely have no effect on the data. + +#K-Means +```{r} +library(mclust) +library(cluster) +library(dplyr) +library(fpc) +pokeNum<-select_if(pokemon, is.numeric) +distPoke<-daisy(pokemon) +#distPoke<-daisy(pokeNum) +summary(distPoke) +pokeDist<-cmdscale(distPoke) +plot(pokeDist, type = "n") +text(pokeDist, rownames(pokeDist)) +set.seed(413) +clustore<-matrix(0, nrow = 721, ncol=25) +wsstore<-NULL +for(i in 1:10){ + km<-kmeans(pokeDist, i, nstart=10) + clustore[,i]<-km$cluster + wsstore[i]<-km$tot.withinss +} +plot(wsstore) +kPoke2<-kmeans(pokeDist, 7, nstart=25) +plot(pokeDist, col = kPoke2$cluster) +points(kPoke2$centers, col = 1:4, pch=8, cex=2) +out <- cbind(pokemon, clusterNum = kPoke2$cluster) +clusterGroups<-order(out$clusterNum, decreasing = TRUE) +head(out[clusterGroups,]) +``` + + +#KNN Classification +```{r} +library(class) +knnrun<-knn.cv(pokeDist, cl = poke.train$isLegendary, k = 5, prob = TRUE) +table(poke.train$isLegendary, knnrun) +``` + +#Linear Discriminant Analysis +```{r} +library(MASS) +library(MLmetrics) +poke.train$hasGender<-factor(poke.train$hasGender) +poke.train$isLegendary<-factor(poke.train$isLegendary) +pokelda<-lda(poke.train$isLegendary~poke.train$hasGender+poke.train$Total) +table(poke.train$isLegendary, predict(pokelda)$class) +Sensitivity(poke.train$isLegendary, predict(pokelda)$class) +Recall(poke.train$isLegendary, predict(pokelda)$class) #same as sensitivity +Precision(poke.train$isLegendary, predict(pokelda)$class) +Specificity(poke.train$isLegendary, predict(pokelda)$class) +F1_Score(poke.train$isLegendary, predict(pokelda)$class) +``` + +#QDA +```{r} +pokeqda<-qda(poke.train$isLegendary~poke.train$hasGender+poke.train$Total) +table(poke.train$isLegendary, predict(pokeqda)$class) +Sensitivity(poke.train$isLegendary, predict(pokelda)$class) +Recall(poke.train$isLegendary, predict(pokeqda)$class) #same as sensitivity +Precision(poke.train$isLegendary, predict(pokeqda)$class) +Specificity(poke.train$isLegendary, predict(pokeqda)$class) +F1_Score(poke.train$isLegendary, predict(pokeqda)$class) +``` + +#Logistic Regression +```{r} +simlog<-glm(factor(poke.train$isLegendary)~poke.train$hasGender+poke.train$Total, family = "binomial") +table(predict(simlog, type = "response")>0.5, poke.train$isLegendary) +``` + +```{r} +#https://www.kaggle.com/excaliburzero/predicting-legendary-pokemon +poke<-data.frame(pokemon) +pokeLegend<-poke[which(isLegendary=='True'),] +plot(Generation~isLegendary) +TheLegends<-as.data.frame(table(pokeLegend$Generation)) +colnames(TheLegends)<-c("Generation", "Legends") +TheLegends +summary(TheLegends) +plot<-ggplot(TheLegends, aes(Generation, Legends))+geom_bar(stat="identity") +plot +TheMan<-as.data.frame(table(pokeLegend$Type_1)) +colnames(TheMan)<-c("Type 1", "Legends") +TheMan +summary(TheMan) +plot(TheMan) +maxTotalL<-order(TheMan$Legends, decreasing = TRUE) +head(TheMan[maxTotalL,]) +#Of Type 2 +TheMyth<-as.data.frame(table(pokeLegend$Type_2)) +colnames(TheMyth)<-c("Type 2", "Legends") +TheMyth +summary(TheMyth) +plot(TheMyth) +maxTotalL2<-order(TheMyth$Legends, decreasing = TRUE) +head(TheMyth[maxTotalL2,]) +``` + +```{r} +poke<-data.frame(pokemon) +poke1<-poke[which(hasGender=='True'),] +attach(poke) +head(poke1) +``` + +Let's see if there is a relationshp between Score and Pr_Male, a predictor for the probability of gender according to male +```{r} +set.seed(983457) +pokeG<-tree(Pr_Male~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed + Total,data=poke1) +plot(pokeG) +text(pokeG, pretty=0) +cv.pokeG<-cv.tree(pokeG, FUN=prune.tree) +plot(cv.pokeG) +prunePokeG<-prune.tree(pokeG, best=12) +plot(prunePokeG) +text(prunePokeG, pretty=0) +``` + +#K-Means +```{r} +library(mclust) +library(cluster) +library(dplyr) +library(fpc) +pokeNum<-select_if(pokemon, is.numeric) +distPoke<-daisy(pokemon) +summary(distPoke) +pokeDist<-cmdscale(distPoke) +plot(pokeDist, type = "n") +text(pokeDist, rownames(pokeDist)) +set.seed(413) +clustore<-matrix(0, nrow = 721, ncol=25) +wsstore<-NULL +for(i in 1:10){ + km<-kmeans(pokeDist, i, nstart=10) + clustore[,i]<-km$cluster + wsstore[i]<-km$tot.withinss +} +plot(wsstore) +kPoke2<-kmeans(pokeDist, 7, nstart=25) +plot(pokeDist, col = kPoke2$cluster) +points(kPoke2$centers, col = 1:4, pch=8, cex=2) +out <- cbind(pokemon, clusterNum = kPoke2$cluster) +clusterGroups<-order(out$clusterNum, decreasing = TRUE) +out[clusterGroups,] +``` + +Ok, let's check out the mean for Total for each cluster + +```{r} +for(i in 1:7) { + print(paste("Mean for total for cluster ",i)) + print(mean(out[which(out$clusterNum==i),]$Total)) +} +``` + +Ok, how about the number of isLegendary in each cluster + +```{r} +for(i in 1:7) { + print(paste("Number of isLegendary for cluster ",i)) + legendTemp<-out[which(out$clusterNum==i),] + print(count(legendTemp,vars=isLegendary)) +} +``` + +Ok, that didnt' look great, but it looks like isLegendary==TRUE are mostly in clusters 2 and 4. + + +##Creating a subset with only pokemon that have a gender + +```{r} +pokemon<-read.csv("pokemon_alopez247.csv") +poke<-data.frame(pokemon) +``` + + +```{r} +attach(poke) +poke1<-poke[which(hasGender=='True'),] +head(poke1) +length(poke1[,1]) +``` + +##Trees on the new data set + +```{r} +attach(poke1) +library(tree) +pocl<-tree(Pr_Male~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke1) +plot(pocl) +text(pocl) +``` + +Ok, let's prune this tree down now... + +```{r} +j<-sample(0,10000,100) +size<-{} +for(i in 1:100) { + set.seed(i) + cv.pocl<-cv.tree(pocl, FUN=prune.tree) + thing<-cv.pocl$size[which.min(cv.pocl$dev)] + size[i]<-thing +} +hist(size) +sort(table(size),decreasing=TRUE)[1:3] +``` + +```{r} +p.pocl<-prune.tree(pocl,best=3) +plot(p.pocl) +text(p.pocl) +summary(p.pocl) +``` + +Alright, let's use bagging now... + +```{r} +library(randomForest) +set.seed(1995) +pokebag<-randomForest(Pr_Male~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke1,mtry=6,importance=TRUE) +pokebag +varImpPlot(pokebag) +``` + +Well that didn't work out very well... + +How about random forest... + +```{r} +pokeRF<-randomForest(Pr_Male~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke1,mtry=3,importance=TRUE) +pokeRF +varImpPlot(pokeRF) +``` +Very slightly better, still not a lot of evidence that this model is any good. + + + +```{r} +pokemon<-read.csv("pokemon_alopez247.csv") +poke <- data.frame(pokemon, stringsAsFactors = TRUE) +poke[is.na(poke)] <- 0 +poke$isLegendary<-(as.integer(factor(poke$isLegendary))-1) +poke$hasGender<-(as.integer(factor(poke$hasGender))-1) +poke<-poke[,-c(1,2)] +set.seed(1995) +train<-sample(1:nrow(poke),432) +poke.train<-poke[train,] +poke.test<-poke[-train,] +``` + +##LDA +```{r} +#install.packages(MASS) +library(MASS) +pkmlda<- lda(poke$hasMegaEvolution~poke$hasGender+poke$Type_1+poke$Total+poke$Generation+poke$Pr_Male+poke$isLegendary, data=poke, CV=TRUE) +table(poke$hasMegaEvolution, pkmlda$class) +``` +```{r} +pkmlda<- lda(poke$isLegendary~poke$hasMegaEvolution+poke$Total+poke$hasGender+poke$Pr_Male, data=poke, CV=TRUE) +table(poke$hasMegaEvolution, pkmlda$class) +``` +```{r} +pkmlda<- lda(poke$isLegendary~poke$hasGender+poke$Pr_Male, data=poke, CV=TRUE) +table(poke$isLegendary, pkmlda$class) +``` + +```{r} +pkmlda<- lda(poke$isLegendary~poke$Type_1+poke$Type_2, data=poke, CV=TRUE) +table(poke$isLegendary, pkmlda$class) +``` + +##KNN +```{r} +poke<-data.frame(pokemon) +#remove unique identifiers +poke<-poke[,-c(1,2)] +``` +This block removes all NA values for Pr_Male. +```{r} +#the new dataset poke2 has all na values fr Pr_Male removed +poke2<-poke[which(hasGender=='True'),] +poke2 +``` +```{r} +for(j in 1:ncol(poke2)){ + if(!is.numeric(poke2[,j]) ){ + poke2[,j]<-(as.numeric(poke2[,j])) + } +} +poke2$isLegendary <- (poke2$isLegendary - 1) +poke2$hasMegaEvolution <- (poke2$hasMegaEvolution - 1) +poke2$hasGender <- (poke2$hasGender - 1) +``` + +```{r} +poke<-data.frame(pokemon) +#remove unique identifiers +poke<-poke[,-c(1,2)] +``` + +```{r} +for(j in 1:ncol(poke)){ + if(!is.numeric(poke[,j]) ){ + poke[,j]<-(as.numeric(poke[,j])) + } +} +poke$isLegendary <- (poke$isLegendary - 1) +poke$hasMegaEvolution <- (poke$hasMegaEvolution - 1) +poke$hasGender <- (poke$hasGender - 1) +``` + + +## Logistic Regression + +```{r} +library(class) +library(boot) +library("gclus") +# typeglm <- glm(poke.train$hasGender~poke.train$Type_1 + poke.train$Type_2, data=poke.train) +# typeglm +# predgend<- predict(typeglm, newdata = poke.test, type= "response") +# predgend +# predgend2<- predgend[c(1:289)] +# length(poke.test$hasGender) +# table(predgend2>0.5, poke.test$hasGender) +``` +isLegendary ~ hasGender + Catch_Rate + +```{r} +pokeglm<- glm(isLegendary ~ hasGender + Catch_Rate, family = "binomial", data = poke) +summary(pokeglm) +``` + +ok so the t test variable selection says all of the variables are important. +This might be Type 1 error??? (probs nah tbh why would a legendary pokemon need a gender or be easy to catch?) + + +Leave One Out Cross Validation! +```{r} +attach(poke2) +pokeglm <- list() +cv.mse <- NA +for(i in 1:nrow(poke)){ + cvisLeg <- poke$isLegendary[-i] + cvhasGend <- poke$hasGender[-i] + cvCatchR <- poke$Catch_Rate[-i] + + pokeglm[[i]]<- glm(cvisLeg ~ cvhasGend + cvCatchR, family = "binomial") + cv.mse[i] <- (predict(pokeglm[[i]], newdata = data.frame(poke$isLegendary[i])) - poke$isLegendary[i])^2 + +} +mean(cv.mse) +``` + +See what a regression tree looks like using total as the predictor and hp, attack, defense, sp_atk, sp_def, and speed as predictors. +```{r} +pokemon<-read.csv("pokemon_alopez247.csv") +library(tree) +poke<-data.frame(pokemon) +attach(poke) +pocl<-tree(Total~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke) +plot(pocl) +text(pocl) +``` + +Now let's try pruning it back + +```{r} +cv.pocl<-cv.tree(pocl, FUN=prune.tree) +plot(cv.pocl,type="b") +p.pocl<-prune.tree(pocl,best=10) +plot(p.pocl) +text(p.pocl) +summary(p.pocl) +``` + +Looks like pruning was unnecessary since the lowest MSE is with 12 nodes... + +How about with bagging... + +```{r} +library(randomForest) +set.seed(1995) +pokebag<-randomForest(Total~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke,mtry=6,importance=FALSE) +pokebag +``` +Random forest where m=3 +```{r} +pokeRF<-randomForest(Total~HP+Attack+Defense+Sp_Atk+Sp_Def+Speed,data=poke,mtry=3,importance=TRUE) +pokeRF +``` + + + +##PCA + +Alright, let's check out PCA on Pr_Male response with ...stats as predictors + +```{r} +head(poke) +``` + +```{r} +pcapoke <- prcomp(as.matrix(poke[,6:11]), scale.=TRUE) +summary(pcapoke) +biplot(pcapoke) +``` + +Ok cool, two principal components satisfy the Kaiser criterion. Let's take a look at which predictors influence these components... + +```{r} +round(pcapoke$rotation[,1:2], 2) +``` + +Ok, so looks like PC1 refers to kind of all around, balanced pokemon, and PC2 refers to slow defenders with bad HP? I don't think this model is all that great... But, let's see which pokemon each component is referring to. + +```{r} +poke[order(pcapoke$x[,1], decreasing=TRUE)[1:4] , 1:11] +``` + +```{r} +poke[order(pcapoke$x[,2], decreasing=TRUE)[1:4] , 1:11] +``` + +The first component doesn't really seem to refer to much at all, just kind of all around generalists maybe. The totals are quite high though, so maybe these are the powerhouses? Wait, let's see how many of them are legendary... + +```{r} +poke[order(pcapoke$x[,1], decreasing=TRUE)[1:20],] +``` + + +The first 13 are legendary, this is a good sign. Let's see how PC1 correlates with isLegendary... + +```{r} +library(MASS) +pcleg<-data.frame(pcapoke$x) +pcleg[1:20,] +leglda <- lda(factor(poke$isLegendary)~PC1+PC2,data=pcleg) +leglda +``` + +I might just be high, but I'm pretty sure this indicates PC1 is a pretty good predictor for isLegendary. PC2 doesn't really seem to refer to anything here... + + +##Gender PCA + +Ok, now let's run PCA on the subset that has a gender + +```{r} +pcagenpoke <- prcomp(as.matrix(poke1[,6:11]), scale.=TRUE) +summary(pcagenpoke) +biplot(pcagenpoke) +``` + +Ok cool, two principal components satisfy the Kaiser criterion. Let's take a look at which predictors influence these components... + +```{r} +round(pcagenpoke$rotation[,1:2], 2) +``` +This is looking pretty similar to the full dataset! But, let's see which pokemon each component is referring to. + +```{r} +poke[order(pcagenpoke$x[,1], decreasing=TRUE)[1:4] , 1:11] +``` + + +```{r} +poke[order(pcagenpoke$x[,2], decreasing=TRUE)[1:4] , 1:11] +``` + +The first component doesn't really seem to refer to much at all, just kind of all around generalists maybe. The totals are quite high though, so maybe these are the powerhouses? Wait, let's see how many of them are legendary... + +```{r} +poke[order(pcagenpoke$x[,1], decreasing=TRUE)[1:20],] +``` + + +The first 13 are legendary, this is a good sign. Let's see how PC1 correlates with isLegendary... + +```{r} +library(MASS) +pcgenleg<-data.frame(pcagenpoke$x) +pcgenleg[1:20,] +leggenlda <- lda(factor(poke1$isLegendary)~pcgenleg[,1]+pcgenleg[,2],data=pcgenleg) +leggenlda +``` + +Ok now let's look at a classification table: + +```{r} +lda.pred<-predict(leggenlda,poke1) +lda.class<-lda.pred$class +table(lda.class,poke1$isLegendary) +``` + +So LDA using PC1 and PC2 basically amounts to a naive classifier classifying everything "False" for isLegendary. Let's see if univariate LDA with PC1 only does any better. + +```{r} +leggenlda1 <- lda(factor(poke1$isLegendary)~pcgenleg[,1],data=pcgenleg) +leggenlda1 +``` + +```{r} +lda.pred<-predict(leggenlda1,poke1) +lda.class<-lda.pred$class +table(lda.class,poke1$isLegendary) +``` + +Yea, this is still a naive classifier, NOT VERY USEFUL! + +Let's try a linear model, see if PC1 and PC2 are any good at predicting Pr_Male: + +```{r} +linmod<-lm(poke1$Pr_Male~pcgenleg[,1]+pcgenleg[,2]) +summary(linmod) +linmod<-lm(poke1$Pr_Male~pcgenleg[,1]) +summary(linmod) +plot(pcgenleg[,1],poke1$Pr_Male) +abline(linmod) +``` + +Ok, so the second model is statistically significant. So let's try to interpret this now. The intercept on this linear model is 0.55, which is already above 50%. Oh man that graph looks like garbage. I don't think PCA really did anything here... + + +```{r} +# linmod<-lm(poke1$Catch_Rate~pcgenleg[,1]+pcgenleg[,2]) +# summary(linmod) +linmod<-lm(poke1$Catch_Rate~pcgenleg[,1]) +summary(linmod) +plot(pcgenleg[,1],poke1$Catch_Rate) +abline(linmod) +``` + +Ok, now we're talking. So it looks like PC1 is correlated with the harder to catch Pokemon rather than the legendary ones. Probably a good time to start a new file, this is getting messy... + +```{r} +set.seed(1995) +train<-sample(1:nrow(poke),432) +poke.test<-poke[-train,] +poke.train<-poke[train,] +``` + +#Neural Networks +```{r} +library(gclus) +library(nnet) +library(NeuralNetTools) +set.seed(1995) +spoke <- cbind(scale(trainset[,6:11]), factor(trainset$isLegendary)) +colnames(spoke)[7] <- "isLegendary" +spoke<-data.frame(spoke) +nnpoke <- nnet(factor(isLegendary)~., data=spoke, size=5) +table(trainset$isLegendary, predict(nnpoke, type="class")) +plotnet(nnpoke) +``` + +```{r} +spoke +``` + +```{r} +spoketest <- cbind(scale(testset[,6:11]), factor(testset$isLegendary)) +colnames(spoketest)[7] <- "isLegendary" +spoketest<-data.frame(spoketest) +table(spoketest$isLegendary, predict(nnpoke, newdata=spoketest, type="class")) +``` + +```{r} +attach(data) +trainsetg<-trainset[which(hasGender=='True'),] +testsetg<-testset[which(hasGender=='True'),] +trainsetg<-na.omit(trainsetg) +trainsetg +testsetg<-na.omit(testsetg) +testsetg +``` + +#Neural Net predicting Pr_Male + +Optimizing number of nodes in first layer +```{r} +for(i in 1:5){ + nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(i,3), threshold=0.01) + print(paste("Number of hidden layer variables in first layer:", i)) + print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2))) +} +``` + +Optimizing number of nodes in second layer +```{r} +for(i in 1:5){ + nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(4,i), threshold=0.01) + print(paste("Number of hidden layer variables in second layer:", i)) + print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2))) +} +``` + +```{r} +set.seed(906534) +nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01) +plotnet(nnmale) +mse<-mean((compute(nnmale, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2) +mse +``` +MSE with 2 hidden layers and 4 and 3 nodes: 0.04011755 +MSE with 1 hidden layer and 3 nodes: 0.03988659 + + +```{r} +linmod<-lm(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg) +mean((predict(linmod,newdata=testsetg)-testsetg$Pr_Male)^2) +``` +This is pretty close to our neural net modeled above when we use 1 hidden layer and 3 nodes. + +#Neural Net, predicting Generation +```{r} +set.seed(12345) +library(neuralnet) +nnGen <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=4, threshold=0.01) +plotnet(nnGen) +``` + +```{r} +mse<-mean((compute(nnGen, testsetg[,6:11])$net.result-testsetg$Generation)^2) +mse +``` +MSE without Pr_Male, 2 hidden 4 and 3 nodes +1.28986 +MSE without Pr_Male included, 5 nodes 1 hidden +1.349612 +MSE with Pr_Male included, 5 nodes 1 hidden +1.466631 + +```{r} +spoketest <- cbind(scale(testsetg[,6:11]), factor(testsetg$Generation)) +colnames(spoketest)[7] <- "Generation" +spoketest<-data.frame(spoketest) +table(spoketest$isLegendary, predict(nnGen, newdata=spoketest, type="class")) +``` diff --git a/Neural.Rmd b/Neural.Rmd new file mode 100644 index 0000000..947961b --- /dev/null +++ b/Neural.Rmd @@ -0,0 +1,155 @@ +--- +title: "Neural networks, Pokemon" +output: html_notebook +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r} +data = read.csv("pokemon_alopez247.csv", header=T) +``` + +#Neural Networks +```{r} +library(nnet) +library(NeuralNetTools) +library(neuralnet) +set.seed(53747958) +numeric_col <- c(5:12, 16, 20:21) +pokemon[,numeric_col] <- scale(pokemon[,numeric_col]) +train<-sample(1:nrow(pokemon),505) +testset<-poke[-train,] +trainset<-poke[train,] +``` + +```{r} +library(gclus) +library(nnet) +library(NeuralNetTools) +set.seed(1995) +spoke <- cbind(scale(trainset[,6:11]), factor(trainset$isLegendary)) +colnames(spoke)[7] <- "isLegendary" +spoke<-data.frame(spoke) +nnpoke <- nnet(factor(isLegendary)~., data=spoke, size=9) +table(trainset$isLegendary, predict(nnpoke, type="class")) +plotnet(nnpoke) +``` + +```{r} +spoke +``` + +```{r} +spoketest <- cbind(scale(testset[,6:11]), factor(testset$isLegendary)) +colnames(spoketest)[7] <- "isLegendary" +spoketest<-data.frame(spoketest) +table(spoketest$isLegendary, predict(nnpoke, newdata=spoketest, type="class")) +``` + +```{r} +attach(data) +trainsetg<-trainset[which(hasGender=='True'),] +testsetg<-testset[which(hasGender=='True'),] +trainsetg<-na.omit(trainsetg) +trainsetg +testsetg<-na.omit(testsetg) +testsetg +``` + +##Neural Net predicting Pr_Male +```{r} +set.seed(906534) +library(nnet) +library(NeuralNetTools) +library(neuralnet) +nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01) +#nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Weight_kg + Height_m,data=trainsetg, hidden=5, threshold=0.01) +plotnet(nnmale) +mse<-mean((compute(nnmale, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2) +#mse<-mean((compute(nnmale, testsetg[,c(6:11, 20:21)])$net.result-testsetg$Pr_Male)^2) +mse +``` + +Optimizing number of nodes in first layer +```{r} +for(i in 1:5){ + nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(i,3), threshold=0.01) + print(paste("Number of hidden layer variables in first layer:", i)) + print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2))) +} +``` + +Optimizing number of nodes in second layer +```{r} +for(i in 1:5){ + nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(4,i), threshold=0.01) + print(paste("Number of hidden layer variables in second layer:", i)) + print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2))) +} +``` +MSE with 2 hidden layers and 4 and 3 nodes: 0.04011755 +MSE with 1 hidden layer and 3 nodes: 0.03988659 + +```{r} +linmod<-lm(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Weight_kg + Height_m,data=trainsetg) +mean((predict(linmod,newdata=testsetg)-testsetg$Pr_Male)^2) +``` +This is pretty close to our neural net modeled above when we use 1 hidden layer and 3 nodes. + +##Neural net predicting hasGender +```{r} +library(gclus) +library(nnet) +library(NeuralNetTools) +set.seed(1995) +gpoke <- cbind(scale(trainset[,c(6:11, 20:22)]), factor(trainset$hasGender)) +colnames(gpoke)[10] <- "hasGender" +gpoke<-data.frame(gpoke) +nngend <- nnet(factor(hasGender)~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Catch_Rate + Height_m + Weight_kg, data=gpoke, size=5) +plotnet(nngend) +table(trainset$hasGender, predict(nngend, type="class")) +``` +It appears that our neural net is effective in predicting hasGender, and also easy to generate an overfitted model for. +1 hidden layer with 11 nodes appears to overfit the model, + 1 2 + False 45 0 + True 0 460 + +Instead 1 hidden layer with 5 nodes appears to have a reasonable misclassification without totally overfitting. Let's try this on gspoketest, + +```{r} +gpoketest <- cbind(scale(trainset[,c(6:11, 20:22)]), factor(testset$hasGender)) +colnames(gpoketest)[10] <- "hasGender" +gpoketest<-data.frame(gpoketest) +table(gpoketest$hasGender, predict(nngend, newdata=gpoketest, type="class")) +``` + +##Neural net predicting Generation +```{r} +library(gclus) +library(nnet) +library(neuralnet) +library(NeuralNetTools) +set.seed(19127395) +#nnGeneration <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Pr_Male,data=trainsetg, hidden=5, threshold=0.01) +nnGeneration <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=5, threshold=0.01) +plotnet(nnGeneration) +mse<-mean((compute(nnGeneration, testsetg[,6:11])$net.result-testsetg$Generation)^2) +mse +``` + +Optimizing number of nodes in first layer +```{r} +for(i in 1:5){ + nnmaletr <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01) + print(paste("Number of hidden layer variables in first layer:", i)) + print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2))) +} +``` + +```{r} +linmod<-lm(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg) +mean((predict(linmod,newdata=testsetg)-testsetg$Generation)^2) +``` \ No newline at end of file diff --git a/Neural.nb.html b/Neural.nb.html new file mode 100644 index 0000000..14c5408 --- /dev/null +++ b/Neural.nb.html @@ -0,0 +1,655 @@ + + + + +
+ + + + + + + + +data = read.csv("pokemon_alopez247.csv", header=T)
+
+
+
+library(nnet)
+library(NeuralNetTools)
+library(neuralnet)
+set.seed(53747958)
+numeric_col <- c(5:12, 16, 20:21)
+pokemon[,numeric_col] <- scale(pokemon[,numeric_col])
+train<-sample(1:nrow(pokemon),505)
+testset<-poke[-train,]
+trainset<-poke[train,]
+
+
+
+
+
+
+library(gclus)
+library(nnet)
+library(NeuralNetTools)
+set.seed(1995)
+spoke <- cbind(scale(trainset[,6:11]), factor(trainset$isLegendary))
+colnames(spoke)[7] <- "isLegendary"
+spoke<-data.frame(spoke)
+nnpoke <- nnet(factor(isLegendary)~., data=spoke, size=9)
+
+
+# weights: 73
+initial value 230.803477
+iter 10 value 21.386442
+iter 20 value 10.225302
+iter 30 value 5.443762
+iter 40 value 5.124080
+iter 50 value 5.117497
+iter 60 value 5.096526
+iter 70 value 5.032397
+iter 80 value 4.957708
+iter 90 value 4.956312
+iter 100 value 4.953087
+final value 4.953087
+stopped after 100 iterations
+
+
+table(trainset$isLegendary, predict(nnpoke, type="class"))
+
+
+
+ 1 2
+ False 479 1
+ True 1 24
+
+
+plotnet(nnpoke)
+
+
+spoke
+
+
+spoketest <- cbind(scale(testset[,6:11]), factor(testset$isLegendary))
+colnames(spoketest)[7] <- "isLegendary"
+spoketest<-data.frame(spoketest)
+table(spoketest$isLegendary, predict(nnpoke, newdata=spoketest, type="class"))
+
+
+
+ 1 2
+ 1 193 2
+ 2 15 6
+
+
+
+
+
+
+attach(data)
+
+
+The following object is masked _by_ .GlobalEnv:
+
+ Pr_Male
+
+The following objects are masked from data (pos = 3):
+
+ Attack, Body_Style, Catch_Rate, Color, Defense, Egg_Group_1, Egg_Group_2, Generation, hasGender,
+ hasMegaEvolution, Height_m, HP, isLegendary, Name, Number, Pr_Male, Sp_Atk, Sp_Def, Speed,
+ Total, Type_1, Type_2, Weight_kg
+
+The following objects are masked from data (pos = 4):
+
+ Attack, Body_Style, Catch_Rate, Color, Defense, Egg_Group_1, Egg_Group_2, Generation, hasGender,
+ hasMegaEvolution, Height_m, HP, isLegendary, Name, Number, Pr_Male, Sp_Atk, Sp_Def, Speed,
+ Total, Type_1, Type_2, Weight_kg
+
+
+trainsetg<-trainset[which(hasGender=='True'),]
+testsetg<-testset[which(hasGender=='True'),]
+trainsetg<-na.omit(trainsetg)
+trainsetg
+
+
+testsetg<-na.omit(testsetg)
+testsetg
+
+
+set.seed(906534)
+library(nnet)
+library(NeuralNetTools)
+library(neuralnet)
+nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01)
+#nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Weight_kg + Height_m,data=trainsetg, hidden=5, threshold=0.01)
+plotnet(nnmale)
+
+
+mse<-mean((compute(nnmale, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)
+#mse<-mean((compute(nnmale, testsetg[,c(6:11, 20:21)])$net.result-testsetg$Pr_Male)^2)
+mse
+
+
+[1] 0.03509201
+
+
+
+Optimizing number of nodes in first layer
+ + + +for(i in 1:5){
+ nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(i,3), threshold=0.01)
+ print(paste("Number of hidden layer variables in first layer:", i))
+ print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)))
+}
+
+
+[1] "Number of hidden layer variables in first layer: 1"
+[1] "MSE: 0.0350882335552937"
+[1] "Number of hidden layer variables in first layer: 2"
+[1] "MSE: 0.0350818416116905"
+[1] "Number of hidden layer variables in first layer: 3"
+[1] "MSE: 0.035248926798519"
+[1] "Number of hidden layer variables in first layer: 4"
+[1] "MSE: 0.0395926526903969"
+[1] "Number of hidden layer variables in first layer: 5"
+[1] "MSE: 0.0354457058825359"
+
+
+
+Optimizing number of nodes in second layer
+ + + +for(i in 1:5){
+ nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(4,i), threshold=0.01)
+ print(paste("Number of hidden layer variables in second layer:", i))
+ print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)))
+}
+
+
+[1] "Number of hidden layer variables in second layer: 1"
+[1] "MSE: 0.0431050100780546"
+[1] "Number of hidden layer variables in second layer: 2"
+[1] "MSE: 0.0414010218086393"
+[1] "Number of hidden layer variables in second layer: 3"
+[1] "MSE: 0.0481998685468267"
+[1] "Number of hidden layer variables in second layer: 4"
+[1] "MSE: 0.0352346597808724"
+
+
+Algorithm did not converge in 1 of 1 repetition(s) within the stepmax.
+
+
+[1] "Number of hidden layer variables in second layer: 5"
+
+
+Error in cbind(1, pred) %*% weights[[num_hidden_layers + 1]] :
+ requires numeric/complex matrix/vector arguments
+
+
+
+MSE with 2 hidden layers and 4 and 3 nodes: 0.04011755 MSE with 1 hidden layer and 3 nodes: 0.03988659
+ + + +linmod<-lm(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Weight_kg + Height_m,data=trainsetg)
+mean((predict(linmod,newdata=testsetg)-testsetg$Pr_Male)^2)
+
+
+[1] 0.03378086
+
+
+
+This is pretty close to our neural net modeled above when we use 1 hidden layer and 3 nodes.
+library(gclus)
+library(nnet)
+library(NeuralNetTools)
+set.seed(1995)
+gpoke <- cbind(scale(trainset[,c(6:11, 20:22)]), factor(trainset$hasGender))
+colnames(gpoke)[10] <- "hasGender"
+gpoke<-data.frame(gpoke)
+nngend <- nnet(factor(hasGender)~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Catch_Rate + Height_m + Weight_kg, data=gpoke, size=5)
+
+
+# weights: 56
+initial value 525.566326
+iter 10 value 111.914061
+iter 20 value 95.010360
+iter 30 value 79.510878
+iter 40 value 74.458633
+iter 50 value 69.719439
+iter 60 value 63.374675
+iter 70 value 62.255317
+iter 80 value 59.258941
+iter 90 value 57.722880
+iter 100 value 55.807885
+final value 55.807885
+stopped after 100 iterations
+
+
+plotnet(nngend)
+
+
+table(trainset$hasGender, predict(nngend, type="class"))
+
+
+
+ 1 2
+ False 36 9
+ True 25 435
+
+
+
+It appears that our neural net is effective in predicting hasGender, and also easy to generate an overfitted model for. 1 hidden layer with 11 nodes appears to overfit the model, 1 2 False 45 0 True 0 460
+Instead 1 hidden layer with 5 nodes appears to have a reasonable misclassification without totally overfitting. Let’s try this on gspoketest,
+ + + +gpoketest <- cbind(scale(trainset[,c(6:11, 20:22)]), factor(testset$hasGender))
+
+
+number of rows of result is not a multiple of vector length (arg 2)
+
+
+colnames(gpoketest)[10] <- "hasGender"
+gpoketest<-data.frame(gpoketest)
+table(gpoketest$hasGender, predict(nngend, newdata=gpoketest, type="class"))
+
+
+
+ 1 2
+ 1 14 57
+ 2 47 387
+
+
+
+library(gclus)
+library(nnet)
+library(neuralnet)
+library(NeuralNetTools)
+set.seed(19127395)
+#nnGeneration <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Pr_Male,data=trainsetg, hidden=5, threshold=0.01)
+nnGeneration <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=5, threshold=0.01)
+plotnet(nnGeneration)
+
+
+mse<-mean((compute(nnGeneration, testsetg[,6:11])$net.result-testsetg$Generation)^2)
+mse
+
+
+[1] 2.756416
+
+
+
+Optimizing number of nodes in first layer
+ + + +for(i in 1:5){
+ nnmaletr <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01)
+ print(paste("Number of hidden layer variables in first layer:", i))
+ print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)))
+}
+
+
+
+
+
+
+linmod<-lm(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg)
+mean((predict(linmod,newdata=testsetg)-testsetg$Generation)^2)
+
+
+[1] 2.834874
+
+
+pokemon<-read.csv("pokemon_alopez247.csv")
+poke<-data.frame(pokemon)
+
+
+
+
+
+
+pcapoke <- prcomp(as.matrix(poke[,c(6:11, 20:21)]), scale.=TRUE)
+summary(pcapoke)
+biplot(pcapoke)
+
+
+
+
+
+
+round(pcapoke$rotation[,1:2], 2)
+
+
+ PC1 PC2
+HP 0.36 0.00
+Attack 0.38 -0.02
+Defense 0.33 -0.41
+Sp_Atk 0.35 0.47
+Sp_Def 0.35 0.10
+Speed 0.23 0.68
+Height_m 0.40 -0.17
+Weight_kg 0.41 -0.34
+
+
+
+
+
+
+poke[order(pcapoke$x[,1], decreasing=TRUE)[1:4] , c(1:11, 20:21)]
+
+
+poke[order(pcapoke$x[,2], decreasing=TRUE)[1:4] , c(1:11, 20:21)]
+
+
+poke[order(pcapoke$x[,1], decreasing=TRUE)[1:20],]
+
+
+Majority are of the legendary type
+ + + +lda.pred<-predict(leggenlda1,poke1)
+
+
+Error in predict(leggenlda1, poke1) : object 'leggenlda1' not found
+
+
+
+Yea, this is still a naive classifier, NOT VERY USEFUL!
+Let’s try a linear model, see if PC1 and PC2 are any good at predicting Pr_Male:
+ + + +linmod<-lm(poke1$Pr_Male~pcgenleg[,1]+pcgenleg[,2])
+summary(linmod)
+linmod<-lm(poke1$Pr_Male~pcgenleg[,1])
+summary(linmod)
+plot(pcgenleg[,1],poke1$Pr_Male)
+abline(linmod)
+
+
+
+Ok, so the second model is statistically significant. So let’s try to interpret this now. The intercept on this linear model is 0.55, which is already above 50%. Oh man that graph looks like garbage. I don’t think PCA really did anything here…
+ + + +# linmod<-lm(poke1$Catch_Rate~pcgenleg[,1]+pcgenleg[,2])
+# summary(linmod)
+linmod<-lm(poke1$Catch_Rate~pcgenleg[,1])
+summary(linmod)
+plot(pcgenleg[,1],poke1$Catch_Rate)
+abline(linmod)
+
+
+
+Ok, now we’re talking. So it looks like PC1 is correlated with the harder to catch Pokemon rather than the legendary ones. Probably a good time to start a new file, this is getting messy…
+ + + +set.seed(1995)
+train<-sample(1:nrow(poke),432)
+poke.test<-poke[-train,]
+poke.train<-poke[train,]
+
+
+
+data = read.csv("pokemon_alopez247.csv", header=T)
+
+
+
+
+
+
+summary(data)
+
+
+
+pokemon <- read.csv("~/pokemon_alopez247.csv", stringsAsFactors = FALSE)
+#remove NA from dataset
+cleanPoke <- na.omit(pokemon)
+pcPoke <- prcomp(as.matrix(cleanPoke[,c(5:8,16)]), scale. = TRUE)
+summary(pcPoke)
+
+
+
+
+
+
+biplot(pcPoke)
+
+
+
+
+
+
+plot(pcPoke$x[,1:2])
+
+
+
+
+
+
+plot(pcPoke$x[,1:2], type="n")
+text(pcPoke$x[,1], pcPoke$x[,2], labels = 1:nrow(pokemon))
+
+
+
+
+
+
+round(pcPoke$rotation[,1:5], 3)
+
+
+
+
+
+
+pokemon[order(pcPoke$x[,1], decreasing=TRUE)[1:4], 1:3]
+plot(pcPoke, type="lines")
+abline(a=1, b=0, col="blue", lwd=3)
+
+
+
+
+
+
+test1 <- hclust(dist(scale(pokemon[,-c(2, 3, 4, 12, 13, 14, 15, 16, 17, 18, 19, 23)])))
+plot(test1)
+test2 <- hclust(dist(pcPoke$x))
+plot(test2)
+
+
+
+
+
+
+all.equal(dist(scale(pokemon[,-c(2, 3, 4, 12, 13, 14, 15, 16, 17, 18, 19, 23)])), dist(pcPoke$x), check.attributes = FALSE)
+
+
+data = read.csv("pokemon_alopez247.csv", header=T)
+
+
+
+
+
+
+summary(data)
+
+
+
+library(scatterplot3d)
+attach(pokemon)
+sp <- scatterplot3d(Total,Defense,Pr_Male, pch=16, highlight.3d=TRUE,
+ type="h", main="3D Scatterplot")
+
+
+
+3d Scatterplot without line markers
+ + + +library(scatterplot3d)
+attach(pokemon)
+scatterplot3d(Total,Defense,Pr_Male, main="3D Scatterplot")
+
+
+
+library(gclus)
+cleanPoke <- na.omit(pokemon)
+dta <- cleanPoke[c(5:8, 16)] # data, numbers as column numbers
+dta.r <- abs(cor(dta)) # correlation
+#dta.r[is.na(dta.r)] <- 0.5
+dta.col <- dmat.color(dta.r) # colors
+dta.o <- order.single(dta.r)
+cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
+main="Variables Ordered and Colored by Correlation" )
+
+
+
+It may be worth exploring how the correlation changes depending on what value is set to Pr_Male NA values when using “ta.r[is.na(dta.r)] <- 0.5.” Below is the correlation jsy between the columns 5:8 for additional clarity.
+ + + +library(gclus)
+#cleanPoke <- na.omit(pokemon)
+dta <- pokemon[c(5:8)] # data, numbers as column numbers
+dta.r <- abs(cor(dta)) # correlation
+dta.col <- dmat.color(dta.r) # colors
+dta.o <- order.single(dta.r)
+cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
+main="Variables Ordered and Colored by Correlation" )
+
+
+data = read.csv("pokemon_alopez247.csv", header=T)
+
+
+
+
+
+
+summary(data)
+
+
+
+library(scatterplot3d)
+attach(pokemon)
+sp <- scatterplot3d(Total,Defense,Pr_Male, pch=16, highlight.3d=TRUE,
+ type="h", main="3D Scatterplot")
+
+
+
+3d Scatterplot without line markers
+ + + +library(scatterplot3d)
+attach(pokemon)
+scatterplot3d(Total,Defense,Pr_Male, main="3D Scatterplot")
+
+
+
+library(gclus)
+cleanPoke <- na.omit(pokemon)
+dta <- cleanPoke[c(6:8, 16)] # data, numbers as column numbers
+dta.r <- abs(cor(dta)) # correlation
+#dta.r[is.na(dta.r)] <- 0.5
+dta.col <- dmat.color(dta.r) # colors
+dta.o <- order.single(dta.r)
+cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
+main="Variables Ordered and Colored by Correlation" )
+
+
+It may be worth exploring how the correlation changes depending on what value is set to Pr_Male NA values when using “ta.r[is.na(dta.r)] <- 0.5.” Below is the correlation jsy between the columns 5:8 for additional clarity.
+ + + +library(gclus)
+#cleanPoke <- na.omit(pokemon)
+dta <- pokemon[c(5:8)] # data, numbers as column numbers
+dta.r <- abs(cor(dta)) # correlation
+dta.col <- dmat.color(dta.r) # colors
+dta.o <- order.single(dta.r)
+cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
+main="Variables Ordered and Colored by Correlation" )
+
+
+Neural net testing isLegendary w/ all
+ + + +library(nnet)
+library(NeuralNetTools)
+set.seed(53747958)
+numeric_col <- c(5:12, 16, 20:21)
+pokemon[,numeric_col] <- scale(pokemon[,numeric_col])
+colnames(data)[22]<-"isLegendary"
+nnpokemon<-nnet(isLegendary~., data = data, size = 1)
+plotnet(nnpokemon)
+mean(nnpokemon$residuals^2)
+
+
+
+Neural net testing isLgendary w/ Defense+Pr_Male+Attack+HP
+ + + +library(nnet)
+library(neuralnet)
+library(NeuralNetTools)
+set.seed(53747958)
+numeric_col <- c(5:12, 16, 20:21)
+is.na(numeric_col) <- 0.5
+pokemon[,numeric_col] <- scale(pokemon[,numeric_col])
+trainset <- pokemon[1:505, ]
+testset <- pokemon[506:721, ]
+colnames(data)[22]<-"isLegendary"
+nnpokemon<-neuralnet(isLegendary~Defense+Pr_Male+Attack+HP,data=pokemon, hidden=3,act.fct = "logistic",
+ linear.output = FALSE)
+plotnet(nnpokemon)
+nnpokemon$result.matrix
+
+
+
+*DOUBLE CHECK
+ + + +attach(pokemon)
+test=data.frame(Defense,Pr_Male,Attack,HP)
+Predict=compute(nnpokemon,test)
+Predict$net.result
+
+
+
+
+
+
+attach(pokemon)
+prob <- Predict$net.result
+pred <- ifelse(prob>0.5, 1, 0)
+pred
+
+
+
+
+
+
+#Test the resulting output
+temp_test <- subset(testset, select = c("Defense","Pr_Male", "Attack", "HP"))
+head(temp_test)
+nn.results <- compute(nnpokemon, temp_test)
+results <- data.frame(actual = testset$isLegendary, predicted = nn.results$net.result)
+results
+
+
+
+
+
+
+results <- data.frame(actual = testset$isLegendary, prediction = nn.results$net.result)
+results
+
+
+
+*DOUBLE CHECK
+ + + +#mse <- sum((compute(nnpokemon, testset[,2:15])$net.result-test$Price)^2)
+#mse
+
+
+
+pokemon <- read.csv("~/pokemon_alopez247.csv", stringsAsFactors = FALSE)
+#remove NA from dataset
+cleanPoke <- na.omit(pokemon)
+pcPoke <- prcomp(as.matrix(cleanPoke[,c(6:11)]), scale. = TRUE)
+summary(pcPoke)
+
+
+Importance of components:
+ PC1 PC2 PC3 PC4 PC5 PC6
+Standard deviation 1.5292 1.0865 0.9572 0.8876 0.69979 0.53602
+Proportion of Variance 0.3898 0.1967 0.1527 0.1313 0.08162 0.04789
+Cumulative Proportion 0.3898 0.5865 0.7392 0.8705 0.95211 1.00000
+
+
+
+
+
+
+biplot(pcPoke)
+
+
+plot(pcPoke$x[,1:2])
+
+
+plot(pcPoke$x[,1:2], type="n")
+text(pcPoke$x[,1], pcPoke$x[,2], labels = 1:nrow(pokemon))
+
+
+round(pcPoke$rotation[,1:2], 3)
+
+
+ PC1 PC2
+HP 0.414 0.080
+Attack 0.434 0.111
+Defense 0.365 0.621
+Sp_Atk 0.453 -0.361
+Sp_Def 0.464 0.114
+Speed 0.293 -0.673
+
+
+
+
+
+
+pokemon[order(pcPoke$x[,1], decreasing=TRUE)[1:4], 1:11]
+
+
+plot(pcPoke, type="lines")
+abline(a=1, b=0, col="blue", lwd=3)
+
+
+test1 <- hclust(dist(scale(pokemon[,-c(2, 3, 4, 12, 13, 14, 15, 16, 17, 18, 19, 23)])))
+plot(test1)
+test2 <- hclust(dist(pcPoke$x))
+plot(test2)
+
+
+
+
+
+
+all.equal(dist(scale(pokemon[,-c(2, 3, 4, 12, 13, 14, 15, 16, 17, 18, 19, 23)])), dist(pcPoke$x), check.attributes = FALSE)
+
+
+[1] "Numeric: lengths (259560, 207046) differ"
+
+
+