Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
759 changes: 759 additions & 0 deletions AllModels.Rmd

Large diffs are not rendered by default.

155 changes: 155 additions & 0 deletions Neural.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
---
title: "Neural networks, Pokemon"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
data = read.csv("pokemon_alopez247.csv", header=T)
```

#Neural Networks
```{r}
library(nnet)
library(NeuralNetTools)
library(neuralnet)
set.seed(53747958)
numeric_col <- c(5:12, 16, 20:21)
pokemon[,numeric_col] <- scale(pokemon[,numeric_col])
train<-sample(1:nrow(pokemon),505)
testset<-poke[-train,]
trainset<-poke[train,]
```

```{r}
library(gclus)
library(nnet)
library(NeuralNetTools)
set.seed(1995)
spoke <- cbind(scale(trainset[,6:11]), factor(trainset$isLegendary))
colnames(spoke)[7] <- "isLegendary"
spoke<-data.frame(spoke)
nnpoke <- nnet(factor(isLegendary)~., data=spoke, size=9)
table(trainset$isLegendary, predict(nnpoke, type="class"))
plotnet(nnpoke)
```

```{r}
spoke
```

```{r}
spoketest <- cbind(scale(testset[,6:11]), factor(testset$isLegendary))
colnames(spoketest)[7] <- "isLegendary"
spoketest<-data.frame(spoketest)
table(spoketest$isLegendary, predict(nnpoke, newdata=spoketest, type="class"))
```

```{r}
attach(data)
trainsetg<-trainset[which(hasGender=='True'),]
testsetg<-testset[which(hasGender=='True'),]
trainsetg<-na.omit(trainsetg)
trainsetg
testsetg<-na.omit(testsetg)
testsetg
```

##Neural Net predicting Pr_Male
```{r}
set.seed(906534)
library(nnet)
library(NeuralNetTools)
library(neuralnet)
nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01)
#nnmale <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Weight_kg + Height_m,data=trainsetg, hidden=5, threshold=0.01)
plotnet(nnmale)
mse<-mean((compute(nnmale, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)
#mse<-mean((compute(nnmale, testsetg[,c(6:11, 20:21)])$net.result-testsetg$Pr_Male)^2)
mse
```

Optimizing number of nodes in first layer
```{r}
for(i in 1:5){
nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(i,3), threshold=0.01)
print(paste("Number of hidden layer variables in first layer:", i))
print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)))
}
```

Optimizing number of nodes in second layer
```{r}
for(i in 1:5){
nnmaletr <- neuralnet(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=c(4,i), threshold=0.01)
print(paste("Number of hidden layer variables in second layer:", i))
print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)))
}
```
MSE with 2 hidden layers and 4 and 3 nodes: 0.04011755
MSE with 1 hidden layer and 3 nodes: 0.03988659

```{r}
linmod<-lm(Pr_Male ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Weight_kg + Height_m,data=trainsetg)
mean((predict(linmod,newdata=testsetg)-testsetg$Pr_Male)^2)
```
This is pretty close to our neural net modeled above when we use 1 hidden layer and 3 nodes.

##Neural net predicting hasGender
```{r}
library(gclus)
library(nnet)
library(NeuralNetTools)
set.seed(1995)
gpoke <- cbind(scale(trainset[,c(6:11, 20:22)]), factor(trainset$hasGender))
colnames(gpoke)[10] <- "hasGender"
gpoke<-data.frame(gpoke)
nngend <- nnet(factor(hasGender)~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Catch_Rate + Height_m + Weight_kg, data=gpoke, size=5)
plotnet(nngend)
table(trainset$hasGender, predict(nngend, type="class"))
```
It appears that our neural net is effective in predicting hasGender, and also easy to generate an overfitted model for.
1 hidden layer with 11 nodes appears to overfit the model,
1 2
False 45 0
True 0 460

Instead 1 hidden layer with 5 nodes appears to have a reasonable misclassification without totally overfitting. Let's try this on gspoketest,

```{r}
gpoketest <- cbind(scale(trainset[,c(6:11, 20:22)]), factor(testset$hasGender))
colnames(gpoketest)[10] <- "hasGender"
gpoketest<-data.frame(gpoketest)
table(gpoketest$hasGender, predict(nngend, newdata=gpoketest, type="class"))
```

##Neural net predicting Generation
```{r}
library(gclus)
library(nnet)
library(neuralnet)
library(NeuralNetTools)
set.seed(19127395)
#nnGeneration <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed + Pr_Male,data=trainsetg, hidden=5, threshold=0.01)
nnGeneration <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=5, threshold=0.01)
plotnet(nnGeneration)
mse<-mean((compute(nnGeneration, testsetg[,6:11])$net.result-testsetg$Generation)^2)
mse
```

Optimizing number of nodes in first layer
```{r}
for(i in 1:5){
nnmaletr <- neuralnet(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg, hidden=3, threshold=0.01)
print(paste("Number of hidden layer variables in first layer:", i))
print(paste("MSE: ", mean((compute(nnmaletr, testsetg[,6:11])$net.result-testsetg$Pr_Male)^2)))
}
```

```{r}
linmod<-lm(Generation ~ Attack + Defense + HP + Sp_Atk + Sp_Def + Speed,data=trainsetg)
mean((predict(linmod,newdata=testsetg)-testsetg$Generation)^2)
```
655 changes: 655 additions & 0 deletions Neural.nb.html

Large diffs are not rendered by default.

80 changes: 80 additions & 0 deletions PCAWithWH.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---
title: "R Notebook"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
pokemon<-read.csv("pokemon_alopez247.csv")
poke<-data.frame(pokemon)
```

```{r}
pcapoke <- prcomp(as.matrix(poke[,c(6:11, 20:21)]), scale.=TRUE)
summary(pcapoke)
biplot(pcapoke)
```


```{r}
round(pcapoke$rotation[,1:2], 2)
```


```{r}
poke[order(pcapoke$x[,1], decreasing=TRUE)[1:4] , c(1:11, 20:21)]
```

```{r}
poke[order(pcapoke$x[,2], decreasing=TRUE)[1:4] , c(1:11, 20:21)]
```

```{r}
poke[order(pcapoke$x[,1], decreasing=TRUE)[1:20],]
```

Majority are of the legendary type


```{r}
lda.pred<-predict(leggenlda1,poke1)
lda.class<-lda.pred$class
table(lda.class,poke1$isLegendary)
```

Yea, this is still a naive classifier, NOT VERY USEFUL!

Let's try a linear model, see if PC1 and PC2 are any good at predicting Pr_Male:

```{r}
linmod<-lm(poke1$Pr_Male~pcgenleg[,1]+pcgenleg[,2])
summary(linmod)
linmod<-lm(poke1$Pr_Male~pcgenleg[,1])
summary(linmod)
plot(pcgenleg[,1],poke1$Pr_Male)
abline(linmod)
```

Ok, so the second model is statistically significant. So let's try to interpret this now. The intercept on this linear model is 0.55, which is already above 50%. Oh man that graph looks like garbage. I don't think PCA really did anything here...


```{r}
# linmod<-lm(poke1$Catch_Rate~pcgenleg[,1]+pcgenleg[,2])
# summary(linmod)
linmod<-lm(poke1$Catch_Rate~pcgenleg[,1])
summary(linmod)
plot(pcgenleg[,1],poke1$Catch_Rate)
abline(linmod)
```

Ok, now we're talking. So it looks like PC1 is correlated with the harder to catch Pokemon rather than the legendary ones. Probably a good time to start a new file, this is getting messy...

```{r}
set.seed(1995)
train<-sample(1:nrow(poke),432)
poke.test<-poke[-train,]
poke.train<-poke[train,]
```
416 changes: 416 additions & 0 deletions PCAWithWH.nb.html

Large diffs are not rendered by default.

63 changes: 63 additions & 0 deletions Principle.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
title: "R Notebook"
output: html_notebook
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

#laurenedits

```{r}
data = read.csv("pokemon_alopez247.csv", header=T)
```

```{r}
summary(data)
```


#Principle Component Analysis
```{r}
pokemon <- read.csv("~/pokemon_alopez247.csv", stringsAsFactors = FALSE)
#remove NA from dataset
cleanPoke <- na.omit(pokemon)
pcPoke <- prcomp(as.matrix(cleanPoke[,c(5:8,16)]), scale. = TRUE)
summary(pcPoke)
```

```{r}
biplot(pcPoke)
```

```{r}
plot(pcPoke$x[,1:2])
```

```{r}
plot(pcPoke$x[,1:2], type="n")
text(pcPoke$x[,1], pcPoke$x[,2], labels = 1:nrow(pokemon))
```

```{r}
round(pcPoke$rotation[,1:5], 3)
```

```{r}
pokemon[order(pcPoke$x[,1], decreasing=TRUE)[1:4], 1:3]
plot(pcPoke, type="lines")
abline(a=1, b=0, col="blue", lwd=3)
```

```{r}
test1 <- hclust(dist(scale(pokemon[,-c(2, 3, 4, 12, 13, 14, 15, 16, 17, 18, 19, 23)])))
plot(test1)
test2 <- hclust(dist(pcPoke$x))
plot(test2)
```

```{r}
all.equal(dist(scale(pokemon[,-c(2, 3, 4, 12, 13, 14, 15, 16, 17, 18, 19, 23)])), dist(pcPoke$x), check.attributes = FALSE)
```
376 changes: 376 additions & 0 deletions Principle.nb.html

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions Scatters.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
title: "R Notebook"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r}
data = read.csv("pokemon_alopez247.csv", header=T)
```

```{r}
summary(data)
```

#Basic 3d Scatterplots
##Evaluating variables, Total, Defense, and Pr_Male
```{r}
library(scatterplot3d)
attach(pokemon)
sp <- scatterplot3d(Total,Defense,Pr_Male, pch=16, highlight.3d=TRUE,
type="h", main="3D Scatterplot")
```

3d Scatterplot without line markers
```{r}
library(scatterplot3d)
attach(pokemon)
scatterplot3d(Total,Defense,Pr_Male, main="3D Scatterplot")
```

#Corelation
```{r}
library(gclus)
cleanPoke <- na.omit(pokemon)
dta <- cleanPoke[c(5:8, 16)] # data, numbers as column numbers
dta.r <- abs(cor(dta)) # correlation
#dta.r[is.na(dta.r)] <- 0.5
dta.col <- dmat.color(dta.r) # colors
dta.o <- order.single(dta.r)
cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
main="Variables Ordered and Colored by Correlation" )
```
It may be worth exploring how the correlation changes depending on what value is set to Pr_Male NA values when using "ta.r[is.na(dta.r)] <- 0.5." Below is the correlation jsy between the columns 5:8 for additional clarity.

```{r}
library(gclus)
#cleanPoke <- na.omit(pokemon)
dta <- pokemon[c(5:8)] # data, numbers as column numbers
dta.r <- abs(cor(dta)) # correlation
dta.col <- dmat.color(dta.r) # colors
dta.o <- order.single(dta.r)
cpairs(dta, dta.o, panel.colors=dta.col, gap=.5,
main="Variables Ordered and Colored by Correlation" )
```
363 changes: 363 additions & 0 deletions Scatters.nb.html

Large diffs are not rendered by default.

Loading