-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path05-05-ML-Overfitting.Rmd
107 lines (81 loc) · 3.56 KB
/
05-05-ML-Overfitting.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
## Overfitting {-}
```{r f1, comment=NA, message=FALSE, warning=FALSE, fig.width=8, fig.height=6}
library(pander)
library(e1071)
## First 500 digits in pi
dta = c(3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9, 3, 2, 3, 8, 4, 6, 2, 6, 4, 3, 3, 8,
3, 2, 7, 9, 5, 0, 2, 8, 8, 4, 1, 9, 7, 1, 6, 9, 3, 9, 9, 3, 7, 5, 1, 0, 5, 8, 2,
0, 9, 7, 4, 9, 4, 4, 5, 9, 2, 3, 0, 7, 8, 1, 6, 4, 0, 6, 2, 8, 6, 2, 0, 8, 9, 9,
8, 6, 2, 8, 0, 3, 4, 8, 2, 5, 3, 4, 2, 1, 1, 7, 0, 6, 7, 9, 8, 2, 1, 4, 8, 0, 8,
6, 5, 1, 3, 2, 8, 2, 3, 0, 6, 6, 4, 7, 0, 9, 3, 8, 4, 4, 6, 0, 9, 5, 5, 0, 5, 8,
2, 2, 3, 1, 7, 2, 5, 3, 5, 9, 4, 0, 8, 1, 2, 8, 4, 8, 1, 1, 1, 7, 4, 5, 0, 2, 8,
4, 1, 0, 2, 7, 0, 1, 9, 3, 8, 5, 2, 1, 1, 0, 5, 5, 5, 9, 6, 4, 4, 6, 2, 2, 9, 4,
8, 9, 5, 4, 9, 3, 0, 3, 8, 1, 9, 6, 4, 4, 2, 8, 8, 1, 0, 9, 7, 5, 6, 6, 5, 9, 3,
3, 4, 4, 6, 1, 2, 8, 4, 7, 5, 6, 4, 8, 2, 3, 3, 7, 8, 6, 7, 8, 3, 1, 6, 5, 2, 7,
1, 2, 0, 1, 9, 0, 9, 1, 4, 5, 6, 4, 8, 5, 6, 6, 9, 2, 3, 4, 6, 0, 3, 4, 8, 6, 1,
0, 4, 5, 4, 3, 2, 6, 6, 4, 8, 2, 1, 3, 3, 9, 3, 6, 0, 7, 2, 6, 0, 2, 4, 9, 1, 4,
1, 2, 7, 3, 7, 2, 4, 5, 8, 7, 0, 0, 6, 6, 0, 6, 3, 1, 5, 5, 8, 8, 1, 7, 4, 8, 8,
1, 5, 2, 0, 9, 2, 0, 9, 6, 2, 8, 2, 9, 2, 5, 4, 0, 9, 1, 7, 1, 5, 3, 6, 4, 3, 6,
7, 8, 9, 2, 5, 9, 0, 3, 6, 0, 0, 1, 1, 3, 3, 0, 5, 3, 0, 5, 4, 8, 8, 2, 0, 4, 6,
6, 5, 2, 1, 3, 8, 4, 1, 4, 6, 9, 5, 1, 9, 4, 1, 5, 1, 1, 6, 0, 9, 4, 3, 3, 0, 5,
7, 2, 7, 0, 3, 6, 5, 7, 5, 9, 5, 9, 1, 9, 5, 3, 0, 9, 2, 1, 8, 6, 1, 1, 7, 3, 8,
1, 9, 3, 2, 6, 1, 1, 7, 9, 3, 1, 0, 5, 1, 1, 8, 5, 4, 8, 0, 7, 4, 4, 6, 2, 3, 7,
9, 9, 6, 2, 7, 4, 9, 5, 6, 7, 3, 5, 1, 8, 8, 5, 7, 5, 2, 7, 2, 4, 8, 9, 1, 2, 2,
7, 9, 3, 8, 1, 8, 3, 0, 1, 1, 9, 4, 9, 1, 2)
## Create 5 variables to based on the lagged value of the ith digit
dta = data.frame(y = dta)
dta$x1 = NA; dta$x2 = NA; dta$x3 = NA; dta$x4 = NA; dta$x5 = NA
for (i in 2:500) {
dta$x1[i] = dta$y[i-1]
}
for (i in 3:500) {
dta$x2[i] = dta$y[i-2]
}
for (i in 4:500) {
dta$x3[i] = dta$y[i-3]
}
for (i in 5:500) {
dta$x4[i] = dta$y[i-4]
}
for (i in 6:500) {
dta$x5[i] = dta$y[i-5]
}
head(dta)
## Remove NA
dta = dta[6:500,]
## Create Factors out of the variables
dta[] = lapply(dta, factor)
## Break up the data into the training and testing sets
train = dta[1:475, ]
test = dta[476:495, ]
## Tune an SVM Model
mdl.svm = tune(svm, y ~ ., data = train,
ranges = list(
cost = seq(1, 20, 2),
gamma = seq(0, 1, .1))
)
plot(mdl.svm)
best.cost = mdl.svm$best.parameters[1]
best.gamma = mdl.svm$best.parameters[2]
mdl.svm = svm(y ~ ., data = train, cost = best.cost, gamma = best.gamma,
probability = TRUE)
## Predict the testing set
tmp = predict(mdl.svm, test, probability = TRUE)
results.svm = data.frame(actual = test$y, predicted = tmp)
results.svm$Result = FALSE
results.svm$Result[which(results.svm$actual == results.svm$predicted)] = TRUE
## create Predict the training set
train.results.svm = data.frame(actual = train$y, pred = predict(mdl.svm, train))
train.results.svm$Result = FALSE
train.results.svm$Result[train.results.svm$actual == train.results.svm$pred] = TRUE
x = as.numeric(round(table(train.results.svm$Result) / 475, 3))
y = as.numeric(round(table(results.svm$Result) / 20, 3))
if (length(x) == 1) { x = c(0, x)}
if (length(y) == 1) {y = c(y, 0)}
## Aggregate results
results = list(
Train.Incorrect = x[1], Train.Correct = x[2],
Test.Incorrect = y[1], Test.Correct = y[2]
)
pandoc.table(results, split.tables = Inf)
```