-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chapter 2: Statistical learning and predicting SSPL using linear regression
133 lines (122 loc) · 6.08 KB
/
Chapter 2: Statistical learning and predicting SSPL using linear regression
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Chapter 2: Statistical learning and model adequacy
##%-----------------------------------------------------------------------------------------------------
# 1. Read the data (airfoil data set)
#%----------------------------------------------------------------------------------------------------
airfoil <- read.csv(’C:/Users/MAMSL/Downloads/AirfoilSelfNoise.csv’)
airfoil
dim(airfoil) # n = 1503 observations, p= 5 predictors.
#%-----------------------------------------------------------------------------------------------------
# 1. Create a training set and test set:
#%----------------------------------------------------------------------------------------------------
set.seed(123)
h <- runif(nrow(airfoil)) # generate n numbers from some uniform distribution
airfoilr<- airfoil[order(h),] # randomized training set
dim(airfoilr)
trains <- airfoilr[1:1202,] #training set
tests <- airfoilr[1203:1503,] #testing set
dim(tests) # 301 x 6
dim(trains) # 1202 x 6
#%----------------------------------------------------------------------------
#Graphical training set example 2.1.2
#%----------------------------------------------------------------------------
#For example 2.1.2 , we take p=1 and n= 50 from the training set.
train_example <- trains[,5:6] # example training set
plot(train_example[1:50,],
main=’Scaled sound pressure level vs Suction side displacement thickness’, color=’blue’,
pch=20, xlab= ’delta (in dB)’ ,ylab=’SSPL (in m)’)
dim(train_example) # n=1202 p=2
#%--------------------------------------------------------------------------
# 2.Linear regression (Without scaling)
#%--------------------------------------------------------------------------
# fitting linear model on training set called trains
model_airfoil <- lm(SSPL~ f+alpha+c+U_infinity+delta, data =trains )
# linear regression summary
summary(model_airfoil)
par(mfrow=c(2,2)) # 2 by 2 window
plot(model_airfoil) # Give graphs to verify the three assumptions of a good linear model.
# Training error
MSE_train = mean((trains$SSPL- model_airfoil$fitted.values)^2)
MSE_train # MSE_train = 23.47078
#Other possible manner to compute the training error is using the
#regression residuals and averaging them
n <- dim(trains)[1]
d <- sum(model_airfoil$residuals^2)/n
d # d = 23.47078 Same results as before
#Testing error
par(mfrow=c(1,1))
input_test <- tests[,1:5] # separate inputs from the output SSPL in testing set.
T_hat <- predict(model_airfoil,input_test) # prediction on input test data by the linear fitted model
Y_test <- tests$SSPL # output values SSPL on test set
MSE_test <- mean((T_hat-tests$SSPL)^2) # MSE_test computed
MSE_test # MSE_test = 21.33534
#---------------------------------------------------------------------------------------
# Plot the predicted values vs the actual values of the test sets
#---------------------------------------------------------------------------------------
plot(T_hat,Y_test, col= ifelse(T_hat == T_hat[25], "green", "black"),
pch=ifelse(T_hat == T_hat[25], 19, 1),
cex=ifelse(T_hat == T_hat[25], 1.4, 1),xlab = ’Predicted output value (T_hat)’,
ylab= ’True test output value (y)’, main = ’True output values vs predicted values on
the testing set for the same inputs ’)
line <- abline(0,1, col=’red’) # line T(x)=x giving some indication of relative distance between predicted values vs actual values
identify(T_hat,Y_test) # identify some points close to the line and check their output values afterwards.
Y_test[297] # y= 118.084
T_hat[297] # T_hat= 117.9099
Y_test[25] # y= 122.845
T_hat[25] # T_hat= 122.8342
# Very close observations indeed for the 25th predicted output.
#--------------------------------------------------------------------------------------------------------------
# Simulation: Bias-Variance Trade-off
#-----------------------------------------------------------------------------------------------------------------
# define function
n <- 100
x <- seq(0, 1, length= n)
Tx <- sin(2 * pi * x)
Tx
# generate noisy data
set.seed(1)
y <- Tx + rnorm(n,mean = 0, sd = 0.3)
# Divide between training and testing outputs
datasimul <- data.frame(x,y) # Create data.frame
train_simul <- datasimul[1:80,] #training set
test_simul <- datasimul[81:100,] # Testing set
# plot data and T(x)
plot(train_simul)
lines(x, Tx, lwd = 2) # T(x)
legend("topright", legend = "T(x)", lty = 1, lwd = 2, bty = "n")
# Linear model on data
ml <- lm(y~x ,data = train_simul)
summary(ml)
abline(ml, col=’red’)
legend("topright", legend = c(’T(x)’,"Linear fitted line",
’Smoothing spline 1’,’Smoothing spline 2’), c(lty = 1,lty=1,lty=1,lty=2) , lwd=2,
col=c(’black’,’red’,’green4’,’navy’))
# Non linear fitting (smoothing spline 1)
non_linear_fit <- smooth.spline(train_simul)
non_linear_fit
lines(x[1:80], non_linear_fit$y, lty = 1, col = ’green4’, lwd = 2)
pred_ss1 <- predict(non_linear_fit,x[1:80])
# Prediction on training inputs of the red smoothing spline
# Non linear fitting (smoothing spline 2)
non_linear_fit2 <- smooth.spline(train_simul,spar = 0.0000001)
non_linear_fit2
lines(x[1:80], non_linear_fit2$y, lty = 1, col = ’navy’, lwd = 2)
pred_ss2 <- predict(non_linear_fit2,x[1:80]) # Prediction on training inputs
of the blue smoothing spline
# training error 3 models
MSE_train = mean((train_simul$y- ml$fitted.values)^2) # training error for linear model
MSE_train # MSE_train = 0.2417237
MSE_train_ss1 = mean((train_simul$y-pred_ss1$y)^2) #training error for smoothing spline 1
MSE_train_ss1 # 0.07083181
MSE_train_ss2 <- mean((train_simul$y-pred_ss2$y)^2) # training error for smoothing spline 2
MSE_train_ss2 # 0.02419351
# Testing error 3 models
# new smoothing spline prediction on test inputs:
pred_ss1_test <- predict(non_linear_fit,x[81:100])
pred_ss2_test <- predict(non_linear_fit2,x[81:100])
ml_testr <- lm(y~x, data= test_simul)
MSE_test = mean((test_simul$y- ml_testr$fitted.values)^2) # for linear model
MSE_test # MSE_test = 0.06327598
MSE_test_ss1 = mean((test_simul$y-pred_ss1_test$y)^2) #testing error for smoothing spline 1
MSE_test_ss1 # 1.238867
MSE_test_ss2 <- mean((test_simul$y-pred_ss2_test$y)^2) # testing error for smoothing spline 2
MSE_test_ss2 # 59.10941