From 2eabbf5f2f6a1b90c43e947e6473dc14451768a8 Mon Sep 17 00:00:00 2001
From: George Fisher <george@georgefisher.com>
Date: Fri, 19 Jun 2015 14:45:18 -0400
Subject: [PATCH] Single hidden-layer neural network

---
 nnet/nnet_benchmark_fit.Rmd | 109 +++++++++++++++
 nnet/nnet_cv.Rmd            | 259 ++++++++++++++++++++++++++++++++++++
 2 files changed, 368 insertions(+)
 create mode 100644 nnet/nnet_benchmark_fit.Rmd
 create mode 100644 nnet/nnet_cv.Rmd

diff --git a/nnet/nnet_benchmark_fit.Rmd b/nnet/nnet_benchmark_fit.Rmd
new file mode 100644
index 0000000..d631c76
--- /dev/null
+++ b/nnet/nnet_benchmark_fit.Rmd
@@ -0,0 +1,109 @@
+---
+title: "nnet benchmark fit"
+author: "George Fisher"
+date: "June 16, 2015"
+output: 
+  pdf_document: 
+    toc: yes
+    toc_depth: 1
+---
+
+#SETUP
+
+```{r setup, message=FALSE}
+library(nnet)
+
+library(plyr)
+library(caret)
+library(psych)
+library(pryr)
+library(ggplot2)
+library(foreach)
+library(doParallel)
+library(readr)
+library(data.table)
+
+rm(list = setdiff(ls(), lsf.str())) # clear variables, leave functions
+ptm <- proc.time()                  # start timer
+opar = par(no.readonly=TRUE)
+
+# ############################ PARAMETER SETUP ##################################
+# ===============================================================================
+
+deskewed = FALSE                        # deskewed (TRUE) or original (FALSE)
+source("load_TrainTest.R")              # load the data
+
+best_M        = 120
+best_maxit    = 250
+best_decay    = 0.5
+
+# ===============================================================================
+# ############################ PARAMETER SETUP ##################################
+# ################################## END ########################################
+
+# calculate the length of the Wts vector
+# ======================================
+num.Wts = function(p, M, K) {
+    # returns the length of the Wts vector
+    #
+    # p = ncol(X) # number of predictor variables
+    # M = size    # number of hidden units
+    # K = 1       # number of classes
+    return ((p + 1) * M + K * (M + 1))
+}
+p = ncol(trainX) # number of predictor variables
+K = 10           # x, y input the number of output classes
+```
+
+#TRAIN WITH THE FULL TRAINING DATASET   
+
+```{r fit_best, message=FALSE}
+nnet.fit = nnet(x=trainX, y=class.ind(trainY),
+                softmax=TRUE,
+                size=best_M, decay=best_decay, maxit=best_maxit,
+                bag=TRUE, MaxNWts=num.Wts(p, best_M, K), 
+                Wts=runif (num.Wts(p, best_M, K), -0.5, 0.5),
+                trace=TRUE)
+```
+
+#FIT THE TEST DATASET
+
+```{r pred_test,message=FALSE}
+nnet.pred = predict(nnet.fit, newdata=testX, type="class")
+
+(matrix = table(actual    = as.character(testY),
+                predicted = nnet.pred))
+
+# tr() expects a square matrix
+# if predict() does not produce all values 0...9 an error is thrown
+correct.entries = 0
+tr_error = tryCatch({
+    correct.entries = tr(matrix)
+    }, warning = function(w) {
+        #warning-handler-code
+        }, error = function(e) {
+            print(e)
+            }, finally = {
+                #cleanup code
+                })
+(model.accuracy  = correct.entries / sum(matrix))
+(model.misclass  = 1 - model.accuracy)
+
+# which were the hardest to detect?
+# =================================
+if (correct.entries > 0) {
+    results = data.frame(number=numeric(0), percent=numeric(0))
+    for (i in seq(from=0,to=9)){
+         results[nrow(results)+1,] = c(i, round(prop.table(matrix,1),digits=3)[i+1,i+1])
+    }
+    
+    results[nrow(results)+1,] = c(100,model.accuracy)
+    print(arrange(results,percent))
+}
+
+# run time
+run_time = proc.time() - ptm
+print(paste("elapsed minutes",round(run_time[3]/60,digits=2),
+             "; elapsed hours",round(run_time[3]/(60*60),digits=2)))
+```
+
diff --git a/nnet/nnet_cv.Rmd b/nnet/nnet_cv.Rmd
new file mode 100644
index 0000000..e7ef0e3
--- /dev/null
+++ b/nnet/nnet_cv.Rmd
@@ -0,0 +1,259 @@
+---
+title: "nnet cross validation"
+author: "George Fisher"
+date: "June 17, 2015"
+output: 
+  pdf_document: 
+    toc: yes
+    toc_depth: 1
+---
+
+#SETUP
+
+```{r setup, message=FALSE}
+library(nnet)
+
+library(psych)
+library(plyr)
+library(caret)
+library(pryr)
+library(ggplot2)
+library(foreach)
+library(doParallel)
+library(readr)
+library(data.table)
+library(NMF)
+library(RColorBrewer)
+
+rm(list = setdiff(ls(), lsf.str())) # clear variables, leave functions
+ptm <- proc.time()                  # start timer
+opar = par(no.readonly=TRUE)
+
+# ############################ PARAMETER SETUP ##################################
+# ===============================================================================
+
+deskewed = TRUE                        # deskewed (TRUE) or original (FALSE)
+source("../load_TrainTest.R")           # load the data
+
+k = 5                                   # k=5 fold CV
+fold.idx = createFolds(trainY, k = k)   # the indexes of the hold-out folds
+
+registerDoParallel(cores=k)             # each fold gets its own core
+
+if (deskewed) {
+    cv_filename = "nnet_cv_deskewed.txt"
+} else {
+    cv_filename = "nnet_cv_original.txt"
+}
+
+# start a new file 
+# ------> comment out to append to an existing file <------
+# ========================================================================
+header="fold,M,maxit,decay,avg.misclass,avg.mem_used,parms,date,elapsed"
+write.table(header, file=cv_filename, quote=FALSE, sep="", append=FALSE,
+            row.names = FALSE, col.names = FALSE)
+# ========================================================================
+
+params = expand.grid(M = 40, maxit = c(250,300,400), decay = 1)
+
+#params = expand.grid(M = 5, maxit = 50, decay = 0.5)   # for script testing only
+
+# ===============================================================================
+# ############################ PARAMETER SETUP ##################################
+# ################################## END ########################################
+
+# calculate the length of the Wts vector
+# ======================================
+num.Wts = function(p, M, K) {
+    # returns the length of the Wts vector
+    #
+    # p = ncol(X) # number of predictor variables
+    # M = size    # number of hidden units
+    # K = 1       # number of classes
+    return ((p + 1) * M + K * (M + 1))
+}
+p = ncol(trainX) # number of predictor variables
+K = 10           # x, y input the number of output classes
+```
+
+#CROSS VALIDATION
+
+```{r cv, message=FALSE}
+# results of each k-fold CV loop
+# ==============================
+cv_results = data.frame(M=numeric(0),
+                        maxit=numeric(0),
+                        decay=numeric(0),
+                        avg.misclass=numeric(0),
+                        avg.mem_used=numeric(0),
+                        parms=character(0))
+
+# ================================= k-fold CV loop =================================
+for (i in 1:nrow(params)) {
+    # for each set of parameters ...
+    M     = params[i,"M"]
+    maxit = params[i,"maxit"]
+    decay = params[i,"decay"]
+    
+    # ... run k-fold CV, each fold on a separate processor
+    results = foreach (fold = seq(k), .combine = 'rbind', .inorder = FALSE) %dopar% {
+        fold_start = proc.time()
+        
+        idx = fold.idx[[fold]]
+        # fit 4/5 of the training data
+        nnet.fit = nnet(x = trainX[-idx,], y = class.ind(trainY[-idx]),
+                        softmax = TRUE,
+                        size = M, decay = decay, maxit = maxit,
+                        bag = TRUE, MaxNWts = num.Wts(p, M, K),#  #MaxNWts=1000000,
+                        Wts = runif (num.Wts(p, M, K),-0.5, 0.5),
+                        trace = FALSE)
+        # predict 1/5 of the training data
+        nnet.pred = predict(nnet.fit, newdata = trainX[idx,],type = "class")
+        matrix = table(actual    = as.character(trainY[idx]),
+                       predicted = nnet.pred)
+        # tr() expects a square matrix
+        # if predict() does not produce all values 0...9 an error is thrown
+        model.misclass = tryCatch({
+                            # code to try
+                            # 'tryCatch()' will return the last evaluated expression 
+                            # in case the "try" part was completed successfully
+                            1 - (tr(matrix) / sum(matrix))
+                            }, 
+                            warning = function(w) {
+                                # warning-handler-code 
+                                print(w)
+                                print(matrix)
+                                return(1.0)
+                                }, 
+                            error = function(e) {
+                                # error-handler-code
+                                print(e)
+                                print(matrix)
+                                return(1.0)
+                                }, 
+                            finally = { 
+                                # NOTE:
+                                # Here goes everything that should be executed at the end,
+                                # regardless of success or error. 
+                                } )
+        
+        # write data to disk for later evaluation
+        fold_row = c(fold,  M, maxit, decay, model.misclass, mem_used())
+        frt      = proc.time() - fold_start    # fold run time
+        frte     = as.numeric(frt["elapsed"])  # fold run time elapsed
+        write.table(matrix(c(fold_row, 
+                             paste0("M",M,"it",maxit,"d",decay), 
+                             date(), frte), ncol=9, nrow=1),
+                    file=cv_filename, 
+                    append=TRUE, quote=TRUE, sep=",",
+                    row.names = FALSE, col.names = FALSE)
+        # each fold outputs one row of "results"
+        fold_row
+    }
+
+    # after each k-fold CV save the information in a data.frame
+    cv_results[nrow(cv_results)+1,] = c(M, maxit, decay, 
+                                        mean(as.numeric(results[,5])), # model.misclass
+                                        mean(as.numeric(results[,6])), # mem_used()
+                                        paste0("M",M,"it",maxit,"d",decay))
+}
+# ================================= k-fold CV loop =================================
+
+cv_results[,1:5] = sapply(cv_results[,1:5],as.numeric)
+print(cv_results)
+
+# find the parameters that produced the lowest average misclassification rate
+# ===========================================================================
+best_cv_row   = which.min(cv_results$avg.misclass)
+best_results  = cv_results[best_cv_row,]
+best_M        = best_results$M
+best_maxit    = best_results$maxit
+best_decay    = best_results$decay
+best_misclass = best_results$avg.misclass
+```
+
+
+```{r cv_runtime}
+# run time
+run_time = proc.time() - ptm
+print(paste(
+    "elapsed minutes",round(run_time[3] / 60,digits = 2),
+    "; elapsed hours",round(run_time[3] / (60 * 60),digits = 2),
+    "; user/elapsed",round((run_time[1]+run_time[4])/run_time[3],digits=0)
+))
+mem_range = prettyNum(range(cv_results$avg.mem_used),big.mark=",",scientific=FALSE)
+print(paste("Range of R memory usage",mem_range[1],":",mem_range[2]))
+```
+
+#TRAIN WITH THE FULL TRAINING DATASET   
+using the best parameters found in the CV steps
+
+```{r fit_best, message=FALSE}
+# fit the full training dataset
+# with the best parameters found by CV
+nnet.fit = nnet(x=trainX, y=class.ind(trainY),
+                softmax=TRUE,
+                size=best_M, decay=best_decay, maxit=best_maxit,
+                bag=TRUE, MaxNWts=num.Wts(p, best_M, K), 
+                Wts=runif (num.Wts(p, best_M, K), -0.5, 0.5),
+                trace=TRUE)
+```
+
+#FIT THE TEST DATASET
+
+```{r pred_test,message=FALSE}
+# get the specific class predictions
+nnet.pred = predict(nnet.fit, newdata=testX, type="class")
+
+matrix = table(actual    = as.character(testY),
+               predicted = nnet.pred)
+
+# heatmap of the range of probabilities
+aheatmap(prop.table(matrix,margin=1), Rowv=NA, Colv=NA)
+
+confusionMatrix(data=nnet.pred, reference=as.character(testY))
+
+# tr() expects a square matrix
+# if predict() does not produce all values 0...9 an error is thrown
+correct.entries = tryCatch({
+                    # code to try
+                    # 'tryCatch()' will return the last evaluated expression 
+                    # in case the "try" part was completed successfully
+                    tr(matrix)
+                    }, 
+                    warning = function(w) {
+                        # warning-handler-code
+                        print(w)
+                        return(0.0)
+                        }, 
+                    error = function(e) {
+                        # error-handler-code
+                        print(e)
+                        return(0.0)
+                        }, 
+                    finally = {
+                        # NOTE:
+                        # Here goes everything that should be executed at the end,
+                        # regardless of success or error. 
+                        } )
+(model.accuracy  = correct.entries / sum(matrix))
+(model.misclass  = 1 - model.accuracy)
+
+# which were the hardest to detect?
+# =================================
+if (correct.entries > 0) {
+    results = data.frame(number=numeric(0), percent=numeric(0))
+    for (i in seq(from=0,to=9)){
+         results[nrow(results)+1,] = c(i, round(prop.table(matrix,1),digits=3)[i+1,i+1])
+    }
+    
+    results[nrow(results)+1,] = c(100,model.accuracy)
+    print(arrange(results,percent))
+}
+
+# run time
+run_time = proc.time() - ptm
+print(paste("elapsed minutes",round(run_time[3]/60,digits=2),
+             "; elapsed hours",round(run_time[3]/(60*60),digits=2)))
+```
+