Project initiation

grfiv · Jun 19, 2015 · 73f36b4 · 73f36b4
commit 73f36b4
Show file tree

Hide file tree

Showing 6 changed files with 619 additions and 0 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MNIST.pdf b/MNIST.pdf
diff --git a/README.md b/README.md
@@ -0,0 +1,8 @@
+# Analyses of MNIST database of handwritten digits
+
+see **MNIST.pdf** in the repo for the documentation. As of June 2015 the project is just getting underway.
+
+The US Post Office’s desire to automate the routing of mail by handwritten zipcode motivated the
+creation of the MNIST database of handwritten digits. The database contains 60,000 training images and 10,000 testing images, each 28 × 28 grayscale images of a single digit, and is widely used for benchmarking machine learning algorithms, the best of which is reported to have achieved a 0.23% misclassification error rate using convolutional neural networks. Kaggle has a training contest using a variation of the MNIST database. 
+
+This project is my attempt to beat the benchmarks set for the various algorithms described in the 1998 paper by LeCun et al plus a few others of interest to me and my results on Kaggle.
diff --git a/deskew.ipynb b/deskew.ipynb
diff --git a/load_TrainTest.R b/load_TrainTest.R
@@ -0,0 +1,34 @@
+set.seed(1009)
+# read MNIST training and test data
+library(data.table)
+library(caret)
+
+# read the data from the csv files
+# ################################
+if (deskewed) {
+    trainX = fread(input='../data/train-images_deskewed.csv', sep=",", header=FALSE,verbose=FALSE)
+    testX  = fread(input='../data/t10k-images_deskewed.csv',  sep=",", header=FALSE,verbose=FALSE)
+    print("deskewed data loaded")
+} else {
+    trainX = fread(input='../data/train-images.csv', sep=",", header=FALSE,verbose=FALSE)
+    testX  = fread(input='../data/t10k-images.csv',  sep=",", header=FALSE,verbose=FALSE)
+    print("original data loaded")
+}
+
+trainY = read.table(file='../data/train-labels.csv', sep="", header=FALSE)
+testY  = read.table(file='../data/t10k-labels.csv',  sep="", header=FALSE)
+
+trainY = as.vector(trainY$V1)
+testY  = as.vector(testY$V1)
+
+# shuffle the data to help any CV process
+# #######################################
+train.shuffle = sample(nrow(trainX))
+trainX = trainX[train.shuffle,]
+trainY = trainY[train.shuffle]
+
+test.shuffle = sample(nrow(testX))
+testX = testX[test.shuffle,]
+testY = testY[test.shuffle]
+
+rm(train.shuffle, test.shuffle)
diff --git a/load_mnist.R b/load_mnist.R
@@ -0,0 +1,55 @@
+# see https://gist.github.com/brendano/39760
+
+# Load the MNIST digit recognition dataset into R
+# http://yann.lecun.com/exdb/mnist/
+# assume you have all 4 files and gunzip'd them
+# creates train$n, train$x, train$y  and test$n, test$x, test$y
+# e.g. train$x is a 60000 x 784 matrix, each row is one digit (28x28)
+# call:  show_digit(train$x[5,])   to see a digit.
+# brendan o'connor - gist.github.com/39760 - anyall.org
+
+load_mnist <- function() {
+    load_image_file <- function(filename) {
+        ret = list()
+        f = file(filename,'rb')
+        readBin(f,'integer',n=1,size=4,endian='big')
+        ret$n = readBin(f,'integer',n=1,size=4,endian='big')
+        nrow = readBin(f,'integer',n=1,size=4,endian='big')
+        ncol = readBin(f,'integer',n=1,size=4,endian='big')
+        x = readBin(f,'integer',n=ret$n*nrow*ncol,size=1,signed=F)
+        ret$x = matrix(x, ncol=nrow*ncol, byrow=T)
+        close(f)
+        ret
+    }
+    load_label_file <- function(filename) {
+        f = file(filename,'rb')
+        readBin(f,'integer',n=1,size=4,endian='big')
+        n = readBin(f,'integer',n=1,size=4,endian='big')
+        y = readBin(f,'integer',n=n,size=1,signed=F)
+        close(f)
+        y
+    }
+    train <<- load_image_file('data/train-images.idx3-ubyte')
+    test <<- load_image_file('data/t10k-images.idx3-ubyte')
+
+    train$y <<- load_label_file('data/train-labels.idx1-ubyte')
+    test$y <<- load_label_file('data/t10k-labels.idx1-ubyte')
+}
+
+
+show_digit <- function(arr784, col=gray(12:1/12), ...) {
+    image(matrix(arr784, nrow=28)[,28:1], col=col, ...)
+}
+
+print_16 = function(starting_at=1, X=trainX, Y=trainY) {
+    # print a 4x4 of images in the training set
+    # starting at index=starting_at
+    opar = par(no.readonly=TRUE)
+    par(mfrow=c(4,4))
+    for (i in seq(from=starting_at, length.out=16)){
+        show_digit(matrix(as.numeric(X[i,]),28,28),
+                   main=Y[i],
+                   xlab=paste("index",i))
+    }
+    par(opar)
+}