From 752a50d98d5c7b72ba24d4675f1c5bf3de11bf8e Mon Sep 17 00:00:00 2001 From: ericphanson <5846501+ericphanson@users.noreply.github.com> Date: Mon, 14 Dec 2020 17:08:12 +0100 Subject: [PATCH 1/7] WIP MNIST example --- .gitignore | 1 + docs/Project.toml | 9 ++ docs/src/mnist_example.jl | 239 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 docs/src/mnist_example.jl diff --git a/.gitignore b/.gitignore index e03551c..715e883 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ docs/build/ docs/site/ .DS_Store tmpdir/* +docs/logs diff --git a/docs/Project.toml b/docs/Project.toml index 47fabd2..f814317 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,15 @@ [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Lighthouse = "ac2c24cd-07f0-4848-96b2-1b82c3ea0e59" LighthouseFlux = "56a5d6c5-c9a8-4db3-ae3d-7c3fdb50c563" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] Documenter = "0.25" diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl new file mode 100644 index 0000000..cb025bb --- /dev/null +++ b/docs/src/mnist_example.jl @@ -0,0 +1,239 @@ +# from https://github.com/FluxML/model-zoo/blob/b4732e5a3158391f2fd737470ff63986420e42cd/vision/mnist/conv.jl + +# Classifies MNIST digits with a convolutional network. +# Writes out saved model to the file "mnist_conv.bson". +# Demonstrates basic model construction, training, saving, +# conditional early-exit, and learning rate scheduling. +# +# This model, while simple, should hit around 99% test +# accuracy after training for approximately 20 epochs. + +using Flux, Flux.Data.MNIST, Statistics +using Flux: onehotbatch, onecold, logitcrossentropy +using Base.Iterators: partition +using Printf +using CUDA +using LighthouseFlux, Lighthouse +using TensorBoardLogger +using Dates + +# for headless plotting with GR +ENV["GKSwstype"]="100" + +if has_cuda() + @info "CUDA is on" + CUDA.allowscalar(false) +end + +Base.@kwdef mutable struct Args + lr::Float64 = 3e-3 + epochs::Int = 20 + batch_size = 128 + savepath::String = "./output/run" + run_name::String = "abc" + logger = LearnLogger(savepath, run_name) +end + +# Bundle images together with labels and group into minibatchess +function make_minibatch(X, Y, idxs) + X_batch = Array{Float32}(undef, size(X[1])..., 1, length(idxs)) + for i in 1:length(idxs) + X_batch[:, :, :, i] = Float32.(X[idxs[i]]) + end + Y_batch = onehotbatch(Y[idxs], 0:9) + return (X_batch, Y_batch) +end + +function get_processed_data(args) + # Load labels and images from Flux.Data.MNIST + train_labels = MNIST.labels() + train_imgs = MNIST.images() + mb_idxs = partition(1:length(train_imgs), args.batch_size) + train_set = [make_minibatch(train_imgs, train_labels, i) for i in mb_idxs] + + # Prepare test set as one giant minibatch: + test_imgs = MNIST.images(:test) + test_labels = MNIST.labels(:test) + test_set = make_minibatch(test_imgs, test_labels, 1:length(test_imgs)) + + return train_set, test_set, test_labels +end + +function make_rater_labels(true_labels; error_rate = 0.1, n_classes = 10) + out_labels = similar(true_labels) + for i = eachindex(out_labels, true_labels) + if rand() < error_rate + out_labels[i] = mod(true_labels[i] + 1, n_classes) + else + out_labels[i] = true_labels[i] + end + end + return out_labels +end + +# Build model + +struct SimpleModel{C} + chain::C + function SimpleModel(; imgsize = (28,28,1), nclasses = 10) + cnn_output_size = Int.(floor.([imgsize[1]/8,imgsize[2]/8,32])) + + chain = Chain( + # First convolution, operating upon a 28x28 image + Conv((3, 3), imgsize[3]=>16, pad=(1,1), relu), + MaxPool((2,2)), + + # Second convolution, operating upon a 14x14 image + Conv((3, 3), 16=>32, pad=(1,1), relu), + MaxPool((2,2)), + + # Third convolution, operating upon a 7x7 image + Conv((3, 3), 32=>32, pad=(1,1), relu), + MaxPool((2,2)), + + # Reshape 3d tensor into a 2d one using `Flux.flatten`, at this point it should be (3, 3, 32, N) + flatten, + Dense(prod(cnn_output_size), 10)) + chain = gpu(chain) + return new{typeof(chain)}(chain) + end +end + +# make callable +(sm::SimpleModel)(args...) = sm.chain(args...) + +# We augment `x` a little bit here, adding in random noise. +augment(x) = x .+ gpu(0.1f0*randn(eltype(x), size(x))) + +# Returns a vector of all parameters used in model +paramvec(m) = vcat(map(p->reshape(p, :), params(m))...) + +# Function to check if any element is NaN or not +anynan(x) = any(isnan.(x)) + +accuracy(x, y, model) = mean(onecold(cpu(model(x))) .== onecold(cpu(y))) + + +function LighthouseFlux.loss_and_prediction(model::SimpleModel, x, y) + # We augment the data + # a bit, adding gaussian random noise to our image to make it more robust. + x̂ = augment(x) + ŷ = model(x̂) # prediction + return logitcrossentropy(ŷ, y), ŷ +end + +LighthouseFlux.loss(model::SimpleModel, x, y) = LighthouseFlux.loss_and_prediction(model, x, y)[1] + +function train(; kws...) + args = Args(; kws...) + + _info_and_log = (msg::String) -> begin + msg = Dates.format(now(), "HH:MM:SS ") * msg + @info msg + Lighthouse.log_event!(args.logger, msg) + return nothing + end + + + isdir(args.savepath) || mkpath(args.savepath) + + _info_and_log("Loading data set") + train_set, test_set, test_labels = get_processed_data(args) + + # Define our model. We will use a simple convolutional architecture with + # three iterations of Conv -> ReLU -> MaxPool, followed by a final Dense layer. + _info_and_log("Building model...") + model = SimpleModel() + + # Load model and datasets onto GPU, if enabled + train_set = gpu.(train_set) + test_set = gpu.(test_set) + + # Make sure our model is nicely precompiled before starting our training loop + model(train_set[1][1]) + + # Train our model with the given training set using the ADAM optimizer and + # printing out performance against the test set as we go. + opt = ADAM(args.lr) + + classifier = FluxClassifier(model, opt, 0:9) + _info_and_log("Beginning `learn!`...") + + votes = reduce(hcat, [ make_rater_labels(test_labels, error_rate = 0.1) for _ = 1:5 ]) + + learn!(classifier, args.logger, + () -> train_set, () -> [(test_set, 1:length(test_labels))], votes) + + return cpu.(params(model)) + + _info_and_log("Beginning training loop...") + + best_acc = 0.0 + last_improvement = 0 + best_params = cpu.(params(model)) + + for epoch_idx in 1:args.epochs + # Train for a single epoch + Lighthouse.train!(classifier, train_set, args.logger) + + # Terminate on NaN + if anynan(paramvec(model)) + @error "NaN params" + break + end + + # Calculate accuracy: + acc = accuracy(test_set..., model) + + _info_and_log(@sprintf("[%d]: Test accuracy: %.4f", epoch_idx, acc)) + # If our accuracy is good enough, quit out. + if acc >= 0.999 + _info_and_log(" -> Early-exiting: We reached our target accuracy of 99.9%") + break + end + + # If this is the best accuracy we've seen so far, save the model out + if acc >= best_acc + _info_and_log("Best epoch yet (epoch $(epoch_idx))") + best_params = cpu.(params(model)) + best_acc = acc + last_improvement = epoch_idx + end + + # If we haven't seen improvement in 5 epochs, drop our learning rate: + if epoch_idx - last_improvement >= 5 && opt.eta > 1e-6 + opt.eta /= 10.0 + _info_and_log(" -> Haven't improved in a while, dropping learning rate to $(opt.eta)!") + + # After dropping learning rate, give it a few epochs to improve + last_improvement = epoch_idx + end + + if epoch_idx - last_improvement >= 10 + _info_and_log(" -> We're calling this converged.") + break + end + end + return best_params +end + +# Testing the model, from saved model +function test(params; kws...) + args = Args(; kws...) + + # Loading the test data + _,test_set = get_processed_data(args) + + # Re-constructing the model with random initial weights + model = SimpleModel() + + # Loading parameters onto the model + Flux.loadparams!(model, params) + + test_set = gpu.(test_set) + model = gpu(model) + @show accuracy(test_set...,model) +end + +best_params = train(; epochs=1) +test(best_params) From 5e9246922876081eeadabad125e084a6ccdf6b17 Mon Sep 17 00:00:00 2001 From: ericphanson <5846501+ericphanson@users.noreply.github.com> Date: Mon, 14 Dec 2020 17:10:29 +0100 Subject: [PATCH 2/7] save logs into gitignored directory --- docs/src/mnist_example.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl index cb025bb..4d424ca 100644 --- a/docs/src/mnist_example.jl +++ b/docs/src/mnist_example.jl @@ -29,7 +29,7 @@ Base.@kwdef mutable struct Args lr::Float64 = 3e-3 epochs::Int = 20 batch_size = 128 - savepath::String = "./output/run" + savepath::String = joinpath(@__DIR__, "logs", "run") run_name::String = "abc" logger = LearnLogger(savepath, run_name) end From 3fd43e2ca98d0949b4e79d9f08a50405f04c6ded Mon Sep 17 00:00:00 2001 From: ericphanson <5846501+ericphanson@users.noreply.github.com> Date: Mon, 14 Dec 2020 18:07:30 +0100 Subject: [PATCH 3/7] wip --- docs/src/mnist_example.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl index 4d424ca..11f3a65 100644 --- a/docs/src/mnist_example.jl +++ b/docs/src/mnist_example.jl @@ -29,7 +29,7 @@ Base.@kwdef mutable struct Args lr::Float64 = 3e-3 epochs::Int = 20 batch_size = 128 - savepath::String = joinpath(@__DIR__, "logs", "run") + savepath::String = joinpath(@__DIR__, "..", "logs", "run") run_name::String = "abc" logger = LearnLogger(savepath, run_name) end @@ -99,6 +99,8 @@ struct SimpleModel{C} end end +Flux.@functor SimpleModel (chain,) + # make callable (sm::SimpleModel)(args...) = sm.chain(args...) @@ -166,8 +168,11 @@ function train(; kws...) return cpu.(params(model)) + # the following is dead code, from the original model zoo example + # I haven't deleted it yet because I wanted to port the functionality to + # Lighthouse callbacks, to show how the same loop can be done with Lighthouse + _info_and_log("Beginning training loop...") - best_acc = 0.0 last_improvement = 0 best_params = cpu.(params(model)) From 8fee861a0d6bd681cf69af7dbe2b48766f814630 Mon Sep 17 00:00:00 2001 From: ericphanson <5846501+ericphanson@users.noreply.github.com> Date: Mon, 14 Dec 2020 21:13:17 +0100 Subject: [PATCH 4/7] wip --- docs/Project.toml | 1 + docs/src/mnist_example.jl | 53 ++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index f814317..8f07968 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -9,6 +9,7 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl index 11f3a65..0315015 100644 --- a/docs/src/mnist_example.jl +++ b/docs/src/mnist_example.jl @@ -75,28 +75,29 @@ end struct SimpleModel{C} chain::C - function SimpleModel(; imgsize = (28,28,1), nclasses = 10) - cnn_output_size = Int.(floor.([imgsize[1]/8,imgsize[2]/8,32])) - - chain = Chain( - # First convolution, operating upon a 28x28 image - Conv((3, 3), imgsize[3]=>16, pad=(1,1), relu), - MaxPool((2,2)), - - # Second convolution, operating upon a 14x14 image - Conv((3, 3), 16=>32, pad=(1,1), relu), - MaxPool((2,2)), - - # Third convolution, operating upon a 7x7 image - Conv((3, 3), 32=>32, pad=(1,1), relu), - MaxPool((2,2)), - - # Reshape 3d tensor into a 2d one using `Flux.flatten`, at this point it should be (3, 3, 32, N) - flatten, - Dense(prod(cnn_output_size), 10)) - chain = gpu(chain) - return new{typeof(chain)}(chain) - end +end + +function SimpleModel(; imgsize = (28,28,1), nclasses = 10) + cnn_output_size = Int.(floor.([imgsize[1]/8,imgsize[2]/8,32])) + + chain = Chain( + # First convolution, operating upon a 28x28 image + Conv((3, 3), imgsize[3]=>16, pad=(1,1), relu), + MaxPool((2,2)), + + # Second convolution, operating upon a 14x14 image + Conv((3, 3), 16=>32, pad=(1,1), relu), + MaxPool((2,2)), + + # Third convolution, operating upon a 7x7 image + Conv((3, 3), 32=>32, pad=(1,1), relu), + MaxPool((2,2)), + + # Reshape 3d tensor into a 2d one using `Flux.flatten`, at this point it should be (3, 3, 32, N) + flatten, + Dense(prod(cnn_output_size), 10)) + chain = gpu(chain) + return SimpleModel{typeof(chain)}(chain) end Flux.@functor SimpleModel (chain,) @@ -120,7 +121,13 @@ function LighthouseFlux.loss_and_prediction(model::SimpleModel, x, y) # We augment the data # a bit, adding gaussian random noise to our image to make it more robust. x̂ = augment(x) + ŷ = model(x̂) # prediction + + # actually, ignore the model, and output y + 1 with 10% probability + # mask = rand(length(y)) .< 0.1 + # ŷ = y + mask + return logitcrossentropy(ŷ, y), ŷ end @@ -171,7 +178,7 @@ function train(; kws...) # the following is dead code, from the original model zoo example # I haven't deleted it yet because I wanted to port the functionality to # Lighthouse callbacks, to show how the same loop can be done with Lighthouse - + _info_and_log("Beginning training loop...") best_acc = 0.0 last_improvement = 0 From 19537398f91ddacaa850b4bfc4d41229f12b3957 Mon Sep 17 00:00:00 2001 From: Eric Hanson <5846501+ericphanson@users.noreply.github.com> Date: Mon, 14 Dec 2020 22:34:44 +0100 Subject: [PATCH 5/7] Update docs/src/mnist_example.jl Co-authored-by: Hannah Robertson --- docs/src/mnist_example.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl index 0315015..186a9b9 100644 --- a/docs/src/mnist_example.jl +++ b/docs/src/mnist_example.jl @@ -165,7 +165,7 @@ function train(; kws...) # printing out performance against the test set as we go. opt = ADAM(args.lr) - classifier = FluxClassifier(model, opt, 0:9) + classifier = FluxClassifier(model, opt, ["class_$i" for i in 0:9]) _info_and_log("Beginning `learn!`...") votes = reduce(hcat, [ make_rater_labels(test_labels, error_rate = 0.1) for _ = 1:5 ]) From cffc76e56775d5fe227d31234f95b1d9da3fe516 Mon Sep 17 00:00:00 2001 From: ericphanson Date: Mon, 14 Dec 2020 21:42:01 +0000 Subject: [PATCH 6/7] tweak labeller error path --- docs/src/mnist_example.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl index 186a9b9..54bf1f2 100644 --- a/docs/src/mnist_example.jl +++ b/docs/src/mnist_example.jl @@ -63,7 +63,7 @@ function make_rater_labels(true_labels; error_rate = 0.1, n_classes = 10) out_labels = similar(true_labels) for i = eachindex(out_labels, true_labels) if rand() < error_rate - out_labels[i] = mod(true_labels[i] + 1, n_classes) + out_labels[i] = mod(true_labels[i] + 2, n_classes) else out_labels[i] = true_labels[i] end From d63325d96506b92c08a812539ca915afa5ca8c73 Mon Sep 17 00:00:00 2001 From: ericphanson Date: Mon, 14 Dec 2020 23:15:34 +0000 Subject: [PATCH 7/7] fix classes vs indices errors --- docs/src/mnist_example.jl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl index 54bf1f2..6c35a41 100644 --- a/docs/src/mnist_example.jl +++ b/docs/src/mnist_example.jl @@ -1,4 +1,10 @@ -# from https://github.com/FluxML/model-zoo/blob/b4732e5a3158391f2fd737470ff63986420e42cd/vision/mnist/conv.jl +# for now, running this: +# * Julia 1.4.2 +# * dev LighthouseFlux (i.e. ..) into the docs project +# * dev https://github.com/beacon-biosignals/Lighthouse.jl/pull/15 + +# The following example had been modified from +# https://github.com/FluxML/model-zoo/blob/b4732e5a3158391f2fd737470ff63986420e42cd/vision/mnist/conv.jl # Classifies MNIST digits with a convolutional network. # Writes out saved model to the file "mnist_conv.bson". @@ -40,20 +46,20 @@ function make_minibatch(X, Y, idxs) for i in 1:length(idxs) X_batch[:, :, :, i] = Float32.(X[idxs[i]]) end - Y_batch = onehotbatch(Y[idxs], 0:9) + Y_batch = onehotbatch(Y[idxs], 1:10) return (X_batch, Y_batch) end function get_processed_data(args) # Load labels and images from Flux.Data.MNIST - train_labels = MNIST.labels() + train_labels = MNIST.labels() .+ 1 train_imgs = MNIST.images() mb_idxs = partition(1:length(train_imgs), args.batch_size) train_set = [make_minibatch(train_imgs, train_labels, i) for i in mb_idxs] # Prepare test set as one giant minibatch: test_imgs = MNIST.images(:test) - test_labels = MNIST.labels(:test) + test_labels = MNIST.labels(:test) .+ 1 test_set = make_minibatch(test_imgs, test_labels, 1:length(test_imgs)) return train_set, test_set, test_labels