diff --git a/.gitignore b/.gitignore index e03551c..715e883 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ docs/build/ docs/site/ .DS_Store tmpdir/* +docs/logs diff --git a/docs/Project.toml b/docs/Project.toml index 47fabd2..8f07968 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,16 @@ [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Lighthouse = "ac2c24cd-07f0-4848-96b2-1b82c3ea0e59" LighthouseFlux = "56a5d6c5-c9a8-4db3-ae3d-7c3fdb50c563" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +TensorBoardLogger = "899adc3e-224a-11e9-021f-63837185c80f" [compat] Documenter = "0.25" diff --git a/docs/src/mnist_example.jl b/docs/src/mnist_example.jl new file mode 100644 index 0000000..6c35a41 --- /dev/null +++ b/docs/src/mnist_example.jl @@ -0,0 +1,257 @@ +# for now, running this: +# * Julia 1.4.2 +# * dev LighthouseFlux (i.e. ..) into the docs project +# * dev https://github.com/beacon-biosignals/Lighthouse.jl/pull/15 + +# The following example had been modified from +# https://github.com/FluxML/model-zoo/blob/b4732e5a3158391f2fd737470ff63986420e42cd/vision/mnist/conv.jl + +# Classifies MNIST digits with a convolutional network. +# Writes out saved model to the file "mnist_conv.bson". +# Demonstrates basic model construction, training, saving, +# conditional early-exit, and learning rate scheduling. +# +# This model, while simple, should hit around 99% test +# accuracy after training for approximately 20 epochs. + +using Flux, Flux.Data.MNIST, Statistics +using Flux: onehotbatch, onecold, logitcrossentropy +using Base.Iterators: partition +using Printf +using CUDA +using LighthouseFlux, Lighthouse +using TensorBoardLogger +using Dates + +# for headless plotting with GR +ENV["GKSwstype"]="100" + +if has_cuda() + @info "CUDA is on" + CUDA.allowscalar(false) +end + +Base.@kwdef mutable struct Args + lr::Float64 = 3e-3 + epochs::Int = 20 + batch_size = 128 + savepath::String = joinpath(@__DIR__, "..", "logs", "run") + run_name::String = "abc" + logger = LearnLogger(savepath, run_name) +end + +# Bundle images together with labels and group into minibatchess +function make_minibatch(X, Y, idxs) + X_batch = Array{Float32}(undef, size(X[1])..., 1, length(idxs)) + for i in 1:length(idxs) + X_batch[:, :, :, i] = Float32.(X[idxs[i]]) + end + Y_batch = onehotbatch(Y[idxs], 1:10) + return (X_batch, Y_batch) +end + +function get_processed_data(args) + # Load labels and images from Flux.Data.MNIST + train_labels = MNIST.labels() .+ 1 + train_imgs = MNIST.images() + mb_idxs = partition(1:length(train_imgs), args.batch_size) + train_set = [make_minibatch(train_imgs, train_labels, i) for i in mb_idxs] + + # Prepare test set as one giant minibatch: + test_imgs = MNIST.images(:test) + test_labels = MNIST.labels(:test) .+ 1 + test_set = make_minibatch(test_imgs, test_labels, 1:length(test_imgs)) + + return train_set, test_set, test_labels +end + +function make_rater_labels(true_labels; error_rate = 0.1, n_classes = 10) + out_labels = similar(true_labels) + for i = eachindex(out_labels, true_labels) + if rand() < error_rate + out_labels[i] = mod(true_labels[i] + 2, n_classes) + else + out_labels[i] = true_labels[i] + end + end + return out_labels +end + +# Build model + +struct SimpleModel{C} + chain::C +end + +function SimpleModel(; imgsize = (28,28,1), nclasses = 10) + cnn_output_size = Int.(floor.([imgsize[1]/8,imgsize[2]/8,32])) + + chain = Chain( + # First convolution, operating upon a 28x28 image + Conv((3, 3), imgsize[3]=>16, pad=(1,1), relu), + MaxPool((2,2)), + + # Second convolution, operating upon a 14x14 image + Conv((3, 3), 16=>32, pad=(1,1), relu), + MaxPool((2,2)), + + # Third convolution, operating upon a 7x7 image + Conv((3, 3), 32=>32, pad=(1,1), relu), + MaxPool((2,2)), + + # Reshape 3d tensor into a 2d one using `Flux.flatten`, at this point it should be (3, 3, 32, N) + flatten, + Dense(prod(cnn_output_size), 10)) + chain = gpu(chain) + return SimpleModel{typeof(chain)}(chain) +end + +Flux.@functor SimpleModel (chain,) + +# make callable +(sm::SimpleModel)(args...) = sm.chain(args...) + +# We augment `x` a little bit here, adding in random noise. +augment(x) = x .+ gpu(0.1f0*randn(eltype(x), size(x))) + +# Returns a vector of all parameters used in model +paramvec(m) = vcat(map(p->reshape(p, :), params(m))...) + +# Function to check if any element is NaN or not +anynan(x) = any(isnan.(x)) + +accuracy(x, y, model) = mean(onecold(cpu(model(x))) .== onecold(cpu(y))) + + +function LighthouseFlux.loss_and_prediction(model::SimpleModel, x, y) + # We augment the data + # a bit, adding gaussian random noise to our image to make it more robust. + x̂ = augment(x) + + ŷ = model(x̂) # prediction + + # actually, ignore the model, and output y + 1 with 10% probability + # mask = rand(length(y)) .< 0.1 + # ŷ = y + mask + + return logitcrossentropy(ŷ, y), ŷ +end + +LighthouseFlux.loss(model::SimpleModel, x, y) = LighthouseFlux.loss_and_prediction(model, x, y)[1] + +function train(; kws...) + args = Args(; kws...) + + _info_and_log = (msg::String) -> begin + msg = Dates.format(now(), "HH:MM:SS ") * msg + @info msg + Lighthouse.log_event!(args.logger, msg) + return nothing + end + + + isdir(args.savepath) || mkpath(args.savepath) + + _info_and_log("Loading data set") + train_set, test_set, test_labels = get_processed_data(args) + + # Define our model. We will use a simple convolutional architecture with + # three iterations of Conv -> ReLU -> MaxPool, followed by a final Dense layer. + _info_and_log("Building model...") + model = SimpleModel() + + # Load model and datasets onto GPU, if enabled + train_set = gpu.(train_set) + test_set = gpu.(test_set) + + # Make sure our model is nicely precompiled before starting our training loop + model(train_set[1][1]) + + # Train our model with the given training set using the ADAM optimizer and + # printing out performance against the test set as we go. + opt = ADAM(args.lr) + + classifier = FluxClassifier(model, opt, ["class_$i" for i in 0:9]) + _info_and_log("Beginning `learn!`...") + + votes = reduce(hcat, [ make_rater_labels(test_labels, error_rate = 0.1) for _ = 1:5 ]) + + learn!(classifier, args.logger, + () -> train_set, () -> [(test_set, 1:length(test_labels))], votes) + + return cpu.(params(model)) + + # the following is dead code, from the original model zoo example + # I haven't deleted it yet because I wanted to port the functionality to + # Lighthouse callbacks, to show how the same loop can be done with Lighthouse + + _info_and_log("Beginning training loop...") + best_acc = 0.0 + last_improvement = 0 + best_params = cpu.(params(model)) + + for epoch_idx in 1:args.epochs + # Train for a single epoch + Lighthouse.train!(classifier, train_set, args.logger) + + # Terminate on NaN + if anynan(paramvec(model)) + @error "NaN params" + break + end + + # Calculate accuracy: + acc = accuracy(test_set..., model) + + _info_and_log(@sprintf("[%d]: Test accuracy: %.4f", epoch_idx, acc)) + # If our accuracy is good enough, quit out. + if acc >= 0.999 + _info_and_log(" -> Early-exiting: We reached our target accuracy of 99.9%") + break + end + + # If this is the best accuracy we've seen so far, save the model out + if acc >= best_acc + _info_and_log("Best epoch yet (epoch $(epoch_idx))") + best_params = cpu.(params(model)) + best_acc = acc + last_improvement = epoch_idx + end + + # If we haven't seen improvement in 5 epochs, drop our learning rate: + if epoch_idx - last_improvement >= 5 && opt.eta > 1e-6 + opt.eta /= 10.0 + _info_and_log(" -> Haven't improved in a while, dropping learning rate to $(opt.eta)!") + + # After dropping learning rate, give it a few epochs to improve + last_improvement = epoch_idx + end + + if epoch_idx - last_improvement >= 10 + _info_and_log(" -> We're calling this converged.") + break + end + end + return best_params +end + +# Testing the model, from saved model +function test(params; kws...) + args = Args(; kws...) + + # Loading the test data + _,test_set = get_processed_data(args) + + # Re-constructing the model with random initial weights + model = SimpleModel() + + # Loading parameters onto the model + Flux.loadparams!(model, params) + + test_set = gpu.(test_set) + model = gpu(model) + @show accuracy(test_set...,model) +end + +best_params = train(; epochs=1) +test(best_params)