From af3435a118fc10f04a63a3abfdf9321ad79a29fa Mon Sep 17 00:00:00 2001
From: Ke Tran <ketranmanh@gmail.com>
Date: Wed, 9 Nov 2016 01:14:35 +0100
Subject: [PATCH 1/4] add eve optimizer

---
 eve.lua           | 67 +++++++++++++++++++++++++++++++++++++++++++++++
 test/test_eve.lua | 22 ++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 eve.lua
 create mode 100644 test/test_eve.lua

diff --git a/eve.lua b/eve.lua
new file mode 100644
index 0000000..5275902
--- /dev/null
+++ b/eve.lua
@@ -0,0 +1,67 @@
+--[[ EVE implementation https://arxiv.org/pdf/1611.01505v1.pdf
+
+ARGS:
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before update
+]]
+function optim.eve(opfunc, x, config, state)
+    -- (0) get/update state
+    if config == nil and state == nil then
+        print('no state table, EVE initializing')
+    end
+    local config = config or {}
+    local state = state or {}
+    local lr = config.learningRate or 1e-3
+    local beta1 = config.beta1 or 0.9
+    local beta2 = config.beta2 or 0.999
+    local eps = config.epsilon or 1e-8
+    local beta3 = config.beta3 or 0.999
+    local thl = config.thl or 0.1
+    local thu = config.thu or 10
+    state.d = state.d or 1
+    state.t = state.t or 0
+    state.fhat = state.fhat or 0
+    -- (2) evaluate f(x) and df/dx
+
+
+    local fx, dfdx = opfunc(x)
+
+    state.m = state.m or x.new(dfdx:size()):zero()
+    state.m:mul(beta1):add(1-beta1, dfdx)
+
+    state.v = state.v or x.new(dfdx:size()):zero()
+    state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
+
+    state.denom = state.denom or x.new(dfdx:size()):zero()
+    state.denom:copy(state.v)
+    -- update
+    state.t = state.t + 1
+
+    if state.t == 1 then
+        state.d = 1
+        state.fhat = fx
+        state.t = 1
+    else
+        local l, u = 0, 0 -- lowerbound and upperbound
+        if fx > state.fhat then
+            l, u = thl + 1, thu + 1
+        else
+            l, u = 1 / (thu+1), 1 / (thl + 1)
+        end
+        local fhat = state.fhat * math.min(math.max(l, fx / state.fhat), u)
+        local r = math.abs(fhat - state.fhat) / math.min(fhat, state.fhat)
+        state.fhat = fhat
+        state.d = beta3 * state.d + (1 - beta3) * r
+    end
+
+
+    local biasCorrection1 = 1 - beta1^state.t
+    local biasCorrection2 = 1 - beta2^state.t
+    local alpha = lr * state.d / math.sqrt(biasCorrection2) * biasCorrection1
+    state.denom:sqrt():add(eps)
+    x:addcdiv(-alpha, state.m, state.denom)
+
+    -- return x*, f(x) before optimization
+    return x, {fx}
+end
diff --git a/test/test_eve.lua b/test/test_eve.lua
new file mode 100644
index 0000000..98f8365
--- /dev/null
+++ b/test/test_eve.lua
@@ -0,0 +1,22 @@
+require 'torch'
+require 'optim'
+require 'rosenbrock'
+require 'l2'
+x = torch.Tensor(2):fill(0)
+fx = {}
+config = {thl = 1e-1, thu = 10}
+state = {}
+for i = 1, 10001 do
+    x, f = optim.eve(rosenbrock, x, config, state)
+    if (i-1)%1000 == 0 then
+        table.insert(fx,f[1])
+    end
+end
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i = 1, #fx do
+    print((i-1)*1000+1, fx[i])
+end

From 6a3000e88888c2f834ce97c1655ba14bf2910e30 Mon Sep 17 00:00:00 2001
From: Ke Tran <ketranmanh@gmail.com>
Date: Wed, 9 Nov 2016 01:14:57 +0100
Subject: [PATCH 2/4] add eve optimizer

---
 init.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init.lua b/init.lua
index a045bd8..8455dc0 100644
--- a/init.lua
+++ b/init.lua
@@ -18,6 +18,7 @@ require('optim.rmsprop')
 require('optim.adadelta')
 require('optim.cmaes')
 require('optim.de')
+require('optim.eve')
 
 -- line search functions
 require('optim.lswolfe')

From b299ac11da4c3ceaf17b7b6fdb110f0e2a61ad30 Mon Sep 17 00:00:00 2001
From: Ke Tran <ketranmanh@gmail.com>
Date: Wed, 9 Nov 2016 01:22:30 +0100
Subject: [PATCH 3/4] nicer format

---
 eve.lua | 61 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/eve.lua b/eve.lua
index 5275902..6e8c131 100644
--- a/eve.lua
+++ b/eve.lua
@@ -1,6 +1,21 @@
 --[[ EVE implementation https://arxiv.org/pdf/1611.01505v1.pdf
 
 ARGS:
+- 'opfunc' : a function that takes a single input (X), the point
+             of a evaluation, and returns f(X) and df/dX
+- 'x'      : the initial point
+- 'config` : a table with configuration parameters for the optimizer
+- 'config.learningRate'      : learning rate
+- `config.learningRateDecay` : learning rate decay
+- 'config.beta1'             : first moment coefficient
+- 'config.beta2'             : second moment coefficient
+- `config.beta3`             : exponential decay rate for relative change
+- 'config.epsilon'           : for numerical stability
+- `config.thl`               : lowerbound threshold
+- `config.thu`               : upperbound threshold
+- 'config.weightDecay'       : weight decay
+- 'state'                    : a table describing the state of the optimizer; after each
+                              call the state is modified
 RETURN:
 - `x` : the new x vector
 - `f(x)` : the function, evaluated before update
@@ -10,23 +25,38 @@ function optim.eve(opfunc, x, config, state)
     if config == nil and state == nil then
         print('no state table, EVE initializing')
     end
+
     local config = config or {}
-    local state = state or {}
-    local lr = config.learningRate or 1e-3
-    local beta1 = config.beta1 or 0.9
-    local beta2 = config.beta2 or 0.999
-    local eps = config.epsilon or 1e-8
-    local beta3 = config.beta3 or 0.999
-    local thl = config.thl or 0.1
-    local thu = config.thu or 10
+    local state  = state or {}
+
+    local lr     = config.learningRate or 1e-3
+    local lrd    = config.learningRateDecay or 0
+    local beta1  = config.beta1 or 0.9
+    local beta2  = config.beta2 or 0.999
+    local beta3  = config.beta3 or 0.999
+    local eps    = config.epsilon or 1e-8
+    local thl    = config.thl or 0.1
+    local thu    = config.thu or 10
+    local wd     = config.weightDecay or 0
+
+    -- (1) evaluate f(x) and df/dx
+    local fx, dfdx = opfunc(x)
+
+    -- (2) weight decay
+    if wd ~= 0 then
+      dfdx:add(wd, x)
+    end
+
+    -- Initialize state
     state.d = state.d or 1
     state.t = state.t or 0
     state.fhat = state.fhat or 0
-    -- (2) evaluate f(x) and df/dx
-
 
-    local fx, dfdx = opfunc(x)
+    -- (3) learning rate decay (annealing)
+    local clr = lr / (1 + state.t*lrd)
+    state.t = state.t + 1
 
+    -- Decay the first and second moment running average coefficient
     state.m = state.m or x.new(dfdx:size()):zero()
     state.m:mul(beta1):add(1-beta1, dfdx)
 
@@ -35,8 +65,6 @@ function optim.eve(opfunc, x, config, state)
 
     state.denom = state.denom or x.new(dfdx:size()):zero()
     state.denom:copy(state.v)
-    -- update
-    state.t = state.t + 1
 
     if state.t == 1 then
         state.d = 1
@@ -52,15 +80,18 @@ function optim.eve(opfunc, x, config, state)
         local fhat = state.fhat * math.min(math.max(l, fx / state.fhat), u)
         local r = math.abs(fhat - state.fhat) / math.min(fhat, state.fhat)
         state.fhat = fhat
+        -- Decay the relative change
         state.d = beta3 * state.d + (1 - beta3) * r
     end
 
 
     local biasCorrection1 = 1 - beta1^state.t
     local biasCorrection2 = 1 - beta2^state.t
-    local alpha = lr * state.d / math.sqrt(biasCorrection2) * biasCorrection1
+    local stepSize = clr * state.d/math.sqrt(biasCorrection2) * biasCorrection1
     state.denom:sqrt():add(eps)
-    x:addcdiv(-alpha, state.m, state.denom)
+
+    -- (4) update x
+    x:addcdiv(-stepSize, state.m, state.denom)
 
     -- return x*, f(x) before optimization
     return x, {fx}

From 2d8e4823e1ee5a4fe092b8c88189bcf429920f63 Mon Sep 17 00:00:00 2001
From: Ke Tran <ketranmanh@gmail.com>
Date: Wed, 9 Nov 2016 01:33:45 +0100
Subject: [PATCH 4/4] eve

---
 eve.lua | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/eve.lua b/eve.lua
index 6e8c131..6a67dbe 100644
--- a/eve.lua
+++ b/eve.lua
@@ -64,7 +64,6 @@ function optim.eve(opfunc, x, config, state)
     state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
 
     state.denom = state.denom or x.new(dfdx:size()):zero()
-    state.denom:copy(state.v)
 
     if state.t == 1 then
         state.d = 1
@@ -87,11 +86,11 @@ function optim.eve(opfunc, x, config, state)
 
     local biasCorrection1 = 1 - beta1^state.t
     local biasCorrection2 = 1 - beta2^state.t
-    local stepSize = clr * state.d/math.sqrt(biasCorrection2) * biasCorrection1
-    state.denom:sqrt():add(eps)
+    local alpha = clr * math.sqrt(biasCorrection2) / biasCorrection1 / state.d
+    state.denom:copy(state.v):sqrt():add(eps)
 
     -- (4) update x
-    x:addcdiv(-stepSize, state.m, state.denom)
+    x:addcdiv(-alpha, state.m, state.denom)
 
     -- return x*, f(x) before optimization
     return x, {fx}