Skip to content

Commit

Permalink
First pass optimisation routines. L-BFGS and gradient descent seem to…
Browse files Browse the repository at this point in the history
… work on simple test functions.
  • Loading branch information
Ati Sharma committed Jun 25, 2020
1 parent 54b0053 commit 44a89d3
Showing 1 changed file with 171 additions and 111 deletions.
282 changes: 171 additions & 111 deletions src/matlib/optim.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns matlib.optim
"Various optimisation algorithms.
L-BFGS and gradient descent are implemented.
[NW-06]
Numerical Optimization (second Ed.)
Expand All @@ -15,8 +16,7 @@
Mathematical Programming Vol. 45, pp. 503-528 (1989).
"
(:require
[matlib.core :refer [eps sq-eps shift-update dge-eye last-cols col-vector take-cols drop-cols]]
[data.plot :as plot]
[matlib.core :refer [eps sq-eps shift-update dge-eye last-cols col-vector ones take-cols drop-cols]]
[uncomplicate.neanderthal
[core :refer [transfer! copy! copy scal! scal xpy axpy axpy! dot nrm1 nrm2 nrmi sum mm dia dim col ncols mrows trans view-vctr subvector submatrix vctr?]]
[vect-math :refer [sqrt inv mul div]]
Expand All @@ -28,12 +28,19 @@
(def ip (/ (- (Math/sqrt 5.0) 1.0) 2.0))
(def ip2 (/ (- 3.0 (Math/sqrt 5.0)) 2.0))

(defn- tolerance
"Default tolerance for iterative functions, `sqrt eps * |x|²`."
[x]
(if (vctr? x)
(* sq-eps (max 1.0 (nrm2 x)))
(* sq-eps (max 1.0 (Math/abs x)))))

(defn vctr-grad
"Gradient of `f` at `x`, approximated by central finite-difference,
where `x` is a Neanderthal vctr.
`f: ℝⁿ -> ℝ`."
([f x]
(vctr-grad f x (* sq-eps (max 1.0 (nrm2 x)))))
(vctr-grad f x (tolerance x)))
([f x h]
(let [df_dx (dv (repeat (dim x) 0))]
(doseq [i (range (dim x))]
Expand All @@ -49,7 +56,7 @@
"Approximate gradient of a function `phi` by forward finite-difference, where
`phi: ℝ -> ℝ."
([phi ^double x]
(scalar-grad phi x (* sq-eps (max 1.0 (Math/abs x)))))
(scalar-grad phi x (tolerance x)))
([phi ^double x ^double h]
(let [x+ (+ x h)
x- (- x h)]
Expand All @@ -66,170 +73,223 @@
"Return double `x` that minimises differentiable `phi(x)` using golden mean search.
If bounds `x-` and `x+` are not given, they are assumed to be +-1e10."
([phi]
(golden-section phi -1e10 1e10 {}))
([phi x- x+ args]
(golden-section phi -1e10 1e10))
([phi x- x+ & args]
(let [a (min x- x+)
b (max x- x+)
{:keys [h c d fc fd tol] :or {h (- b a)
c (+ a (* ip2 h))
d (+ a (* ip h))
fc (phi c)
fd (phi d)
tol sq-eps}} args]
(cond (<= h tol) a
(< fc fd) (recur phi a d {:h (* h ip) :d c :fd fc :tol tol})
(>= fc fd) (recur phi c b {:h (* h ip) :c d :fc fd :tol tol})))))

(defn optimal?
"Check for local optimality, using infinity norm of the gradient."
[f x tol]
(< (nrmi (grad f x)) tol))
{:keys [h c d fc fd k] :or {h (- b a)
c (+ a (* ip2 h))
d (+ a (* ip h))
fc (phi c)
fd (phi d)
k 0}} args]
(cond (<= h (tolerance x+)) {:sol (/ (+ a b) 2.0) :iterations k}
(< fc fd) (recur phi a d {:h (* h ip) :d c :fd fc :k (inc k)})
(>= fc fd) (recur phi c b {:h (* h ip) :c d :fc fd :k (inc k)})))))

(defn- zoom
"Algorithm 3.6 of [NW-06] using bisection.
`phi` phi: ℝ -> ℝ, usually `phi(a) = f(x + ap)`
`a-` lower bound on `a`
`a+` upper bound on `a`
`0 < c1 < c2 < 1`."
[phi a- a+ c1 c2 i]
(let [max-iter 20
aj (/ (+ a+ a-) 2.0)
phiaj (phi aj)
phi'0 (scalar-grad phi 0.0)
phi'aj (scalar-grad phi aj)]
(cond (> i max-iter) aj
(> phiaj (+ (phi 0.0) (* c1 aj phi'0))) (recur phi a- aj c1 c2 (inc i))
(>= phiaj (phi a-)) (recur phi a- aj c1 c2 (inc i))
(> (Math/abs phi'aj) (- (* c2 phi'0))) aj
(>= (* phi'aj (- a+ a-)) 0.0) (recur phi aj a- c1 c2 (inc i))
:else (recur phi aj a+ c1 c2 (inc i)))))
[phi a- a+ phi0 phi'0 c1 c2 i]
(let [maxiter 20
a (/ (+ a+ a-) 2.0)
phia (phi a)
phi'a (scalar-grad phi a)]
(cond (> i maxiter) {:sol a :success false}
(> phia (+ phi0 (* c1 a phi'0))) (recur phi a- a phi0 phi'0 c1 c2 (inc i))
(>= phia (phi a-)) (recur phi a- a phi0 phi'0 c1 c2 (inc i))
(<= (Math/abs phi'a) (* -1 c2 phi'0)) {:sol a :success true}
(>= (* phi'a (- a+ a-)) 0.0) (recur phi a a- phi0 phi'0 c1 c2 (inc i))
:else (recur phi a a+ phi0 phi'0 c1 c2 (inc i)))))

(defn- wolfe
(defn wolfe
"Perform a line search to find a step length `a` satisfying the strong Wolfe
conditions.
`phi` function of scalar a
`a` starting value.
See p.60 [NW-06]."
([phi]
(wolfe phi 1.0 0.0 1))
([phi a_i a_i-1 i]
(let [c1 1e-4 ; from [NW-06]
c2 0.9 ; from [NW-06]
a_max 1.0
maxiter 20
phi_a_i (phi a_i)
phi'a_i (scalar-grad phi a_i)
phi'0 (scalar-grad phi 0)
phi_a_i-1 (phi a_i-1)]
(cond (> i maxiter) a_i
(> phi_a_i (+ (phi 0) (* c1 a_i phi'0))) (zoom phi a_i-1 a_i c1 c2 0)
(and (> phi_a_i phi_a_i-1) (> i 1)) (zoom phi a_i-1 a_i c1 c2 0)
(<= (Math/abs phi'a_i) (* -1 c2 phi'0)) a_i
(>= phi'a_i 0) (zoom phi a_i a_i-1 c1 c2 0)
:else (recur phi (/ (+ a_i a_max) 2.0) a_i (inc i))))))
([phi & options]
(let [{:keys [a a- amax maxiter k phi0 phi'0 c1 c2]
:or {a 1.0
a- 0.0
amax 2.0
maxiter 20
k 0
phi0 (phi 0)
phi'0 (scalar-grad phi 0)
c1 1e-4
c2 0.9}} options
phi_a (phi a)
phi'a (scalar-grad phi a)
phi_a- (phi a-)]
(cond (> k maxiter) {:sol a :success false}
(> phi_a (+ phi0 (* c1 a phi'0))) (zoom phi a- a phi0 phi'0 c1 c2 0)
(and (> phi_a phi_a-) (> k 1)) (zoom phi a- a phi0 phi'0 c1 c2 0)
(<= (Math/abs phi'a) (* -1 c2 phi'0)) {:sol a :success true}
(>= phi'a 0.0) (zoom phi a a- phi0 phi'0 c1 c2 0)
:else (recur phi {:a (/ (+ a amax) 2.0)
:a- a
:amax amax
:maxiter maxiter
:k (inc k)
:phi0 phi0
:phi'0 phi'0
:c1 c1
:c2 c2})))))

(defn backtrack
"Backtracking line search."
([phi] (backtrack phi 5.0 0))
([phi a j]
(let [tau 0.5
c 0.5
m (scalar-grad phi a)
t (* -1 c m)]
(if (or (< (- (phi 0) (phi a)) (* a t)) (< a 1e-8))
a
(recur phi (* tau a) j)))))
([phi & options]
(let [{:keys [a maxiter k tau c phi0 t]
:or {a 10.0
maxiter 1000
k 0
tau 0.5
c 0.5
phi0 (phi 0.0)
t (* -1 c (scalar-grad phi 0))}} options
armijo (>= (- phi0 (phi a)) (* a t))]
(cond armijo {:sol a :success true :iterations k}
(> k maxiter) {:sol a :success false :iterations k}
:else (recur phi {:a (* tau a)
:maxiter maxiter
:k (inc k)
:tau tau
:c c
:phi0 phi0
:t t})))))

(defn gradient-descent
"Gradient descent with line search.
`f` function to be solved, `f: ℝⁿ -> ℝ`.
`x` initial solution guess
options (default):
`:tol` solution converges when `(nrm1 (grad f x)) < tol)`, (`sqrt eps * |x₀|²`)
`:maxiter` maximum iterations (1000)
`:output` print progress every iteration (`false`)
`:lsmethod line-search method for step length, one of `:wolfe`, `:gs`, `:backtrack` (`wolfe`)
"
([f x & options]
(let [{:keys [tol maxiter output k lsmethod]
:or {tol (tolerance x)
maxiter 1000
output false
k 0
lsmethod :wolfe}} options
linesearch (get {:wolfe wolfe :gs golden-section :backtrack backtrack} lsmethod wolfe)
g (vctr-grad f x)
q (scal -1 g)
a (:sol (wolfe #(f (axpy % q x))))
x+ (axpy a q x)
success (< (nrm1 g) tol)]
(when output
(when (= k 0) (print options "\n"))
(printf "k: %05d\ta: %8.5f\t|grad| %10.3f\t|dx|: %10.3f\t|f(x+)|: %10.4f\n"
k a (nrmi g) (nrm2 (axpy -1 x x+)) (f x+)))
(cond success (merge options {:sol x+ :f (f x+) :grad g :iterations k :success true :lsmethod lsmethod})
(> k maxiter) (merge options {:sol x+ :f (f x+) :grad g :iterations k :success false :lsmethod lsmethod})
:else (recur f x+ {:tol tol :maxiter maxiter :output output :k (inc k) :lsmethod lsmethod})))))

(defn- alg-7-4
"Two-loop recursion in L-BFGS to calculate product of Hessian with gradient.
"Two-loop recursion in L-BFGS to calculate descent direction as the
product of Hessian with the gradient.
`S` is a matrix whose columns are `sᵢ`
`Y` is a matrix whose columns are `yᵢ`
`q` is the supplied gradient of `f` at `xₖ`.
See Algorithm 7.4 of [NW-06]."
[S Y q k]
(assert (= (ncols S) (ncols Y)))
(assert (= (mrows S) (mrows Y) (dim q)))
; i = k-1, ..., k-m
(let [c (ncols S)
m (min c k)
is (range (- c m) c) ; k-m, ..., k-1
rhos (dv (repeat c 0.0))
as (copy rhos)
sl (last-cols S)
yl (last-cols Y)
irhos (dv (repeat c 0.0))
as (dv (repeat c 0.0))
sl (col S (- c 1))
yl (col Y (- c 1))
yl2 (dot yl yl)
gamma (/ (dot yl sl) (max yl2 1e-2))
gamma (/ (dot yl sl) yl2)
r (dv (repeat (dim q) 0.0))]
; i = k-1, ... k-m
(doseq [i (reverse is)]
(let [s (col S i)
y (col Y i)
rho (/ 1.0 (dot y s))
a (* rho (dot s q))]
; update as, rhos, q in-place
(entry! rhos i rho)
irho (dot y s)
a (/ (dot s q) irho)]
; update as, 1/rhos, q in-place
(entry! irhos i irho)
(entry! as i a)
(axpy! (- a) y q)))
(copy! (scal gamma q) r) ; r0 = H^0_k, H^0_k = gamma I
; i = k-m, ..., k-1
(doseq [i is]
(let [rho (entry rhos i)
(let [irho (entry irhos i)
a (entry as i)
s (col S i)
y (col Y i)
b (* rho (dot y r))]
(axpy! (- b a) s r)))
r))
b (/ (dot y r) irho)]
(axpy! (- a b) s r)))
(scal! 1 r)))

(defn- alg-7-5
"Main L-BFGS loop. See Algorithm 7.5 of [NW-06]."
([f x S Y k tol maxiter]
"Compute L-BFGS descent direction. See Algorithm 7.5 of [NW-06]."
([f x S Y k linesearch tol maxiter output]
(let [q (vctr-grad f x)
p (alg-7-4 S Y q k)
;a (backtrack #(f (axpy % p x)))
;a (wolfe #(f (axpy % p x)))
a (golden-section #(f (axpy % p x)))
x+ (axpy (- a) p x)
; search direction, start off downhill
p (if (= 0 k) (scal -1 q) (alg-7-4 S Y q k))
a (:sol (linesearch #(f (axpy % p x))))
x+ (axpy a p x)
s (axpy -1 x x+)
y (axpy -1 (grad f x+) (grad f x))
X (dge (dim x) maxiter)]
X (dge (dim x) maxiter)
success (< (nrmi q) tol)]
(when output
(printf "k: %5d\ta: %8.5f\t|q| %10.3f\tp.s: %10.3f\t|f(x+)|: %10.4f\n"
k a (nrmi q) (dot p s) (f x+)))
(shift-update S (col-vector s))
(shift-update Y (col-vector y))
(shift-update X (col-vector x))
(printf "k: %05d\ta: %8.5f\t|s|: %10.3f\t|y|: %10.3f\t|q| %10.3f\t|p|: %10.3f\t|f(x+)|: %10.4f\n"
k a (nrm2 s) (nrm2 y) (nrmi q) (nrm2 p) (f x+))
;(print "p, s: " p s "\n")
(cond (< (nrmi q) tol) {:sol x+ :k k :X X :S S :Y Y :success true}
(> k maxiter) {:sol x+ :k k :X X :S S :Y Y :success false}
:else (recur f x+ S Y (inc k) tol maxiter)))))
(cond success {:sol x+ :f (f x+) :iterations k :tol tol :maxiter maxiter :output output :X X :S S :Y Y :success true}
(> k maxiter) {:sol x+ :f (f x+) :iterations k :tol tol :maxiter maxiter :output output :X X :S S :Y Y :success false}
:else (recur f x+ S Y (inc k) linesearch tol maxiter output)))))

(defn l-bfgs
"A pure Neanderthal implementation of L-BFGS, using finite-difference gradient
approximation.
"L-BFGS, using finite-difference gradient approximation.
`f` function to be solved, `f: ℝⁿ -> ℝ`.
`x` initial solution guess
`m` number of last iterations to store."
([f x m & options]
(let [{:keys [tol maxiter] :or {tol (* sq-eps (nrm2 x))}, maxiter 100} options
X (rand-normal! (dge (dim x) (+ m 1))) ; some random states
S (axpy -1.0 (submatrix X (dim x) m) (submatrix X 0 1 (dim x) m))
options (default):
`:m` number of last iterations to store for approximate Hessian (20)
`:tol` solution converges when `(nrm1 (grad f x)) < tol)`, (`sqrt eps * |x₀|²`)
`:maxiter` maximum iterations ( 1000)
`:output` print progress every iteration (`false`)
`:lsmethod line-search method for step length, one of `:wolfe`, `:gs`, `:backtrack` (`wolfe`)
"
([f x & options]
(let [{:keys [tol maxiter output m lsmethod]
:or {tol (tolerance x)
maxiter 1000
output false
m 20
lsmethod :wolfe}} options
linesearch (get {:wolfe wolfe :gs golden-section :backtrack backtrack} lsmethod wolfe)
S (dge (dim x) m)
Y (dge (dim x) m)]
(printf "tol %f\tmaxiter: %d\n" tol maxiter)
(print "\t\ta: steplength\ts: step\t\ty: ddf/dx\tq: df/dx\tp: search dirn\n")
(doseq [i (range m)]
(copy!
(axpy -1
(vctr-grad f (col X i))
(vctr-grad f (col X (+ i 1))))
(col Y i)))
(alg-7-5 f (copy x) S Y 0 tol maxiter))))

(defn f
"A test function f: ℝⁿ -> ℝ."
[x]
(+ 10.0 (nrm2 x)))
(when output
(print options "\n")
(print "\t\ta: steplength\ts: step\t\ty: ddf/dx\tq: df/dx\tp: search dirn\n"))
(merge (alg-7-5 f (copy x) S Y 0 linesearch tol maxiter output) {:lsmethod lsmethod}))))

(defn- f
"Booth function f: ℝⁿ -> ℝ, minimum at f(1, 3) = 0."
[v]
(let [x (entry v 0)
y (entry v 1)]
(+ (Math/pow (+ x y y -7), 2)
(Math/pow (+ x x y -5), 2))))

(defn make-phi
(defn- make-phi
"A test function phi(a) = f(x + ap)."
[f x p]
#(f (axpy % p x)))

0 comments on commit 44a89d3

Please sign in to comment.