433 lines
15 KiB
Python
433 lines
15 KiB
Python
|
"""Nearly exact trust-region optimization subproblem."""
|
||
|
from __future__ import division, print_function, absolute_import
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy.linalg import (norm, get_lapack_funcs, solve_triangular,
|
||
|
cho_solve)
|
||
|
from ._trustregion import (_minimize_trust_region, BaseQuadraticSubproblem)
|
||
|
|
||
|
__all__ = ['_minimize_trustregion_exact',
|
||
|
'estimate_smallest_singular_value',
|
||
|
'singular_leading_submatrix',
|
||
|
'IterativeSubproblem']
|
||
|
|
||
|
|
||
|
def _minimize_trustregion_exact(fun, x0, args=(), jac=None, hess=None,
|
||
|
**trust_region_options):
|
||
|
"""
|
||
|
Minimization of scalar function of one or more variables using
|
||
|
a nearly exact trust-region algorithm.
|
||
|
|
||
|
Options
|
||
|
-------
|
||
|
initial_tr_radius : float
|
||
|
Initial trust-region radius.
|
||
|
max_tr_radius : float
|
||
|
Maximum value of the trust-region radius. No steps that are longer
|
||
|
than this value will be proposed.
|
||
|
eta : float
|
||
|
Trust region related acceptance stringency for proposed steps.
|
||
|
gtol : float
|
||
|
Gradient norm must be less than ``gtol`` before successful
|
||
|
termination.
|
||
|
"""
|
||
|
|
||
|
if jac is None:
|
||
|
raise ValueError('Jacobian is required for trust region '
|
||
|
'exact minimization.')
|
||
|
if hess is None:
|
||
|
raise ValueError('Hessian matrix is required for trust region '
|
||
|
'exact minimization.')
|
||
|
return _minimize_trust_region(fun, x0, args=args, jac=jac, hess=hess,
|
||
|
subproblem=IterativeSubproblem,
|
||
|
**trust_region_options)
|
||
|
|
||
|
|
||
|
def estimate_smallest_singular_value(U):
|
||
|
"""Given upper triangular matrix ``U`` estimate the smallest singular
|
||
|
value and the correspondent right singular vector in O(n**2) operations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
U : ndarray
|
||
|
Square upper triangular matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
s_min : float
|
||
|
Estimated smallest singular value of the provided matrix.
|
||
|
z_min : ndarray
|
||
|
Estimatied right singular vector.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The procedure is based on [1]_ and is done in two steps. First it finds
|
||
|
a vector ``e`` with components selected from {+1, -1} such that the
|
||
|
solution ``w`` from the system ``U.T w = e`` is as large as possible.
|
||
|
Next it estimate ``U v = w``. The smallest singular value is close
|
||
|
to ``norm(w)/norm(v)`` and the right singular vector is close
|
||
|
to ``v/norm(v)``.
|
||
|
|
||
|
The estimation will be better more ill-conditioned is the matrix.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Cline, A. K., Moler, C. B., Stewart, G. W., Wilkinson, J. H.
|
||
|
An estimate for the condition number of a matrix. 1979.
|
||
|
SIAM Journal on Numerical Analysis, 16(2), 368-375.
|
||
|
"""
|
||
|
|
||
|
U = np.atleast_2d(U)
|
||
|
m, n = U.shape
|
||
|
|
||
|
if m != n:
|
||
|
raise ValueError("A square triangular matrix should be provided.")
|
||
|
|
||
|
# A vector `e` with components selected from {+1, -1}
|
||
|
# is selected so that the solution `w` to the system
|
||
|
# `U.T w = e` is as large as possible. Implementation
|
||
|
# based on algorithm 3.5.1, p. 142, from reference [2]
|
||
|
# adapted for lower triangular matrix.
|
||
|
|
||
|
p = np.zeros(n)
|
||
|
w = np.empty(n)
|
||
|
|
||
|
# Implemented according to: Golub, G. H., Van Loan, C. F. (2013).
|
||
|
# "Matrix computations". Forth Edition. JHU press. pp. 140-142.
|
||
|
for k in range(n):
|
||
|
wp = (1-p[k]) / U.T[k, k]
|
||
|
wm = (-1-p[k]) / U.T[k, k]
|
||
|
pp = p[k+1:] + U.T[k+1:, k]*wp
|
||
|
pm = p[k+1:] + U.T[k+1:, k]*wm
|
||
|
|
||
|
if abs(wp) + norm(pp, 1) >= abs(wm) + norm(pm, 1):
|
||
|
w[k] = wp
|
||
|
p[k+1:] = pp
|
||
|
else:
|
||
|
w[k] = wm
|
||
|
p[k+1:] = pm
|
||
|
|
||
|
# The system `U v = w` is solved using backward substitution.
|
||
|
v = solve_triangular(U, w)
|
||
|
|
||
|
v_norm = norm(v)
|
||
|
w_norm = norm(w)
|
||
|
|
||
|
# Smallest singular value
|
||
|
s_min = w_norm / v_norm
|
||
|
|
||
|
# Associated vector
|
||
|
z_min = v / v_norm
|
||
|
|
||
|
return s_min, z_min
|
||
|
|
||
|
|
||
|
def gershgorin_bounds(H):
|
||
|
"""
|
||
|
Given a square matrix ``H`` compute upper
|
||
|
and lower bounds for its eigenvalues (Gregoshgorin Bounds).
|
||
|
Defined ref. [1].
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Conn, A. R., Gould, N. I., & Toint, P. L.
|
||
|
Trust region methods. 2000. Siam. pp. 19.
|
||
|
"""
|
||
|
|
||
|
H_diag = np.diag(H)
|
||
|
H_diag_abs = np.abs(H_diag)
|
||
|
H_row_sums = np.sum(np.abs(H), axis=1)
|
||
|
lb = np.min(H_diag + H_diag_abs - H_row_sums)
|
||
|
ub = np.max(H_diag - H_diag_abs + H_row_sums)
|
||
|
|
||
|
return lb, ub
|
||
|
|
||
|
|
||
|
def singular_leading_submatrix(A, U, k):
|
||
|
"""
|
||
|
Compute term that makes the leading ``k`` by ``k``
|
||
|
submatrix from ``A`` singular.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
A : ndarray
|
||
|
Symmetric matrix that is not positive definite.
|
||
|
U : ndarray
|
||
|
Upper triangular matrix resulting of an incomplete
|
||
|
Cholesky decomposition of matrix ``A``.
|
||
|
k : int
|
||
|
Positive integer such that the leading k by k submatrix from
|
||
|
`A` is the first non-positive definite leading submatrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
delta : float
|
||
|
Amount that should be added to the element (k, k) of the
|
||
|
leading k by k submatrix of ``A`` to make it singular.
|
||
|
v : ndarray
|
||
|
A vector such that ``v.T B v = 0``. Where B is the matrix A after
|
||
|
``delta`` is added to its element (k, k).
|
||
|
"""
|
||
|
|
||
|
# Compute delta
|
||
|
delta = np.sum(U[:k-1, k-1]**2) - A[k-1, k-1]
|
||
|
|
||
|
n = len(A)
|
||
|
|
||
|
# Inicialize v
|
||
|
v = np.zeros(n)
|
||
|
v[k-1] = 1
|
||
|
|
||
|
# Compute the remaining values of v by solving a triangular system.
|
||
|
if k != 1:
|
||
|
v[:k-1] = solve_triangular(U[:k-1, :k-1], -U[:k-1, k-1])
|
||
|
|
||
|
return delta, v
|
||
|
|
||
|
|
||
|
class IterativeSubproblem(BaseQuadraticSubproblem):
|
||
|
"""Quadratic subproblem solved by nearly exact iterative method.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This subproblem solver was based on [1]_, [2]_ and [3]_,
|
||
|
which implement similar algorithms. The algorithm is basically
|
||
|
that of [1]_ but ideas from [2]_ and [3]_ were also used.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] A.R. Conn, N.I. Gould, and P.L. Toint, "Trust region methods",
|
||
|
Siam, pp. 169-200, 2000.
|
||
|
.. [2] J. Nocedal and S. Wright, "Numerical optimization",
|
||
|
Springer Science & Business Media. pp. 83-91, 2006.
|
||
|
.. [3] J.J. More and D.C. Sorensen, "Computing a trust region step",
|
||
|
SIAM Journal on Scientific and Statistical Computing, vol. 4(3),
|
||
|
pp. 553-572, 1983.
|
||
|
"""
|
||
|
|
||
|
# UPDATE_COEFF appears in reference [1]_
|
||
|
# in formula 7.3.14 (p. 190) named as "theta".
|
||
|
# As recommended there it value is fixed in 0.01.
|
||
|
UPDATE_COEFF = 0.01
|
||
|
|
||
|
EPS = np.finfo(float).eps
|
||
|
|
||
|
def __init__(self, x, fun, jac, hess, hessp=None,
|
||
|
k_easy=0.1, k_hard=0.2):
|
||
|
|
||
|
super(IterativeSubproblem, self).__init__(x, fun, jac, hess)
|
||
|
|
||
|
# When the trust-region shrinks in two consecutive
|
||
|
# calculations (``tr_radius < previous_tr_radius``)
|
||
|
# the lower bound ``lambda_lb`` may be reused,
|
||
|
# facilitating the convergence. To indicate no
|
||
|
# previous value is known at first ``previous_tr_radius``
|
||
|
# is set to -1 and ``lambda_lb`` to None.
|
||
|
self.previous_tr_radius = -1
|
||
|
self.lambda_lb = None
|
||
|
|
||
|
self.niter = 0
|
||
|
|
||
|
# ``k_easy`` and ``k_hard`` are parameters used
|
||
|
# to determine the stop criteria to the iterative
|
||
|
# subproblem solver. Take a look at pp. 194-197
|
||
|
# from reference _[1] for a more detailed description.
|
||
|
self.k_easy = k_easy
|
||
|
self.k_hard = k_hard
|
||
|
|
||
|
# Get Lapack function for cholesky decomposition.
|
||
|
# The implemented Scipy wrapper does not return
|
||
|
# the incomplete factorization needed by the method.
|
||
|
self.cholesky, = get_lapack_funcs(('potrf',), (self.hess,))
|
||
|
|
||
|
# Get info about Hessian
|
||
|
self.dimension = len(self.hess)
|
||
|
self.hess_gershgorin_lb,\
|
||
|
self.hess_gershgorin_ub = gershgorin_bounds(self.hess)
|
||
|
self.hess_inf = norm(self.hess, np.Inf)
|
||
|
self.hess_fro = norm(self.hess, 'fro')
|
||
|
|
||
|
# A constant such that for vectors smaler than that
|
||
|
# backward substituition is not reliable. It was stabilished
|
||
|
# based on Golub, G. H., Van Loan, C. F. (2013).
|
||
|
# "Matrix computations". Forth Edition. JHU press., p.165.
|
||
|
self.CLOSE_TO_ZERO = self.dimension * self.EPS * self.hess_inf
|
||
|
|
||
|
def _initial_values(self, tr_radius):
|
||
|
"""Given a trust radius, return a good initial guess for
|
||
|
the damping factor, the lower bound and the upper bound.
|
||
|
The values were chosen accordingly to the guidelines on
|
||
|
section 7.3.8 (p. 192) from [1]_.
|
||
|
"""
|
||
|
|
||
|
# Upper bound for the damping factor
|
||
|
lambda_ub = max(0, self.jac_mag/tr_radius + min(-self.hess_gershgorin_lb,
|
||
|
self.hess_fro,
|
||
|
self.hess_inf))
|
||
|
|
||
|
# Lower bound for the damping factor
|
||
|
lambda_lb = max(0, -min(self.hess.diagonal()),
|
||
|
self.jac_mag/tr_radius - min(self.hess_gershgorin_ub,
|
||
|
self.hess_fro,
|
||
|
self.hess_inf))
|
||
|
|
||
|
# Improve bounds with previous info
|
||
|
if tr_radius < self.previous_tr_radius:
|
||
|
lambda_lb = max(self.lambda_lb, lambda_lb)
|
||
|
|
||
|
# Initial guess for the damping factor
|
||
|
if lambda_lb == 0:
|
||
|
lambda_initial = 0
|
||
|
else:
|
||
|
lambda_initial = max(np.sqrt(lambda_lb * lambda_ub),
|
||
|
lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
|
||
|
|
||
|
return lambda_initial, lambda_lb, lambda_ub
|
||
|
|
||
|
def solve(self, tr_radius):
|
||
|
"""Solve quadratic subproblem"""
|
||
|
|
||
|
lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius)
|
||
|
n = self.dimension
|
||
|
hits_boundary = True
|
||
|
already_factorized = False
|
||
|
self.niter = 0
|
||
|
|
||
|
while True:
|
||
|
|
||
|
# Compute Cholesky factorization
|
||
|
if already_factorized:
|
||
|
already_factorized = False
|
||
|
else:
|
||
|
H = self.hess+lambda_current*np.eye(n)
|
||
|
U, info = self.cholesky(H, lower=False,
|
||
|
overwrite_a=False,
|
||
|
clean=True)
|
||
|
|
||
|
self.niter += 1
|
||
|
|
||
|
# Check if factorization succeeded
|
||
|
if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO:
|
||
|
# Successful factorization
|
||
|
|
||
|
# Solve `U.T U p = s`
|
||
|
p = cho_solve((U, False), -self.jac)
|
||
|
|
||
|
p_norm = norm(p)
|
||
|
|
||
|
# Check for interior convergence
|
||
|
if p_norm <= tr_radius and lambda_current == 0:
|
||
|
hits_boundary = False
|
||
|
break
|
||
|
|
||
|
# Solve `U.T w = p`
|
||
|
w = solve_triangular(U, p, trans='T')
|
||
|
|
||
|
w_norm = norm(w)
|
||
|
|
||
|
# Compute Newton step accordingly to
|
||
|
# formula (4.44) p.87 from ref [2]_.
|
||
|
delta_lambda = (p_norm/w_norm)**2 * (p_norm-tr_radius)/tr_radius
|
||
|
lambda_new = lambda_current + delta_lambda
|
||
|
|
||
|
if p_norm < tr_radius: # Inside boundary
|
||
|
s_min, z_min = estimate_smallest_singular_value(U)
|
||
|
|
||
|
ta, tb = self.get_boundaries_intersections(p, z_min,
|
||
|
tr_radius)
|
||
|
|
||
|
# Choose `step_len` with the smallest magnitude.
|
||
|
# The reason for this choice is explained at
|
||
|
# ref [3]_, p. 6 (Immediately before the formula
|
||
|
# for `tau`).
|
||
|
step_len = min([ta, tb], key=abs)
|
||
|
|
||
|
# Compute the quadratic term (p.T*H*p)
|
||
|
quadratic_term = np.dot(p, np.dot(H, p))
|
||
|
|
||
|
# Check stop criteria
|
||
|
relative_error = (step_len**2 * s_min**2) / (quadratic_term + lambda_current*tr_radius**2)
|
||
|
if relative_error <= self.k_hard:
|
||
|
p += step_len * z_min
|
||
|
break
|
||
|
|
||
|
# Update uncertanty bounds
|
||
|
lambda_ub = lambda_current
|
||
|
lambda_lb = max(lambda_lb, lambda_current - s_min**2)
|
||
|
|
||
|
# Compute Cholesky factorization
|
||
|
H = self.hess + lambda_new*np.eye(n)
|
||
|
c, info = self.cholesky(H, lower=False,
|
||
|
overwrite_a=False,
|
||
|
clean=True)
|
||
|
|
||
|
# Check if the factorization have succeeded
|
||
|
#
|
||
|
if info == 0: # Successful factorization
|
||
|
# Update damping factor
|
||
|
lambda_current = lambda_new
|
||
|
already_factorized = True
|
||
|
else: # Unsuccessful factorization
|
||
|
# Update uncertanty bounds
|
||
|
lambda_lb = max(lambda_lb, lambda_new)
|
||
|
|
||
|
# Update damping factor
|
||
|
lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
|
||
|
lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
|
||
|
|
||
|
else: # Outside boundary
|
||
|
# Check stop criteria
|
||
|
relative_error = abs(p_norm - tr_radius) / tr_radius
|
||
|
if relative_error <= self.k_easy:
|
||
|
break
|
||
|
|
||
|
# Update uncertanty bounds
|
||
|
lambda_lb = lambda_current
|
||
|
|
||
|
# Update damping factor
|
||
|
lambda_current = lambda_new
|
||
|
|
||
|
elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO:
|
||
|
# jac_mag very close to zero
|
||
|
|
||
|
# Check for interior convergence
|
||
|
if lambda_current == 0:
|
||
|
p = np.zeros(n)
|
||
|
hits_boundary = False
|
||
|
break
|
||
|
|
||
|
s_min, z_min = estimate_smallest_singular_value(U)
|
||
|
step_len = tr_radius
|
||
|
|
||
|
# Check stop criteria
|
||
|
if step_len**2 * s_min**2 <= self.k_hard * lambda_current * tr_radius**2:
|
||
|
p = step_len * z_min
|
||
|
break
|
||
|
|
||
|
# Update uncertanty bounds
|
||
|
lambda_ub = lambda_current
|
||
|
lambda_lb = max(lambda_lb, lambda_current - s_min**2)
|
||
|
|
||
|
# Update damping factor
|
||
|
lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
|
||
|
lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
|
||
|
|
||
|
else: # Unsuccessful factorization
|
||
|
|
||
|
# Compute auxiliary terms
|
||
|
delta, v = singular_leading_submatrix(H, U, info)
|
||
|
v_norm = norm(v)
|
||
|
|
||
|
# Update uncertanty interval
|
||
|
lambda_lb = max(lambda_lb, lambda_current + delta/v_norm**2)
|
||
|
|
||
|
# Update damping factor
|
||
|
lambda_current = max(np.sqrt(lambda_lb * lambda_ub),
|
||
|
lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
|
||
|
|
||
|
self.lambda_lb = lambda_lb
|
||
|
self.lambda_current = lambda_current
|
||
|
self.previous_tr_radius = tr_radius
|
||
|
|
||
|
return p, hits_boundary
|