# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2025 Antoine COLLET
"""
An *implicit* representation of the BFGS approximation to the Hessian matrix B
B = theta * I - W * M * W'
H = inv(B)
Classes
^^^^^^^
.. autosummary::
:toctree: _autosummary
LBFGSB_MATRICES
Functions
^^^^^^^^^
.. autosummary::
:toctree: _autosummary
bmv
form_invMfactors
update_lbfgs_matrices
update_X_and_G
Reference:
:cite:`nocedalUpdatingQuasiNewtonMatrices1980`
:cite:`byrdRepresentationsQuasiNewtonMatrices1994`
:cite:`byrdLimitedMemoryAlgorithm1995`
"""
import logging
from collections import deque
from typing import Deque, Optional, Tuple
import numpy as np
import scipy as sp
from lbfgsb._numba_helpers import njit
from lbfgsb.types import NDArrayFloat
[docs]
class LBFGSB_MATRICES:
"""
Represent the L-BFGS matrices.
Attributes
----------
S : NDArrayFloat
# shape (n, m)
Y : NDArrayFloat
# shape (n, m)
D : NDArrayFloat
# shape (m, m)
L : NDArrayFloat
# shape (m, m)
W : NDArrayFloat
# shape (m, 2m)
invMfactors: Tuple[NDArrayFloat, NDArrayFloat]
# shape (2m, 2m)
theta : float
L-BFGS float parameter (multiply the identity matrix).
"""
__slots__ = ["S", "Y", "D", "L", "W", "invMfactors", "theta"]
[docs]
def __init__(self, n: int) -> None:
"""
Initialize the instance.
Parameters
----------
n : int
Number of adjusted variables.
"""
if n < 1:
raise ValueError("n must be an integer > 0.")
self.S: NDArrayFloat = np.zeros([n, 1])
self.Y: NDArrayFloat = np.zeros([n, 1])
self.D: NDArrayFloat = np.zeros([n, 1])
self.L: NDArrayFloat = np.zeros([n, 1])
self.W: NDArrayFloat = np.zeros([n, 1])
self.invMfactors: Tuple[NDArrayFloat, NDArrayFloat] = (
np.zeros([1, 1]),
np.zeros([1, 1]),
)
self.theta: float = 1.0
@property
def use_factor(self) -> bool:
"""If the factors are null then matrix factorization will fail."""
return self.invMfactors[0].size != 1 or self.invMfactors[0][0, 0] != 0
[docs]
def bmv(
invMfactors: Tuple[NDArrayFloat, NDArrayFloat], v: NDArrayFloat
) -> NDArrayFloat:
"""
Return the product of the 2m x 2m middle matrix with a vector v.
In the compact L-BFGS formula of B and a 2m vector `v`;
it returns the product in `p`.
Parameters
----------
invMfactors : Tuple[NDArrayFloat, NDArrayFloat]
LU factorization of the inverse of the middle matrix M.
v : NDArrayFloat
Vector with size 2m.
Returns
-------
NDArrayFloat
Product of the 2m x 2m middle matrix with a vector v.
"""
# PART I: solve [ D^(1/2) O ] [ p1 ] = [ v1 ]
# [ -L*D^(-1/2) J ] [ p2 ] [ v2 ].
# sp.linalg.solve_triangular(invMfactors[0], v, lower=True)
# PART II: solve [ -D^(1/2) D^(-1/2)*L' ] [ p1 ] = [ p1 ]
# [ 0 J' ] [ p2 ] [ p2 ].
return sp.linalg.solve_triangular(
invMfactors[1],
sp.linalg.solve_triangular(invMfactors[0], v, lower=True),
lower=False,
)
@njit(cache=True)
def bmv_numba(L: NDArrayFloat, U: NDArrayFloat, v: NDArrayFloat) -> NDArrayFloat:
"""
Numba replacement for:
solve_triangular(U, solve_triangular(L, v, lower=True), lower=False)
"""
n = v.size
y = np.empty(n)
p = np.empty(n)
# Forward solve: L y = v
for i in range(n):
s = v[i]
for j in range(i):
s -= L[i, j] * y[j]
y[i] = s / L[i, i]
# Backward solve: U p = y
for i in range(n - 1, -1, -1):
s = y[i]
for j in range(i + 1, n):
s -= U[i, j] * p[j]
p[i] = s / U[i, i]
return p
[docs]
def update_lbfgs_matrices(
xk: NDArrayFloat,
gk: NDArrayFloat,
X: Deque[NDArrayFloat],
G: Deque[NDArrayFloat],
maxcor: int,
mats: LBFGSB_MATRICES,
is_force_update: bool,
eps: float = 2.2e-16,
is_check_factorization: bool = False,
is_use_numba_jit: bool = False,
) -> LBFGSB_MATRICES:
r"""
Update lists S and Y, and form the L-BFGS Hessian approximation.
Instead of storing sk and yk, we store the gradients and the parameters.
2 conditions for update
- The current step update is accepted
- The all sequence of x and g has been modified (reg case)
Parameters
----------
xk : NDArrayFloat
New x parameter.
gk : NDArrayFloat
New gradient parameter g.
X : Deque[NDArrayFloat]
List of successive parameters x.
G : Deque[NDArrayFloat]
List of successive gradients.
maxcor : int
The maximum number of variable metric corrections used to
define the limited memory matrix. (The limited memory BFGS
method does not store the full hessian but uses this many terms
in an approximation to it.)
is_force_update: bool
Whether to perform an update even if the current step update is rejected.
This is useful if the sequence of X and G has been modified during the
optimization. See TODO: add ref, for the use.
eps : float, optional
Positive stability parameter for accepting current step for updating.
By default 2.2e-16.
is_force_update: bool
This is to recompute fully the matrices.
is_check_factorization: bool
Whether to check the cholesky factorization of M. The default is False.
is_use_numba_jit: bool
Whether to use `numba` just-in-time compilation to speed-up the computation
intensive part of the algorithm. The default is False.
.. versionadded:: 1.0
Returns
-------
Tuple[NDArrayFloat, Tuple[NDArrayFloat, NDArrayFloat], float]
Updated [W, M, invMfactors, theta]
References
----------
* R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound
Constrained Optimization, (1995), SIAM Journal on Scientific and
Statistical Computing, 16, 5, pp. 1190-1208.
* C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B,
FORTRAN routines for large scale bound constrained optimization (1997),
ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560.
* J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B,
FORTRAN routines for large scale bound constrained optimization (2011),
ACM Transactions on Mathematical Software, 38, 1.
"""
# Case of a vector
is_current_update_accepted: bool = update_X_and_G(
xk, gk, X, G, maxcor, eps, is_use_numba_jit
)
# two conditions to update the inverse Hessian approximation
if is_force_update or is_current_update_accepted:
# yk and sk: These correction pairs contain information about the curvature of
# the
# function and, in conjunction with the BFGS formula, define the limited-memory
# iteration matrix Bk. The question is how to best represent these matrices
# without explicitly forming them. In [6] it is proposed to use a compact
# (or outer product) form to define the limited-memory matrix Bk in terms of
# the n x m correction matrices
# 1) Update theta
sk = X[-1] - X[-2]
yk = G[-1] - G[-2]
# sk = X[-1] - X[-2]
sTy = sk @ yk
yTy = yk @ yk
mats.theta = (yTy / sTy).item()
m = len(X) - 1
n = np.size(X[-1])
# Update the lbfgsb matrices
if is_force_update:
mats.S = np.diff(np.array(X), axis=0).T # shape (n, m)
mats.Y = np.diff(np.array(G), axis=0).T # shape (n ,m)
else:
if np.shape(mats.S)[1] == m:
# shift left
mats.S[:, :-1] = mats.S[:, 1:]
mats.Y[:, :-1] = mats.Y[:, 1:]
else:
_S = np.zeros((n, m), dtype=np.float64)
_Y = np.zeros((n, m), dtype=np.float64)
_S[:, :-1] = mats.S[:, :]
_Y[:, :-1] = mats.Y[:, :]
mats.S = _S
mats.Y = _Y
# append newest column
mats.S[:, -1] = sk
mats.Y[:, -1] = yk
STS = mats.S.T @ mats.S # shape (m, m)
SY = mats.S.T @ mats.Y
# We can build a dense matrix because shape is (m, m) with m usually small ~10
mats.D = np.diag(np.diag(SY)) # shape (m, m)
mats.L = np.tril(SY, -1) # shape (m, m)
# W = [Yk, \theta Sk]
n, m = np.shape(mats.S)
mats.W = np.zeros((n, 2 * m))
mats.W[:, :m] = mats.Y
mats.W[:, m:] = mats.theta * mats.S # shape (n, 2m)
# To avoid forming the limited-memory iteration matrix Bk and allow fast
# matrix vector products, we represent it as eq. (3.2) [1].
# B = theta * I - W @ M @ W.T
# M (or Mk) can be obtained with
# M = np.linalg.inv(
# np.hstack([np.vstack([-D, L]), np.vstack([L.T, theta * STS])])
# )
# However, we can also factorize its inverse and obtain very fast matrix
# products: lower triangle of M inverse
mats.invMfactors = form_invMfactors(mats.theta, STS, mats.L, mats.D)
# Test the factorization on the fly.
if is_check_factorization:
np.testing.assert_allclose(
mats.invMfactors[0] @ mats.invMfactors[1],
np.hstack(
[
np.vstack([-mats.D, mats.L]),
np.vstack([mats.L.T, mats.theta * STS]),
]
),
)
return mats
def is_update_X_and_G(
xk: NDArrayFloat,
gk: NDArrayFloat,
x_old: NDArrayFloat,
g_old: NDArrayFloat,
eps: float = 2.2e-16,
) -> bool:
"""
Update the sequence of parameters X and gradients G with a strong wolfe condition.
Parameters
----------
xk : NDArrayFloat
New adjusted values vector (at iteration k).
gk : NDArrayFloat
New gradient vector (at iteration k).
x_old : NDArrayFloat
Previous adjusted values vector (at iteration k-1).
g_old : NDArrayFloat
Previous gradient vector (at iteration k-1).
maxcor : int
Maximum number of corrections stored (m).
eps : float, optional
Positive stability parameter for accepting current step for updating.
By default 2.2e-16.
Returns
-------
bool
Whether the current step as been accepted.
"""
yk = gk - g_old
sTy = (xk - x_old).dot(yk)
yTy = (yk).dot(yk)
# See eq. (3.9) in [1].
# One can show that BFGS update (2.19) generates positive definite approximations
# whenever the initial approximation B0 is positive definite and sT k yk > 0.
# We discuss these issues further in Chapter 6. (See Numerical optimization in
# Noecedal and Wright)
return sTy > eps * yTy
@njit(cache=True)
def is_update_X_and_G_numba(
xk: NDArrayFloat,
gk: NDArrayFloat,
x_old: NDArrayFloat,
g_old: NDArrayFloat,
eps: float = 2.2e-16,
) -> bool:
"""
Update the sequence of parameters X and gradients G with a strong wolfe condition.
Parameters
----------
xk : NDArrayFloat
New adjusted values vector (at iteration k).
gk : NDArrayFloat
New gradient vector (at iteration k).
x_old : NDArrayFloat
Previous adjusted values vector (at iteration k-1).
g_old : NDArrayFloat
Previous gradient vector (at iteration k-1).
maxcor : int
Maximum number of corrections stored (m).
eps : float, optional
Positive stability parameter for accepting current step for updating.
By default 2.2e-16.
Returns
-------
bool
Whether the current step as been accepted.
"""
sTy = 0.0
yTy = 0.0
for i in range(xk.size):
y = gk[i] - g_old[i]
sTy += (xk[i] - x_old[i]) * y
yTy += y * y
# See eq. (3.9) in [1].
# One can show that BFGS update (2.19) generates positive definite approximations
# whenever the initial approximation B0 is positive definite and sT k yk > 0.
# We discuss these issues further in Chapter 6. (See Numerical optimization in
# Noecedal and Wright)
return sTy > eps * yTy
[docs]
def update_X_and_G(
xk: NDArrayFloat,
gk: NDArrayFloat,
X: Deque[NDArrayFloat],
G: Deque[NDArrayFloat],
maxcor: int,
eps: float = 2.2e-16,
is_use_numba_jit: bool = False,
) -> bool:
"""
Parameters
----------
xk : NDArrayFloat
New adjusted values vector.
gk : NDArrayFloat
New gradient vector.
X : Deque[NDArrayFloat]
Sequence of past adjusted values vectors respecting the strong wolfe conditions.
G : Deque[NDArrayFloat]
Sequence of past gradient vectors respecting the strong wolfe conditions.
maxcor : int
Maximum number of corrections stored (m).
eps : float, optional
Positive stability parameter for accepting current step for updating.
By default 2.2e-16.
is_use_numba_jit: bool
Whether to use `numba` just-in-time compilation to speed-up the computation
intensive part of the algorithm. The default is False.
.. versionadded:: 1.0
Returns
-------
bool
True if the matrices have been updated, False otherwise.
"""
if is_use_numba_jit:
if not is_update_X_and_G_numba(xk, gk, X[-1], G[-1], eps):
return False
else:
if not is_update_X_and_G(xk, gk, X[-1], G[-1], eps):
return False
X.append(xk)
G.append(gk)
# maxcor is the number of corrections m (see S and Y shapes),
# so we must keep one more gradient and parameter vectors.
if len(X) > maxcor + 1:
X.popleft()
G.popleft()
return True
def make_X_and_G_respect_strong_wolfe(
X: Deque[NDArrayFloat],
G: Deque[NDArrayFloat],
eps: float = 2.2e-16,
logger: Optional[logging.Logger] = None,
is_use_numba_jit: bool = False,
) -> Tuple[Deque[NDArrayFloat], Deque[NDArrayFloat]]:
"""
Parameters
----------
X : Deque[NDArrayFloat]
Sequence of past adjusted values vectors respecting the strong wolfe conditions.
G : Deque[NDArrayFloat]
Sequence of past gradient vectors respecting the strong wolfe conditions.
eps : float, optional
Positive stability parameter for accepting current step for updating.
By default 2.2e-16.
logger: Optional[Logger], optional
:class:`logging.Logger` instance. If None, nothing is displayed, no matter the
value of `iprint`, by default None.
is_use_numba_jit: bool
Whether to use `numba` just-in-time compilation to speed-up the computation
intensive part of the algorithm. The default is False.
.. versionadded:: 1.0
Returns
-------
Tuple[Deque[NDArrayFloat], Deque[NDArrayFloat]]
Updated X and G.
"""
ncor: int = len(X) - 1
_X, _G = deque([X[-1]]), deque([G[-1]])
for i in range(ncor):
k = ncor - i - 1 # start at 1
if not is_update_X_and_G(X[k], G[k], _X[0], _G[0], eps):
if logger is not None:
logger.info(f"Dropping update #{-i - 2}")
else:
_X.appendleft(X[k])
_G.appendleft(G[k])
# This is for debug
if len(_G) != len(G) and logger is not None:
# if logger is not None:
logger.info(f"len(newG) = {len(_G)}, len(oldG) = {len(G)}")
return _X, _G