Source code for lbfgsb.bfgsmats

# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2025 Antoine COLLET

"""
An *implicit* representation of the BFGS approximation to the Hessian matrix B

B = theta * I - W * M * W'
H = inv(B)

Classes
^^^^^^^

.. autosummary::
   :toctree: _autosummary

    LBFGSB_MATRICES

Functions
^^^^^^^^^

.. autosummary::
   :toctree: _autosummary

    bmv
    form_invMfactors
    update_lbfgs_matrices
    update_X_and_G

Reference:
:cite:`nocedalUpdatingQuasiNewtonMatrices1980`
:cite:`byrdRepresentationsQuasiNewtonMatrices1994`
:cite:`byrdLimitedMemoryAlgorithm1995`
"""

import logging
from collections import deque
from typing import Deque, Optional, Tuple

import numpy as np
import scipy as sp

from lbfgsb._numba_helpers import njit
from lbfgsb.types import NDArrayFloat


[docs] class LBFGSB_MATRICES: """ Represent the L-BFGS matrices. Attributes ---------- S : NDArrayFloat # shape (n, m) Y : NDArrayFloat # shape (n, m) D : NDArrayFloat # shape (m, m) L : NDArrayFloat # shape (m, m) W : NDArrayFloat # shape (m, 2m) invMfactors: Tuple[NDArrayFloat, NDArrayFloat] # shape (2m, 2m) theta : float L-BFGS float parameter (multiply the identity matrix). """ __slots__ = ["S", "Y", "D", "L", "W", "invMfactors", "theta"]
[docs] def __init__(self, n: int) -> None: """ Initialize the instance. Parameters ---------- n : int Number of adjusted variables. """ if n < 1: raise ValueError("n must be an integer > 0.") self.S: NDArrayFloat = np.zeros([n, 1]) self.Y: NDArrayFloat = np.zeros([n, 1]) self.D: NDArrayFloat = np.zeros([n, 1]) self.L: NDArrayFloat = np.zeros([n, 1]) self.W: NDArrayFloat = np.zeros([n, 1]) self.invMfactors: Tuple[NDArrayFloat, NDArrayFloat] = ( np.zeros([1, 1]), np.zeros([1, 1]), ) self.theta: float = 1.0
@property def use_factor(self) -> bool: """If the factors are null then matrix factorization will fail.""" return self.invMfactors[0].size != 1 or self.invMfactors[0][0, 0] != 0
[docs] def bmv( invMfactors: Tuple[NDArrayFloat, NDArrayFloat], v: NDArrayFloat ) -> NDArrayFloat: """ Return the product of the 2m x 2m middle matrix with a vector v. In the compact L-BFGS formula of B and a 2m vector `v`; it returns the product in `p`. Parameters ---------- invMfactors : Tuple[NDArrayFloat, NDArrayFloat] LU factorization of the inverse of the middle matrix M. v : NDArrayFloat Vector with size 2m. Returns ------- NDArrayFloat Product of the 2m x 2m middle matrix with a vector v. """ # PART I: solve [ D^(1/2) O ] [ p1 ] = [ v1 ] # [ -L*D^(-1/2) J ] [ p2 ] [ v2 ]. # sp.linalg.solve_triangular(invMfactors[0], v, lower=True) # PART II: solve [ -D^(1/2) D^(-1/2)*L' ] [ p1 ] = [ p1 ] # [ 0 J' ] [ p2 ] [ p2 ]. return sp.linalg.solve_triangular( invMfactors[1], sp.linalg.solve_triangular(invMfactors[0], v, lower=True), lower=False, )
@njit(cache=True) def bmv_numba(L: NDArrayFloat, U: NDArrayFloat, v: NDArrayFloat) -> NDArrayFloat: """ Numba replacement for: solve_triangular(U, solve_triangular(L, v, lower=True), lower=False) """ n = v.size y = np.empty(n) p = np.empty(n) # Forward solve: L y = v for i in range(n): s = v[i] for j in range(i): s -= L[i, j] * y[j] y[i] = s / L[i, i] # Backward solve: U p = y for i in range(n - 1, -1, -1): s = y[i] for j in range(i + 1, n): s -= U[i, j] * p[j] p[i] = s / U[i, i] return p
[docs] def form_invMfactors(theta, STS, L, D) -> Tuple[NDArrayFloat, NDArrayFloat]: r""" Return upper triangle of the cholesky factorization of the inverse of M_k. This is defined in eq. (3.4) [1]. Although Mk is not positive definite, but its inverse reads .. math:: \mathbf{M}^{-1} = \begin{bmatrix} -\mathbf{D} & \mathbf{L}^{\mathrm{T}} \\ \mathbf{L} & \theta \mathbf{S}^{\mathrm{T}}\mathbf{S} \end{bmatrix} as given in (3.4) of [1]. Hence its inverse can be factorized almost `symmetrically` by using Cholesky factorizations of the submatrices. Now, the inverse of Mk, the middle matrix in B reads: [ D^(1/2) O ] [ -D^(1/2) D^(-1/2)*L' ] [ -L*D^(-1/2) J ] [ 0 J' ] With J @ J' = T = theta*Ss + L*D^(-1)*L'; T being definite positive, J is obtained by Cholesky factorization of T. REF: see algo 3.2 in :cite:t:`byrdRepresentationsQuasiNewtonMatrices1994`. """ invD = np.zeros_like(D) # Add 1/D on diagonal invD.flat[:: D.shape[0] + 1] = 1 / np.diag(D) # Cholesky factorization J = sp.linalg.cholesky(theta * STS + L @ invD @ L.T, lower=True) m = D.shape[0] dtype = D.dtype sqrtD = np.sqrt(D) sqrtInvD = np.sqrt(invD) # Precompute products L_sqrtInvD = L @ sqrtInvD # (m, m) sqrtInvD_LT = sqrtInvD @ L.T # (m, m) L = np.zeros((2 * m, 2 * m), dtype=dtype) U = np.zeros((2 * m, 2 * m), dtype=dtype) # -------- Lower matrix -------- L[:m, :m] = sqrtD L[m:, :m] = -L_sqrtInvD L[m:, m:] = J # -------- Upper matrix -------- U[:m, :m] = -sqrtD U[:m, m:] = sqrtInvD_LT U[m:, m:] = J.T # Note we form the upper triangle and then transpose it to get the lower one return L, U
[docs] def update_lbfgs_matrices( xk: NDArrayFloat, gk: NDArrayFloat, X: Deque[NDArrayFloat], G: Deque[NDArrayFloat], maxcor: int, mats: LBFGSB_MATRICES, is_force_update: bool, eps: float = 2.2e-16, is_check_factorization: bool = False, is_use_numba_jit: bool = False, ) -> LBFGSB_MATRICES: r""" Update lists S and Y, and form the L-BFGS Hessian approximation. Instead of storing sk and yk, we store the gradients and the parameters. 2 conditions for update - The current step update is accepted - The all sequence of x and g has been modified (reg case) Parameters ---------- xk : NDArrayFloat New x parameter. gk : NDArrayFloat New gradient parameter g. X : Deque[NDArrayFloat] List of successive parameters x. G : Deque[NDArrayFloat] List of successive gradients. maxcor : int The maximum number of variable metric corrections used to define the limited memory matrix. (The limited memory BFGS method does not store the full hessian but uses this many terms in an approximation to it.) is_force_update: bool Whether to perform an update even if the current step update is rejected. This is useful if the sequence of X and G has been modified during the optimization. See TODO: add ref, for the use. eps : float, optional Positive stability parameter for accepting current step for updating. By default 2.2e-16. is_force_update: bool This is to recompute fully the matrices. is_check_factorization: bool Whether to check the cholesky factorization of M. The default is False. is_use_numba_jit: bool Whether to use `numba` just-in-time compilation to speed-up the computation intensive part of the algorithm. The default is False. .. versionadded:: 1.0 Returns ------- Tuple[NDArrayFloat, Tuple[NDArrayFloat, NDArrayFloat], float] Updated [W, M, invMfactors, theta] References ---------- * R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound Constrained Optimization, (1995), SIAM Journal on Scientific and Statistical Computing, 16, 5, pp. 1190-1208. * C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (1997), ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560. * J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (2011), ACM Transactions on Mathematical Software, 38, 1. """ # Case of a vector is_current_update_accepted: bool = update_X_and_G( xk, gk, X, G, maxcor, eps, is_use_numba_jit ) # two conditions to update the inverse Hessian approximation if is_force_update or is_current_update_accepted: # yk and sk: These correction pairs contain information about the curvature of # the # function and, in conjunction with the BFGS formula, define the limited-memory # iteration matrix Bk. The question is how to best represent these matrices # without explicitly forming them. In [6] it is proposed to use a compact # (or outer product) form to define the limited-memory matrix Bk in terms of # the n x m correction matrices # 1) Update theta sk = X[-1] - X[-2] yk = G[-1] - G[-2] # sk = X[-1] - X[-2] sTy = sk @ yk yTy = yk @ yk mats.theta = (yTy / sTy).item() m = len(X) - 1 n = np.size(X[-1]) # Update the lbfgsb matrices if is_force_update: mats.S = np.diff(np.array(X), axis=0).T # shape (n, m) mats.Y = np.diff(np.array(G), axis=0).T # shape (n ,m) else: if np.shape(mats.S)[1] == m: # shift left mats.S[:, :-1] = mats.S[:, 1:] mats.Y[:, :-1] = mats.Y[:, 1:] else: _S = np.zeros((n, m), dtype=np.float64) _Y = np.zeros((n, m), dtype=np.float64) _S[:, :-1] = mats.S[:, :] _Y[:, :-1] = mats.Y[:, :] mats.S = _S mats.Y = _Y # append newest column mats.S[:, -1] = sk mats.Y[:, -1] = yk STS = mats.S.T @ mats.S # shape (m, m) SY = mats.S.T @ mats.Y # We can build a dense matrix because shape is (m, m) with m usually small ~10 mats.D = np.diag(np.diag(SY)) # shape (m, m) mats.L = np.tril(SY, -1) # shape (m, m) # W = [Yk, \theta Sk] n, m = np.shape(mats.S) mats.W = np.zeros((n, 2 * m)) mats.W[:, :m] = mats.Y mats.W[:, m:] = mats.theta * mats.S # shape (n, 2m) # To avoid forming the limited-memory iteration matrix Bk and allow fast # matrix vector products, we represent it as eq. (3.2) [1]. # B = theta * I - W @ M @ W.T # M (or Mk) can be obtained with # M = np.linalg.inv( # np.hstack([np.vstack([-D, L]), np.vstack([L.T, theta * STS])]) # ) # However, we can also factorize its inverse and obtain very fast matrix # products: lower triangle of M inverse mats.invMfactors = form_invMfactors(mats.theta, STS, mats.L, mats.D) # Test the factorization on the fly. if is_check_factorization: np.testing.assert_allclose( mats.invMfactors[0] @ mats.invMfactors[1], np.hstack( [ np.vstack([-mats.D, mats.L]), np.vstack([mats.L.T, mats.theta * STS]), ] ), ) return mats
def is_update_X_and_G( xk: NDArrayFloat, gk: NDArrayFloat, x_old: NDArrayFloat, g_old: NDArrayFloat, eps: float = 2.2e-16, ) -> bool: """ Update the sequence of parameters X and gradients G with a strong wolfe condition. Parameters ---------- xk : NDArrayFloat New adjusted values vector (at iteration k). gk : NDArrayFloat New gradient vector (at iteration k). x_old : NDArrayFloat Previous adjusted values vector (at iteration k-1). g_old : NDArrayFloat Previous gradient vector (at iteration k-1). maxcor : int Maximum number of corrections stored (m). eps : float, optional Positive stability parameter for accepting current step for updating. By default 2.2e-16. Returns ------- bool Whether the current step as been accepted. """ yk = gk - g_old sTy = (xk - x_old).dot(yk) yTy = (yk).dot(yk) # See eq. (3.9) in [1]. # One can show that BFGS update (2.19) generates positive definite approximations # whenever the initial approximation B0 is positive definite and sT k yk > 0. # We discuss these issues further in Chapter 6. (See Numerical optimization in # Noecedal and Wright) return sTy > eps * yTy @njit(cache=True) def is_update_X_and_G_numba( xk: NDArrayFloat, gk: NDArrayFloat, x_old: NDArrayFloat, g_old: NDArrayFloat, eps: float = 2.2e-16, ) -> bool: """ Update the sequence of parameters X and gradients G with a strong wolfe condition. Parameters ---------- xk : NDArrayFloat New adjusted values vector (at iteration k). gk : NDArrayFloat New gradient vector (at iteration k). x_old : NDArrayFloat Previous adjusted values vector (at iteration k-1). g_old : NDArrayFloat Previous gradient vector (at iteration k-1). maxcor : int Maximum number of corrections stored (m). eps : float, optional Positive stability parameter for accepting current step for updating. By default 2.2e-16. Returns ------- bool Whether the current step as been accepted. """ sTy = 0.0 yTy = 0.0 for i in range(xk.size): y = gk[i] - g_old[i] sTy += (xk[i] - x_old[i]) * y yTy += y * y # See eq. (3.9) in [1]. # One can show that BFGS update (2.19) generates positive definite approximations # whenever the initial approximation B0 is positive definite and sT k yk > 0. # We discuss these issues further in Chapter 6. (See Numerical optimization in # Noecedal and Wright) return sTy > eps * yTy
[docs] def update_X_and_G( xk: NDArrayFloat, gk: NDArrayFloat, X: Deque[NDArrayFloat], G: Deque[NDArrayFloat], maxcor: int, eps: float = 2.2e-16, is_use_numba_jit: bool = False, ) -> bool: """ Parameters ---------- xk : NDArrayFloat New adjusted values vector. gk : NDArrayFloat New gradient vector. X : Deque[NDArrayFloat] Sequence of past adjusted values vectors respecting the strong wolfe conditions. G : Deque[NDArrayFloat] Sequence of past gradient vectors respecting the strong wolfe conditions. maxcor : int Maximum number of corrections stored (m). eps : float, optional Positive stability parameter for accepting current step for updating. By default 2.2e-16. is_use_numba_jit: bool Whether to use `numba` just-in-time compilation to speed-up the computation intensive part of the algorithm. The default is False. .. versionadded:: 1.0 Returns ------- bool True if the matrices have been updated, False otherwise. """ if is_use_numba_jit: if not is_update_X_and_G_numba(xk, gk, X[-1], G[-1], eps): return False else: if not is_update_X_and_G(xk, gk, X[-1], G[-1], eps): return False X.append(xk) G.append(gk) # maxcor is the number of corrections m (see S and Y shapes), # so we must keep one more gradient and parameter vectors. if len(X) > maxcor + 1: X.popleft() G.popleft() return True
def make_X_and_G_respect_strong_wolfe( X: Deque[NDArrayFloat], G: Deque[NDArrayFloat], eps: float = 2.2e-16, logger: Optional[logging.Logger] = None, is_use_numba_jit: bool = False, ) -> Tuple[Deque[NDArrayFloat], Deque[NDArrayFloat]]: """ Parameters ---------- X : Deque[NDArrayFloat] Sequence of past adjusted values vectors respecting the strong wolfe conditions. G : Deque[NDArrayFloat] Sequence of past gradient vectors respecting the strong wolfe conditions. eps : float, optional Positive stability parameter for accepting current step for updating. By default 2.2e-16. logger: Optional[Logger], optional :class:`logging.Logger` instance. If None, nothing is displayed, no matter the value of `iprint`, by default None. is_use_numba_jit: bool Whether to use `numba` just-in-time compilation to speed-up the computation intensive part of the algorithm. The default is False. .. versionadded:: 1.0 Returns ------- Tuple[Deque[NDArrayFloat], Deque[NDArrayFloat]] Updated X and G. """ ncor: int = len(X) - 1 _X, _G = deque([X[-1]]), deque([G[-1]]) for i in range(ncor): k = ncor - i - 1 # start at 1 if not is_update_X_and_G(X[k], G[k], _X[0], _G[0], eps): if logger is not None: logger.info(f"Dropping update #{-i - 2}") else: _X.appendleft(X[k]) _G.appendleft(G[k]) # This is for debug if len(_G) != len(G) and logger is not None: # if logger is not None: logger.info(f"len(newG) = {len(_G)}, len(oldG) = {len(G)}") return _X, _G