Source code for lbfgsb.main

# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2025 Antoine COLLET

"""
This code is a python port of the famous implementation of Limited-memory
Broyden-Fletcher-Goldfarb-Shanno (L-BFGS), algorithm 778 written in Fortran [2,3]
(last update in 2011).
Note that this is not a wrapper such as minimize in scipy but a complete
reimplementation (nevertheless relying heavily on numpy and scipy to
maintain correct performances).
The original code can be found here: https://dl.acm.org/doi/10.1145/279232.279236

The aim of this reimplementation was threefold. First, familiarize ourselves with
the code, its logic and inner optimizations. Second, gain access to certain
parameters that are hard-coded in the Fortran code and cannot be modified (typically
wolfe conditions parameters for the line search). Third,
implement additional functionalities that require significant modification of
the code core.

Additional features
--------------------
Explain about objective function update on the fly.
TODO: point to the doc of the main routine.
TODO:
https://towardsdatascience.com/numerical-optimization-based-on-the-l-bfgs-method-f6582135b0ca

References
----------
[1] R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound
    Constrained Optimization, (1995), SIAM Journal on Scientific and
    Statistical Computing, 16, 5, pp. 1190-1208.
[2] C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B,
    FORTRAN routines for large scale bound constrained optimization (1997),
    ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560.
[3] J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B,
    FORTRAN routines for large scale bound constrained optimization (2011),
    ACM Transactions on Mathematical Software, 38, 1.
"""

import copy
import logging
import warnings
from collections import deque
from dataclasses import dataclass
from typing import Callable, Deque, Optional, Tuple, Union

import numpy as np
from numpy.typing import ArrayLike
from scipy.optimize import (
    LbfgsInvHessProduct,  # noqa : F401
    OptimizeResult,
)

from lbfgsb._numba_helpers import NUMBA_AVAILABLE
from lbfgsb.base import (
    clip2bounds,
    count_var_at_bounds,
    display_iter,
    display_results,
    display_start,
    get_bounds,
    is_any_inf,
    projgr,
)
from lbfgsb.bfgsmats import (
    LBFGSB_MATRICES,
    make_X_and_G_respect_strong_wolfe,
    update_lbfgs_matrices,
)
from lbfgsb.cauchy import get_cauchy_point
from lbfgsb.linesearch import line_search
from lbfgsb.scalar_function import (
    JacOption,
    Objective,
    ObjectiveAndGradient,
    ScalarFunction,
    prepare_scalar_function,
)
from lbfgsb.subspacemin import get_freev, subspace_minimization
from lbfgsb.types import NDArrayFloat


@dataclass
class InternalState:
    """Class to keep track of internal state."""

    # keep track of some values (best, init)
    nit: int = 0
    status: str = "IDLE."
    task_str: str = "START"
    is_success: bool = False
    warnflag: int = 2



[docs]
def minimize_lbfgsb(
    *,
    x0: NDArrayFloat,
    fun: Union[Objective, ObjectiveAndGradient],
    jac: JacOption,
    update_fun_def: Optional[
        Callable[
            [
                NDArrayFloat,
                float,
                float,
                NDArrayFloat,
                Deque[NDArrayFloat],
                Deque[NDArrayFloat],
            ],
            Tuple[float, float, NDArrayFloat, Deque[NDArrayFloat]],
        ]
    ] = None,
    bounds: Optional[ArrayLike] = None,
    checkpoint: Optional[OptimizeResult] = None,
    maxcor: int = 10,
    ftarget: Optional[Union[float, Callable[[], float]]] = None,
    ftol: float = 1e-5,
    gtol: Union[float, Callable[[], float]] = 1e-5,
    maxiter: int = 50,
    eps: float = 1e-8,
    maxfun: int = 15000,
    callback: Optional[Callable[[NDArrayFloat, OptimizeResult], bool]] = None,
    maxls: int = 20,
    finite_diff_rel_step: Optional[float] = None,
    max_steplength: float = 1e8,
    ftol_linesearch: float = 1e-3,
    gtol_linesearch: float = 0.9,
    xtol_linesearch: float = 1e-1,
    eps_SY: float = 2.2e-16,
    iprint: int = -1,
    logger: Optional[logging.Logger] = None,
    is_check_factorizations: bool = False,
    is_use_numba_jit: bool = True,
) -> OptimizeResult:
    r"""
    Minimize a scalar function of one or more variables using the L-BFGS-B algorithm.

    This routine solves bound-constrained optimization problems using a pure Python
    implementation of the limited-memory Broyden-Fletcher-Goldfarb-Shanno algorithm
    with bound constraints, commonly known as L-BFGS-B.

    The problem solved is

    .. math::

        \min_x f(x)

    subject to

    .. math::

        l_i \leq x_i \leq u_i,\quad i = 1, \ldots, n.

    The algorithm stores only a limited number of correction pairs to approximate the
    inverse Hessian, making it suitable for large-scale problems.

    Parameters
    ----------
    x0 : ndarray, shape (n,)
        Initial guess. The array contains the starting values of the ``n``
        independent variables. If bounds are provided, ``x0`` is clipped to the
        feasible box before the first evaluation.

    fun : callable
        Objective function to minimize.

        If ``jac`` is ``None``, ``False``, a finite-difference scheme, or a callable,
        ``fun`` must return only the scalar objective value:

        .. code-block:: python

            fun(x) -> float

        If ``jac`` is ``True``, ``fun`` must return both the objective value and the
        gradient:

        .. code-block:: python

            fun(x) -> tuple[float, ndarray]

        In all cases, ``x`` is a one-dimensional array with shape ``(n,)``. If
        additional arguments are required by the objective function, use a closure,
        ``functools.partial``, or another wrapper.

    jac : {callable, bool, '2-point', '3-point', 'cs'}, optional
        Method used to compute the gradient vector.

        If ``jac`` is a callable, it must return the gradient vector:

        .. code-block:: python

            jac(x) -> ndarray, shape (n,)

        If ``jac`` is ``True``, ``fun`` is assumed to return a pair ``(f, g)``, where
        ``f`` is the scalar objective value and ``g`` is the gradient vector. This is
        compatible with the convention used by :func:`scipy.optimize.minimize`.

        If ``jac`` is one of ``'2-point'``, ``'3-point'``, or ``'cs'``, the gradient
        is approximated numerically using the corresponding finite-difference scheme.
        These finite-difference schemes obey the specified bounds.

        If ``jac`` is ``None`` or ``False``, the gradient is approximated using
        forward finite differences with absolute step size ``eps``.

        Default is ``None``.

    update_fun_def : callable, optional
        Experimental callback used to update the objective-function definition during
        optimization and to modify the stored gradient sequence accordingly.

        This is primarily intended for problems where the objective function changes
        during optimization, for example regularized problems where the regularization
        weight is adapted on the fly. Since the L-BFGS approximation depends on past
        gradient differences, the stored gradient sequence must be updated to remain
        consistent with the new objective definition.

        The callable must have signature:

        .. code-block:: python

            update_fun_def(
                x,
                f0,
                f0_old,
                grad,
                x_deque,
                grad_deque,
            ) -> tuple[float, float, ndarray, deque[ndarray]]

        where ``x`` is the current point, ``f0`` is the current objective value,
        ``f0_old`` is the previous objective value, ``grad`` is the current gradient,
        ``x_deque`` is the stored sequence of past iterates, and ``grad_deque`` is the
        stored sequence of past gradients.

    bounds : sequence or scipy.optimize.Bounds, optional
        Bounds on the variables.

        Bounds may be specified in one of two forms:

        1. An instance of :class:`scipy.optimize.Bounds`.
        2. A sequence of ``(min, max)`` pairs for each variable.

        Use ``None`` for one side of a bound to indicate no bound in that direction.

    checkpoint : scipy.optimize.OptimizeResult, optional
        Previous optimization result used to restart the solver.

        When provided, the solver restarts from the checkpoint without discarding the
        stored L-BFGS correction pairs. The previous objective value, gradient, and
        inverse-Hessian approximation are reused. The objective-function definition
        must be unchanged between the original run and the restarted run.

        This is useful when an optimization was stopped early, when stopping criteria
        need to be modified, or when parameters such as ``maxcor`` or ``ftol`` need to
        be changed between runs.

        .. versionadded:: 1.0

    maxcor : int, optional
        Maximum number of variable-metric corrections used to approximate the inverse
        Hessian. The L-BFGS-B algorithm does not store the full Hessian matrix; it
        stores at most ``maxcor`` correction pairs. Default is ``10``.

    ftarget : float or callable, optional
        Target objective-function value.

        The optimization stops when

        .. math::

            f^{k+1} \leq f_{\mathrm{target}}.

        If ``ftarget`` is callable, it is called once after the first objective and
        gradient evaluation. This makes it possible to define a target value based on
        the initial objective value, for example when scaling is applied.

        If ``None``, this stopping criterion is disabled. Default is ``None``.

    ftol : float, optional
        Relative objective-function decrease tolerance.

        The optimization stops when

        .. math::

            \frac{f^k - f^{k+1}}
                {\max(|f^k|, |f^{k+1}|, 1)}
            \leq \mathrm{ftol}.

        In the original Fortran implementation, this corresponds to
        ``factr * epsmch``. Typical values are:

        - ``5e-3`` for low accuracy,
        - ``5e-8`` for moderate accuracy,
        - ``5e-14`` for high accuracy.

        If ``ftol`` is ``0``, the test stops only when the objective function remains
        unchanged after one iteration. Default is ``1e-5``.

    gtol : float or callable, optional
        Projected-gradient tolerance.

        The optimization stops when

        .. math::

            \max_i |\operatorname{proj} g_i| \leq \mathrm{gtol},

        where ``proj g`` is the projected gradient. If ``gtol`` is callable, it is
        called once after the first objective and gradient evaluation. This allows the
        stopping tolerance to depend on the initial objective value. Default is
        ``1e-5``.

    eps : float, optional
        Absolute step size used for forward finite-difference approximation of the
        gradient when ``jac`` is ``None`` or ``False``. Default is ``1e-8``.

    maxfun : int, optional
        Maximum number of objective-function evaluations.

        The limit may be exceeded when gradients are estimated by numerical
        differentiation. Interruptions due to ``maxfun`` are postponed until the end
        of the current minimization iteration. Default is ``15000``.

    maxiter : int, optional
        Maximum number of iterations. Default is ``50``.

    callback : callable, optional
        Callback called after each iteration.

        The callable must have signature:

        .. code-block:: python

            callback(xk, state) -> bool

        where ``xk`` is the current parameter vector and ``state`` is an
        :class:`scipy.optimize.OptimizeResult` containing the current solver state.

        If the callback returns ``True``, the optimization stops.

    maxls : int, optional
        Maximum number of line-search steps per iteration. Default is ``20``.

    finite_diff_rel_step : float, optional
        Relative step size used for numerical gradient approximation when ``jac`` is
        ``'2-point'``, ``'3-point'``, or ``'cs'``.

        The absolute step size is computed as

        .. math::

            h = \mathrm{rel\_step} \operatorname{sign}(x) \max(1, |x|),

        and is adjusted if necessary to respect the bounds. For ``jac='3-point'``,
        the sign of ``h`` is ignored. If ``None``, the step size is selected
        automatically. Default is ``None``.

    max_steplength : float, optional
        Maximum allowed step length during the line search. Default is ``1e8``.

    ftol_linesearch : float, optional
        Tolerance for the sufficient-decrease condition in the line search.

        This is the Wolfe condition parameter :math:`c_1` in

        .. math::

            f(x_k + \alpha_k p_k)
            \leq
            f(x_k) + c_1 \alpha_k p_k^\mathrm{T} \nabla f(x_k).

        Usually, :math:`c_1` is small and satisfies
        :math:`0 < c_1 < 1`. In the original L-BFGS-B Fortran implementation, this
        parameter is hard-coded to ``1e-3``. Default is ``1e-3``.

    gtol_linesearch : float, optional
        Tolerance for the curvature condition in the line search.

        This is the Wolfe condition parameter :math:`c_2` in

        .. math::

            \left|
            p_k^\mathrm{T} \nabla f(x_k + \alpha_k p_k)
            \right|
            \leq
            c_2
            \left|
            p_k^\mathrm{T} \nabla f(x_k)
            \right|.

        The parameters should satisfy :math:`0 < c_1 < c_2 < 1`. Usually,
        :math:`c_2` is much larger than :math:`c_1`. In the original L-BFGS-B
        Fortran implementation, this parameter is hard-coded to ``0.9``. Default is
        ``0.9``.

    xtol_linesearch : float, optional
        Relative tolerance for acceptable steps in the line-search procedure. In the
        original L-BFGS-B Fortran implementation, this parameter is hard-coded to
        ``0.1``. Default is ``0.1``.

    eps_SY : float, optional
        Numerical threshold used when updating the L-BFGS correction matrices.
        Default is ``2.2e-16``.

    iprint : int, optional
        Verbosity level.

        - ``iprint < 0``: no output.
        - ``iprint == 0``: print only the final summary.
        - ``0 < iprint < 99``: print objective value and projected-gradient norm
        every ``iprint`` iterations.
        - ``iprint >= 99``: print detailed information at every iteration, except
        full vectors.

        Default is ``-1``.

    logger : logging.Logger, optional
        Logger used for textual output. If ``None``, no output is emitted regardless
        of ``iprint``. Default is ``None``.

    is_check_factorizations : bool, optional
        Whether to run additional consistency checks on matrix factorizations.
        Intended for development and debugging only. Default is ``False``.

    is_use_numba_jit : bool, optional
        Whether to use ``numba`` just-in-time compilation for performance-critical
        parts of the algorithm.

        If ``True`` but ``numba`` is unavailable, a warning is raised and the solver
        falls back to the pure Python implementation. The expected speed-up for large
        problems is typically between two and five times. Default is ``True``.

        .. versionadded:: 1.0

    Returns
    -------
    scipy.optimize.OptimizeResult
        Optimization result represented as an :class:`scipy.optimize.OptimizeResult`.

        Important fields include:

        ``x`` : ndarray
            Final solution.
        ``fun`` : float
            Objective value at the final solution.
        ``jac`` : ndarray
            Gradient at the final solution.
        ``nfev`` : int
            Number of objective-function evaluations.
        ``njev`` : int
            Number of gradient evaluations.
        ``nit`` : int
            Number of iterations.
        ``status`` : int
            Solver status code.
        ``message`` : str
            Termination message.
        ``success`` : bool
            Whether the solver terminated successfully.
        ``hess_inv`` : scipy.optimize.LbfgsInvHessProduct
            Limited-memory inverse-Hessian approximation.

    Raises
    ------
    ValueError
        Raised when the provided checkpoint is inconsistent with ``x0`` or when
        restored correction vectors have incompatible dimensions.

    Notes
    -----
    This implementation is a Python reimplementation of the L-BFGS-B algorithm,
    rather than a wrapper around the original Fortran code. It exposes several
    parameters that are fixed in the historical implementation, including line-search
    Wolfe-condition parameters.

    The convention ``jac=True`` follows :func:`scipy.optimize.minimize`: in that
    case, ``fun`` must return both the objective value and the gradient. For example:

    .. code-block:: python

        def fun(x):
            f = x[0] ** 2 + x[1] ** 2
            g = np.array([2 * x[0], 2 * x[1]])
            return f, g

        res = minimize_lbfgsb(x0=x0, fun=fun, jac=True)

    If ``jac`` is ``None`` or ``False``, finite differences are used.

    Examples
    --------
    Minimize a quadratic function with a separate gradient function:

    .. code-block:: python

        import numpy as np
        from lbfgsb import minimize_lbfgsb

        def objective(x):
            return x[0] ** 2 + x[1] ** 2

        def gradient(x):
            return np.array([2 * x[0], 2 * x[1]])

        x0 = np.array([10.0, 10.0])

        res = minimize_lbfgsb(
            x0=x0,
            fun=objective,
            jac=gradient,
        )

        print(res.x)
        print(res.fun)

    Use the SciPy-compatible ``jac=True`` convention:

    .. code-block:: python

        import numpy as np
        from lbfgsb import minimize_lbfgsb

        def objective_and_gradient(x):
            f = x[0] ** 2 + x[1] ** 2
            g = np.array([2 * x[0], 2 * x[1]])
            return f, g

        x0 = np.array([10.0, 10.0])

        res = minimize_lbfgsb(
            x0=x0,
            fun=objective_and_gradient,
            jac=True,
        )

        print(res.x)
        print(res.fun)

    Use finite differences with bounds:

    .. code-block:: python

        import numpy as np
        from lbfgsb import minimize_lbfgsb

        def objective(x):
            return x[0] ** 2 + x[1] ** 2

        x0 = np.array([10.0, 10.0])
        bounds = [(0.0, None), (0.0, None)]

        res = minimize_lbfgsb(
            x0=x0,
            fun=objective,
            jac="2-point",
            bounds=bounds,
        )

    References
    ----------
    .. [1] R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound
        Constrained Optimization. SIAM Journal on Scientific and Statistical
        Computing, 16(5), pp. 1190-1208, 1995.

    .. [2] C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778:
        L-BFGS-B, FORTRAN routines for large scale bound constrained
        optimization. ACM Transactions on Mathematical Software, 23(4),
        pp. 550-560, 1997.

    .. [3] J. L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778:
        L-BFGS-B, FORTRAN routines for large scale bound constrained
        optimization. ACM Transactions on Mathematical Software, 38(1), 2011.
    """
    if not NUMBA_AVAILABLE and is_use_numba_jit:
        warnings.warn(
            "`is_use_numba_jit` set to `True` (default) but numba cannot be imported!"
            " Falling back to pure python (slower)."
        )
        is_use_numba_jit = False

    lb, ub = get_bounds(x0, bounds)
    max_steplength_user: float = copy.copy(max_steplength)

    # True if all values have lower and upper bounds
    is_boxed: bool = not is_any_inf([lb, ub])

    # applying the bounds to the initial guess x0
    n = x0.size
    x = clip2bounds(x0, lb, ub)

    # Some display about the problem at hand. The display depends on the value of iprint
    display_start(
        np.finfo(float).eps, n, maxcor, count_var_at_bounds(x, lb, ub), iprint
    )

    X, G = initialize_X_and_G(x, checkpoint, maxcor)

    # Initialization of the matrices
    mats = LBFGSB_MATRICES(n)

    # wrapper storing the calls to f and g and handling finite difference approximation
    sf: ScalarFunction = prepare_scalar_function(
        fun,
        x,
        jac=jac,
        epsilon=eps,
        bounds=(lb, ub),
        finite_diff_rel_step=finite_diff_rel_step,
    )

    # restore the number of iterations and objective function evaluation
    if checkpoint is not None:
        sf.nfev = checkpoint.nfev
        sf.ngev = checkpoint.njev

    # First evaluation of the objective function if no checkpoint provided
    if checkpoint is None:
        f0 = sf.fun(x)
    else:
        f0 = checkpoint.fun

    # potential update of stop criterion
    if ftarget is not None:
        try:
            _ftarget: Optional[float] = ftarget()  # type: ignore
        except TypeError:
            _ftarget = ftarget  # type: ignore
    else:
        _ftarget = None

    try:
        _gtol: float = gtol()  # type: ignore
    except TypeError:
        _gtol = gtol  # type: ignore

    # Create an internal state instance
    istate = InternalState()

    if checkpoint is not None:
        istate.nit = checkpoint.nit

    # early check of stop criterion -> Extreme case in which x0 satisfies the
    # criterion, then no optimization is needed and one does not need to compute
    # anything else.
    if is_f0_target_reached(f0, _ftarget, istate):
        # leave the optimization routine
        if checkpoint is None:
            if len(X) == 0:
                X.append(x)
                G.append(np.zeros_like(x))
            return OptimizeResult(
                fun=f0,
                jac=G[0],
                nfev=sf.nfev,
                njev=sf.ngev,
                nit=istate.nit,
                status=istate.warnflag,
                message=istate.task_str,
                x=x,
                success=istate.is_success,
                hess_inv=LbfgsInvHessProduct(
                    np.diff(np.array(X), axis=0), np.diff(np.array(G), axis=0)
                ),
            )
        else:
            return checkpoint

    # Compute the first gradient if no checkpoint provided
    if checkpoint is None:
        grad = sf.grad(x)
    else:
        grad = checkpoint.jac

    # Note, no need to further update anything because the scaling is handled by the
    # ScalarFunction instance

    # perform an early potential update of the objective function definition and
    # upgrade the gradient and the past sequence of gradients accordingly
    if update_fun_def is not None:
        f0, f0_old, grad, G = update_fun_def(x, f0, copy.copy(f0), grad, X, G)

    if len(X) > 0:
        # only happens if checkpoint is provided (L-BFGS-B restart)
        mats = update_lbfgs_matrices(
            x.copy(),  # copy otherwise x might be changed in X when updated
            grad,
            X,
            G,
            maxcor,
            mats,
            is_force_update=True,
            eps=eps_SY,
            is_check_factorization=is_check_factorizations,
            is_use_numba_jit=is_use_numba_jit,
        )
    else:
        # Store first res to X and G
        X.append(np.copy(x))
        G.append(grad)

    # For now the free variables at the cauchy points is an empty set
    free_vars = np.array([], dtype=np.int_)

    # Check the infinity norm of the projected gradient
    sbgnrm = projgr(x, grad, lb, ub)
    display_iter(istate.nit, sbgnrm, f0, iprint, logger=logger)

    # bool indicating if results of the current iteration have been displayed. It
    # avoid duplicates for the last round.
    has_displayed_results = False

    # Note that interruptions due to maxfun are postponed
    # until the completion of the current minimization iteration.
    while (
        projgr(x, grad, lb, ub) > _gtol
        and istate.nit < maxiter
        and sf.nfev < maxfun
        and not istate.is_success
    ):
        if iprint > 99 and logger is not None:
            logger.info("\n")
            logger.info(f"ITERATION {istate.nit + 1}\n")

        f0_old = f0 + 0.0  # copy

        # find cauchy point
        x_cp, c = get_cauchy_point(
            x, grad, lb, ub, mats, iprint, logger, is_use_numba_jit=is_use_numba_jit
        )

        # Get the free variables for the GCP
        # Note: no numba needed here
        free_vars, active_vars = get_freev(
            x_cp, lb, ub, istate.nit, free_vars, iprint, logger
        )

        # subspace minimization: find the search direction for the minimization problem
        xbar: NDArrayFloat = subspace_minimization(
            x,
            x_cp,
            free_vars,
            active_vars,
            c,
            grad,
            lb,
            ub,
            mats,
            is_check_factorizations=is_check_factorizations,
            is_use_numba_jit=is_use_numba_jit,
        )
        d = xbar - x

        steplength = line_search(
            x,
            f0,
            grad,
            d,
            lb,
            ub,
            istate.nit,
            max_steplength_user,
            is_boxed,
            sf,
            ftol_linesearch,
            gtol_linesearch,
            xtol_linesearch,
            # The maximum number of function evaluation in linesearch must take into
            # account maxfun and the number of call already performed.
            min(maxls, maxfun - sf.nfev),
            iprint,
            logger,
            is_use_numba_jit=is_use_numba_jit,
        )
        if steplength is None:
            if len(X) == 1:
                # Hessian already rebooted: abort.
                istate.task_str = "ABNORMAL_TERMINATION_IN_LNSRCH"
                istate.warnflag = 2
                istate.is_success = False
                break  # leave the while and finish the program.
            else:
                istate.task_str = "RESTART_FROM_LNSRCH"
                # Keep only the last correction
                X = deque([X[-1]])
                G = deque([G[-1]])
                # Reboot BFGS-Hessian
                mats = LBFGSB_MATRICES(n)
        else:
            # x update
            x += steplength * d

            # new evaluation -> normally, the function has been updated in
            # the linesearch step
            f0, grad = sf.fun_and_grad(x)

            if update_fun_def is None:
                if is_f0_target_reached(f0, _ftarget, istate):
                    break  # the while loop
                elif is_f0_min_change_reached(f0, f0_old, ftol, istate):
                    break  # the while loop

            # perform a potential update of the objective function definition and
            # upgrade the gradient and the past sequence of gradients accordingly
            else:
                f0, f0_old, grad, G = update_fun_def(x, f0, f0_old, grad, X, G)

                # Check stop criterion: minimum relative change in the
                # objective function
                if is_f0_min_change_reached(f0, f0_old, ftol, istate):
                    break  # the while loop

                # Check stop criterion: minimum objective function value
                elif is_f0_target_reached(f0, _ftarget, istate):
                    break  # the while loop

                # We must check if the updated G satisfy the strong wolfe condition
                X, G = make_X_and_G_respect_strong_wolfe(
                    X, G, eps_SY, logger=logger, is_use_numba_jit=is_use_numba_jit
                )

            mats = update_lbfgs_matrices(
                x.copy(),  # copy otherwise x might be changed in X when updated
                grad,
                X,
                G,
                maxcor,
                mats,
                is_force_update=False,
                eps=eps_SY,
                is_check_factorization=is_check_factorizations,
                is_use_numba_jit=is_use_numba_jit,
            )

            # callback is a user defined mechanism to stop optimization
            # if callback returns True, then it stops.
            # Note: no need to callback if an other stop criterion has already been
            # reached (istate.is_success)
            if callback is not None and not istate.is_success:
                if callback(
                    np.copy(x),
                    OptimizeResult(
                        fun=f0,
                        jac=grad,
                        nfev=sf.nfev,
                        njev=sf.ngev,
                        nit=istate.nit,
                        status=istate.warnflag,
                        message=istate.task_str,
                        x=x,
                        success=istate.is_success,
                        hess_inv=LbfgsInvHessProduct(
                            np.atleast_2d(np.diff(np.array(X), axis=0)),
                            np.atleast_2d(np.diff(np.array(G), axis=0)),
                        ),
                    ),
                ):
                    istate.task_str = "STOP: USER CALLBACK"
                    istate.is_success = True

        display_iter(istate.nit + 1, projgr(x, grad, lb, ub), f0, iprint, logger)

        # Result display
        has_displayed_results = display_results(
            istate.nit + 1, maxiter, x, grad, lb, ub, f0, _gtol, False, iprint, logger
        )

        istate.nit += 1

    # Final display. If is_success, then it already happened
    if not has_displayed_results:
        display_results(
            istate.nit, maxiter, x, grad, lb, ub, f0, _gtol, True, iprint, logger
        )

    if projgr(x, grad, lb, ub) <= _gtol:
        istate.task_str = "CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL"
        istate.is_success = True
        istate.warnflag = 1
    elif istate.nit == maxiter:
        istate.task_str = "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT"
        istate.is_success = True
        istate.warnflag = 1
    elif sf.nfev >= maxfun:
        istate.task_str = "STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT"
        istate.is_success = True
        istate.warnflag = 1

    # error: b'ERROR: STPMAX .LT. STPMIN'
    return OptimizeResult(
        fun=f0,
        jac=grad,
        nfev=sf.nfev,
        njev=sf.ngev,
        nit=istate.nit,
        status=istate.warnflag,
        message=istate.task_str,
        x=x,
        success=istate.is_success,
        hess_inv=LbfgsInvHessProduct(
            np.atleast_2d(np.diff(np.array(X), axis=0)),
            np.atleast_2d(np.diff(np.array(G), axis=0)),
        ),
    )



def initialize_X_and_G(
    x: NDArrayFloat, checkpoint: Optional[OptimizeResult], maxcor: int
) -> Tuple[Deque[NDArrayFloat], Deque[NDArrayFloat]]:
    """
    Initialize the sequence of adjusted values and associated gradients.

    This routine is mainly dedicated to restore the sequence from a checkpoint.
    The sequence is stored in the `hess_inv` attribute as "sk" and "yk" which
    are differences, e.g., sk = np.atleast_2d(np.diff(np.array(X), axis=0)).

    Parameters
    ----------
    x : NDArrayFloat
        Adjusted values.
    checkpoint : Optional[OptimizeResult]
        Optional checkpoint (see solver restart).
    maxcor : int
        Maximum number of corrections stored.

    Returns
    -------
    Tuple[Deque[NDArrayFloat], Deque[NDArrayFloat]]
        X and G.
    """
    # Initialize X and G
    # Deque = similar to list but with faster operations to remove and add
    # values to extremities
    X: Deque[NDArrayFloat] = deque()
    G: Deque[NDArrayFloat] = deque()

    # if it is a L-BFGS-B restart (checkpoint is provided), then X and G are restored
    # from x, jac and the sequence of differences sk and yk stored in the inverse
    # Hessian approximation instance (LbfgsInvHessProduct).
    if checkpoint is None:
        return X, G

    # x0 and checkpoint.x should be the same otherwisee there is an issue
    try:
        np.testing.assert_equal(x, checkpoint.x)
    except AssertionError as e:
        raise ValueError(
            "When 'checkpoint' is provided (L-BFGS-B restart), x0 and checkpoint.x"
            " should be equal!"
        ) from e
    n_corrs, n = checkpoint.hess_inv.sk.shape
    if n_corrs == 0:
        return X, G

    if n != x.size:
        raise ValueError(
            f"The size of correction vector ({n}) does"
            f" not match the size of x ({x.size})!"
        )
    # restore the past X and G
    for x, g in zip(
        checkpoint.x - np.cumsum(checkpoint.hess_inv.sk, axis=0),
        checkpoint.jac - np.cumsum(checkpoint.hess_inv.yk, axis=0),
    ):
        if len(X) > maxcor:
            X.popleft()
            G.popleft()
        X.append(x)
        G.append(g)
    # at this point, X and G do not have x nor jac -> it is added a bit later
    return X, G


def is_f0_min_change_reached(
    f0: float, f0_old: float, ftol: float, istate: InternalState
) -> bool:
    """
    Return whether the minimum obj fun change has been reached.

    It updates the internal state.
    """
    if (f0_old - f0) / max(abs(f0_old), abs(f0), 1) < ftol:
        istate.task_str = "CONVERGENCE: REL_REDUCTION_OF_F_<=_FTOL"
        istate.is_success = True
        istate.warnflag = 0
        return True
    return False


def is_f0_target_reached(
    f0: float, ftarget: Optional[float], istate: InternalState
) -> bool:
    """
    Return whether the obj fun target has been reached.

    It updates the internal state.
    """
    if ftarget is None:
        return False
    if f0 > ftarget:
        return False
    istate.task_str = "CONVERGENCE: F_<=_TARGET"
    istate.is_success = True
    istate.warnflag = 0
    return True