Source code for botorch.acquisition.logei

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

r"""
Monte-Carlo variants of the LogEI family of improvements-based acquisition functions,
see [Ament2023logei]_ for details.

References

.. [Ament2023logei]
    S. Ament, S. Daulton, D. Eriksson, M. Balandat, and E. Bakshy.
    Unexpected Improvements to Expected Improvement for Bayesian Optimization. Advances
    in Neural Information Processing Systems 36, 2023.
"""

from __future__ import annotations

from copy import deepcopy

from functools import partial

from typing import Any, Callable, List, Optional, Tuple, TypeVar, Union

import torch
from botorch.acquisition.cached_cholesky import CachedCholeskyMCSamplerMixin
from botorch.acquisition.monte_carlo import SampleReducingMCAcquisitionFunction
from botorch.acquisition.objective import (
    ConstrainedMCObjective,
    MCAcquisitionObjective,
    PosteriorTransform,
)
from botorch.acquisition.utils import (
    compute_best_feasible_objective,
    prune_inferior_points,
)
from botorch.exceptions.errors import BotorchError
from botorch.models.model import Model
from botorch.sampling.base import MCSampler
from botorch.utils.safe_math import (
    fatmax,
    log_fatplus,
    log_softplus,
    logmeanexp,
    smooth_amax,
)
from botorch.utils.transforms import match_batch_shape
from torch import Tensor

"""
NOTE: On the default temperature parameters:

tau_relu: It is generally important to set `tau_relu` to be very small, in particular,
smaller than the expected improvement value. Otherwise, the optimization can stagnate.
By setting `tau_relu=1e-6` by default, stagnation is exceedingly unlikely to occur due
to the smooth ReLU approximation for practical applications of BO.
IDEA: We could consider shrinking `tau_relu` with the progression of the optimization.

tau_max: This is only relevant for the batch (`q > 1`) case, and `tau_max=1e-2` is
sufficient to get a good approximation to the maximum improvement in the batch of
candidates. If `fat=False`, the smooth approximation to the maximum can saturate
numerically. It is therefore recommended to use `fat=True` when optimizing batches
of `q > 1` points.
"""
TAU_RELU = 1e-6
TAU_MAX = 1e-2
FloatOrTensor = TypeVar("FloatOrTensor", float, Tensor)


class LogImprovementMCAcquisitionFunction(SampleReducingMCAcquisitionFunction):
    r"""
    Abstract base class for Monte-Carlo-based batch LogEI acquisition functions.

    :meta private:
    """

    _log: bool = True

    def __init__(
        self,
        model: Model,
        sampler: Optional[MCSampler] = None,
        objective: Optional[MCAcquisitionObjective] = None,
        posterior_transform: Optional[PosteriorTransform] = None,
        X_pending: Optional[Tensor] = None,
        constraints: Optional[List[Callable[[Tensor], Tensor]]] = None,
        eta: Union[Tensor, float] = 1e-3,
        fat: bool = True,
        tau_max: float = TAU_MAX,
    ) -> None:
        r"""
        Args:
            model: A fitted model.
            sampler: The sampler used to draw base samples. If not given,
                a sampler is generated using `get_sampler`.
                NOTE: For posteriors that do not support base samples,
                a sampler compatible with intended use case must be provided.
                See `ForkedRNGSampler` and `StochasticSampler` as examples.
            objective: The MCAcquisitionObjective under which the samples are
                evaluated. Defaults to `IdentityMCObjective()`.
            posterior_transform: A PosteriorTransform (optional).
            X_pending: A `batch_shape, m x d`-dim Tensor of `m` design points
                that have points that have been submitted for function evaluation
                but have not yet been evaluated.
            constraints: A list of constraint callables which map a Tensor of posterior
                samples of dimension `sample_shape x batch-shape x q x m`-dim to a
                `sample_shape x batch-shape x q`-dim Tensor. The associated constraints
                are satisfied if `constraint(samples) < 0`.
            eta: Temperature parameter(s) governing the smoothness of the sigmoid
                approximation to the constraint indicators. See the docs of
                `compute_(log_)constraint_indicator` for more details on this parameter.
            fat: Toggles the logarithmic / linear asymptotic behavior of the smooth
                approximation to the ReLU.
            tau_max: Temperature parameter controlling the sharpness of the
                approximation to the `max` operator over the `q` candidate points.
        """
        if isinstance(objective, ConstrainedMCObjective):
            raise BotorchError(
                "Log-Improvement should not be used with `ConstrainedMCObjective`."
                "Please pass the `constraints` directly to the constructor of the "
                "acquisition function."
            )
        q_reduction = partial(fatmax if fat else smooth_amax, tau=tau_max)
        super().__init__(
            model=model,
            sampler=sampler,
            objective=objective,
            posterior_transform=posterior_transform,
            X_pending=X_pending,
            sample_reduction=logmeanexp,
            q_reduction=q_reduction,
            constraints=constraints,
            eta=eta,
            fat=fat,
        )
        self.tau_max = tau_max


[docs] class qLogExpectedImprovement(LogImprovementMCAcquisitionFunction): r"""MC-based batch Log Expected Improvement. This computes qLogEI by (1) sampling the joint posterior over q points, (2) evaluating the smoothed log improvement over the current best for each sample, (3) smoothly maximizing over q, and (4) averaging over the samples in log space. See [Ament2023logei]_ for details. Formally, `qLogEI(X) ~ log(qEI(X)) = log(E(max(max Y - best_f, 0)))`. where `Y ~ f(X)`, and `X = (x_1,...,x_q)`, . Example: >>> model = SingleTaskGP(train_X, train_Y) >>> best_f = train_Y.max()[0] >>> sampler = SobolQMCNormalSampler(1024) >>> qLogEI = qLogExpectedImprovement(model, best_f, sampler) >>> qei = qLogEI(test_X) """ def __init__( self, model: Model, best_f: Union[float, Tensor], sampler: Optional[MCSampler] = None, objective: Optional[MCAcquisitionObjective] = None, posterior_transform: Optional[PosteriorTransform] = None, X_pending: Optional[Tensor] = None, constraints: Optional[List[Callable[[Tensor], Tensor]]] = None, eta: Union[Tensor, float] = 1e-3, fat: bool = True, tau_max: float = TAU_MAX, tau_relu: float = TAU_RELU, ) -> None: r"""q-Log Expected Improvement. Args: model: A fitted model. best_f: The best objective value observed so far (assumed noiseless). Can be a scalar, or a `batch_shape`-dim tensor. In case of a batched model, the tensor can specify different values for each element of the batch. sampler: The sampler used to draw base samples. See `MCAcquisitionFunction` more details. objective: The MCAcquisitionObjective under which the samples are evaluated. Defaults to `IdentityMCObjective()`. posterior_transform: A PosteriorTransform (optional). X_pending: A `m x d`-dim Tensor of `m` design points that have been submitted for function evaluation but have not yet been evaluated. Concatenated into `X` upon forward call. Copied and set to have no gradient. constraints: A list of constraint callables which map a Tensor of posterior samples of dimension `sample_shape x batch-shape x q x m`-dim to a `sample_shape x batch-shape x q`-dim Tensor. The associated constraints are satisfied if `constraint(samples) < 0`. eta: Temperature parameter(s) governing the smoothness of the sigmoid approximation to the constraint indicators. See the docs of `compute_(log_)smoothed_constraint_indicator` for details. fat: Toggles the logarithmic / linear asymptotic behavior of the smooth approximation to the ReLU. tau_max: Temperature parameter controlling the sharpness of the smooth approximations to max. tau_relu: Temperature parameter controlling the sharpness of the smooth approximations to ReLU. """ super().__init__( model=model, sampler=sampler, objective=objective, posterior_transform=posterior_transform, X_pending=X_pending, constraints=constraints, eta=eta, tau_max=check_tau(tau_max, name="tau_max"), fat=fat, ) self.register_buffer("best_f", torch.as_tensor(best_f, dtype=float)) self.tau_relu = check_tau(tau_relu, name="tau_relu") def _sample_forward(self, obj: Tensor) -> Tensor: r"""Evaluate qLogExpectedImprovement on the candidate set `X`. Args: obj: `mc_shape x batch_shape x q`-dim Tensor of MC objective values. Returns: A `mc_shape x batch_shape x q`-dim Tensor of expected improvement values. """ li = _log_improvement( Y=obj, best_f=self.best_f, tau=self.tau_relu, fat=self._fat, ) return li
[docs] class qLogNoisyExpectedImprovement( LogImprovementMCAcquisitionFunction, CachedCholeskyMCSamplerMixin ): r"""MC-based batch Log Noisy Expected Improvement. This function does not assume a `best_f` is known (which would require noiseless observations). Instead, it uses samples from the joint posterior over the `q` test points and previously observed points. A smooth approximation to the canonical improvement over previously observed points is computed for each sample and the logarithm of the average is returned. See [Ament2023logei]_ for details. Formally, `qLogNEI(X) ~ log(qNEI(X)) = Log E(max(max Y - max Y_baseline, 0))`, where `(Y, Y_baseline) ~ f((X, X_baseline)), X = (x_1,...,x_q)`. Example: >>> model = SingleTaskGP(train_X, train_Y) >>> sampler = SobolQMCNormalSampler(1024) >>> qLogNEI = qLogNoisyExpectedImprovement(model, train_X, sampler) >>> acqval = qLogNEI(test_X) """ def __init__( self, model: Model, X_baseline: Tensor, sampler: Optional[MCSampler] = None, objective: Optional[MCAcquisitionObjective] = None, posterior_transform: Optional[PosteriorTransform] = None, X_pending: Optional[Tensor] = None, constraints: Optional[List[Callable[[Tensor], Tensor]]] = None, eta: Union[Tensor, float] = 1e-3, fat: bool = True, prune_baseline: bool = False, cache_root: bool = True, tau_max: float = TAU_MAX, tau_relu: float = TAU_RELU, **kwargs: Any, ) -> None: r"""q-Noisy Expected Improvement. Args: model: A fitted model. X_baseline: A `batch_shape x r x d`-dim Tensor of `r` design points that have already been observed. These points are considered as the potential best design point. sampler: The sampler used to draw base samples. See `MCAcquisitionFunction` more details. objective: The MCAcquisitionObjective under which the samples are evaluated. Defaults to `IdentityMCObjective()`. posterior_transform: A PosteriorTransform (optional). X_pending: A `batch_shape x m x d`-dim Tensor of `m` design points that have points that have been submitted for function evaluation but have not yet been evaluated. Concatenated into `X` upon forward call. Copied and set to have no gradient. constraints: A list of constraint callables which map a Tensor of posterior samples of dimension `sample_shape x batch-shape x q x m`-dim to a `sample_shape x batch-shape x q`-dim Tensor. The associated constraints are satisfied if `constraint(samples) < 0`. eta: Temperature parameter(s) governing the smoothness of the sigmoid approximation to the constraint indicators. See the docs of `compute_(log_)smoothed_constraint_indicator` for details. fat: Toggles the logarithmic / linear asymptotic behavior of the smooth approximation to the ReLU. prune_baseline: If True, remove points in `X_baseline` that are highly unlikely to be the best point. This can significantly improve performance and is generally recommended. In order to customize pruning parameters, instead manually call `botorch.acquisition.utils.prune_inferior_points` on `X_baseline` before instantiating the acquisition function. cache_root: A boolean indicating whether to cache the root decomposition over `X_baseline` and use low-rank updates. tau_max: Temperature parameter controlling the sharpness of the smooth approximations to max. tau_relu: Temperature parameter controlling the sharpness of the smooth approximations to ReLU. kwargs: Here for qNEI for compatibility. TODO: similar to qNEHVI, when we are using sequential greedy candidate selection, we could incorporate pending points X_baseline and compute the incremental q(Log)NEI from the new point. This would greatly increase efficiency for large batches. """ # TODO: separate out baseline variables initialization and other functions # in qNEI to avoid duplication of both code and work at runtime. super().__init__( model=model, sampler=sampler, objective=objective, posterior_transform=posterior_transform, X_pending=X_pending, constraints=constraints, eta=eta, fat=fat, tau_max=tau_max, ) self.tau_relu = tau_relu self._init_baseline( model=model, X_baseline=X_baseline, sampler=sampler, objective=objective, posterior_transform=posterior_transform, prune_baseline=prune_baseline, cache_root=cache_root, **kwargs, ) def _sample_forward(self, obj: Tensor) -> Tensor: r"""Evaluate qLogNoisyExpectedImprovement per sample on the candidate set `X`. Args: obj: `mc_shape x batch_shape x q`-dim Tensor of MC objective values. Returns: A `sample_shape x batch_shape x q`-dim Tensor of log noisy expected smoothed improvement values. """ return _log_improvement( Y=obj, best_f=self.compute_best_f(obj), tau=self.tau_relu, fat=self._fat, ) def _init_baseline( self, model: Model, X_baseline: Tensor, sampler: Optional[MCSampler] = None, objective: Optional[MCAcquisitionObjective] = None, posterior_transform: Optional[PosteriorTransform] = None, prune_baseline: bool = False, cache_root: bool = True, **kwargs: Any, ) -> None: CachedCholeskyMCSamplerMixin.__init__( self, model=model, cache_root=cache_root, sampler=sampler ) if prune_baseline: X_baseline = prune_inferior_points( model=model, X=X_baseline, objective=objective, posterior_transform=posterior_transform, marginalize_dim=kwargs.get("marginalize_dim"), constraints=self._constraints, ) self.register_buffer("X_baseline", X_baseline) # registering buffers for _get_samples_and_objectives in the next `if` block self.register_buffer("baseline_samples", None) self.register_buffer("baseline_obj", None) if self._cache_root: self.q_in = -1 # set baseline samples with torch.no_grad(): # this is _get_samples_and_objectives(X_baseline) posterior = self.model.posterior( X_baseline, posterior_transform=self.posterior_transform ) # Note: The root decomposition is cached in two different places. It # may be confusing to have two different caches, but this is not # trivial to change since each is needed for a different reason: # - LinearOperator caching to `posterior.mvn` allows for reuse within # this function, which may be helpful if the same root decomposition # is produced by the calls to `self.base_sampler` and # `self._cache_root_decomposition`. # - self._baseline_L allows a root decomposition to be persisted outside # this method. self.baseline_samples = self.get_posterior_samples(posterior) self.baseline_obj = self.objective(self.baseline_samples, X=X_baseline) # We make a copy here because we will write an attribute `base_samples` # to `self.base_sampler.base_samples`, and we don't want to mutate # `self.sampler`. self.base_sampler = deepcopy(self.sampler) self.register_buffer( "_baseline_best_f", self._compute_best_feasible_objective( samples=self.baseline_samples, obj=self.baseline_obj ), ) self._baseline_L = self._compute_root_decomposition(posterior=posterior)
[docs] def compute_best_f(self, obj: Tensor) -> Tensor: """Computes the best (feasible) noisy objective value. Args: obj: `sample_shape x batch_shape x q`-dim Tensor of objectives in forward. Returns: A `sample_shape x batch_shape`-dim Tensor of best feasible objectives. """ if self._cache_root: val = self._baseline_best_f else: val = self._compute_best_feasible_objective( samples=self.baseline_samples, obj=self.baseline_obj ) # ensuring shape, dtype, device compatibility with obj n_sample_dims = len(self.sample_shape) view_shape = torch.Size( [ *val.shape[:n_sample_dims], # sample dimensions *(1,) * (obj.ndim - val.ndim - 1), # pad to match obj without `q`-dim *val.shape[n_sample_dims:], # the rest ] ) return val.view(view_shape).to(obj) # obj.shape[:-1], i.e. without `q`-dim`
def _get_samples_and_objectives(self, X: Tensor) -> Tuple[Tensor, Tensor]: r"""Compute samples at new points, using the cached root decomposition. Args: X: A `batch_shape x q x d`-dim tensor of inputs. Returns: A two-tuple `(samples, obj)`, where `samples` is a tensor of posterior samples with shape `sample_shape x batch_shape x q x m`, and `obj` is a tensor of MC objective values with shape `sample_shape x batch_shape x q`. """ n_baseline, q = self.X_baseline.shape[-2], X.shape[-2] X_full = torch.cat([match_batch_shape(self.X_baseline, X), X], dim=-2) # TODO: Implement more efficient way to compute posterior over both training and # test points in GPyTorch (https://github.com/cornellius-gp/gpytorch/issues/567) posterior = self.model.posterior( X_full, posterior_transform=self.posterior_transform ) if not self._cache_root: samples_full = super().get_posterior_samples(posterior) obj_full = self.objective(samples_full, X=X_full) # assigning baseline buffers so `best_f` can be computed in _sample_forward self.baseline_samples, samples = samples_full.split([n_baseline, q], dim=-2) self.baseline_obj, obj = obj_full.split([n_baseline, q], dim=-1) return samples, obj # handle one-to-many input transforms n_plus_q = X_full.shape[-2] n_w = posterior._extended_shape()[-2] // n_plus_q q_in = q * n_w self._set_sampler(q_in=q_in, posterior=posterior) samples = self._get_f_X_samples(posterior=posterior, q_in=q_in) obj = self.objective(samples, X=X_full[..., -q:, :]) return samples, obj def _compute_best_feasible_objective(self, samples: Tensor, obj: Tensor) -> Tensor: r"""Computes best feasible objective value from samples. Args: samples: `sample_shape x batch_shape x q x m`-dim posterior samples. obj: A `sample_shape x batch_shape x q`-dim Tensor of MC objective values. Returns: A `sample_shape x batch_shape`-dim Tensor of best feasible objectives. """ return compute_best_feasible_objective( samples=samples, obj=obj, constraints=self._constraints, model=self.model, objective=self.objective, posterior_transform=self.posterior_transform, X_baseline=self.X_baseline, )
""" ###################################### utils ########################################## """ def _log_improvement( Y: Tensor, best_f: Tensor, tau: Union[float, Tensor], fat: bool, ) -> Tensor: """Computes the logarithm of the softplus-smoothed improvement, i.e. `log_softplus(Y - best_f, beta=(1 / tau))`. Note that softplus is an approximation to the regular ReLU objective whose maximum pointwise approximation error is linear with respect to tau as tau goes to zero. Args: obj: `mc_samples x batch_shape x q`-dim Tensor of output samples. best_f: Best previously observed objective value(s), broadcastable with `mc_samples x batch_shape`-dim Tensor, i.e. `obj`'s dims without `q`. tau: Temperature parameter for smooth approximation of ReLU. as `tau -> 0`, maximum pointwise approximation error is linear w.r.t. `tau`. fat: Toggles the logarithmic / linear asymptotic behavior of the smooth approximation to ReLU. Returns: A `mc_samples x batch_shape x q`-dim Tensor of improvement values. """ log_soft_clamp = log_fatplus if fat else log_softplus Z = Y - best_f.unsqueeze(-1).to(Y) return log_soft_clamp(Z, tau=tau) # ~ ((Y - best_f) / Y_std).clamp(0)
[docs] def check_tau(tau: FloatOrTensor, name: str) -> FloatOrTensor: """Checks the validity of the tau arguments of the functions below, and returns `tau` if it is valid.""" if isinstance(tau, Tensor) and tau.numel() != 1: raise ValueError(name + f" is not a scalar: {tau.numel() = }.") if not (tau > 0): raise ValueError(name + f" is non-positive: {tau = }.") return tau