Source code for botorch.acquisition.max_value_entropy_search

#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

r"""
Acquisition functions for Max-value Entropy Search (MES), General
Information-Based Bayesian Optimization (GIBBON), and
multi-fidelity MES with noisy observations and trace observations.

References

.. [Moss2021gibbon]
    Moss, H. B., et al.,
    GIBBON: General-purpose Information-Based Bayesian OptimisatioN.
    Journal of Machine Learning Research, 2021.

.. [Takeno2020mfmves]
    S. Takeno, H. Fukuoka, Y. Tsukada, T. Koyama, M. Shiga, I. Takeuchi,
    M. Karasuyama. Multi-fidelity Bayesian Optimization with Max-value Entropy
    Search and its Parallelization. Proceedings of the 37th International
    Conference on Machine Learning, 2020.

.. [Wang2017mves]
    Z. Wang, S. Jegelka, Max-value Entropy Search for Efficient
    Bayesian Optimization. Proceedings of the 37th International
    Conference on Machine Learning, 2017.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from copy import deepcopy
from math import log
from typing import Callable, Optional

import numpy as np
import torch
from botorch.acquisition.acquisition import AcquisitionFunction, MCSamplerMixin
from botorch.acquisition.cost_aware import CostAwareUtility, InverseCostWeightedUtility
from botorch.acquisition.objective import PosteriorTransform
from botorch.exceptions.errors import UnsupportedError
from botorch.models.cost import AffineFidelityCostModel
from botorch.models.model import Model
from botorch.models.utils import check_no_nans
from botorch.sampling.normal import SobolQMCNormalSampler
from botorch.utils.transforms import match_batch_shape, t_batch_mode_transform

from linear_operator.functions import inv_quad
from linear_operator.utils.cholesky import psd_safe_cholesky
from scipy.optimize import brentq
from scipy.stats import norm
from torch import Tensor


CLAMP_LB = 1.0e-8


class MaxValueBase(AcquisitionFunction, ABC):
    r"""Abstract base class for acquisition functions based on Max-value Entropy Search.

    This class provides the basic building blocks for constructing max-value
    entropy-based acquisition functions along the lines of [Wang2017mves]_.

    Subclasses need to implement `_sample_max_values` and _compute_information_gain`
    methods.

    :meta private:
    """

    def __init__(
        self,
        model: Model,
        num_mv_samples: int,
        posterior_transform: Optional[PosteriorTransform] = None,
        maximize: bool = True,
        X_pending: Optional[Tensor] = None,
    ) -> None:
        r"""Single-outcome max-value entropy search-based acquisition functions.

        Args:
            model: A fitted single-outcome model.
            num_mv_samples: Number of max value samples.
            posterior_transform: A PosteriorTransform. If using a multi-output model,
                a PosteriorTransform that transforms the multi-output posterior into a
                single-output posterior is required.
            maximize: If True, consider the problem a maximization problem.
            X_pending: A `m x d`-dim Tensor of `m` design points that have been
                submitted for function evaluation but have not yet been evaluated.
        """
        super().__init__(model=model)

        if posterior_transform is None and model.num_outputs != 1:
            raise UnsupportedError(
                "Must specify a posterior transform when using a multi-output model."
            )

        # Batched GP models are not currently supported
        try:
            batch_shape = model.batch_shape
        except NotImplementedError:
            batch_shape = torch.Size()
        if len(batch_shape) > 0:
            raise NotImplementedError(
                "Batched GP models (e.g., fantasized models) are not yet "
                f"supported by `{self.__class__.__name__}`."
            )
        self.num_mv_samples = num_mv_samples
        self.posterior_transform = posterior_transform
        self.maximize = maximize
        self.weight = 1.0 if maximize else -1.0
        self.set_X_pending(X_pending)

    @t_batch_mode_transform(expected_q=1)
    def forward(self, X: Tensor) -> Tensor:
        r"""Compute max-value entropy at the design points `X`.

        Args:
            X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches
                with `1` `d`-dim design points each.

        Returns:
            A `batch_shape`-dim Tensor of MVE values at the given design points `X`.
        """
        # Compute the posterior, posterior mean, variance and std
        posterior = self.model.posterior(
            X.unsqueeze(-3),
            observation_noise=False,
            posterior_transform=self.posterior_transform,
        )
        # batch_shape x num_fantasies x (m) x 1
        mean = self.weight * posterior.mean.squeeze(-1).squeeze(-1)
        variance = posterior.variance.clamp_min(CLAMP_LB).view_as(mean)
        ig = self._compute_information_gain(
            X=X, mean_M=mean, variance_M=variance, covar_mM=variance.unsqueeze(-1)
        )
        return ig.mean(dim=0)  # average over fantasies

    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
        r"""Set pending design points.

        Set "pending points" to inform the acquisition function of the candidate
        points that have been generated but are pending evaluation.

        Args:
            X_pending: `n x d` Tensor with `n` `d`-dim design points that have
                been submitted for evaluation but have not yet been evaluated.
        """
        if X_pending is not None:
            X_pending = X_pending.detach().clone()
        self._sample_max_values(num_samples=self.num_mv_samples, X_pending=X_pending)
        self.X_pending = X_pending

    # ------- Abstract methods that need to be implemented by subclasses ------- #

    @abstractmethod
    def _compute_information_gain(self, X: Tensor) -> Tensor:
        r"""Compute the information gain at the design points `X`.

        `num_fantasies = 1` for non-fantasized models.

         Args:
            X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches
                with `1` `d`-dim design point each.

        Returns:
            A `num_fantasies x batch_shape`-dim Tensor of information gains at the
            given design points `X` (`num_fantasies=1` for non-fantasized models).
        """
        pass  # pragma: no cover

    @abstractmethod
    def _sample_max_values(
        self, num_samples: int, X_pending: Optional[Tensor] = None
    ) -> None:
        r"""Draw samples from the posterior over maximum values.

        These samples are used to compute Monte Carlo approximations of expectations
        over the posterior over the function maximum. This function sets
        `self.posterior_max_values`.

        Args:
            num_samples: The number of samples to draw.
            X_pending: A `m x d`-dim Tensor of `m` design points that have been
                submitted for function evaluation but have not yet been evaluated.

        Returns:
            A `num_samples x num_fantasies` Tensor of posterior max value samples
            (`num_fantasies=1` for non-fantasized models).
        """
        pass  # pragma: no cover


[docs] class DiscreteMaxValueBase(MaxValueBase): r"""Abstract base class for MES-like methods using discrete max posterior sampling. This class provides basic functionality for sampling posterior maximum values from a surrogate Gaussian process model using a discrete set of candidates. It supports either exact (w.r.t. the candidate set) sampling, or using a Gumbel approximation. """ def __init__( self, model: Model, candidate_set: Tensor, num_mv_samples: int = 10, posterior_transform: Optional[PosteriorTransform] = None, use_gumbel: bool = True, maximize: bool = True, X_pending: Optional[Tensor] = None, train_inputs: Optional[Tensor] = None, ) -> None: r"""Single-outcome MES-like acquisition functions based on discrete MV sampling. Args: model: A fitted single-outcome model. candidate_set: A `n x d` Tensor including `n` candidate points to discretize the design space. Max values are sampled from the (joint) model posterior over these points. num_mv_samples: Number of max value samples. posterior_transform: A PosteriorTransform. If using a multi-output model, a PosteriorTransform that transforms the multi-output posterior into a single-output posterior is required. use_gumbel: If True, use Gumbel approximation to sample the max values. maximize: If True, consider the problem a maximization problem. X_pending: A `m x d`-dim Tensor of `m` design points that have been submitted for function evaluation but have not yet been evaluated. train_inputs: A `n_train x d` Tensor that the model has been fitted on. Not required if the model is an instance of a GPyTorch ExactGP model. """ self.use_gumbel = use_gumbel if train_inputs is None and hasattr(model, "train_inputs"): train_inputs = model.train_inputs[0] if train_inputs is not None: if train_inputs.ndim > 2: raise NotImplementedError( "Batch GP models (e.g. fantasized models) " "are not yet supported by `MaxValueBase`" ) train_inputs = match_batch_shape(train_inputs, candidate_set) candidate_set = torch.cat([candidate_set, train_inputs], dim=0) self.candidate_set = candidate_set super().__init__( model=model, num_mv_samples=num_mv_samples, posterior_transform=posterior_transform, maximize=maximize, X_pending=X_pending, ) def _sample_max_values( self, num_samples: int, X_pending: Optional[Tensor] = None ) -> None: r"""Draw samples from the posterior over maximum values on a discrete set. These samples are used to compute Monte Carlo approximations of expectations over the posterior over the function maximum. This function sets `self.posterior_max_values`. Args: num_samples: The number of samples to draw. X_pending: A `m x d`-dim Tensor of `m` design points that have been submitted for function evaluation but have not yet been evaluated. Returns: A `num_samples x num_fantasies` Tensor of posterior max value samples (`num_fantasies=1` for non-fantasized models). """ if self.use_gumbel: sample_max_values = _sample_max_value_Gumbel else: sample_max_values = _sample_max_value_Thompson candidate_set = self.candidate_set with torch.no_grad(): if X_pending is not None: # Append X_pending to candidate set X_pending = match_batch_shape(X_pending, self.candidate_set) candidate_set = torch.cat([self.candidate_set, X_pending], dim=0) # project the candidate_set to the highest fidelity, # which is needed for the multi-fidelity MES try: candidate_set = self.project(candidate_set) except AttributeError: pass self.posterior_max_values = sample_max_values( model=self.model, candidate_set=candidate_set, num_samples=self.num_mv_samples, posterior_transform=self.posterior_transform, maximize=self.maximize, )
[docs] class qMaxValueEntropy(DiscreteMaxValueBase, MCSamplerMixin): r"""The acquisition function for Max-value Entropy Search. This acquisition function computes the mutual information of max values and a candidate point X. See [Wang2017mves]_ for a detailed discussion. The model must be single-outcome. The batch case `q > 1` is supported through cyclic optimization and fantasies. Example: >>> model = SingleTaskGP(train_X, train_Y) >>> candidate_set = torch.rand(1000, bounds.size(1)) >>> candidate_set = bounds[0] + (bounds[1] - bounds[0]) * candidate_set >>> MES = qMaxValueEntropy(model, candidate_set) >>> mes = MES(test_X) """ def __init__( self, model: Model, candidate_set: Tensor, num_fantasies: int = 16, num_mv_samples: int = 10, num_y_samples: int = 128, posterior_transform: Optional[PosteriorTransform] = None, use_gumbel: bool = True, maximize: bool = True, X_pending: Optional[Tensor] = None, train_inputs: Optional[Tensor] = None, ) -> None: r"""Single-outcome max-value entropy search acquisition function. Args: model: A fitted single-outcome model. candidate_set: A `n x d` Tensor including `n` candidate points to discretize the design space. Max values are sampled from the (joint) model posterior over these points. num_fantasies: Number of fantasies to generate. The higher this number the more accurate the model (at the expense of model complexity, wall time and memory). Ignored if `X_pending` is `None`. num_mv_samples: Number of max value samples. num_y_samples: Number of posterior samples at specific design point `X`. posterior_transform: A PosteriorTransform. If using a multi-output model, a PosteriorTransform that transforms the multi-output posterior into a single-output posterior is required. use_gumbel: If True, use Gumbel approximation to sample the max values. maximize: If True, consider the problem a maximization problem. X_pending: A `m x d`-dim Tensor of `m` design points that have been submitted for function evaluation but have not yet been evaluated. train_inputs: A `n_train x d` Tensor that the model has been fitted on. Not required if the model is an instance of a GPyTorch ExactGP model. """ super().__init__( model=model, candidate_set=candidate_set, num_mv_samples=num_mv_samples, posterior_transform=posterior_transform, use_gumbel=use_gumbel, maximize=maximize, X_pending=X_pending, train_inputs=train_inputs, ) MCSamplerMixin.__init__( self, sampler=SobolQMCNormalSampler(sample_shape=torch.Size([num_y_samples])), ) self._init_model = model # used for `fantasize()` when setting `X_pending` self.fantasies_sampler = SobolQMCNormalSampler( sample_shape=torch.Size([num_fantasies]) ) self.num_fantasies = num_fantasies self.set_X_pending(X_pending) # this did not happen in the super constructor
[docs] def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None: r"""Set pending points. Informs the acquisition function about pending design points, fantasizes the model on the pending points and draws max-value samples from the fantasized model posterior. Args: X_pending: `m x d` Tensor with `m` `d`-dim design points that have been submitted for evaluation but have not yet been evaluated. """ try: init_model = self._init_model except AttributeError: # Short-circuit (this allows calling the super constructor) return if X_pending is not None: # fantasize the model and use this as the new model self.model = init_model.fantasize( X=X_pending, sampler=self.fantasies_sampler, ) else: self.model = init_model super().set_X_pending(X_pending)
def _compute_information_gain( self, X: Tensor, mean_M: Tensor, variance_M: Tensor, covar_mM: Tensor ) -> Tensor: r"""Computes the information gain at the design points `X`. Approximately computes the information gain at the design points `X`, for both MES with noisy observations and multi-fidelity MES with noisy observation and trace observations. The implementation is inspired from the papers on multi-fidelity MES by [Takeno2020mfmves]_. The notation in the comments in this function follows the Appendix C of [Takeno2020mfmves]_. `num_fantasies = 1` for non-fantasized models. Args: X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches with `1` `d`-dim design point each. mean_M: A `batch_shape x num_fantasies x (m)`-dim Tensor of means. variance_M: A `batch_shape x num_fantasies x (m)`-dim Tensor of variances. covar_mM: A `batch_shape x num_fantasies x (m) x (1 + num_trace_observations)`-dim Tensor of covariances. Returns: A `num_fantasies x batch_shape`-dim Tensor of information gains at the given design points `X` (`num_fantasies=1` for non-fantasized models). """ # compute the std_m, variance_m with noisy observation posterior_m = self.model.posterior( X.unsqueeze(-3), observation_noise=True, posterior_transform=self.posterior_transform, ) # batch_shape x num_fantasies x (m) x (1 + num_trace_observations) mean_m = self.weight * posterior_m.mean.squeeze(-1) # batch_shape x num_fantasies x (m) x (1 + num_trace_observations) variance_m = posterior_m.distribution.covariance_matrix check_no_nans(variance_m) # compute mean and std for fM|ym, x, Dt ~ N(u, s^2) samples_m = self.weight * self.get_posterior_samples(posterior_m).squeeze(-1) # s_m x batch_shape x num_fantasies x (m) (1 + num_trace_observations) L = psd_safe_cholesky(variance_m) temp_term = torch.cholesky_solve(covar_mM.unsqueeze(-1), L).transpose(-2, -1) # equivalent to torch.matmul(covar_mM.unsqueeze(-2), torch.inverse(variance_m)) # batch_shape x num_fantasies (m) x 1 x (1 + num_trace_observations) mean_pt1 = torch.matmul(temp_term, (samples_m - mean_m).unsqueeze(-1)) mean_new = mean_pt1.squeeze(-1).squeeze(-1) + mean_M # s_m x batch_shape x num_fantasies x (m) variance_pt1 = torch.matmul(temp_term, covar_mM.unsqueeze(-1)) variance_new = variance_M - variance_pt1.squeeze(-1).squeeze(-1) # batch_shape x num_fantasies x (m) stdv_new = variance_new.clamp_min(CLAMP_LB).sqrt() # batch_shape x num_fantasies x (m) # define normal distribution to compute cdf and pdf normal = torch.distributions.Normal( torch.zeros(1, device=X.device, dtype=X.dtype), torch.ones(1, device=X.device, dtype=X.dtype), ) # Compute p(fM <= f* | ym, x, Dt) view_shape = torch.Size( [ self.posterior_max_values.shape[0], # add 1s to broadcast across the batch_shape of X *[1 for _ in range(X.ndim - self.posterior_max_values.ndim)], *self.posterior_max_values.shape[1:], ] ) # s_M x batch_shape x num_fantasies x (m) max_vals = self.posterior_max_values.view(view_shape).unsqueeze(1) # s_M x 1 x batch_shape x num_fantasies x (m) normalized_mvs_new = (max_vals - mean_new) / stdv_new # s_M x s_m x batch_shape x num_fantasies x (m) = # s_M x 1 x batch_shape x num_fantasies x (m) # - s_m x batch_shape x num_fantasies x (m) cdf_mvs_new = normal.cdf(normalized_mvs_new).clamp_min(CLAMP_LB) # Compute p(fM <= f* | x, Dt) stdv_M = variance_M.sqrt() normalized_mvs = (max_vals - mean_M) / stdv_M # s_M x 1 x batch_shape x num_fantasies x (m) = # s_M x 1 x 1 x num_fantasies x (m) - batch_shape x num_fantasies x (m) cdf_mvs = normal.cdf(normalized_mvs).clamp_min(CLAMP_LB) # s_M x 1 x batch_shape x num_fantasies x (m) # Compute log(p(ym | x, Dt)) log_pdf_fm = posterior_m.distribution.log_prob( self.weight * samples_m ).unsqueeze(0) # 1 x s_m x batch_shape x num_fantasies x (m) # H0 = H(ym | x, Dt) H0 = posterior_m.distribution.entropy() # batch_shape x num_fantasies x (m) # regression adjusted H1 estimation, H1_hat = H1_bar - beta * (H0_bar - H0) # H1 = E_{f*|x, Dt}[H(ym|f*, x, Dt)] Z = cdf_mvs_new / cdf_mvs # s_M x s_m x batch_shape x num_fantasies x (m) # s_M x s_m x batch_shape x num_fantasies x (m) h1 = -Z * Z.log() - Z * log_pdf_fm check_no_nans(h1) dim = [0, 1] # dimension of fm samples, fM samples H1_bar = h1.mean(dim=dim) h0 = -log_pdf_fm H0_bar = h0.mean(dim=dim) cov = ((h1 - H1_bar) * (h0 - H0_bar)).mean(dim=dim) beta = cov / (h0.var(dim=dim) * h1.var(dim=dim)).sqrt() H1_hat = H1_bar - beta * (H0_bar - H0) ig = H0 - H1_hat # batch_shape x num_fantasies x (m) if self.posterior_max_values.ndim == 2: permute_idcs = [-1, *range(ig.ndim - 1)] else: permute_idcs = [-2, *range(ig.ndim - 2), -1] ig = ig.permute(*permute_idcs) # num_fantasies x batch_shape x (m) return ig
[docs] class qLowerBoundMaxValueEntropy(DiscreteMaxValueBase): r"""The acquisition function for General-purpose Information-Based Bayesian Optimisation (GIBBON). This acquisition function provides a computationally cheap approximation of the mutual information between max values and a batch of candidate points `X`. See [Moss2021gibbon]_ for a detailed discussion. The model must be single-outcome, unless using a PosteriorTransform. q > 1 is supported through greedy batch filling. Example: >>> model = SingleTaskGP(train_X, train_Y) >>> candidate_set = torch.rand(1000, bounds.size(1)) >>> candidate_set = bounds[0] + (bounds[1] - bounds[0]) * candidate_set >>> qGIBBON = qLowerBoundMaxValueEntropy(model, candidate_set) >>> candidates, _ = optimize_acqf(qGIBBON, bounds, q=5) """ def _compute_information_gain( self, X: Tensor, mean_M: Tensor, variance_M: Tensor, covar_mM: Tensor ) -> Tensor: r"""Compute GIBBON's approximation of information gain at the design points `X`. When using GIBBON for batch optimization (i.e `q > 1`), we calculate the additional information provided by adding a new candidate point to the current batch of design points (`X_pending`), rather than calculating the information provided by the whole batch. This allows a modest computational saving. Args: X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches with `1` `d`-dim design point each. mean_M: A `batch_shape x 1`-dim Tensor of means. variance_M: A `batch_shape x 1`-dim Tensor of variances consisting of `batch_shape` t-batches with `num_fantasies` fantasies. covar_mM: A `batch_shape x num_fantasies x (1 + num_trace_observations)` -dim Tensor of covariances. Returns: A `num_fantasies x batch_shape`-dim Tensor of information gains at the given design points `X`. """ # TODO: give the Posterior API an add_observation_noise function to avoid # doing posterior computations twice # compute the mean_m, variance_m with noisy observation posterior_m = self.model.posterior( X, observation_noise=True, posterior_transform=self.posterior_transform ) mean_m = self.weight * posterior_m.mean.squeeze(-1) # batch_shape x 1 variance_m = posterior_m.variance.clamp_min(CLAMP_LB).squeeze(-1) # batch_shape x 1 check_no_nans(variance_m) # get stdv of noiseless variance stdv = variance_M.sqrt() # batch_shape x 1 # define normal distribution to compute cdf and pdf normal = torch.distributions.Normal( torch.zeros(1, device=X.device, dtype=X.dtype), torch.ones(1, device=X.device, dtype=X.dtype), ) # prepare max value quantities required by GIBBON mvs = torch.transpose(self.posterior_max_values, 0, 1) # 1 x s_M normalized_mvs = (mvs - mean_m) / stdv # batch_shape x s_M cdf_mvs = normal.cdf(normalized_mvs).clamp_min(CLAMP_LB) pdf_mvs = torch.exp(normal.log_prob(normalized_mvs)) ratio = pdf_mvs / cdf_mvs check_no_nans(ratio) # prepare squared correlation between current and target fidelity rhos_squared = torch.pow(covar_mM.squeeze(-1), 2) / (variance_m * variance_M) # batch_shape x 1 check_no_nans(rhos_squared) # calculate quality contribution to the GIBBON acquisition function inner_term = 1 - rhos_squared * ratio * (normalized_mvs + ratio) acq = -0.5 * inner_term.clamp_min(CLAMP_LB).log() # average over posterior max samples acq = acq.mean(dim=1).unsqueeze(0) if self.X_pending is None: # for q=1, no repulsion term required return acq # for q>1 GIBBON requires repulsion terms r_i, where # r_i = log |C_i| for the predictive # correlation matricies C_i between each candidate point in X and # the m current batch elements in X_pending. # Each predictive covariance matrix can be expressed as # V_i = [[v_i, A_i], [A_i,B]] for a shared m x m tensor B. # So we can efficiently calculate |V_i| using the formula for # determinant of block matricies, i.e. # |V_i| = (v_i - A_i^T * B^{-1} * A_i) * |B| # As the |B| term does not depend on X and we later take its log, # it provides only a translation of the acquisition function surface # and can thus be ignored. if self.posterior_transform is not None: raise UnsupportedError( "qLowerBoundMaxValueEntropy does not support PosteriorTransforms" "when X_pending is not None." ) X_batches = torch.cat( [X, self.X_pending.unsqueeze(0).repeat(X.shape[0], 1, 1)], 1 ) # batch_shape x (1 + m) x d # NOTE: This is the blocker for supporting posterior transforms. # We would have to process this MVN, applying whatever operations # are typically applied for the corresponding posterior, then applying # the posterior transform onto the resulting object. V = self.model(X_batches) # Evaluate terms required for A A = V.lazy_covariance_matrix[:, 0, 1:].unsqueeze(1) # batch_shape x 1 x m # Evaluate terms required for B B = self.model.posterior( self.X_pending, observation_noise=True, posterior_transform=self.posterior_transform, ).distribution.covariance_matrix.unsqueeze(0) # 1 x m x m # use determinant of block matrix formula inv_quad_term = inv_quad(B, A.transpose(1, 2)).unsqueeze(1) # NOTE: Even when using Cholesky to compute inv_quad, `V_determinant` can be # negative due to numerical issues. To avoid this, we clamp the variance # so that `V_determinant` > 0, while still allowing gradients to be # propagated through `inv_quad_term`, as well as through `variance_m` # in the expression for `r` below. # choosing eps to be small while avoiding numerical underflow eps = 1e-6 if inv_quad_term.dtype == torch.float32 else 1e-12 V_determinant = variance_m.clamp(inv_quad_term * (1 + eps)) - inv_quad_term # batch_shape x 1 # Take logs and convert covariances to correlations. r = V_determinant.log() - variance_m.log() # = log(1 - inv_quad / var) r = 0.5 * r.transpose(0, 1) return acq + r
[docs] class qMultiFidelityMaxValueEntropy(qMaxValueEntropy): r"""Multi-fidelity max-value entropy. The acquisition function for multi-fidelity max-value entropy search with support for trace observations. See [Takeno2020mfmves]_ for a detailed discussion of the basic ideas on multi-fidelity MES (note that this implementation is somewhat different). The model must be single-outcome, unless using a PosteriorTransform. The batch case `q > 1` is supported through cyclic optimization and fantasies. Example: >>> model = SingleTaskGP(train_X, train_Y) >>> candidate_set = torch.rand(1000, bounds.size(1)) >>> candidate_set = bounds[0] + (bounds[1] - bounds[0]) * candidate_set >>> MF_MES = qMultiFidelityMaxValueEntropy(model, candidate_set) >>> mf_mes = MF_MES(test_X) """ def __init__( self, model: Model, candidate_set: Tensor, num_fantasies: int = 16, num_mv_samples: int = 10, num_y_samples: int = 128, posterior_transform: Optional[PosteriorTransform] = None, use_gumbel: bool = True, maximize: bool = True, X_pending: Optional[Tensor] = None, cost_aware_utility: Optional[CostAwareUtility] = None, project: Callable[[Tensor], Tensor] = lambda X: X, expand: Callable[[Tensor], Tensor] = lambda X: X, ) -> None: r"""Single-outcome max-value entropy search acquisition function. Args: model: A fitted single-outcome model. candidate_set: A `n x d` Tensor including `n` candidate points to discretize the design space, which will be used to sample the max values from their posteriors. cost_aware_utility: A CostAwareUtility computing the cost-transformed utility from a candidate set and samples of increases in utility. num_fantasies: Number of fantasies to generate. The higher this number the more accurate the model (at the expense of model complexity and performance) and it's only used when `X_pending` is not `None`. num_mv_samples: Number of max value samples. num_y_samples: Number of posterior samples at specific design point `X`. posterior_transform: A PosteriorTransform. If using a multi-output model, a PosteriorTransform that transforms the multi-output posterior into a single-output posterior is required. use_gumbel: If True, use Gumbel approximation to sample the max values. maximize: If True, consider the problem a maximization problem. X_pending: A `m x d`-dim Tensor of `m` design points that have been submitted for function evaluation but have not yet been evaluated. cost_aware_utility: A CostAwareUtility computing the cost-transformed utility from a candidate set and samples of increases in utility. project: A callable mapping a `batch_shape x q x d` tensor of design points to a tensor of the same shape projected to the desired target set (e.g. the target fidelities in case of multi-fidelity optimization). expand: A callable mapping a `batch_shape x q x d` input tensor to a `batch_shape x (q + q_e)' x d`-dim output tensor, where the `q_e` additional points in each q-batch correspond to additional ("trace") observations. """ super().__init__( model=model, candidate_set=candidate_set, num_fantasies=num_fantasies, num_mv_samples=num_mv_samples, num_y_samples=num_y_samples, posterior_transform=posterior_transform, use_gumbel=use_gumbel, maximize=maximize, X_pending=X_pending, ) if cost_aware_utility is None: cost_model = AffineFidelityCostModel(fidelity_weights={-1: 1.0}) cost_aware_utility = InverseCostWeightedUtility(cost_model=cost_model) self.cost_aware_utility = cost_aware_utility self.expand = expand self.project = project self._cost_sampler = None # @TODO make sure fidelity_dims align in project, expand & cost_aware_utility # It seems very difficult due to the current way of handling project/expand # resample max values after initializing self.project # so that the max value samples are at the highest fidelity self._sample_max_values(self.num_mv_samples) @property def cost_sampler(self): if self._cost_sampler is None: # Note: Using the deepcopy here is essential. Removing this poses a # problem if the base model and the cost model have a different number # of outputs or test points (this would be caused by expand), as this # would trigger re-sampling the base samples in the fantasy sampler. # By cloning the sampler here, the right thing will happen if the # the sizes are compatible, if they are not this will result in # samples being drawn using different base samples, but it will at # least avoid changing state of the fantasy sampler. self._cost_sampler = deepcopy(self.fantasies_sampler) return self._cost_sampler
[docs] @t_batch_mode_transform(expected_q=1) def forward(self, X: Tensor) -> Tensor: r"""Evaluates `qMultifidelityMaxValueEntropy` at the design points `X` Args: X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches with `1` `d`-dim design point each. Returns: A `batch_shape`-dim Tensor of MF-MVES values at the design points `X`. """ X_expand = self.expand(X) # batch_shape x (1 + num_trace_observations) x d X_max_fidelity = self.project(X) # batch_shape x 1 x d X_all = torch.cat((X_expand, X_max_fidelity), dim=-2).unsqueeze(-3) # batch_shape x num_fantasies x (2 + num_trace_observations) x d # Compute the posterior, posterior mean, variance without noise # `_m` and `_M` in the var names means the current and the max fidelity. posterior = self.model.posterior( X_all, observation_noise=False, posterior_transform=self.posterior_transform ) mean_M = self.weight * posterior.mean[..., -1, 0] # batch_shape x num_fantasies variance_M = posterior.variance[..., -1, 0].clamp_min(CLAMP_LB) # get the covariance between the low fidelities and max fidelity covar_mM = posterior.distribution.covariance_matrix[..., :-1, -1] # batch_shape x num_fantasies x (1 + num_trace_observations) check_no_nans(mean_M) check_no_nans(variance_M) check_no_nans(covar_mM) # compute the information gain (IG) ig = self._compute_information_gain( X=X_expand, mean_M=mean_M, variance_M=variance_M, covar_mM=covar_mM ) ig = self.cost_aware_utility(X=X, deltas=ig, sampler=self.cost_sampler) return ig.mean(dim=0) # average over the fantasies
[docs] class qMultiFidelityLowerBoundMaxValueEntropy(qMultiFidelityMaxValueEntropy): r"""Multi-fidelity acquisition function for General-purpose Information-Based Bayesian optimization (GIBBON). The acquisition function for multi-fidelity max-value entropy search with support for trace observations. See [Takeno2020mfmves]_ for a detailed discussion of the basic ideas on multi-fidelity MES (note that this implementation is somewhat different). This acquisition function is similar to `qMultiFidelityMaxValueEntropy` but computes the information gain from the lower bound described in [Moss2021gibbon]. The model must be single-outcome, unless using a PosteriorTransform. The batch case `q > 1` is supported through cyclic optimization and fantasies. Example: >>> model = SingleTaskGP(train_X, train_Y) >>> candidate_set = torch.rand(1000, bounds.size(1)) >>> candidate_set = bounds[0] + (bounds[1] - bounds[0]) * candidate_set >>> MF_qGIBBON = qMultiFidelityLowerBoundMaxValueEntropy(model, candidate_set) >>> mf_gibbon = MF_qGIBBON(test_X) """ def __init__( self, model: Model, candidate_set: Tensor, num_fantasies: int = 16, num_mv_samples: int = 10, num_y_samples: int = 128, posterior_transform: Optional[PosteriorTransform] = None, use_gumbel: bool = True, maximize: bool = True, cost_aware_utility: Optional[CostAwareUtility] = None, project: Callable[[Tensor], Tensor] = lambda X: X, expand: Callable[[Tensor], Tensor] = lambda X: X, ) -> None: r"""Single-outcome max-value entropy search acquisition function. Args: model: A fitted single-outcome model. candidate_set: A `n x d` Tensor including `n` candidate points to discretize the design space, which will be used to sample the max values from their posteriors. cost_aware_utility: A CostAwareUtility computing the cost-transformed utility from a candidate set and samples of increases in utility. num_fantasies: Number of fantasies to generate. The higher this number the more accurate the model (at the expense of model complexity and performance) and it's only used when `X_pending` is not `None`. num_mv_samples: Number of max value samples. num_y_samples: Number of posterior samples at specific design point `X`. posterior_transform: A PosteriorTransform. If using a multi-output model, a PosteriorTransform that transforms the multi-output posterior into a single-output posterior is required. use_gumbel: If True, use Gumbel approximation to sample the max values. maximize: If True, consider the problem a maximization problem. cost_aware_utility: A CostAwareUtility computing the cost-transformed utility from a candidate set and samples of increases in utility. project: A callable mapping a `batch_shape x q x d` tensor of design points to a tensor of the same shape projected to the desired target set (e.g. the target fidelities in case of multi-fidelity optimization). expand: A callable mapping a `batch_shape x q x d` input tensor to a `batch_shape x (q + q_e)' x d`-dim output tensor, where the `q_e` additional points in each q-batch correspond to additional ("trace") observations. """ super().__init__( model=model, candidate_set=candidate_set, num_fantasies=num_fantasies, num_mv_samples=num_mv_samples, num_y_samples=num_y_samples, posterior_transform=posterior_transform, use_gumbel=use_gumbel, maximize=maximize, cost_aware_utility=cost_aware_utility, project=project, expand=expand, ) def _compute_information_gain( self, X: Tensor, mean_M: Tensor, variance_M: Tensor, covar_mM: Tensor ) -> Tensor: r"""Compute GIBBON's approximation of information gain at the design points `X`. When using GIBBON for batch optimization (i.e `q > 1`), we calculate the additional information provided by adding a new candidate point to the current batch of design points (`X_pending`), rather than calculating the information provided by the whole batch. This allows a modest computational saving. Args: X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches with `1` `d`-dim design point each. mean_M: A `batch_shape x 1`-dim Tensor of means. variance_M: A `batch_shape x 1`-dim Tensor of variances consisting of `batch_shape` t-batches with `num_fantasies` fantasies. covar_mM: A `batch_shape x num_fantasies x (1 + num_trace_observations)` -dim Tensor of covariances. Returns: A `num_fantasies x batch_shape`-dim Tensor of information gains at the given design points `X`. """ return qLowerBoundMaxValueEntropy._compute_information_gain( self, X=X, mean_M=mean_M, variance_M=variance_M, covar_mM=covar_mM )
def _sample_max_value_Thompson( model: Model, candidate_set: Tensor, num_samples: int, posterior_transform: Optional[PosteriorTransform] = None, maximize: bool = True, ) -> Tensor: """Samples the max values by discrete Thompson sampling. Should generally be called within a `with torch.no_grad()` context. Args: model: A fitted single-outcome model. candidate_set: A `n x d` Tensor including `n` candidate points to discretize the design space. num_samples: Number of max value samples. posterior_transform: A PosteriorTransform. If using a multi-output model, a PosteriorTransform that transforms the multi-output posterior into a single-output posterior is required. maximize: If True, consider the problem a maximization problem. Returns: A `num_samples x num_fantasies` Tensor of posterior max value samples. """ posterior = model.posterior(candidate_set, posterior_transform=posterior_transform) weight = 1.0 if maximize else -1.0 samples = weight * posterior.rsample(torch.Size([num_samples])).squeeze(-1) # samples is num_samples x (num_fantasies) x n max_values, _ = samples.max(dim=-1) if len(samples.shape) == 2: max_values = max_values.unsqueeze(-1) # num_samples x num_fantasies return max_values def _sample_max_value_Gumbel( model: Model, candidate_set: Tensor, num_samples: int, posterior_transform: Optional[PosteriorTransform] = None, maximize: bool = True, ) -> Tensor: """Samples the max values by Gumbel approximation. Should generally be called within a `with torch.no_grad()` context. Args: model: A fitted single-outcome model. candidate_set: A `n x d` Tensor including `n` candidate points to discretize the design space. num_samples: Number of max value samples. posterior_transform: A PosteriorTransform. If using a multi-output model, a PosteriorTransform that transforms the multi-output posterior into a single-output posterior is required. maximize: If True, consider the problem a maximization problem. Returns: A `num_samples x num_fantasies` Tensor of posterior max value samples. """ # define the approximate CDF for the max value under the independence assumption posterior = model.posterior(candidate_set, posterior_transform=posterior_transform) weight = 1.0 if maximize else -1.0 mu = weight * posterior.mean sigma = posterior.variance.clamp_min(1e-8).sqrt() # mu, sigma is (num_fantasies) X n X 1 if len(mu.shape) == 3 and mu.shape[-1] == 1: mu = mu.squeeze(-1).T sigma = sigma.squeeze(-1).T # mu, sigma is now n X num_fantasies or n X 1 # bisect search to find the quantiles 25, 50, 75 lo = (mu - 3 * sigma).min(dim=0).values hi = (mu + 5 * sigma).max(dim=0).values num_fantasies = mu.shape[1] device = candidate_set.device dtype = candidate_set.dtype quantiles = torch.zeros(num_fantasies, 3, device=device, dtype=dtype) for i in range(num_fantasies): lo_, hi_ = lo[i], hi[i] N = norm(mu[:, i].cpu().numpy(), sigma[:, i].cpu().numpy()) quantiles[i, :] = torch.tensor( [ brentq(lambda y: np.exp(np.sum(N.logcdf(y))) - p, lo_, hi_) for p in [0.25, 0.50, 0.75] ] ) q25, q50, q75 = quantiles[:, 0], quantiles[:, 1], quantiles[:, 2] # q25, q50, q75 are 1 dimensional tensor with size of either 1 or num_fantasies # parameter fitting based on matching percentiles for the Gumbel distribution b = (q25 - q75) / (log(log(4.0 / 3.0)) - log(log(4.0))) a = q50 + b * log(log(2.0)) # inverse sampling from the fitted Gumbel CDF distribution sample_shape = (num_samples, num_fantasies) eps = torch.rand(*sample_shape, device=device, dtype=dtype) max_values = a - b * eps.log().mul(-1.0).log() return max_values # num_samples x num_fantasies