#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
r"""
Monte-Carlo Acquisition Functions for Multi-objective Bayesian optimization.
In particular, this module contains implementations of
1) qEHVI [Daulton2020qehvi]_, and
2) qNEHVI [Daulton2021nehvi]_.
References
.. [Daulton2020qehvi]
S. Daulton, M. Balandat, and E. Bakshy. Differentiable Expected Hypervolume
Improvement for Parallel Multi-Objective Bayesian Optimization. Advances in Neural
Information Processing Systems 33, 2020.
.. [Daulton2021nehvi]
S. Daulton, M. Balandat, and E. Bakshy. Parallel Bayesian Optimization of
Multiple Noisy Objectives with Expected Hypervolume Improvement. Advances
in Neural Information Processing Systems 34, 2021.
"""
from __future__ import annotations
from collections.abc import Callable
import torch
from botorch.acquisition.multi_objective.base import MultiObjectiveMCAcquisitionFunction
from botorch.acquisition.multi_objective.objective import MCMultiOutputObjective
from botorch.exceptions.warnings import legacy_ei_numerics_warning
from botorch.models.model import Model
from botorch.sampling.base import MCSampler
from botorch.utils.multi_objective.box_decompositions.non_dominated import (
NondominatedPartitioning,
)
from botorch.utils.multi_objective.hypervolume import (
NoisyExpectedHypervolumeMixin,
SubsetIndexCachingMixin,
)
from botorch.utils.objective import compute_smoothed_feasibility_indicator
from botorch.utils.transforms import (
concatenate_pending_points,
is_ensemble,
match_batch_shape,
t_batch_mode_transform,
)
from torch import Tensor
[docs]
class qExpectedHypervolumeImprovement(
MultiObjectiveMCAcquisitionFunction, SubsetIndexCachingMixin
):
def __init__(
self,
model: Model,
ref_point: list[float] | Tensor,
partitioning: NondominatedPartitioning,
sampler: MCSampler | None = None,
objective: MCMultiOutputObjective | None = None,
constraints: list[Callable[[Tensor], Tensor]] | None = None,
X_pending: Tensor | None = None,
eta: Tensor | float = 1e-3,
fat: bool = False,
) -> None:
r"""q-Expected Hypervolume Improvement supporting m>=2 outcomes.
See [Daulton2020qehvi]_ for details.
Example:
>>> model = SingleTaskGP(train_X, train_Y)
>>> ref_point = [0.0, 0.0]
>>> qEHVI = qExpectedHypervolumeImprovement(model, ref_point, partitioning)
>>> qehvi = qEHVI(test_X)
Args:
model: A fitted model.
ref_point: A list or tensor with `m` elements representing the reference
point (in the outcome space) w.r.t. to which compute the hypervolume.
This is a reference point for the objective values (i.e. after
applying`objective` to the samples).
partitioning: A `NondominatedPartitioning` module that provides the non-
dominated front and a partitioning of the non-dominated space in hyper-
rectangles. If constraints are present, this partitioning must only
include feasible points.
sampler: The sampler used to draw base samples. If not given,
a sampler is generated using `get_sampler`.
objective: The MCMultiOutputObjective under which the samples are evaluated.
Defaults to `IdentityMCMultiOutputObjective()`.
constraints: A list of callables, each mapping a Tensor of dimension
`sample_shape x batch-shape x q x m` to a Tensor of dimension
`sample_shape x batch-shape x q`, where negative values imply
feasibility. The acquisition function will compute expected feasible
hypervolume.
X_pending: A `batch_shape x m x d`-dim Tensor of `m` design points that have
points that have been submitted for function evaluation but have not yet
been evaluated. Concatenated into `X` upon forward call. Copied and set
to have no gradient.
eta: The temperature parameter for the sigmoid function used for the
differentiable approximation of the constraints. In case of a float the
same eta is used for every constraint in constraints. In case of a
tensor the length of the tensor must match the number of provided
constraints. The i-th constraint is then estimated with the i-th
eta value.
fat: A Boolean flag indicating whether to use the heavy-tailed approximation
of the constraint indicator.
"""
legacy_ei_numerics_warning(legacy_name=type(self).__name__)
if len(ref_point) != partitioning.num_outcomes:
raise ValueError(
"The length of the reference point must match the number of outcomes. "
f"Got ref_point with {len(ref_point)} elements, but expected "
f"{partitioning.num_outcomes}."
)
ref_point = torch.as_tensor(
ref_point,
dtype=partitioning.pareto_Y.dtype,
device=partitioning.pareto_Y.device,
)
super().__init__(
model=model,
sampler=sampler,
objective=objective,
constraints=constraints,
eta=eta,
X_pending=X_pending,
)
self.register_buffer("ref_point", ref_point)
cell_bounds = partitioning.get_hypercell_bounds()
self.register_buffer("cell_lower_bounds", cell_bounds[0])
self.register_buffer("cell_upper_bounds", cell_bounds[1])
SubsetIndexCachingMixin.__init__(self)
self.fat = fat
def _compute_qehvi(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
r"""Compute the expected (feasible) hypervolume improvement given MC samples.
Args:
samples: A `n_samples x batch_shape x q' x m`-dim tensor of samples.
X: A `batch_shape x q x d`-dim tensor of inputs.
Returns:
A `batch_shape x (model_batch_shape)`-dim tensor of expected hypervolume
improvement for each batch.
"""
# Note that the objective may subset the outcomes (e.g. this will usually happen
# if there are constraints present).
obj = self.objective(samples, X=X)
q = obj.shape[-2]
if self.constraints is not None:
feas_weights = compute_smoothed_feasibility_indicator(
constraints=self.constraints,
samples=samples,
eta=self.eta,
fat=self.fat,
) # `sample_shape x batch-shape x q`
device = self.ref_point.device
q_subset_indices = self.compute_q_subset_indices(q_out=q, device=device)
batch_shape = obj.shape[:-2]
# this is n_samples x input_batch_shape x
areas_per_segment = torch.zeros(
*batch_shape,
self.cell_lower_bounds.shape[-2],
dtype=obj.dtype,
device=device,
)
cell_batch_ndim = self.cell_lower_bounds.ndim - 2
sample_batch_view_shape = torch.Size(
[
batch_shape[0] if cell_batch_ndim > 0 else 1,
*[1 for _ in range(len(batch_shape) - max(cell_batch_ndim, 1))],
*self.cell_lower_bounds.shape[1:-2],
]
)
view_shape = (
*sample_batch_view_shape,
self.cell_upper_bounds.shape[-2],
1,
self.cell_upper_bounds.shape[-1],
)
for i in range(1, self.q_out + 1):
# TODO: we could use batches to compute (q choose i) and (q choose q-i)
# simultaneously since subsets of size i and q-i have the same number of
# elements. This would decrease the number of iterations, but increase
# memory usage.
q_choose_i = q_subset_indices[f"q_choose_{i}"]
# this tensor is mc_samples x batch_shape x i x q_choose_i x m
obj_subsets = obj.index_select(dim=-2, index=q_choose_i.view(-1))
obj_subsets = obj_subsets.view(
obj.shape[:-2] + q_choose_i.shape + obj.shape[-1:]
)
# since all hyperrectangles share one vertex, the opposite vertex of the
# overlap is given by the component-wise minimum.
# take the minimum in each subset
overlap_vertices = obj_subsets.min(dim=-2).values
# add batch-dim to compute area for each segment (pseudo-pareto-vertex)
# this tensor is mc_samples x batch_shape x num_cells x q_choose_i x m
overlap_vertices = torch.min(
overlap_vertices.unsqueeze(-3), self.cell_upper_bounds.view(view_shape)
)
# subtract cell lower bounds, clamp min at zero
lengths_i = (
overlap_vertices - self.cell_lower_bounds.view(view_shape)
).clamp_min(0.0)
# take product over hyperrectangle side lengths to compute area
# sum over all subsets of size i
areas_i = lengths_i.prod(dim=-1)
# if constraints are present, apply a differentiable approximation of
# the indicator function
if self.constraints is not None:
feas_subsets = feas_weights.index_select(
dim=-1, index=q_choose_i.view(-1)
).view(feas_weights.shape[:-1] + q_choose_i.shape)
areas_i = areas_i * feas_subsets.unsqueeze(-3).prod(dim=-1)
areas_i = areas_i.sum(dim=-1)
# Using the inclusion-exclusion principle, set the sign to be positive
# for subsets of odd sizes and negative for subsets of even size
areas_per_segment += (-1) ** (i + 1) * areas_i
# sum over segments and average over MC samples
return areas_per_segment.sum(dim=-1).mean(dim=0)
[docs]
@concatenate_pending_points
@t_batch_mode_transform()
def forward(self, X: Tensor) -> Tensor:
posterior = self.model.posterior(X)
samples = self.get_posterior_samples(posterior)
return self._compute_qehvi(samples=samples, X=X)
[docs]
class qNoisyExpectedHypervolumeImprovement(
NoisyExpectedHypervolumeMixin, qExpectedHypervolumeImprovement
):
def __init__(
self,
model: Model,
ref_point: list[float] | Tensor,
X_baseline: Tensor,
sampler: MCSampler | None = None,
objective: MCMultiOutputObjective | None = None,
constraints: list[Callable[[Tensor], Tensor]] | None = None,
X_pending: Tensor | None = None,
eta: Tensor | float = 1e-3,
fat: bool = False,
prune_baseline: bool = False,
alpha: float = 0.0,
cache_pending: bool = True,
max_iep: int = 0,
incremental_nehvi: bool = True,
cache_root: bool = True,
marginalize_dim: int | None = None,
) -> None:
r"""q-Noisy Expected Hypervolume Improvement supporting m>=2 outcomes.
See [Daulton2021nehvi]_ for details.
Example:
>>> model = SingleTaskGP(train_X, train_Y)
>>> ref_point = [0.0, 0.0]
>>> qNEHVI = qNoisyExpectedHypervolumeImprovement(model, ref_point, train_X)
>>> qnehvi = qNEHVI(test_X)
Args:
model: A fitted model.
ref_point: A list or tensor with `m` elements representing the reference
point (in the outcome space) w.r.t. to which compute the hypervolume.
This is a reference point for the objective values (i.e. after
applying `objective` to the samples).
X_baseline: A `r x d`-dim Tensor of `r` design points that have already
been observed. These points are considered as potential approximate
pareto-optimal design points.
sampler: The sampler used to draw base samples. If not given,
a sampler is generated using `get_sampler`.
Note: a pareto front is created for each mc sample, which can be
computationally intensive for `m` > 2.
objective: The MCMultiOutputObjective under which the samples are
evaluated. Defaults to `IdentityMCMultiOutputObjective()`.
constraints: A list of callables, each mapping a Tensor of dimension
`sample_shape x batch-shape x q x m` to a Tensor of dimension
`sample_shape x batch-shape x q`, where negative values imply
feasibility. The acquisition function will compute expected feasible
hypervolume.
X_pending: A `batch_shape x m x d`-dim Tensor of `m` design points that
have points that have been submitted for function evaluation, but
have not yet been evaluated.
eta: The temperature parameter for the sigmoid function used for the
differentiable approximation of the constraints. In case of a float the
same `eta` is used for every constraint in constraints. In case of a
tensor the length of the tensor must match the number of provided
constraints. The i-th constraint is then estimated with the i-th
`eta` value. For more details, on this parameter, see the docs of
`compute_smoothed_feasibility_indicator`.
fat: A Boolean flag indicating whether to use the heavy-tailed approximation
of the constraint indicator.
prune_baseline: If True, remove points in `X_baseline` that are
highly unlikely to be the pareto optimal and better than the
reference point. This can significantly improve computation time and
is generally recommended. In order to customize pruning parameters,
instead manually call `prune_inferior_points_multi_objective` on
`X_baseline` before instantiating the acquisition function.
alpha: The hyperparameter controlling the approximate non-dominated
partitioning. The default value of 0.0 means an exact partitioning
is used. As the number of objectives `m` increases, consider increasing
this parameter in order to limit computational complexity.
cache_pending: A boolean indicating whether to use cached box
decompositions (CBD) for handling pending points. This is
generally recommended.
max_iep: The maximum number of pending points before the box
decompositions will be recomputed.
incremental_nehvi: A boolean indicating whether to compute the
incremental NEHVI from the `i`th point where `i=1, ..., q`
under sequential greedy optimization, or the full qNEHVI over
`q` points.
cache_root: A boolean indicating whether to cache the root
decomposition over `X_baseline` and use low-rank updates.
marginalize_dim: A batch dimension that should be marginalized. For example,
this is useful when using a batched fully Bayesian model.
"""
legacy_ei_numerics_warning(legacy_name=type(self).__name__)
MultiObjectiveMCAcquisitionFunction.__init__(
self,
model=model,
sampler=sampler,
objective=objective,
constraints=constraints,
eta=eta,
)
SubsetIndexCachingMixin.__init__(self)
NoisyExpectedHypervolumeMixin.__init__(
self,
model=model,
ref_point=ref_point,
X_baseline=X_baseline,
sampler=self.sampler,
objective=self.objective,
constraints=self.constraints,
X_pending=X_pending,
prune_baseline=prune_baseline,
alpha=alpha,
cache_pending=cache_pending,
max_iep=max_iep,
incremental_nehvi=incremental_nehvi,
cache_root=cache_root,
marginalize_dim=marginalize_dim,
)
self.fat = fat
[docs]
@concatenate_pending_points
@t_batch_mode_transform()
def forward(self, X: Tensor) -> Tensor:
X_full = torch.cat([match_batch_shape(self.X_baseline, X), X], dim=-2)
# NOTE: To ensure that we correctly sample `f(X)` from the joint distribution
# `f((X_baseline, X)) ~ P(f | D)`, it is critical to compute the joint posterior
# over X *and* X_baseline -- which also contains pending points whenever there
# are any -- since the baseline and pending values `f(X_baseline)` are
# generally pre-computed and cached before the `forward` call, see the docs of
# `cache_pending` for details.
# TODO: Improve the efficiency by not re-computing the X_baseline-X_baseline
# covariance matrix, but only the covariance of
# 1) X and X, and
# 2) X and X_baseline.
posterior = self.model.posterior(X_full)
# Account for possible one-to-many transform and the MCMC batch dimension in
# `SaasFullyBayesianSingleTaskGP`
event_shape_lag = 1 if is_ensemble(self.model) else 2
n_w = (
posterior._extended_shape()[X_full.dim() - event_shape_lag]
// X_full.shape[-2]
)
q_in = X.shape[-2] * n_w
self._set_sampler(q_in=q_in, posterior=posterior)
samples = self._get_f_X_samples(posterior=posterior, q_in=q_in)
# Add previous nehvi from pending points.
return self._compute_qehvi(samples=samples, X=X) + self._prev_nehvi