I'm trying to understand the hyperparameter optimization implemented in SKLearn. I'm using the basic example presented here with an alternative data set of 100 observations of Rastrigin test function (see code below). I copy pasted the internal functions of GP.fit and GP.log_marginal_likelihood to play around with them.
When optimizing this model I normally get a log-marginal-likelihood value of 569.619 leading to the following GP which looks pretty messy regarding the confidence interval:
Since I often heard that the log-marginal-likelihood value should be positive, I added the following if-condition into the respective function to penalize negative LML values (disregarding that this is extremly bad style, especially for the optimizer):
if log_likelihood < 0: log_likelihood = -np.inf When I re-tested the fitting I get LMLs of about 404.084 and the following GP, which seem to be the better fitting model:
- Which is the better fitting model?
- Why does the model with a lower LML seem to fit better when we are trying to minimize the negative LML, thus maximizing the LML?
print(__doc__) import numpy as np from matplotlib import pyplot as plt from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C import warnings from operator import itemgetter import numpy as np from scipy.linalg import cholesky, cho_solve, solve_triangular import scipy.optimize from sklearn.base import BaseEstimator, RegressorMixin, clone from sklearn.base import MultiOutputMixin from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils.optimize import _check_optimize_result from sklearn.utils.validation import _deprecate_positional_args np.random.seed(1) def alt_log_marginal_likelihood(gp, theta=None, eval_gradient=False, clone_kernel=True): if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return gp.log_marginal_likelihood_value_ if clone_kernel: kernel = gp.kernel_.clone_with_theta(theta) else: kernel = gp.kernel_ kernel.theta = theta if eval_gradient: K, K_gradient = kernel(gp.X_train_, eval_gradient=True) else: K = kernel(gp.X_train_) K[np.diag_indices_from(K)] += gp.alpha try: L = cholesky(K, lower=True) # Line 2 except np.linalg.LinAlgError: return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf y_train = gp.y_train_ if y_train.ndim == 1: y_train = y_train[:, np.newaxis] alpha = cho_solve((L, True), y_train) # Line 3 log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha) log_likelihood_dims -= np.log(np.diag(L)).sum() log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi) log_likelihood = log_likelihood_dims.sum(-1) # sum over dimensions if eval_gradient: # compare Equation 5.9 from GPML tmp = np.einsum("ik,jk->ijk", alpha, alpha) # k: output-dimension tmp -= cho_solve((L, True), np.eye(K.shape[0]))[:, :, np.newaxis] log_likelihood_gradient_dims = 0.5 * np.einsum("ijl,jik->kl", tmp, K_gradient) log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1) if log_likelihood < 0: log_likelihood = -np.inf if eval_gradient: return log_likelihood, log_likelihood_gradient else: return log_likelihood def alt_fit(gp, X, y): if gp.kernel is None: # Use an RBF kernel as default gp.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(1.0, length_scale_bounds="fixed") else: gp.kernel_ = clone(gp.kernel) gp._rng = check_random_state(gp.random_state) if gp.normalize_y: gp._y_train_mean = np.mean(y, axis=0) gp._y_train_std = np.std(y, axis=0) y = (y - gp._y_train_mean) / gp._y_train_std else: gp._y_train_mean = np.zeros(1) gp._y_train_std = 1 if np.iterable(gp.alpha) and gp.alpha.shape[0] != y.shape[0]: if gp.alpha.shape[0] == 1: gp.alpha = gp.alpha[0] else: raise ValueError("alpha must be a scalar or an array" " with same number of entries as y.(%d != %d)" % (gp.alpha.shape[0], y.shape[0])) gp.X_train_ = np.copy(X) if gp.copy_X_train else X gp.y_train_ = np.copy(y) if gp.copy_X_train else y if gp.optimizer is not None and gp.kernel_.n_dims > 0: def obj_func(theta, eval_gradient=True): if eval_gradient: lml, grad = alt_log_marginal_likelihood(gp, theta, eval_gradient=True, clone_kernel=True) return -lml, -grad else: return -alt_log_marginal_likelihood(gp, theta, clone_kernel=True) optima = [(gp._constrained_optimization(obj_func, gp.kernel_.theta, gp.kernel_.bounds))] if gp.n_restarts_optimizer > 0: if not np.isfinite(gp.kernel_.bounds).all(): raise ValueError("Multiple optimizer restarts (n_restarts_optimizer>0) requires that all bounds are finite.") bounds = gp.kernel_.bounds for iteration in range(gp.n_restarts_optimizer): theta_initial = gp._rng.uniform(bounds[:, 0], bounds[:, 1]) optima.append(gp._constrained_optimization(obj_func, theta_initial, bounds)) lml_values = list(map(itemgetter(1), optima)) gp.kernel_.theta = optima[np.argmin(lml_values)][0] gp.log_marginal_likelihood_value_ = -np.min(lml_values) else: gp.log_marginal_likelihood_value_ = alt_log_marginal_likelihood(gp, gp.kernel_.theta, clone_kernel=True) K = gp.kernel_(gp.X_train_) K[np.diag_indices_from(K)] += gp.alpha try: gp.L_ = cholesky(K, lower=True) # Line 2 gp._K_inv = None except np.linalg.LinAlgError as exc: exc.args = ("The kernel, %s, is not returning a " "positive definite matrix. Try gradually " "increasing the 'alpha' parameter of your " "GaussianProcessRegressor estimator." % gp.kernel_,) + exc.args raise gp.alpha_ = cho_solve((gp.L_, True), gp.y_train_) # Line 3 return gp X = np.array((-5.09717005e+00,-4.97371994e+00,-4.90522017e+00,-4.76561902e+00,-4.67691785e+00,-4.54340855e+00,-4.47258867e+00,-4.37597423e+00,-4.26686261e+00,-4.17197691e+00,-3.99791328e+00,-3.97123439e+00,-3.79411596e+00,-3.74365655e+00,-3.63353746e+00,-3.52356937e+00,-3.46841329e+00,-3.28546064e+00,-3.22201979e+00,-3.09407224e+00,-2.97760593e+00,-2.93012849e+00,-2.78277215e+00,-2.69112708e+00,-2.66146750e+00,-2.55933162e+00,-2.38965976e+00,-2.29507762e+00,-2.16632907e+00,-2.14792914e+00,-2.00087737e+00,-1.93692937e+00,-1.76566894e+00,-1.73169083e+00,-1.54399602e+00,-1.47022061e+00,-1.34075771e+00,-1.32753621e+00,-1.15529292e+00,-1.09658954e+00,-9.66129799e-01,-8.36047383e-01,-7.37663792e-01,-6.70379935e-01,-5.77235907e-01,-5.00116512e-01,-3.62700012e-01,-2.51988573e-01,-1.39697953e-01,-4.37808815e-02,+8.03814053e-02,+1.83303592e-01,+3.00678855e-01,+3.82356967e-01,+4.74142871e-01,+5.80786139e-01,+6.56322792e-01,+7.48888247e-01,+8.30761981e-01,+9.92463873e-01,+1.08551669e+00,+1.20559816e+00,+1.28581523e+00,+1.35538103e+00,+1.51986166e+00,+1.57229312e+00,+1.67100322e+00,+1.75514038e+00,+1.84515657e+00,+2.02161206e+00,+2.11086824e+00,+2.19530943e+00,+2.29571550e+00,+2.41376546e+00,+2.51742391e+00,+2.63982950e+00,+2.73989004e+00,+2.82681192e+00,+2.89377000e+00,+3.03091821e+00,+3.07864347e+00,+3.26376424e+00,+3.37361362e+00,+3.39946483e+00,+3.49132403e+00,+3.61596989e+00,+3.70512010e+00,+3.79136456e+00,+3.98995451e+00,+4.01805376e+00,+4.17349187e+00,+4.23815199e+00,+4.33461262e+00,+4.49142727e+00,+4.53227252e+00,+4.67593419e+00,+4.79455493e+00,+4.82796882e+00,+4.92600534e+00,+5.01985017e+00)) X = X.reshape(-1, 1) y = np.array((+2.77877424e+01,+2.48739078e+01,+2.57826114e+01,+3.17313269e+01,+3.63057704e+01,+4.02729149e+01,+3.98560991e+01,+3.62633692e+01,+2.92636442e+01,+2.26970723e+01,+1.59841701e+01,+1.59335926e+01,+2.16587881e+01,+2.44134295e+01,+2.98843638e+01,+3.23060871e+01,+3.18335943e+01,+2.30039205e+01,+1.86324052e+01,+1.12698526e+01,+8.96496488e+00,+9.53394799e+00,+1.56992062e+01,+2.08574751e+01,+2.23635987e+01,+2.58633207e+01,+2.34019619e+01,+1.80619753e+01,+9.67462276e+00,+8.63098130e+00,+4.00366220e+00,+4.52668010e+00,+1.21346680e+01,+1.41466166e+01,+2.20042686e+01,+2.19870090e+01,+1.71960353e+01,+1.64436611e+01,+5.72909944e+00,+2.98825178e+00,+1.15899978e+00,+5.55202711e+00,+1.13184788e+01,+1.52460131e+01,+1.91786100e+01,+2.02501138e+01,+1.66355828e+01,+1.01884407e+01,+3.63066416e+00,+3.77891270e-01,+1.25496446e+00,+5.96452667e+00,+1.32211156e+01,+1.75364564e+01,+2.00931269e+01,+1.90764765e+01,+1.59826585e+01,+1.06306866e+01,+5.83072947e+00,+9.96192975e-01,+2.58749742e+00,+8.69966674e+00,+1.38847129e+01,+1.79850279e+01,+2.22322121e+01,+2.14580925e+01,+1.75544557e+01,+1.27575941e+01,+7.77563531e+00,+4.17897190e+00,+6.78552043e+00,+1.14503031e+01,+1.81033636e+01,+2.43939397e+01,+2.62775562e+01,+2.33511903e+01,+1.81417978e+01,+1.33498195e+01,+1.05199577e+01,+9.37456658e+00,+1.06742362e+01,+2.15159119e+01,+2.83904739e+01,+2.96267206e+01,+3.21744890e+01,+3.05359097e+01,+2.65105789e+01,+2.18045944e+01,+1.59396496e+01,+1.62090247e+01,+2.27939041e+01,+2.72181869e+01,+3.38583153e+01,+4.01584157e+01,+4.03366107e+01,+3.63518883e+01,+3.02247111e+01,+2.86039723e+01,+2.53269640e+01,+2.52765732e+01)) x = np.atleast_2d(np.linspace(-5.12, 5.12, 1000)).T kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True, alpha=1.0E-10) gp = alt_fit(gp, X, y) print("LML: " + str(gp.log_marginal_likelihood_value_)) y_pred, sigma = gp.predict(x, return_std=True) plt.figure() plt.plot(X, y, 'r.', markersize=10, label='Observations') plt.plot(x, y_pred, 'b-', label='Prediction') plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]), alpha=.5, fc='b', ec='None', label='95% confidence interval') plt.xlabel('$x$') plt.ylabel('$f(x)$') plt.ylim(-20, 50) plt.legend(loc='upper left') plt.show() 
