Skip to content

Commit

Permalink
Remove BayesSearchCV(iid=) parameter deprecated in sklearn 0.24 (scik…
Browse files Browse the repository at this point in the history
…it-optimize#988)

* Remove BayesSearchCV(iid=) parameter deprecated in sklearn

Fixes scikit-optimize#978

* Remove now unused weights= parameter

* Update skopt/searchcv.py

Co-authored-by: Tim Head <[email protected]>

* Fix BayesSearchCV repr/pprint; Fix changed sklearn.model_selection._validation._fit_and_score

* Update searchcv.py

Further improvements for kernc previous commits.  iid totally removed to prevent troubles with sklearn utils prettyprinting. prettyprinting looks for params when print to repl. Sklearn 0.24 has changed return value after cv, so i changed dict destructuring(a litttle bit dirty)

* Update searchcv.py

* Revert unrelated changes

* PEP8 format; add comment

* Revert reverting "unrelated changes"

This is required to pass tests/test_searchcv.py
with scikit-learn 0.24+.

* Migrate BayesSearchCV to sklearn BaseSearchCV._run_search() API

Fixes scikit-optimize#718

* add todo item

* Add WhatsNew entry

Co-authored-by: Tim Head <[email protected]>
Co-authored-by: bole1 <[email protected]>
  • Loading branch information
3 people authored May 4, 2021
1 parent c95a3e5 commit 9461bfe
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 222 deletions.
7 changes: 7 additions & 0 deletions doc/whats_new/v0.9.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,10 @@
Version 0.9.0
=============
**In Development**

:mod:`skopt.searchcv`
---------------------
- |Fix| Fix :obj:`skopt.searchcv.BayesSearchCV` for scikit-learn >= 0.24.
:pr:`988`
- |API| Deprecate :class:`skopt.searchcv.BayesSearchCV` parameter `iid=`.
:pr:`988`
259 changes: 37 additions & 222 deletions skopt/searchcv.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import warnings

try:
from collections.abc import Sized
except ImportError:
from collections import Sized
from collections import defaultdict
from functools import partial

import numpy as np
from scipy.stats import rankdata

import sklearn
from sklearn.base import is_classifier, clone
from joblib import Parallel, delayed
from sklearn.model_selection._search import BaseSearchCV
from sklearn.utils import check_random_state
from sklearn.utils.fixes import MaskedArray

from sklearn.utils.validation import indexable, check_is_fitted
from sklearn.utils.validation import check_is_fitted
try:
from sklearn.metrics import check_scoring
except ImportError:
Expand Down Expand Up @@ -115,11 +111,6 @@ class BayesSearchCV(BaseSearchCV):
- A string, giving an expression as a function of n_jobs,
as in '2*n_jobs'
iid : boolean, default=True
If True, the data is assumed to be identically distributed across
the folds, and the loss minimized is the total loss per sample,
and not the mean loss across the folds.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
Expand Down Expand Up @@ -289,7 +280,7 @@ class BayesSearchCV(BaseSearchCV):

def __init__(self, estimator, search_spaces, optimizer_kwargs=None,
n_iter=50, scoring=None, fit_params=None, n_jobs=1,
n_points=1, iid=True, refit=True, cv=None, verbose=0,
n_points=1, iid='deprecated', refit=True, cv=None, verbose=0,
pre_dispatch='2*n_jobs', random_state=None,
error_score='raise', return_train_score=False):

Expand All @@ -305,9 +296,14 @@ def __init__(self, estimator, search_spaces, optimizer_kwargs=None,
# in the constructor and be passed in ``fit``.
self.fit_params = fit_params

if iid != "deprecated":
warnings.warn("The `iid` parameter has been deprecated "
"and will be ignored.")
self.iid = iid # For sklearn repr pprint

super(BayesSearchCV, self).__init__(
estimator=estimator, scoring=scoring,
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
pre_dispatch=pre_dispatch, error_score=error_score,
return_train_score=return_train_score)

Expand Down Expand Up @@ -364,170 +360,11 @@ def _check_search_space(self, search_space):
"Search space should be provided as a dict or list of dict,"
"got %s" % search_space)

# copied for compatibility with 0.19 sklearn from 0.18 BaseSearchCV
@property
def best_score_(self):
check_is_fitted(self, 'cv_results_')
return self.cv_results_['mean_test_score'][self.best_index_]

# copied for compatibility with 0.19 sklearn from 0.18 BaseSearchCV
@property
def best_params_(self):
check_is_fitted(self, 'cv_results_')
return self.cv_results_['params'][self.best_index_]

@property
def optimizer_results_(self):
check_is_fitted(self, '_optim_results')
return self._optim_results

# copied for compatibility with 0.19 sklearn from 0.18 BaseSearchCV
def _fit(self, X, y, groups, parameter_iterable):
"""
Actual fitting, performing the search over parameters.
Taken from https://github.com/scikit-learn/scikit-learn/blob/0.18.X
.../sklearn/model_selection/_search.py
"""
estimator = self.estimator
cv = sklearn.model_selection._validation.check_cv(
self.cv, y, classifier=is_classifier(estimator))
self.scorer_ = check_scoring(
self.estimator, scoring=self.scoring)

X, y, groups = indexable(X, y, groups)
n_splits = cv.get_n_splits(X, y, groups)
if self.verbose > 0 and isinstance(parameter_iterable, Sized):
n_candidates = len(parameter_iterable)
print("Fitting {0} folds for each of {1} candidates, totalling"
" {2} fits".format(n_splits, n_candidates,
n_candidates * n_splits))

base_estimator = clone(self.estimator)
pre_dispatch = self.pre_dispatch

cv_iter = list(cv.split(X, y, groups))
out = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose,
pre_dispatch=pre_dispatch
)(delayed(sklearn.model_selection._validation._fit_and_score)(
clone(base_estimator),
X, y, self.scorer_,
train, test, self.verbose, parameters,
fit_params=self.fit_params,
return_train_score=self.return_train_score,
return_n_test_samples=True,
return_times=True, return_parameters=True,
error_score=self.error_score
)
for parameters in parameter_iterable
for train, test in cv_iter)

# if one choose to see train score, "out" will contain train score info
if self.return_train_score:
(train_scores, test_scores, test_sample_counts,
fit_time, score_time, parameters) = zip(*out)
else:
(test_scores, test_sample_counts,
fit_time, score_time, parameters) = zip(*out)

candidate_params = parameters[::n_splits]
n_candidates = len(candidate_params)

results = dict()

def _store(key_name, array, weights=None, splits=False, rank=False):
"""A small helper to store the scores/times to the cv_results_"""
array = np.array(array, dtype=np.float64).reshape(n_candidates,
n_splits)
if splits:
for split_i in range(n_splits):
results["split%d_%s"
% (split_i, key_name)] = array[:, split_i]

array_means = np.average(array, axis=1, weights=weights)
results['mean_%s' % key_name] = array_means
# Weighted std is not directly available in numpy
array_stds = np.sqrt(np.average((array -
array_means[:, np.newaxis]) ** 2,
axis=1, weights=weights))
results['std_%s' % key_name] = array_stds

if rank:
results["rank_%s" % key_name] = np.asarray(
rankdata(-array_means, method='min'), dtype=np.int32)

# Computed the (weighted) mean and std for test scores alone
# NOTE test_sample counts (weights) remain the same for all candidates
test_sample_counts = np.array(test_sample_counts[:n_splits],
dtype=np.int)

_store('test_score', test_scores, splits=True, rank=True,
weights=test_sample_counts if self.iid else None)
if self.return_train_score:
_store('train_score', train_scores, splits=True)
_store('fit_time', fit_time)
_store('score_time', score_time)

best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
best_parameters = candidate_params[best_index]

# Use one MaskedArray and mask all the places where the param is not
# applicable for that candidate. Use defaultdict as each candidate may
# not contain all the params
param_results = defaultdict(partial(np.ma.array,
np.empty(n_candidates,),
mask=True,
dtype=object))
for cand_i, params in enumerate(candidate_params):
for name, value in params.items():
# An all masked empty array gets created for the key
# `"param_%s" % name` at the first occurence of `name`.
# Setting the value at an index also unmasks that index
param_results["param_%s" % name][cand_i] = value

results.update(param_results)

# Store a list of param dicts at the key 'params'
results['params'] = candidate_params

self.cv_results_ = results
self.best_index_ = best_index
self.n_splits_ = n_splits

if self.refit:
# fit the best estimator using the entire dataset
# clone first to work around broken estimators
best_estimator = clone(base_estimator).set_params(
**best_parameters)
if y is not None:
best_estimator.fit(X, y, **self.fit_params)
else:
best_estimator.fit(X, **self.fit_params)
self.best_estimator_ = best_estimator
return self

def _fit_best_model(self, X, y):
"""Fit the estimator copy with best parameters found to the
provided data.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Input data, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape = [n_samples] or [n_samples, n_output],
Target relative to X for classification or regression.
Returns
-------
self
"""
self.best_estimator_ = clone(self.estimator)
self.best_estimator_.set_params(**self.best_params_)
self.best_estimator_.fit(X, y, **(self.fit_params or {}))
return self

def _make_optimizer(self, params_space):
"""Instantiate skopt Optimizer class.
Expand Down Expand Up @@ -556,10 +393,9 @@ def _make_optimizer(self, params_space):

return optimizer

def _step(self, X, y, search_space, optimizer, groups=None, n_points=1):
def _step(self, search_space, optimizer, evaluate_candidates, n_points=1):
"""Generate n_jobs parameters and evaluate them in parallel.
"""

# get parameter values to evaluate
params = optimizer.ask(n_points=n_points)

Expand All @@ -569,33 +405,10 @@ def _step(self, X, y, search_space, optimizer, groups=None, n_points=1):
# make lists into dictionaries
params_dict = [point_asdict(search_space, p) for p in params]

# HACK: self.cv_results_ is reset at every call to _fit, keep current
all_cv_results = self.cv_results_

# HACK: this adds compatibility with different versions of sklearn
refit = self.refit
self.refit = False
self._fit(X, y, groups, params_dict)
self.refit = refit

# merge existing and new cv_results_
for k in self.cv_results_:
all_cv_results[k].extend(self.cv_results_[k])

all_cv_results["rank_test_score"] = list(np.asarray(
rankdata(-np.array(all_cv_results['mean_test_score']),
method='min'), dtype=np.int32))
if self.return_train_score:
all_cv_results["rank_train_score"] = list(np.asarray(
rankdata(-np.array(all_cv_results['mean_train_score']),
method='min'), dtype=np.int32))
self.cv_results_ = all_cv_results
self.best_index_ = np.argmax(self.cv_results_['mean_test_score'])

# feed the point and objective back into optimizer
local_results = self.cv_results_['mean_test_score'][-len(params):]

# optimizer minimizes objective, hence provide negative score
all_results = evaluate_candidates(params_dict)
# Feed the point and objective value back into optimizer
# Optimizer minimizes objective, hence provide negative score
local_results = all_results["mean_test_score"][-len(params):]
return optimizer.tell(params, [-score for score in local_results])

@property
Expand All @@ -621,10 +434,8 @@ def total_iterations(self):

return total_iter

def _run_search(self, x):
pass

def fit(self, X, y=None, groups=None, callback=None):
# TODO: Accept callbacks via the constructor?
def fit(self, X, y=None, *, groups=None, callback=None, **fit_params):
"""Run fit on the estimator with randomly drawn parameters.
Parameters
Expand All @@ -645,18 +456,31 @@ def fit(self, X, y=None, groups=None, callback=None):
combination tested. If list of callables, then each callable in
the list is called.
"""
self._callbacks = check_callback(callback)

if self.optimizer_kwargs is None:
self.optimizer_kwargs_ = {}
else:
self.optimizer_kwargs_ = dict(self.optimizer_kwargs)

super().fit(X=X, y=y, groups=groups, **fit_params)

# BaseSearchCV never ranked train scores,
# but apparently we used to ship this (back-compat)
if self.return_train_score:
self.cv_results_["rank_train_score"] = \
rankdata(-np.array(self.cv_results_["mean_train_score"]),
method='min').astype(int)
return self

def _run_search(self, evaluate_candidates):
# check if space is a single dict, convert to list if so
search_spaces = self.search_spaces
if isinstance(search_spaces, dict):
search_spaces = [search_spaces]

callbacks = check_callback(callback)
callbacks = self._callbacks

if self.optimizer_kwargs is None:
self.optimizer_kwargs_ = {}
else:
self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
random_state = check_random_state(self.random_state)
self.optimizer_kwargs_['random_state'] = random_state

Expand All @@ -668,9 +492,6 @@ def fit(self, X, y=None, groups=None, callback=None):
optimizers.append(self._make_optimizer(search_space))
self.optimizers_ = optimizers # will save the states of the optimizers

self.cv_results_ = defaultdict(list)
self.best_index_ = None
self.multimetric_ = False
self._optim_results = []

n_points = self.n_points
Expand All @@ -689,17 +510,11 @@ def fit(self, X, y=None, groups=None, callback=None):
n_points_adjusted = min(n_iter, n_points)

optim_result = self._step(
X, y, search_space, optimizer,
groups=groups, n_points=n_points_adjusted
search_space, optimizer,
evaluate_candidates, n_points=n_points_adjusted
)
n_iter -= n_points

if eval_callbacks(callbacks, optim_result):
break
self._optim_results.append(optim_result)

# Refit the best model on the the whole dataset
if self.refit:
self._fit_best_model(X, y)

return self

0 comments on commit 9461bfe

Please sign in to comment.