Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding stratified bootstrapping #179

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 39 additions & 11 deletions modAL/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,15 @@ def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.cs
# concatenate all transformations and return
return data_hstack(Xt)

def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner':
def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, stratify: bool = False, **fit_kwargs) -> 'BaseLearner':
"""
Fits self.estimator to the given data and labels.

Args:
X: The new samples for which the labels are supplied by the expert.
y: Labels corresponding to the new instances in X.
bootstrap: If True, the method trains the model on a set bootstrapped from X.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.

Returns:
Expand All @@ -116,9 +117,31 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f
if not bootstrap:
self.estimator.fit(X, y, **fit_kwargs)
else:
bootstrap_idx = np.random.choice(
range(X.shape[0]), X.shape[0], replace=True)
self.estimator.fit(X[bootstrap_idx], y[bootstrap_idx])
if not stratify:
bootstrap_idx = np.random.choice(
range(X.shape[0]), X.shape[0], replace=True)
self.estimator.fit(X[bootstrap_idx], y[bootstrap_idx])
else:
classes, y_indices = np.unique(y, return_inverse=True)
n_classes = classes.shape[0]

class_counts = np.bincount(y_indices)

# Find the sorted list of instances for each class:
# (np.unique above performs a sort, so code is O(n logn) already)
class_indices = np.split(
np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
)

indices = []

for i in range(n_classes):
indices_i = np.random.choice(class_indices[i], class_counts[i], replace=True)
indices.extend(indices_i)

indices = np.random.permutation(indices)

self.estimator.fit(X[indices], y[indices])

return self

Expand Down Expand Up @@ -245,28 +268,30 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None:
for learner in self.learner_list:
learner._add_training_data(X, y)

def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> None:
def _fit_to_known(self, bootstrap: bool = False, stratify: bool = False, **fit_kwargs) -> None:
"""
Fits all learners to the training data and labels provided to it so far.
Args:
bootstrap: If True, each estimator is trained on a bootstrapped dataset. Useful when
using bagging to build the ensemble.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.
"""
for learner in self.learner_list:
learner._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
learner._fit_to_known(bootstrap=bootstrap, stratify=stratify, **fit_kwargs)

def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> None:
def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, stratify: bool = False, **fit_kwargs) -> None:
"""
Fits all learners to the given data and labels.
Args:
X: The new samples for which the labels are supplied by the expert.
y: Labels corresponding to the new instances in X.
bootstrap: If True, the method trains the model on a set bootstrapped from X.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.
"""
for learner in self.learner_list:
learner._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs)
learner._fit_on_new(X, y, bootstrap=bootstrap, stratify=stratify, **fit_kwargs)

def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee':
"""
Expand Down Expand Up @@ -334,28 +359,31 @@ def rebag(self, **fit_kwargs) -> None:
"""
Refits every learner with a dataset bootstrapped from its training instances. Contrary to .bag(), it bootstraps
the training data for each learner based on its own examples.
Args:
stratify: If True, samples are bootstrapped in stratified fashion.
Todo:
Where is .bag()?
Args:
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.
"""
self._fit_to_known(bootstrap=True, **fit_kwargs)

def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None:
def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, stratify: bool = False, only_new: bool = False, **fit_kwargs) -> None:
"""
Adds X and y to the known training data for each learner and retrains learners with the augmented dataset.
Args:
X: The new samples for which the labels are supplied by the expert.
y: Labels corresponding to the new instances in X.
bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.
"""
self._add_training_data(X, y)
if not only_new:
self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
self._fit_to_known(bootstrap=bootstrap, stratify=stratify, **fit_kwargs)
else:
self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs)
self._fit_on_new(X, y, bootstrap=bootstrap, stratify=stratify, **fit_kwargs)

@abc.abstractmethod
def predict(self, X: modALinput) -> Any:
Expand Down
56 changes: 42 additions & 14 deletions modAL/models/learners.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class ActiveLearner(BaseLearner):
y_training: Initial training labels corresponding to initial training samples.
bootstrap_init: If initial training data is available, bootstrapping can be done during the first training.
Useful when building Committee models with bagging.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
on_transformed: Whether to transform samples with the pipeline defined by the estimator
when applying the query strategy.
**fit_kwargs: keyword arguments.
Expand Down Expand Up @@ -74,6 +75,7 @@ def __init__(self,
X_training: Optional[modALinput] = None,
y_training: Optional[modALinput] = None,
bootstrap_init: bool = False,
stratify: bool = False,
on_transformed: bool = False,
**fit_kwargs
) -> None:
Expand All @@ -83,7 +85,7 @@ def __init__(self,
self.y_training = y_training

if X_training is not None:
self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
self._fit_to_known(bootstrap=bootstrap_init, stratify=stratify, **fit_kwargs)

def _add_training_data(self, X: modALinput, y: modALinput) -> None:
"""
Expand Down Expand Up @@ -111,12 +113,13 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None:
raise ValueError('the dimensions of the new training data and label must'
'agree with the training data and labels provided so far')

def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner':
def _fit_to_known(self, bootstrap: bool = False, stratify:bool = False, **fit_kwargs) -> 'BaseLearner':
"""
Fits self.estimator to the training data and labels provided to it so far.

Args:
bootstrap: If True, the method trains the model on a set bootstrapped from the known training instances.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.

Returns:
Expand All @@ -125,15 +128,37 @@ def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner':
if not bootstrap:
self.estimator.fit(self.X_training, self.y_training, **fit_kwargs)
else:
n_instances = self.X_training.shape[0]
bootstrap_idx = np.random.choice(
range(n_instances), n_instances, replace=True)
self.estimator.fit(
self.X_training[bootstrap_idx], self.y_training[bootstrap_idx], **fit_kwargs)
if not stratify:
n_instances = self.X_training.shape[0]
bootstrap_idx = np.random.choice(
range(n_instances), n_instances, replace=True)
self.estimator.fit(
self.X_training[bootstrap_idx], self.y_training[bootstrap_idx], **fit_kwargs)
else:
classes, y_indices = np.unique(self.y_training, return_inverse=True)
n_classes = classes.shape[0]

class_counts = np.bincount(y_indices)

# Find the sorted list of instances for each class:
# (np.unique above performs a sort, so code is O(n logn) already)
class_indices = np.split(
np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
)

indices = []

for i in range(n_classes):
indices_i = np.random.choice(class_indices[i], class_counts[i], replace=True)
indices.extend(indices_i)

indices = np.random.permutation(indices)

self.estimator.fit(self.X_training[indices], self.y_training[indices], **fit_kwargs)

return self

def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner':
def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, stratify: bool = False, **fit_kwargs) -> 'BaseLearner':
"""
Interface for the fit method of the predictor. Fits the predictor to the supplied data, then stores it
internally for the active learning loop.
Expand All @@ -143,6 +168,7 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg
y: The corresponding labels.
bootstrap: If true, trains the estimator on a set bootstrapped from X.
Useful for building Committee models with bagging.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.

Note:
Expand All @@ -155,9 +181,9 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg
check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
force_all_finite=self.force_all_finite)
self.X_training, self.y_training = X, y
return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
return self._fit_to_known(bootstrap=bootstrap, stratify=stratify, **fit_kwargs)

def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None:
def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, stratify: bool = False, only_new: bool = False, **fit_kwargs) -> None:
"""
Adds X and y to the known training data and retrains the predictor with the augmented dataset.

Expand All @@ -166,18 +192,19 @@ def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new:
y: Labels corresponding to the new instances in X.
bootstrap: If True, training is done on a bootstrapped dataset. Useful for building Committee models
with bagging.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples.
Useful when working with models where the .fit() method doesn't retrain the model from scratch (e. g. in
tensorflow or keras).
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.
"""
if not only_new:
self._add_training_data(X, y)
self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
self._fit_to_known(bootstrap=bootstrap, stratify=stratify, **fit_kwargs)
else:
check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
force_all_finite=self.force_all_finite)
self._fit_on_new(X, y, bootstrap=bootstrap, **fit_kwargs)
self._fit_on_new(X, y, bootstrap=bootstrap, stratify=stratify, **fit_kwargs)


class DeepActiveLearner(BaseLearner):
Expand Down Expand Up @@ -524,17 +551,18 @@ def fit(self, X: modALinput, y: modALinput, **fit_kwargs) -> 'BaseCommittee':
super().fit(X, y, **fit_kwargs)
self._set_classes()

def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None:
def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, stratify: bool = False, only_new: bool = False, **fit_kwargs) -> None:
"""
Adds X and y to the known training data for each learner and retrains learners with the augmented dataset.
Args:
X: The new samples for which the labels are supplied by the expert.
y: Labels corresponding to the new instances in X.
bootstrap: If True, trains each learner on a bootstrapped set. Useful when building the ensemble by bagging.
stratify: If True, samples are bootstrapped in stratified fashion. Is significat only if bootstrap parameter is True.
only_new: If True, the model is retrained using only X and y, ignoring the previously provided examples.
**fit_kwargs: Keyword arguments to be passed to the fit method of the predictor.
"""
super().teach(X, y, bootstrap=bootstrap, only_new=only_new, **fit_kwargs)
super().teach(X, y, bootstrap=bootstrap, stratify=stratify, only_new=only_new, **fit_kwargs)
self._set_classes()

def predict(self, X: modALinput, **predict_proba_kwargs) -> Any:
Expand Down