Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Diverse Mini-batch Active Learning #134

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion modAL/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@

import numpy as np
import scipy.sparse as sp
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min

from modAL.utils.data import data_vstack, modALinput, data_shape
from modAL.models.base import BaseCommittee, BaseLearner
from modAL.uncertainty import classifier_uncertainty
from modAL.uncertainty import classifier_margin, classifier_uncertainty


def select_cold_start_instance(X: modALinput,
Expand Down Expand Up @@ -216,3 +217,86 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
n_instances=n_instances, metric=metric, n_jobs=n_jobs)


def kmeans_batch(
classifier: Union[BaseLearner, BaseCommittee],
unlabeled: modALinput,
uncertainty_scores: np.ndarray,
n_instances: int,
filter_param: int,
) -> np.ndarray:
"""
Query our top :n_instances: to request for labeling.

Refer to Zhadanov's "Diverse mini-batch Active Learning":
https://arxiv.org/pdf/1901.05954.pdf

Args:
classifier: One of modAL's supported active learning models.
unlabeled: Set of records to be considered for our active learning model.
uncertainty_scores: Our classifier's predictions over the response variable.
n_instances: Limit on the number of records to query from our unlabeled set.
filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top
`n_instances * filter_param` most informative examples

Returns:
The indices of the top n_instances unlabelled samples.
"""

# transform unlabeled data if needed
if classifier.on_transformed:
unlabeled = classifier.transform_without_estimating(unlabeled)

# Limit data set based on n_instances and filter_param
record_limit = filter_param * n_instances
keep_args = np.argsort(uncertainty_scores)[-record_limit:]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

argsort is suboptimal in this case because we only need to partition at the record_limitth instance.
argpartition is better suited for that. it is O(n) as opposed to O(nlog(n)) for argsort. you can use multi_argmax, or shuffled_argmax already implemented in selection.py

uncertainty_scores = uncertainty_scores[keep_args]
unlabeled = unlabeled[keep_args]

# Avoids ValueErrors when we try to sample more instances than we have data points
n_clusters = min(n_instances, unlabeled.shape[0])

# Fit kmeans to data
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(unlabeled, sample_weight=uncertainty_scores)

# Return closest point to each cluster center
return np.argmin(kmeans.transform(unlabeled), axis=0)


def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee],
X: Union[np.ndarray, sp.csr_matrix],
n_instances: int = 20,
filter_param: int = 10,
**uncertainty_measure_kwargs
) -> np.ndarray:
"""
Batch sampling query strategy that tries to consider both diversity and informativeness.

This strategy uses weighted K-Means (the weights being some uncertainty measure) to determine
a batch of samples to label that are both informative and diverse. Margin-based uncertainty
has been found to perform best, so that is what we use here.

Refer to Zhadanov's "Diverse mini-batch Active Learning":
https://arxiv.org/pdf/1901.05954.pdf

Args:
classifier: One of modAL's supported active learning models.
X: Set of records to be considered for our active learning model.
n_instances: Number of records to return for labeling from `X`.
filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top
`n_instances * filter_param` most informative examples
**uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

Returns:
Indices of the instances from `X` chosen to be labelled
"""
uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so you only support margin uncertainty? I would suggest to add the callable as param of the function, and default to classifier_margin.

unlabeled_batch = kmeans_batch(
classifier,
unlabeled=X,
uncertainty_scores=uncertainty,
n_instances=n_instances,
filter_param=filter_param
)
return unlabeled_batch
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
url='https://modAL-python.github.io/',
packages=['modAL', 'modAL.models', 'modAL.utils'],
classifiers=['Development Status :: 4 - Beta'],
install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'],
install_requires=['numpy>=1.13', 'scikit-learn>=0.20', 'scipy>=0.18', 'pandas>=1.1.0'],
)
10 changes: 7 additions & 3 deletions tests/core_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,8 @@ def test_on_transformed(self):
n_samples = 10
n_features = 5
query_strategies = [
modAL.batch.uncertainty_batch_sampling
modAL.batch.uncertainty_batch_sampling,
modAL.batch.diverse_batch_kmeans,
# add further strategies which work with instance representations
# no further ones as of 25.09.2020
]
Expand Down Expand Up @@ -831,7 +832,8 @@ def test_on_transformed_with_variable_transformation(self):
properly for on_transformed=True query strategies.
"""
query_strategies = [
modAL.batch.uncertainty_batch_sampling
modAL.batch.uncertainty_batch_sampling,
modAL.batch.diverse_batch_kmeans,
# add further strategies which work with instance representations
# no further ones as of 09.12.2020
]
Expand Down Expand Up @@ -1152,7 +1154,8 @@ def test_on_transformed(self):
n_samples = 10
n_features = 5
query_strategies = [
modAL.batch.uncertainty_batch_sampling
modAL.batch.uncertainty_batch_sampling,
modAL.batch.diverse_batch_kmeans,
# add further strategies which work with instance representations
# no further ones as of 25.09.2020
]
Expand Down Expand Up @@ -1318,6 +1321,7 @@ def test_examples(self):
import example_tests.information_density
import example_tests.bayesian_optimization
import example_tests.ranked_batch_mode
import example_tests.diverse_batch_kmeans


if __name__ == '__main__':
Expand Down
79 changes: 79 additions & 0 deletions tests/example_tests/diverse_batch_kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from functools import partial

from modAL.batch import diverse_batch_kmeans
from modAL.models import ActiveLearner

# Set our RNG for reproducibility.
RANDOM_STATE_SEED = 123
np.random.seed(RANDOM_STATE_SEED)

iris = load_iris()
X_raw = iris['data']
y_raw = iris['target']

# Define our PCA transformer and fit it onto our raw dataset.
pca = PCA(n_components=2, random_state=RANDOM_STATE_SEED)
transformed_iris = pca.fit_transform(X=X_raw)

# Isolate the data we'll need for plotting.
x_component, y_component = transformed_iris[:, 0], transformed_iris[:, 1]

# Isolate our examples for our labeled dataset.
n_labeled_examples = X_raw.shape[0]
training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3)

X_train = X_raw[training_indices]
y_train = y_raw[training_indices]

# Isolate the non-training examples we'll be querying.
X_pool = np.delete(X_raw, training_indices, axis=0)
y_pool = np.delete(y_raw, training_indices, axis=0)

# Pre-set our batch sampling to retrieve 3 samples at a time.
BATCH_SIZE = 3
preset_batch = partial(diverse_batch_kmeans, n_instances=BATCH_SIZE)

# Testing the cold-start
learner = ActiveLearner(
estimator=KNeighborsClassifier(n_neighbors=3),
query_strategy=preset_batch
)
cold_start_idx, cold_start_inst = learner.query(X_raw)
learner.teach(X_raw[cold_start_idx], y_raw[cold_start_idx])

# Specify our active learning model.
learner = ActiveLearner(
estimator=KNeighborsClassifier(n_neighbors=3),
X_training=X_train,
y_training=y_train,
query_strategy=preset_batch
)

predictions = learner.predict(X_raw)

# Record our learner's score on the raw data.
unqueried_score = learner.score(X_raw, y_raw)

# Pool-based sampling
N_RAW_SAMPLES = 20
N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE

for index in range(N_QUERIES):
query_index, query_instance = learner.query(X_pool)

# Teach our ActiveLearner model the record it has requested.
X, y = X_pool[query_index], y_pool[query_index]
learner.teach(X=X, y=y)

# Remove the queried instance from the unlabeled pool.
X_pool = np.delete(X_pool, query_index, axis=0)
y_pool = np.delete(y_pool, query_index)

# Calculate and report our model's accuracy.
model_accuracy = learner.score(X_raw, y_raw)

predictions = learner.predict(X_raw)