diff --git a/modAL/batch.py b/modAL/batch.py index 38fa732..25a0a43 100644 --- a/modAL/batch.py +++ b/modAL/batch.py @@ -6,11 +6,12 @@ import numpy as np import scipy.sparse as sp +from sklearn.cluster import KMeans from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min from modAL.utils.data import data_vstack, modALinput, data_shape from modAL.models.base import BaseCommittee, BaseLearner -from modAL.uncertainty import classifier_uncertainty +from modAL.uncertainty import classifier_margin, classifier_uncertainty def select_cold_start_instance(X: modALinput, @@ -216,3 +217,86 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee], return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty, n_instances=n_instances, metric=metric, n_jobs=n_jobs) + +def kmeans_batch( + classifier: Union[BaseLearner, BaseCommittee], + unlabeled: modALinput, + uncertainty_scores: np.ndarray, + n_instances: int, + filter_param: int, +) -> np.ndarray: + """ + Query our top :n_instances: to request for labeling. + + Refer to Zhadanov's "Diverse mini-batch Active Learning": + https://arxiv.org/pdf/1901.05954.pdf + + Args: + classifier: One of modAL's supported active learning models. + unlabeled: Set of records to be considered for our active learning model. + uncertainty_scores: Our classifier's predictions over the response variable. + n_instances: Limit on the number of records to query from our unlabeled set. + filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top + `n_instances * filter_param` most informative examples + + Returns: + The indices of the top n_instances unlabelled samples. + """ + + # transform unlabeled data if needed + if classifier.on_transformed: + unlabeled = classifier.transform_without_estimating(unlabeled) + + # Limit data set based on n_instances and filter_param + record_limit = filter_param * n_instances + keep_args = np.argsort(uncertainty_scores)[-record_limit:] + uncertainty_scores = uncertainty_scores[keep_args] + unlabeled = unlabeled[keep_args] + + # Avoids ValueErrors when we try to sample more instances than we have data points + n_clusters = min(n_instances, unlabeled.shape[0]) + + # Fit kmeans to data + kmeans = KMeans(n_clusters=n_clusters) + kmeans.fit(unlabeled, sample_weight=uncertainty_scores) + + # Return closest point to each cluster center + return np.argmin(kmeans.transform(unlabeled), axis=0) + + +def diverse_batch_kmeans(classifier: Union[BaseLearner, BaseCommittee], + X: Union[np.ndarray, sp.csr_matrix], + n_instances: int = 20, + filter_param: int = 10, + **uncertainty_measure_kwargs + ) -> np.ndarray: + """ + Batch sampling query strategy that tries to consider both diversity and informativeness. + + This strategy uses weighted K-Means (the weights being some uncertainty measure) to determine + a batch of samples to label that are both informative and diverse. Margin-based uncertainty + has been found to perform best, so that is what we use here. + + Refer to Zhadanov's "Diverse mini-batch Active Learning": + https://arxiv.org/pdf/1901.05954.pdf + + Args: + classifier: One of modAL's supported active learning models. + X: Set of records to be considered for our active learning model. + n_instances: Number of records to return for labeling from `X`. + filter_param: Controls number of examples to use for sampling. Limits K-Means dataset to top + `n_instances * filter_param` most informative examples + **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. + + Returns: + Indices of the instances from `X` chosen to be labelled + """ + uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs) + unlabeled_batch = kmeans_batch( + classifier, + unlabeled=X, + uncertainty_scores=uncertainty, + n_instances=n_instances, + filter_param=filter_param + ) + return unlabeled_batch \ No newline at end of file diff --git a/setup.py b/setup.py index c3f2b60..5af5bdf 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,5 @@ url='https://modAL-python.github.io/', packages=['modAL', 'modAL.models', 'modAL.utils'], classifiers=['Development Status :: 4 - Beta'], - install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'], + install_requires=['numpy>=1.13', 'scikit-learn>=0.20', 'scipy>=0.18', 'pandas>=1.1.0'], ) diff --git a/tests/core_tests.py b/tests/core_tests.py index 1ed4f95..217c58d 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -799,7 +799,8 @@ def test_on_transformed(self): n_samples = 10 n_features = 5 query_strategies = [ - modAL.batch.uncertainty_batch_sampling + modAL.batch.uncertainty_batch_sampling, + modAL.batch.diverse_batch_kmeans, # add further strategies which work with instance representations # no further ones as of 25.09.2020 ] @@ -831,7 +832,8 @@ def test_on_transformed_with_variable_transformation(self): properly for on_transformed=True query strategies. """ query_strategies = [ - modAL.batch.uncertainty_batch_sampling + modAL.batch.uncertainty_batch_sampling, + modAL.batch.diverse_batch_kmeans, # add further strategies which work with instance representations # no further ones as of 09.12.2020 ] @@ -1152,7 +1154,8 @@ def test_on_transformed(self): n_samples = 10 n_features = 5 query_strategies = [ - modAL.batch.uncertainty_batch_sampling + modAL.batch.uncertainty_batch_sampling, + modAL.batch.diverse_batch_kmeans, # add further strategies which work with instance representations # no further ones as of 25.09.2020 ] @@ -1318,6 +1321,7 @@ def test_examples(self): import example_tests.information_density import example_tests.bayesian_optimization import example_tests.ranked_batch_mode + import example_tests.diverse_batch_kmeans if __name__ == '__main__': diff --git a/tests/example_tests/diverse_batch_kmeans.py b/tests/example_tests/diverse_batch_kmeans.py new file mode 100644 index 0000000..1fe5606 --- /dev/null +++ b/tests/example_tests/diverse_batch_kmeans.py @@ -0,0 +1,79 @@ +import numpy as np +from sklearn.datasets import load_iris +from sklearn.decomposition import PCA +from sklearn.neighbors import KNeighborsClassifier +from functools import partial + +from modAL.batch import diverse_batch_kmeans +from modAL.models import ActiveLearner + +# Set our RNG for reproducibility. +RANDOM_STATE_SEED = 123 +np.random.seed(RANDOM_STATE_SEED) + +iris = load_iris() +X_raw = iris['data'] +y_raw = iris['target'] + +# Define our PCA transformer and fit it onto our raw dataset. +pca = PCA(n_components=2, random_state=RANDOM_STATE_SEED) +transformed_iris = pca.fit_transform(X=X_raw) + +# Isolate the data we'll need for plotting. +x_component, y_component = transformed_iris[:, 0], transformed_iris[:, 1] + +# Isolate our examples for our labeled dataset. +n_labeled_examples = X_raw.shape[0] +training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3) + +X_train = X_raw[training_indices] +y_train = y_raw[training_indices] + +# Isolate the non-training examples we'll be querying. +X_pool = np.delete(X_raw, training_indices, axis=0) +y_pool = np.delete(y_raw, training_indices, axis=0) + +# Pre-set our batch sampling to retrieve 3 samples at a time. +BATCH_SIZE = 3 +preset_batch = partial(diverse_batch_kmeans, n_instances=BATCH_SIZE) + +# Testing the cold-start +learner = ActiveLearner( + estimator=KNeighborsClassifier(n_neighbors=3), + query_strategy=preset_batch +) +cold_start_idx, cold_start_inst = learner.query(X_raw) +learner.teach(X_raw[cold_start_idx], y_raw[cold_start_idx]) + +# Specify our active learning model. +learner = ActiveLearner( + estimator=KNeighborsClassifier(n_neighbors=3), + X_training=X_train, + y_training=y_train, + query_strategy=preset_batch +) + +predictions = learner.predict(X_raw) + +# Record our learner's score on the raw data. +unqueried_score = learner.score(X_raw, y_raw) + +# Pool-based sampling +N_RAW_SAMPLES = 20 +N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE + +for index in range(N_QUERIES): + query_index, query_instance = learner.query(X_pool) + + # Teach our ActiveLearner model the record it has requested. + X, y = X_pool[query_index], y_pool[query_index] + learner.teach(X=X, y=y) + + # Remove the queried instance from the unlabeled pool. + X_pool = np.delete(X_pool, query_index, axis=0) + y_pool = np.delete(y_pool, query_index) + + # Calculate and report our model's accuracy. + model_accuracy = learner.score(X_raw, y_raw) + +predictions = learner.predict(X_raw) \ No newline at end of file