Add new centroid generation techniques (#417)

HenryChen4 · btjanaka · web-flow · commit 1e434ae2ae41 · 2023-11-17T14:09:01.000-08:00
## Description Added new centroid generation techniques in _cvt_archive.py and benchmarked these in benchmark.py. These techniques were studied in Mouret 2023: https://dl.acm.org/doi/10.1145/3583133.3590726 Notably, this PR bumps scipy to 1.7.0 since that is when scipy.stats.qmc was first introduced, but this should not be an issue for most users since scipy 1.7.0 supports Python 3.7+. ## TODO  ## Questions  ## Status - [ ] I have read the guidelines in [CONTRIBUTING.md](https://github.com/icaros-usc/pyribs/blob/master/CONTRIBUTING.md) - [ ] I have formatted my code using `yapf` - [ ] I have tested my code by running `pytest` - [ ] I have linted my code with `pylint` - [ ] I have added a one-line description of my change to the changelog in `HISTORY.md` - [ ] This PR is ready to go --------- Co-authored-by: Bryon Tjanaka <bryon.tjanaka@gmail.com>
diff --git a/benchmarks/centroid_quality.py b/benchmarks/centroid_quality.py
@@ -62,30 +62,31 @@ def main():
     techniques used in the aforementioned paper.
     """
 
-    score_seed = 1
-    num_samples = 10000
-    archive = CVTArchive(
-        solution_dim=20,
-        cells=512,
-        ranges=[(0., 1.), (0., 1.)],
-    )
-    cvt_centroids = archive.centroids
-    print(
-        "Score for CVT generation: ",
-        get_score(centroids=cvt_centroids,
-                  num_samples=num_samples,
-                  seed=score_seed))
-
-    centroid_gen_seed = 100
-    num_centroids = 1024
-    dim = 2
-    rng = np.random.default_rng(seed=centroid_gen_seed)
-    random_centroids = rng.random((num_centroids, dim))
-    print(
-        "Score for random generation: ",
-        get_score(centroids=random_centroids,
-                  num_samples=num_samples,
-                  seed=score_seed))
+    # Default settings to benchmark different centroid generation techniques.
+    score_seed = 1823170571
+    num_samples = 100000
+
+    # Settings for creating the CVTArchive.
+    solution_dim = 20
+    cells = 512
+    ranges = [(0., 1.), (0., 1.)]
+
+    # Different methods for generating centroids.
+    generation_methods = [
+        "kmeans", "random", "sobol", "scrambled_sobol", "halton"
+    ]
+
+    # Benchmark each centroid generation technique.
+    for method in generation_methods:
+        archive = CVTArchive(solution_dim=solution_dim,
+                             cells=cells,
+                             ranges=ranges,
+                             centroid_method=method)
+        print(
+            f"Score for {method} generation: ",
+            get_score(centroids=archive.centroids,
+                      num_samples=num_samples,
+                      seed=score_seed))
 
 
 if __name__ == "__main__":
diff --git a/pinned_reqs/install.txt b/pinned_reqs/install.txt
@@ -3,5 +3,5 @@ numba==0.51.0
 pandas==1.0.0
 sortedcontainers==2.0.0
 scikit-learn==1.1.0
-scipy==1.4.0
+scipy==1.7.0
 threadpoolctl==3.0.0
diff --git a/ribs/archives/_cvt_archive.py b/ribs/archives/_cvt_archive.py
@@ -1,6 +1,9 @@
 """Contains the CVTArchive class."""
+import numbers
+
 import numpy as np
 from scipy.spatial import cKDTree  # pylint: disable=no-name-in-module
+from scipy.stats.qmc import Halton, Sobol
 from sklearn.cluster import k_means
 
 from ribs._utils import check_batch_shape, check_finite
@@ -90,17 +93,22 @@ class CVTArchive(ArchiveBase):
             and a "bar" field that contains 10D values. Note that field names
             must be valid Python identifiers, and names already used in the
             archive are not allowed.
+        custom_centroids (array-like): If passed in, this (cells, measure_dim)
+            array will be used as the centroids of the CVT instead of generating
+            new ones. In this case, ``samples`` will be ignored, and
+            ``archive.samples`` will be None. This can be useful when one wishes
+            to use the same CVT across experiments for fair comparison.
+        centroid_method (str): Pass in the following methods for
+            generating centroids: "random", "sobol", "scrambled sobol",
+            "halton". Default method is "kmeans". These methods are derived from
+            Mouret 2023: https://dl.acm.org/doi/pdf/10.1145/3583133.3590726.
+            Note: Samples are only used when method is "kmeans".
         samples (int or array-like): If it is an int, this specifies the number
             of samples to generate when creating the CVT. Otherwise, this must
             be a (num_samples, measure_dim) array where samples[i] is a sample
             to use when creating the CVT. It can be useful to pass in custom
             samples when there are restrictions on what samples in the measure
             space are (physically) possible.
-        custom_centroids (array-like): If passed in, this (cells, measure_dim)
-            array will be used as the centroids of the CVT instead of generating
-            new ones. In this case, ``samples`` will be ignored, and
-            ``archive.samples`` will be None. This can be useful when one wishes
-            to use the same CVT across experiments for fair comparison.
         k_means_kwargs (dict): kwargs for :func:`~sklearn.cluster.k_means`. By
             default, we pass in `n_init=1`, `init="random"`,
             `algorithm="lloyd"`, and `random_state=seed`.
@@ -128,12 +136,13 @@ def __init__(self,
                  seed=None,
                  dtype=np.float64,
                  extra_fields=None,
-                 samples=100_000,
                  custom_centroids=None,
-                 chunk_size=None,
+                 centroid_method="kmeans",
+                 samples=100_000,
                  k_means_kwargs=None,
                  use_kd_tree=True,
-                 ckdtree_kwargs=None):
+                 ckdtree_kwargs=None,
+                 chunk_size=None):
 
         ArchiveBase.__init__(
             self,
@@ -167,23 +176,55 @@ def __init__(self,
         self._k_means_kwargs.setdefault("algorithm", "lloyd")
         self._k_means_kwargs.setdefault("random_state", seed)
 
-        self._use_kd_tree = use_kd_tree
-        self._centroid_kd_tree = None
-        self._ckdtree_kwargs = ({} if ckdtree_kwargs is None else
-                                ckdtree_kwargs.copy())
-        self._chunk_size = chunk_size
-
         if custom_centroids is None:
-            if not isinstance(samples, int):
-                # Validate shape of custom samples. These are ignored when
-                # `custom_centroids` is provided.
-                samples = np.asarray(samples, dtype=self.dtype)
-                if samples.shape[1] != self._measure_dim:
-                    raise ValueError(
-                        f"Samples has shape {samples.shape} but must be of "
-                        f"shape (n_samples, len(ranges)={self._measure_dim})")
-            self._samples = samples
-            self._centroids = None
+            self._samples = None
+            if centroid_method == "kmeans":
+                if not isinstance(samples, numbers.Integral):
+                    # Validate shape of custom samples.
+                    samples = np.asarray(samples, dtype=self.dtype)
+                    if samples.shape[1] != self._measure_dim:
+                        raise ValueError(
+                            f"Samples has shape {samples.shape} but must be of "
+                            f"shape (n_samples, len(ranges)="
+                            f"{self._measure_dim})")
+                    self._samples = samples
+                else:
+                    self._samples = self._rng.uniform(
+                        self._lower_bounds,
+                        self._upper_bounds,
+                        size=(samples, self._measure_dim),
+                    ).astype(self.dtype)
+
+                self._centroids = k_means(self._samples, self._cells,
+                                          **self._k_means_kwargs)[0]
+
+                if self._centroids.shape[0] < self._cells:
+                    raise RuntimeError(
+                        "While generating the CVT, k-means clustering found "
+                        f"{self._centroids.shape[0]} centroids, but this "
+                        f"archive needs {self._cells} cells. This most "
+                        "likely happened because there are too few samples "
+                        "and/or too many cells.")
+            elif centroid_method == "random":
+                # Generate random centroids for the archive.
+                self._centroids = self._rng.uniform(self._lower_bounds,
+                                                    self._upper_bounds,
+                                                    size=(self._cells,
+                                                          self._measure_dim))
+            elif centroid_method == "sobol":
+                # Generate self._cells number of centroids as a Sobol sequence.
+                sampler = Sobol(d=self._measure_dim, scramble=False)
+                num_points = np.log2(self._cells).astype(int)
+                self._centroids = sampler.random_base2(num_points)
+            elif centroid_method == "scrambled_sobol":
+                # Generates centroids as a scrambled Sobol sequence.
+                sampler = Sobol(d=self._measure_dim, scramble=True)
+                num_points = np.log2(self._cells).astype(int)
+                self._centroids = sampler.random_base2(num_points)
+            elif centroid_method == "halton":
+                # Generates centroids using a Halton sequence.
+                sampler = Halton(d=self._measure_dim)
+                self._centroids = sampler.random(n=self._cells)
         else:
             # Validate shape of `custom_centroids` when they are provided.
             custom_centroids = np.asarray(custom_centroids, dtype=self.dtype)
@@ -195,24 +236,11 @@ def __init__(self,
             self._centroids = custom_centroids
             self._samples = None
 
-        if self._centroids is None:
-            self._samples = self._rng.uniform(
-                self._lower_bounds,
-                self._upper_bounds,
-                size=(self._samples, self._measure_dim),
-            ).astype(self.dtype) if isinstance(self._samples,
-                                               int) else self._samples
-
-            self._centroids = k_means(self._samples, self._cells,
-                                      **self._k_means_kwargs)[0]
-
-            if self._centroids.shape[0] < self._cells:
-                raise RuntimeError(
-                    "While generating the CVT, k-means clustering found "
-                    f"{self._centroids.shape[0]} centroids, but this archive "
-                    f"needs {self._cells} cells. This most likely happened "
-                    "because there are too few samples and/or too many cells.")
-
+        self._use_kd_tree = use_kd_tree
+        self._centroid_kd_tree = None
+        self._ckdtree_kwargs = ({} if ckdtree_kwargs is None else
+                                ckdtree_kwargs.copy())
+        self._chunk_size = chunk_size
         if self._use_kd_tree:
             self._centroid_kd_tree = cKDTree(self._centroids,
                                              **self._ckdtree_kwargs)
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
     "pandas>=1.0.0",
     "sortedcontainers>=2.0.0",  # Primarily used in SlidingBoundariesArchive.
     "scikit-learn>=1.1.0",  # Primarily used in CVTArchive.
-    "scipy>=1.4.0",  # Primarily used in CVTArchive.
+    "scipy>=1.7.0",  # Primarily used in CVTArchive.
     "threadpoolctl>=3.0.0",
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`"pandas>=1.0.0",`
`20`	`20`	`"sortedcontainers>=2.0.0", # Primarily used in SlidingBoundariesArchive.`
`21`	`21`	`"scikit-learn>=1.1.0", # Primarily used in CVTArchive.`
`22`		`- "scipy>=1.4.0", # Primarily used in CVTArchive.`
	`22`	`+ "scipy>=1.7.0", # Primarily used in CVTArchive.`
`23`	`23`	`"threadpoolctl>=3.0.0",`
`24`	`24`	`]`
`25`	`25`