Implemented Gaussian and Sparse Random projections; dependencies of #48

lensacom · Jul 3, 2015 · bb9576a · bb9576a
1 parent 84459ca
commit bb9576a
Show file tree

Hide file tree

Showing 3 changed files with 224 additions and 4 deletions.
diff --git a/splearn/feature_selection/tests/test_variance_threshold.py b/splearn/feature_selection/tests/test_variance_threshold.py
@@ -1,10 +1,10 @@
 import numpy as np
 import scipy.sparse as sp
-from numpy.testing import assert_array_almost_equal
 from sklearn.feature_selection import VarianceThreshold
 from splearn.feature_selection import SparkVarianceThreshold
 from splearn.rdd import DictRDD
-from splearn.utils.testing import SplearnTestCase, assert_true
+from splearn.utils.testing import (SplearnTestCase, assert_array_almost_equal,
+                                   assert_true)
 from splearn.utils.validation import check_rdd_dtype
 
 
@@ -21,8 +21,8 @@ def test_same_variances(self):
                   ((1e4, 100), 600)]
 
         for shape, block_size in shapes:
-            X_dense, X_dense_rdd = self.make_dense_rdd()
-            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
+            X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size)
+            X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size)
             Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))
 
             local.fit(X_dense)

diff --git a/splearn/random_projection.py b/splearn/random_projection.py
@@ -0,0 +1,111 @@
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+from numpy.testing import assert_equal
+from sklearn.random_projection import (BaseRandomProjection,
+                                       GaussianRandomProjection,
+                                       SparseRandomProjection,
+                                       johnson_lindenstrauss_min_dim)
+from sklearn.utils import DataDimensionalityWarning
+
+from .base import SparkBroadcasterMixin
+from .rdd import DictRDD
+from .utils.validation import check_rdd
+
+
+class SparkBaseRandomProjection(BaseRandomProjection, SparkBroadcasterMixin):
+
+    __transient__ = ['components_']
+
+    def fit(self, Z):
+        """Generate a sparse random projection matrix
+        Parameters
+        ----------
+        X : numpy array or scipy.sparse of shape [n_samples, n_features]
+            Training set: only the shape is used to find optimal random
+            matrix dimensions based on the theory referenced in the
+            afore mentioned papers.
+        y : is not used: placeholder to allow for usage in a Pipeline.
+        Returns
+        -------
+        self
+        """
+        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
+        check_rdd(X, (np.ndarray, sp.spmatrix))
+
+        n_samples, n_features = X.shape
+
+        if self.n_components == 'auto':
+            self.n_components_ = johnson_lindenstrauss_min_dim(
+                n_samples=n_samples, eps=self.eps)
+
+            if self.n_components_ <= 0:
+                raise ValueError(
+                    'eps=%f and n_samples=%d lead to a target dimension of '
+                    '%d which is invalid' % (
+                        self.eps, n_samples, self.n_components_))
+
+            elif self.n_components_ > n_features:
+                raise ValueError(
+                    'eps=%f and n_samples=%d lead to a target dimension of '
+                    '%d which is larger than the original space with '
+                    'n_features=%d' % (self.eps, n_samples, self.n_components_,
+                                       n_features))
+        else:
+            if self.n_components <= 0:
+                raise ValueError("n_components must be greater than 0, got %s"
+                                 % self.n_components_)
+
+            elif self.n_components > n_features:
+                warnings.warn(
+                    "The number of components is higher than the number of"
+                    " features: n_features < n_components (%s < %s)."
+                    "The dimensionality of the problem will not be reduced."
+                    % (n_features, self.n_components),
+                    DataDimensionalityWarning)
+
+            self.n_components_ = self.n_components
+
+        # Generate a projection matrix of size [n_components, n_features]
+        self.components_ = self._make_random_matrix(self.n_components_,
+                                                    n_features)
+
+        # Check contract
+        assert_equal(
+            self.components_.shape,
+            (self.n_components_, n_features),
+            err_msg=('An error has occurred the self.components_ matrix has '
+                     ' not the proper shape.'))
+
+        return self
+
+    def transform(self, Z):
+        """Project the data by using matrix product with the random matrix
+        Parameters
+        ----------
+        X : numpy array or scipy.sparse of shape [n_samples, n_features]
+            The input data to project into a smaller dimensional space.
+        y : is not used: placeholder to allow for usage in a Pipeline.
+        Returns
+        -------
+        X_new : numpy array or scipy sparse of shape [n_samples, n_components]
+            Projected array.
+        """
+        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
+        check_rdd(X, (np.ndarray, sp.spmatrix))
+
+        dtype = np.ndarray if self.dense_output else None
+        mapper = self.broadcast(
+            super(SparkBaseRandomProjection, self).transform, Z.context)
+        return Z.transform(mapper, column='X', dtype=dtype)
+
+
+class SparkGaussianRandomProjection(GaussianRandomProjection,
+                                    SparkBaseRandomProjection):
+    pass
+
+
+class SparkSparseRandomProjection(SparseRandomProjection,
+                                  SparkBaseRandomProjection):
+    pass
diff --git a/splearn/tests/test_random_projection.py b/splearn/tests/test_random_projection.py
@@ -0,0 +1,109 @@
+import numpy as np
+import scipy.sparse as sp
+from sklearn.random_projection import (GaussianRandomProjection,
+                                       SparseRandomProjection)
+from splearn.random_projection import (SparkGaussianRandomProjection,
+                                       SparkSparseRandomProjection)
+from splearn.rdd import DictRDD
+from splearn.utils.testing import (SplearnTestCase, assert_array_almost_equal,
+                                   assert_true)
+from splearn.utils.validation import check_rdd_dtype
+
+
+class TestGaussianRandomProjection(SplearnTestCase):
+
+    def test_same_components(self):
+        local = GaussianRandomProjection(n_components=20, random_state=42)
+        dist = SparkGaussianRandomProjection(n_components=20, random_state=42)
+
+        shapes = [((1e3, 50), None),
+                  ((1e4, 100), 600)]
+
+        for shape, block_size in shapes:
+            X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size)
+            X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size)
+            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))
+
+            local.fit(X_dense)
+            dist.fit(X_dense_rdd)
+            assert_array_almost_equal(local.components_, dist.components_)
+
+            local.fit(X_sparse)
+            dist.fit(X_sparse_rdd)
+            assert_array_almost_equal(local.components_, dist.components_)
+
+            dist.fit(Z)
+            assert_array_almost_equal(local.components_, dist.components_)
+
+    def test_same_transform_result(self):
+        local = GaussianRandomProjection(n_components=4, random_state=42)
+        dist = SparkGaussianRandomProjection(n_components=4, random_state=42)
+
+        X_dense, X_dense_rdd = self.make_dense_rdd()
+        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
+        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))
+
+        result_local = local.fit_transform(X_dense)
+        result_dist = dist.fit_transform(X_dense_rdd)
+        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
+        assert_array_almost_equal(result_local, result_dist.toarray())
+
+        result_local = local.fit_transform(X_sparse)
+        result_dist = dist.fit_transform(X_sparse_rdd)
+        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
+        assert_array_almost_equal(result_local, result_dist.toarray())
+
+        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
+        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
+        assert_array_almost_equal(result_local, result_dist.toarray())
+
+
+class TestSparseRandomProjection(SplearnTestCase):
+
+    def test_same_components(self):
+        local = SparseRandomProjection(n_components=20, random_state=42)
+        dist = SparkSparseRandomProjection(n_components=20, random_state=42)
+
+        shapes = [((1e3, 50), None),
+                  ((1e4, 100), 600)]
+
+        for shape, block_size in shapes:
+            X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size)
+            X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size)
+            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))
+
+            local.fit(X_dense)
+            dist.fit(X_dense_rdd)
+            assert_array_almost_equal(local.components_.toarray(),
+                                      dist.components_.toarray())
+
+            local.fit(X_sparse)
+            dist.fit(X_sparse_rdd)
+            assert_array_almost_equal(local.components_.toarray(),
+                                      dist.components_.toarray())
+
+            dist.fit(Z)
+            assert_array_almost_equal(local.components_.toarray(),
+                                      dist.components_.toarray())
+
+    def test_same_transform_result(self):
+        local = SparseRandomProjection(n_components=4, random_state=42)
+        dist = SparkSparseRandomProjection(n_components=4, random_state=42)
+
+        X_dense, X_dense_rdd = self.make_dense_rdd()
+        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
+        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))
+
+        result_local = local.fit_transform(X_dense)
+        result_dist = dist.fit_transform(X_dense_rdd)
+        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
+        assert_array_almost_equal(result_local, result_dist.toarray())
+
+        result_local = local.fit_transform(X_sparse)
+        result_dist = dist.fit_transform(X_sparse_rdd)
+        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
+        assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
+
+        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
+        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
+        assert_array_almost_equal(result_local.toarray(), result_dist.toarray())