Skip to content

Commit

Permalink
Implemented Gaussian and Sparse Random projections; dependencies of #48
Browse files Browse the repository at this point in the history
  • Loading branch information
kszucs committed Jul 3, 2015
1 parent 84459ca commit bb9576a
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 4 deletions.
8 changes: 4 additions & 4 deletions splearn/feature_selection/tests/test_variance_threshold.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import numpy as np
import scipy.sparse as sp
from numpy.testing import assert_array_almost_equal
from sklearn.feature_selection import VarianceThreshold
from splearn.feature_selection import SparkVarianceThreshold
from splearn.rdd import DictRDD
from splearn.utils.testing import SplearnTestCase, assert_true
from splearn.utils.testing import (SplearnTestCase, assert_array_almost_equal,
assert_true)
from splearn.utils.validation import check_rdd_dtype


Expand All @@ -21,8 +21,8 @@ def test_same_variances(self):
((1e4, 100), 600)]

for shape, block_size in shapes:
X_dense, X_dense_rdd = self.make_dense_rdd()
X_sparse, X_sparse_rdd = self.make_sparse_rdd()
X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size)
X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size)
Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

local.fit(X_dense)
Expand Down
111 changes: 111 additions & 0 deletions splearn/random_projection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import warnings

import numpy as np
import scipy.sparse as sp
from numpy.testing import assert_equal
from sklearn.random_projection import (BaseRandomProjection,
GaussianRandomProjection,
SparseRandomProjection,
johnson_lindenstrauss_min_dim)
from sklearn.utils import DataDimensionalityWarning

from .base import SparkBroadcasterMixin
from .rdd import DictRDD
from .utils.validation import check_rdd


class SparkBaseRandomProjection(BaseRandomProjection, SparkBroadcasterMixin):

__transient__ = ['components_']

def fit(self, Z):
"""Generate a sparse random projection matrix
Parameters
----------
X : numpy array or scipy.sparse of shape [n_samples, n_features]
Training set: only the shape is used to find optimal random
matrix dimensions based on the theory referenced in the
afore mentioned papers.
y : is not used: placeholder to allow for usage in a Pipeline.
Returns
-------
self
"""
X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
check_rdd(X, (np.ndarray, sp.spmatrix))

n_samples, n_features = X.shape

if self.n_components == 'auto':
self.n_components_ = johnson_lindenstrauss_min_dim(
n_samples=n_samples, eps=self.eps)

if self.n_components_ <= 0:
raise ValueError(
'eps=%f and n_samples=%d lead to a target dimension of '
'%d which is invalid' % (
self.eps, n_samples, self.n_components_))

elif self.n_components_ > n_features:
raise ValueError(
'eps=%f and n_samples=%d lead to a target dimension of '
'%d which is larger than the original space with '
'n_features=%d' % (self.eps, n_samples, self.n_components_,
n_features))
else:
if self.n_components <= 0:
raise ValueError("n_components must be greater than 0, got %s"
% self.n_components_)

elif self.n_components > n_features:
warnings.warn(
"The number of components is higher than the number of"
" features: n_features < n_components (%s < %s)."
"The dimensionality of the problem will not be reduced."
% (n_features, self.n_components),
DataDimensionalityWarning)

self.n_components_ = self.n_components

# Generate a projection matrix of size [n_components, n_features]
self.components_ = self._make_random_matrix(self.n_components_,
n_features)

# Check contract
assert_equal(
self.components_.shape,
(self.n_components_, n_features),
err_msg=('An error has occurred the self.components_ matrix has '
' not the proper shape.'))

return self

def transform(self, Z):
"""Project the data by using matrix product with the random matrix
Parameters
----------
X : numpy array or scipy.sparse of shape [n_samples, n_features]
The input data to project into a smaller dimensional space.
y : is not used: placeholder to allow for usage in a Pipeline.
Returns
-------
X_new : numpy array or scipy sparse of shape [n_samples, n_components]
Projected array.
"""
X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
check_rdd(X, (np.ndarray, sp.spmatrix))

dtype = np.ndarray if self.dense_output else None
mapper = self.broadcast(
super(SparkBaseRandomProjection, self).transform, Z.context)
return Z.transform(mapper, column='X', dtype=dtype)


class SparkGaussianRandomProjection(GaussianRandomProjection,
SparkBaseRandomProjection):
pass


class SparkSparseRandomProjection(SparseRandomProjection,
SparkBaseRandomProjection):
pass
109 changes: 109 additions & 0 deletions splearn/tests/test_random_projection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import numpy as np
import scipy.sparse as sp
from sklearn.random_projection import (GaussianRandomProjection,
SparseRandomProjection)
from splearn.random_projection import (SparkGaussianRandomProjection,
SparkSparseRandomProjection)
from splearn.rdd import DictRDD
from splearn.utils.testing import (SplearnTestCase, assert_array_almost_equal,
assert_true)
from splearn.utils.validation import check_rdd_dtype


class TestGaussianRandomProjection(SplearnTestCase):

def test_same_components(self):
local = GaussianRandomProjection(n_components=20, random_state=42)
dist = SparkGaussianRandomProjection(n_components=20, random_state=42)

shapes = [((1e3, 50), None),
((1e4, 100), 600)]

for shape, block_size in shapes:
X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size)
X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size)
Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

local.fit(X_dense)
dist.fit(X_dense_rdd)
assert_array_almost_equal(local.components_, dist.components_)

local.fit(X_sparse)
dist.fit(X_sparse_rdd)
assert_array_almost_equal(local.components_, dist.components_)

dist.fit(Z)
assert_array_almost_equal(local.components_, dist.components_)

def test_same_transform_result(self):
local = GaussianRandomProjection(n_components=4, random_state=42)
dist = SparkGaussianRandomProjection(n_components=4, random_state=42)

X_dense, X_dense_rdd = self.make_dense_rdd()
X_sparse, X_sparse_rdd = self.make_sparse_rdd()
Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

result_local = local.fit_transform(X_dense)
result_dist = dist.fit_transform(X_dense_rdd)
assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
assert_array_almost_equal(result_local, result_dist.toarray())

result_local = local.fit_transform(X_sparse)
result_dist = dist.fit_transform(X_sparse_rdd)
assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
assert_array_almost_equal(result_local, result_dist.toarray())

result_dist = dist.fit_transform(Z_rdd)[:, 'X']
assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
assert_array_almost_equal(result_local, result_dist.toarray())


class TestSparseRandomProjection(SplearnTestCase):

def test_same_components(self):
local = SparseRandomProjection(n_components=20, random_state=42)
dist = SparkSparseRandomProjection(n_components=20, random_state=42)

shapes = [((1e3, 50), None),
((1e4, 100), 600)]

for shape, block_size in shapes:
X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size)
X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size)
Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

local.fit(X_dense)
dist.fit(X_dense_rdd)
assert_array_almost_equal(local.components_.toarray(),
dist.components_.toarray())

local.fit(X_sparse)
dist.fit(X_sparse_rdd)
assert_array_almost_equal(local.components_.toarray(),
dist.components_.toarray())

dist.fit(Z)
assert_array_almost_equal(local.components_.toarray(),
dist.components_.toarray())

def test_same_transform_result(self):
local = SparseRandomProjection(n_components=4, random_state=42)
dist = SparkSparseRandomProjection(n_components=4, random_state=42)

X_dense, X_dense_rdd = self.make_dense_rdd()
X_sparse, X_sparse_rdd = self.make_sparse_rdd()
Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

result_local = local.fit_transform(X_dense)
result_dist = dist.fit_transform(X_dense_rdd)
assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
assert_array_almost_equal(result_local, result_dist.toarray())

result_local = local.fit_transform(X_sparse)
result_dist = dist.fit_transform(X_sparse_rdd)
assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
assert_array_almost_equal(result_local.toarray(), result_dist.toarray())

result_dist = dist.fit_transform(Z_rdd)[:, 'X']
assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
assert_array_almost_equal(result_local.toarray(), result_dist.toarray())

0 comments on commit bb9576a

Please sign in to comment.