-
Notifications
You must be signed in to change notification settings - Fork 255
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented Gaussian and Sparse Random projections; dependencies of #48
- Loading branch information
Showing
3 changed files
with
224 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import warnings | ||
|
||
import numpy as np | ||
import scipy.sparse as sp | ||
from numpy.testing import assert_equal | ||
from sklearn.random_projection import (BaseRandomProjection, | ||
GaussianRandomProjection, | ||
SparseRandomProjection, | ||
johnson_lindenstrauss_min_dim) | ||
from sklearn.utils import DataDimensionalityWarning | ||
|
||
from .base import SparkBroadcasterMixin | ||
from .rdd import DictRDD | ||
from .utils.validation import check_rdd | ||
|
||
|
||
class SparkBaseRandomProjection(BaseRandomProjection, SparkBroadcasterMixin): | ||
|
||
__transient__ = ['components_'] | ||
|
||
def fit(self, Z): | ||
"""Generate a sparse random projection matrix | ||
Parameters | ||
---------- | ||
X : numpy array or scipy.sparse of shape [n_samples, n_features] | ||
Training set: only the shape is used to find optimal random | ||
matrix dimensions based on the theory referenced in the | ||
afore mentioned papers. | ||
y : is not used: placeholder to allow for usage in a Pipeline. | ||
Returns | ||
------- | ||
self | ||
""" | ||
X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z | ||
check_rdd(X, (np.ndarray, sp.spmatrix)) | ||
|
||
n_samples, n_features = X.shape | ||
|
||
if self.n_components == 'auto': | ||
self.n_components_ = johnson_lindenstrauss_min_dim( | ||
n_samples=n_samples, eps=self.eps) | ||
|
||
if self.n_components_ <= 0: | ||
raise ValueError( | ||
'eps=%f and n_samples=%d lead to a target dimension of ' | ||
'%d which is invalid' % ( | ||
self.eps, n_samples, self.n_components_)) | ||
|
||
elif self.n_components_ > n_features: | ||
raise ValueError( | ||
'eps=%f and n_samples=%d lead to a target dimension of ' | ||
'%d which is larger than the original space with ' | ||
'n_features=%d' % (self.eps, n_samples, self.n_components_, | ||
n_features)) | ||
else: | ||
if self.n_components <= 0: | ||
raise ValueError("n_components must be greater than 0, got %s" | ||
% self.n_components_) | ||
|
||
elif self.n_components > n_features: | ||
warnings.warn( | ||
"The number of components is higher than the number of" | ||
" features: n_features < n_components (%s < %s)." | ||
"The dimensionality of the problem will not be reduced." | ||
% (n_features, self.n_components), | ||
DataDimensionalityWarning) | ||
|
||
self.n_components_ = self.n_components | ||
|
||
# Generate a projection matrix of size [n_components, n_features] | ||
self.components_ = self._make_random_matrix(self.n_components_, | ||
n_features) | ||
|
||
# Check contract | ||
assert_equal( | ||
self.components_.shape, | ||
(self.n_components_, n_features), | ||
err_msg=('An error has occurred the self.components_ matrix has ' | ||
' not the proper shape.')) | ||
|
||
return self | ||
|
||
def transform(self, Z): | ||
"""Project the data by using matrix product with the random matrix | ||
Parameters | ||
---------- | ||
X : numpy array or scipy.sparse of shape [n_samples, n_features] | ||
The input data to project into a smaller dimensional space. | ||
y : is not used: placeholder to allow for usage in a Pipeline. | ||
Returns | ||
------- | ||
X_new : numpy array or scipy sparse of shape [n_samples, n_components] | ||
Projected array. | ||
""" | ||
X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z | ||
check_rdd(X, (np.ndarray, sp.spmatrix)) | ||
|
||
dtype = np.ndarray if self.dense_output else None | ||
mapper = self.broadcast( | ||
super(SparkBaseRandomProjection, self).transform, Z.context) | ||
return Z.transform(mapper, column='X', dtype=dtype) | ||
|
||
|
||
class SparkGaussianRandomProjection(GaussianRandomProjection, | ||
SparkBaseRandomProjection): | ||
pass | ||
|
||
|
||
class SparkSparseRandomProjection(SparseRandomProjection, | ||
SparkBaseRandomProjection): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import numpy as np | ||
import scipy.sparse as sp | ||
from sklearn.random_projection import (GaussianRandomProjection, | ||
SparseRandomProjection) | ||
from splearn.random_projection import (SparkGaussianRandomProjection, | ||
SparkSparseRandomProjection) | ||
from splearn.rdd import DictRDD | ||
from splearn.utils.testing import (SplearnTestCase, assert_array_almost_equal, | ||
assert_true) | ||
from splearn.utils.validation import check_rdd_dtype | ||
|
||
|
||
class TestGaussianRandomProjection(SplearnTestCase): | ||
|
||
def test_same_components(self): | ||
local = GaussianRandomProjection(n_components=20, random_state=42) | ||
dist = SparkGaussianRandomProjection(n_components=20, random_state=42) | ||
|
||
shapes = [((1e3, 50), None), | ||
((1e4, 100), 600)] | ||
|
||
for shape, block_size in shapes: | ||
X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size) | ||
X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size) | ||
Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) | ||
|
||
local.fit(X_dense) | ||
dist.fit(X_dense_rdd) | ||
assert_array_almost_equal(local.components_, dist.components_) | ||
|
||
local.fit(X_sparse) | ||
dist.fit(X_sparse_rdd) | ||
assert_array_almost_equal(local.components_, dist.components_) | ||
|
||
dist.fit(Z) | ||
assert_array_almost_equal(local.components_, dist.components_) | ||
|
||
def test_same_transform_result(self): | ||
local = GaussianRandomProjection(n_components=4, random_state=42) | ||
dist = SparkGaussianRandomProjection(n_components=4, random_state=42) | ||
|
||
X_dense, X_dense_rdd = self.make_dense_rdd() | ||
X_sparse, X_sparse_rdd = self.make_sparse_rdd() | ||
Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) | ||
|
||
result_local = local.fit_transform(X_dense) | ||
result_dist = dist.fit_transform(X_dense_rdd) | ||
assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) | ||
assert_array_almost_equal(result_local, result_dist.toarray()) | ||
|
||
result_local = local.fit_transform(X_sparse) | ||
result_dist = dist.fit_transform(X_sparse_rdd) | ||
assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) | ||
assert_array_almost_equal(result_local, result_dist.toarray()) | ||
|
||
result_dist = dist.fit_transform(Z_rdd)[:, 'X'] | ||
assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) | ||
assert_array_almost_equal(result_local, result_dist.toarray()) | ||
|
||
|
||
class TestSparseRandomProjection(SplearnTestCase): | ||
|
||
def test_same_components(self): | ||
local = SparseRandomProjection(n_components=20, random_state=42) | ||
dist = SparkSparseRandomProjection(n_components=20, random_state=42) | ||
|
||
shapes = [((1e3, 50), None), | ||
((1e4, 100), 600)] | ||
|
||
for shape, block_size in shapes: | ||
X_dense, X_dense_rdd = self.make_dense_rdd(shape, block_size) | ||
X_sparse, X_sparse_rdd = self.make_sparse_rdd(shape, block_size) | ||
Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) | ||
|
||
local.fit(X_dense) | ||
dist.fit(X_dense_rdd) | ||
assert_array_almost_equal(local.components_.toarray(), | ||
dist.components_.toarray()) | ||
|
||
local.fit(X_sparse) | ||
dist.fit(X_sparse_rdd) | ||
assert_array_almost_equal(local.components_.toarray(), | ||
dist.components_.toarray()) | ||
|
||
dist.fit(Z) | ||
assert_array_almost_equal(local.components_.toarray(), | ||
dist.components_.toarray()) | ||
|
||
def test_same_transform_result(self): | ||
local = SparseRandomProjection(n_components=4, random_state=42) | ||
dist = SparkSparseRandomProjection(n_components=4, random_state=42) | ||
|
||
X_dense, X_dense_rdd = self.make_dense_rdd() | ||
X_sparse, X_sparse_rdd = self.make_sparse_rdd() | ||
Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) | ||
|
||
result_local = local.fit_transform(X_dense) | ||
result_dist = dist.fit_transform(X_dense_rdd) | ||
assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) | ||
assert_array_almost_equal(result_local, result_dist.toarray()) | ||
|
||
result_local = local.fit_transform(X_sparse) | ||
result_dist = dist.fit_transform(X_sparse_rdd) | ||
assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) | ||
assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) | ||
|
||
result_dist = dist.fit_transform(Z_rdd)[:, 'X'] | ||
assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) | ||
assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) |