Skip to content

Commit 00c746e

Browse files
authored
Merge pull request mlpack#1037 from shikharbhardwaj/parallel_sgd
Implementation of parallel SGD
2 parents 6a43191 + c60f3af commit 00c746e

24 files changed

+1127
-42
lines changed

src/mlpack/core/optimizers/lbfgs/test_functions.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,17 @@ void GeneralizedRosenbrockFunction::Gradient(const arma::mat& coordinates,
196196
gradient[i + 1] = 200 * (coordinates[i + 1] - std::pow(coordinates[i], 2));
197197
}
198198

199+
void GeneralizedRosenbrockFunction::Gradient(const arma::mat& coordinates,
200+
const size_t i,
201+
arma::sp_mat& gradient) const
202+
{
203+
gradient.set_size(n);
204+
205+
gradient[i] = 400 * (std::pow(coordinates[i], 3) - coordinates[i] *
206+
coordinates[i + 1]) + 2 * (coordinates[i] - 1);
207+
gradient[i + 1] = 200 * (coordinates[i + 1] - std::pow(coordinates[i], 2));
208+
}
209+
199210
const arma::mat& GeneralizedRosenbrockFunction::GetInitialPoint() const
200211
{
201212
return initialPoint;

src/mlpack/core/optimizers/lbfgs/test_functions.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ class GeneralizedRosenbrockFunction
129129
const size_t i,
130130
arma::mat& gradient) const;
131131

132+
void Gradient(const arma::mat& coordinates,
133+
const size_t i,
134+
arma::sp_mat& gradient) const;
135+
132136
const arma::mat& GetInitialPoint() const;
133137

134138
private:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
set(SOURCES
2+
parallel_sgd.hpp
3+
parallel_sgd_impl.hpp
4+
sparse_test_function.hpp
5+
sparse_test_function_impl.hpp
6+
)
7+
8+
set(DIR_SRCS)
9+
foreach(file ${SOURCES})
10+
set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
11+
endforeach()
12+
13+
set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
set(SOURCES
2+
constant_step.hpp
3+
exponential_backoff.hpp
4+
)
5+
6+
set(DIR_SRCS)
7+
foreach(file ${SOURCES})
8+
set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
9+
endforeach()
10+
11+
set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/**
2+
* @file constant_step.hpp
3+
* @author Shikhar Bhardwaj
4+
*
5+
* Constant step size policy for parallel Stochastic Gradient Descent.
6+
*
7+
* mlpack is free software; you may redistribute it and/or modify it under the
8+
* terms of the 3-clause BSD license. You should have received a copy of the
9+
* 3-clause BSD license along with mlpack. If not, see
10+
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
11+
*/
12+
#ifndef MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_CONSTANT_STEP_HPP
13+
#define MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_CONSTANT_STEP_HPP
14+
15+
#include <mlpack/prereqs.hpp>
16+
17+
namespace mlpack {
18+
namespace optimization {
19+
20+
/**
21+
* Implementation of the ConstantStep stepsize decay policy for parallel SGD.
22+
*/
23+
class ConstantStep
24+
{
25+
public:
26+
/**
27+
* Member initialization constructor.
28+
*
29+
* The defaults here are not necessarily good for the given problem, so it is
30+
* suggested that the values used be tailored to the task at hand.
31+
*
32+
* @param step The intial stepsize to use.
33+
*/
34+
ConstantStep(const double step = 0.01) : step(step) { /* Nothing to do */ }
35+
36+
/**
37+
* This function is called in each iteration before the gradient update.
38+
*
39+
* @param numEpoch The iteration number for which the stepsize is to be
40+
* calculated.
41+
* @return The step size for the current iteration.
42+
*/
43+
double StepSize(const size_t /* numEpoch */)
44+
{
45+
return step;
46+
}
47+
private:
48+
//! The initial stepsize, which remains unchanged
49+
double step;
50+
};
51+
52+
} // namespace optimization
53+
} // namespace mlpack
54+
55+
#endif
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/**
2+
* @file exponential_backoff.hpp
3+
* @author Shikhar Bhardwaj
4+
*
5+
* Exponential backoff step size decay policy for parallel Stochastic Gradient
6+
* Descent.
7+
*
8+
* mlpack is free software; you may redistribute it and/or modify it under the
9+
* terms of the 3-clause BSD license. You should have received a copy of the
10+
* 3-clause BSD license along with mlpack. If not, see
11+
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
12+
*/
13+
#ifndef MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_EXP_BACKOFF_HPP
14+
#define MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_EXP_BACKOFF_HPP
15+
16+
#include <mlpack/prereqs.hpp>
17+
18+
namespace mlpack {
19+
namespace optimization {
20+
21+
/**
22+
* Exponential backoff stepsize reduction policy for parallel SGD.
23+
*
24+
* For more information, see the following.
25+
*
26+
* @misc{1106.5730,
27+
* Author = {Feng Niu and Benjamin Recht and Christopher Re and Stephen J.
28+
* Wright},
29+
* Title = {HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic
30+
* Gradient Descent},
31+
* Year = {2011},
32+
* Eprint = {arXiv:1106.5730},
33+
* }
34+
*
35+
* This stepsize update scheme gives robust 1/k convergence rates to the
36+
* implementation of parallel SGD.
37+
*/
38+
class ExponentialBackoff
39+
{
40+
public:
41+
/**
42+
* Member initializer constructor to construct the exponential backoff policy
43+
* with the required parameters.
44+
*
45+
* @param firstBackoffEpoch The number of updates to run before the first
46+
* stepsize backoff.
47+
* @param step The initial stepsize(gamma).
48+
* @param beta The reduction factor. This should be a value in range (0, 1).
49+
*/
50+
ExponentialBackoff(const size_t firstBackoffEpoch,
51+
const double step,
52+
const double beta) :
53+
firstBackoffEpoch(firstBackoffEpoch),
54+
cutoffEpoch(firstBackoffEpoch),
55+
step(step),
56+
beta(beta)
57+
{ /* Nothing to do. */ }
58+
59+
/**
60+
* Get the step size for the current gradient update.
61+
*
62+
* @param numEpoch The iteration number of the current update.
63+
* @return The stepsize for the current iteration.
64+
*/
65+
double StepSize(const size_t numEpoch)
66+
{
67+
if (numEpoch >= cutoffEpoch)
68+
{
69+
step *= beta;
70+
cutoffEpoch += firstBackoffEpoch / beta;
71+
}
72+
return step;
73+
}
74+
75+
private:
76+
//! The first iteration at which the stepsize should be reduced.
77+
size_t firstBackoffEpoch;
78+
79+
//! The iteration at which the next decay will be performed.
80+
size_t cutoffEpoch;
81+
82+
//! The initial stepsize.
83+
double step;
84+
85+
//! The reduction factor, should be in range (0, 1).
86+
double beta;
87+
};
88+
89+
} // namespace optimization
90+
} // namespace mlpack
91+
92+
#endif
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/**
2+
* @file parallel_sgd.hpp
3+
* @author Shikhar Bhardwaj
4+
*
5+
* Parallel Stochastic Gradient Descent.
6+
*
7+
* mlpack is free software; you may redistribute it and/or modify it under the
8+
* terms of the 3-clause BSD license. You should have received a copy of the
9+
* 3-clause BSD license along with mlpack. If not, see
10+
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
11+
*/
12+
#ifndef MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_HPP
13+
#define MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_HPP
14+
15+
#include <mlpack/prereqs.hpp>
16+
#include <mlpack/core/math/random.hpp>
17+
#include "decay_policies/constant_step.hpp"
18+
19+
namespace mlpack {
20+
namespace optimization {
21+
22+
/**
23+
* An implementation of parallel stochastic gradient descent using the lock-free
24+
* HOGWILD! approach.
25+
*
26+
* For more information, see the following.
27+
* @misc{1106.5730,
28+
* Author = {Feng Niu and Benjamin Recht and Christopher Re and Stephen J.
29+
* Wright},
30+
* Title = {HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic
31+
* Gradient Descent},
32+
* Year = {2011},
33+
* Eprint = {arXiv:1106.5730},
34+
* }
35+
*
36+
* For Parallel SGD to work, a SparseFunctionType template parameter is
37+
* required. This class must implement the following functions:
38+
*
39+
* size_t NumFunctions();
40+
* double Evaluate(const arma::mat& coordinates, const size_t i);
41+
* void Gradient(const arma::mat& coordinates,
42+
* const size_t i,
43+
* arma::sp_mat& gradient);
44+
*
45+
* In these functions the parameter id refers to which individual function (or
46+
* gradient) is being evaluated. In case of a data-dependent function, the id
47+
* would refer to the index of the datapoint(or training example).
48+
* The data is distributed uniformly among the threads made available to the
49+
* program by the OpenMP runtime.
50+
*
51+
* The Gradient function interface is slightly changed from the
52+
* DecomposableFunctionType interface, it takes in a sparse matrix as the
53+
* out-param for the gradient, as ParallelSGD is only expected to be relevant in
54+
* situations where the computed gradient is sparse.
55+
*
56+
* @tparam DecayPolicyType Step size update policy used by parallel SGD
57+
* to update the stepsize after each iteration.
58+
*/
59+
template <typename DecayPolicyType = ConstantStep>
60+
class ParallelSGD
61+
{
62+
public:
63+
/**
64+
* Construct the parallel SGD optimizer to optimize the given function with
65+
* the given parameters. One iteration means one batch of datapoints processed
66+
* by each thread.
67+
*
68+
* The defaults here are not necessarily good for the given problem, so it is
69+
* suggested that the values used be tailored to the task at hand.
70+
*
71+
* @param maxIterations Maximum number of iterations allowed (0 means no
72+
* limit).
73+
* @param threadShareSize Number of datapoints to be processed in one
74+
* iteration by each thread.
75+
* @param tolerance Maximum absolute tolerance to terminate the algorithm.
76+
* @param shuffle If true, the function order is shuffled; otherwise, each
77+
* function is visited in linear order.
78+
* @param decayPolicy The step size update policy to use.
79+
*/
80+
ParallelSGD(const size_t maxIterations,
81+
const size_t threadShareSize,
82+
const double tolerance = 1e-5,
83+
const bool shuffle = true,
84+
const DecayPolicyType& decayPolicy = DecayPolicyType());
85+
86+
/**
87+
* Optimize the given function using the parallel SGD algorithm. The given
88+
* starting point will be modified to store the finishing point of the
89+
* algorithm, and the value of the loss function at the final point is
90+
* returned.
91+
*
92+
* @tparam SparseFunctionType Type of function to be optimized.
93+
* @param function Function to be optimized(minimized).
94+
* @param iterate Starting point(will be modified).
95+
* @return Objective value at the final point.
96+
*/
97+
template <typename SparseFunctionType>
98+
double Optimize(SparseFunctionType& function, arma::mat& iterate);
99+
100+
//! Get the maximum number of iterations (0 indicates no limits).
101+
size_t MaxIterations() const { return maxIterations; }
102+
//! Modify the maximum number of iterations (0 indicates no limits).
103+
size_t& MaxIterations() { return maxIterations; }
104+
105+
//! Get the number of datapoints to be processed in one iteration by each
106+
//! thread.
107+
size_t ThreadShareSize() const { return threadShareSize; }
108+
//! Modify the number of datapoints to be processed in one iteration by each
109+
//! thread.
110+
size_t& ThreadShareSize() { return threadShareSize; }
111+
112+
//! Get the tolerance for termination.
113+
double Tolerance() const { return tolerance; }
114+
//! Modify the tolerance for termination.
115+
double& Tolerance() { return tolerance; }
116+
117+
//! Get whether or not the individual functions are shuffled.
118+
bool Shuffle() const { return shuffle; }
119+
//! Modify whether or not the individual functions are shuffled.
120+
bool& Shuffle() { return shuffle; }
121+
122+
//! Get the step size decay policy.
123+
DecayPolicyType& DecayPolicy() const { return decayPolicy; }
124+
//! Modify the step size decay policy.
125+
DecayPolicyType& DecayPolicy() { return decayPolicy; }
126+
127+
private:
128+
//! The maximum number of allowed iterations.
129+
size_t maxIterations;
130+
131+
//! The number of datapoints to be processed in one iteration by each thread.
132+
size_t threadShareSize;
133+
134+
//! The tolerance for termination.
135+
double tolerance;
136+
137+
//! Controls whether or not the individual functions are shuffled when
138+
//! iterating.
139+
bool shuffle;
140+
141+
//! The step size decay policy.
142+
DecayPolicyType decayPolicy;
143+
};
144+
145+
} // namespace optimization
146+
} // namespace mlpack
147+
148+
// Include implementation.
149+
#include "parallel_sgd_impl.hpp"
150+
151+
#endif

0 commit comments

Comments
 (0)