|
| 1 | +/** |
| 2 | + * @file parallel_sgd.hpp |
| 3 | + * @author Shikhar Bhardwaj |
| 4 | + * |
| 5 | + * Parallel Stochastic Gradient Descent. |
| 6 | + * |
| 7 | + * mlpack is free software; you may redistribute it and/or modify it under the |
| 8 | + * terms of the 3-clause BSD license. You should have received a copy of the |
| 9 | + * 3-clause BSD license along with mlpack. If not, see |
| 10 | + * http://www.opensource.org/licenses/BSD-3-Clause for more information. |
| 11 | + */ |
| 12 | +#ifndef MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_HPP |
| 13 | +#define MLPACK_CORE_OPTIMIZERS_PARALLEL_SGD_HPP |
| 14 | + |
| 15 | +#include <mlpack/prereqs.hpp> |
| 16 | +#include <mlpack/core/math/random.hpp> |
| 17 | +#include "decay_policies/constant_step.hpp" |
| 18 | + |
| 19 | +namespace mlpack { |
| 20 | +namespace optimization { |
| 21 | + |
| 22 | +/** |
| 23 | + * An implementation of parallel stochastic gradient descent using the lock-free |
| 24 | + * HOGWILD! approach. |
| 25 | + * |
| 26 | + * For more information, see the following. |
| 27 | + * @misc{1106.5730, |
| 28 | + * Author = {Feng Niu and Benjamin Recht and Christopher Re and Stephen J. |
| 29 | + * Wright}, |
| 30 | + * Title = {HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic |
| 31 | + * Gradient Descent}, |
| 32 | + * Year = {2011}, |
| 33 | + * Eprint = {arXiv:1106.5730}, |
| 34 | + * } |
| 35 | + * |
| 36 | + * For Parallel SGD to work, a SparseFunctionType template parameter is |
| 37 | + * required. This class must implement the following functions: |
| 38 | + * |
| 39 | + * size_t NumFunctions(); |
| 40 | + * double Evaluate(const arma::mat& coordinates, const size_t i); |
| 41 | + * void Gradient(const arma::mat& coordinates, |
| 42 | + * const size_t i, |
| 43 | + * arma::sp_mat& gradient); |
| 44 | + * |
| 45 | + * In these functions the parameter id refers to which individual function (or |
| 46 | + * gradient) is being evaluated. In case of a data-dependent function, the id |
| 47 | + * would refer to the index of the datapoint(or training example). |
| 48 | + * The data is distributed uniformly among the threads made available to the |
| 49 | + * program by the OpenMP runtime. |
| 50 | + * |
| 51 | + * The Gradient function interface is slightly changed from the |
| 52 | + * DecomposableFunctionType interface, it takes in a sparse matrix as the |
| 53 | + * out-param for the gradient, as ParallelSGD is only expected to be relevant in |
| 54 | + * situations where the computed gradient is sparse. |
| 55 | + * |
| 56 | + * @tparam DecayPolicyType Step size update policy used by parallel SGD |
| 57 | + * to update the stepsize after each iteration. |
| 58 | + */ |
| 59 | +template <typename DecayPolicyType = ConstantStep> |
| 60 | +class ParallelSGD |
| 61 | +{ |
| 62 | + public: |
| 63 | + /** |
| 64 | + * Construct the parallel SGD optimizer to optimize the given function with |
| 65 | + * the given parameters. One iteration means one batch of datapoints processed |
| 66 | + * by each thread. |
| 67 | + * |
| 68 | + * The defaults here are not necessarily good for the given problem, so it is |
| 69 | + * suggested that the values used be tailored to the task at hand. |
| 70 | + * |
| 71 | + * @param maxIterations Maximum number of iterations allowed (0 means no |
| 72 | + * limit). |
| 73 | + * @param threadShareSize Number of datapoints to be processed in one |
| 74 | + * iteration by each thread. |
| 75 | + * @param tolerance Maximum absolute tolerance to terminate the algorithm. |
| 76 | + * @param shuffle If true, the function order is shuffled; otherwise, each |
| 77 | + * function is visited in linear order. |
| 78 | + * @param decayPolicy The step size update policy to use. |
| 79 | + */ |
| 80 | + ParallelSGD(const size_t maxIterations, |
| 81 | + const size_t threadShareSize, |
| 82 | + const double tolerance = 1e-5, |
| 83 | + const bool shuffle = true, |
| 84 | + const DecayPolicyType& decayPolicy = DecayPolicyType()); |
| 85 | + |
| 86 | + /** |
| 87 | + * Optimize the given function using the parallel SGD algorithm. The given |
| 88 | + * starting point will be modified to store the finishing point of the |
| 89 | + * algorithm, and the value of the loss function at the final point is |
| 90 | + * returned. |
| 91 | + * |
| 92 | + * @tparam SparseFunctionType Type of function to be optimized. |
| 93 | + * @param function Function to be optimized(minimized). |
| 94 | + * @param iterate Starting point(will be modified). |
| 95 | + * @return Objective value at the final point. |
| 96 | + */ |
| 97 | + template <typename SparseFunctionType> |
| 98 | + double Optimize(SparseFunctionType& function, arma::mat& iterate); |
| 99 | + |
| 100 | + //! Get the maximum number of iterations (0 indicates no limits). |
| 101 | + size_t MaxIterations() const { return maxIterations; } |
| 102 | + //! Modify the maximum number of iterations (0 indicates no limits). |
| 103 | + size_t& MaxIterations() { return maxIterations; } |
| 104 | + |
| 105 | + //! Get the number of datapoints to be processed in one iteration by each |
| 106 | + //! thread. |
| 107 | + size_t ThreadShareSize() const { return threadShareSize; } |
| 108 | + //! Modify the number of datapoints to be processed in one iteration by each |
| 109 | + //! thread. |
| 110 | + size_t& ThreadShareSize() { return threadShareSize; } |
| 111 | + |
| 112 | + //! Get the tolerance for termination. |
| 113 | + double Tolerance() const { return tolerance; } |
| 114 | + //! Modify the tolerance for termination. |
| 115 | + double& Tolerance() { return tolerance; } |
| 116 | + |
| 117 | + //! Get whether or not the individual functions are shuffled. |
| 118 | + bool Shuffle() const { return shuffle; } |
| 119 | + //! Modify whether or not the individual functions are shuffled. |
| 120 | + bool& Shuffle() { return shuffle; } |
| 121 | + |
| 122 | + //! Get the step size decay policy. |
| 123 | + DecayPolicyType& DecayPolicy() const { return decayPolicy; } |
| 124 | + //! Modify the step size decay policy. |
| 125 | + DecayPolicyType& DecayPolicy() { return decayPolicy; } |
| 126 | + |
| 127 | + private: |
| 128 | + //! The maximum number of allowed iterations. |
| 129 | + size_t maxIterations; |
| 130 | + |
| 131 | + //! The number of datapoints to be processed in one iteration by each thread. |
| 132 | + size_t threadShareSize; |
| 133 | + |
| 134 | + //! The tolerance for termination. |
| 135 | + double tolerance; |
| 136 | + |
| 137 | + //! Controls whether or not the individual functions are shuffled when |
| 138 | + //! iterating. |
| 139 | + bool shuffle; |
| 140 | + |
| 141 | + //! The step size decay policy. |
| 142 | + DecayPolicyType decayPolicy; |
| 143 | +}; |
| 144 | + |
| 145 | +} // namespace optimization |
| 146 | +} // namespace mlpack |
| 147 | + |
| 148 | +// Include implementation. |
| 149 | +#include "parallel_sgd_impl.hpp" |
| 150 | + |
| 151 | +#endif |
0 commit comments