madlib · fmcquillan99 · Feb 2, 2018 · Jan 30, 2018 · Jan 31, 2018 · Jan 31, 2018
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ auto
 *.swp
 *.fdb_latexmk
 *.swo  # vim swap file
+\#*\#  # emacs backup file
 
 # Biblatex temporary files
 *-blx.bib

diff --git a/src/dbal/EigenIntegration/HandleMap_proto.hpp b/src/dbal/EigenIntegration/HandleMap_proto.hpp
@@ -19,6 +19,7 @@ namespace eigen_integration {
 template <class EigenType, class Handle, int MapOptions = Eigen::Unaligned>
 class HandleMap : public Eigen::Map<EigenType, MapOptions> {
 public:
+    typedef EigenType PlainEigenType;
     typedef Eigen::Map<EigenType, MapOptions> Base;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::Index Index;
@@ -57,7 +58,7 @@ class HandleMap : public Eigen::Map<EigenType, MapOptions> {
      *
      * For example, this allows construction of MappedColumnVector from
      * MappedMatrix::col(int) or NativeColumnVector, etc.
-     */ 
+     */
     template <class Derived>
     HandleMap(const Eigen::MapBase<Derived>& inMappedData,
         typename boost::enable_if_c<Derived::IsVectorAtCompileTime>::type* = 0)

diff --git a/src/modules/convex/algo/igd.hpp b/src/modules/convex/algo/igd.hpp
@@ -34,7 +34,9 @@ class IGD {
     typedef typename Task::model_type model_type;
 
     static void transition(state_type &state, const tuple_type &tuple);
+    static void transitionInMiniBatch(state_type &state, const tuple_type &tuple);
     static void merge(state_type &state, const_state_type &otherState);
+    static void mergeInPlace(state_type &state, const_state_type &otherState);
     static void final(state_type &state);
 };
 
@@ -56,6 +58,62 @@ IGD<State, ConstState, Task>::transition(state_type &state,
             state.task.stepsize * tuple.weight);
 }
 
+/**
+  * @brief Update the transition state in mini-batches
+  *
+  * Note: We assume that
+  *     1. Task defines a model_eigen_type
+  *     2. A batch of tuple.indVar is a Matrix
+  *     3. A batch of tuple.depVar is a ColumnVector
+  *     4. Task defines a getLossAndUpdateModel method
+  *
+ */
+ template <class State, class ConstState, class Task>
+ void
+ IGD<State, ConstState, Task>::transitionInMiniBatch(
+        state_type &state,
+        const tuple_type &tuple) {
+
+    madlib_assert(tuple.indVar.rows() == tuple.depVar.rows(),
+                  std::runtime_error("Invalid data. Independent and dependent "
+                                     "batches don't have same number of rows."));
+
+    int batch_size = state.algo.batchSize;
+    int n_epochs = state.algo.nEpochs;
+
+    // n_rows/n_ind_cols are the rows/cols in a transition tuple.
+    int n_rows = tuple.indVar.rows();
+    int n_ind_cols = tuple.indVar.cols();
+    int n_batches = n_rows < batch_size ? 1 :
+                    n_rows / batch_size +
+                    int(n_rows%batch_size > 0);
+
+    for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
+        double loss = 0.0;
+        for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches;
+             curr_batch++, curr_batch_row_index += batch_size) {
+           Matrix X_batch;
+           ColumnVector y_batch;
+           if (curr_batch == n_batches-1) {
+              // last batch
+              X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
+              y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
+           } else {
+               X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols);
+               y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size);
+           }
+           loss += Task::getLossAndUpdateModel(
+               state.task.model, X_batch, y_batch, state.task.stepsize);
+        }
+
+        // The first epoch will most likely have the highest loss.
+        // Being pessimistic, use the total loss only from the first epoch.
+        if (curr_epoch==0) state.algo.loss += loss;
+    }
+    return;
+ }
+
+
 template <class State, class ConstState, class Task>
 void
 IGD<State, ConstState, Task>::merge(state_type &state,
@@ -84,13 +142,34 @@ IGD<State, ConstState, Task>::merge(state_type &state,
         static_cast<double>(totalNumRows);
 }
 
+template <class State, class ConstState, class Task>
+void
+IGD<State, ConstState, Task>::mergeInPlace(state_type &state,
+        const_state_type &otherState) {
+    // avoid division by zero
+    if (state.algo.numRows == 0) {
+        state.task.model = otherState.task.model;
+        return;
+    } else if (otherState.algo.numRows == 0) {
+        return;
+    }
+
+    // model averaging, weighted by rows seen
+    double leftRows = static_cast<double>(state.algo.numRows + state.algo.numRows);
+    double rightRows = static_cast<double>(otherState.algo.numRows + otherState.algo.numRows);
+    double totalNumRows = leftRows + rightRows;
+    state.task.model *= leftRows / rightRows;
+    state.task.model += otherState.task.model;
+    state.task.model *= rightRows / totalNumRows;
+}
+
 template <class State, class ConstState, class Task>
 void
 IGD<State, ConstState, Task>::final(state_type &state) {
     // The reason that we have to keep the task.model untouched in transition
     // funtion: loss computation needs the model from last iteration cleanly
-
     state.task.model = state.algo.incrModel;
+
 }
 
 } // namespace convex

diff --git a/src/modules/convex/linear_svm_igd.cpp b/src/modules/convex/linear_svm_igd.cpp
@@ -32,6 +32,10 @@ typedef IGD<GLMIGDState<MutableArrayHandle<double> >,
         GLMIGDState<ArrayHandle<double> >,
         LinearSVM<GLMModel, GLMTuple > > LinearSVMIGDAlgorithm;
 
+typedef IGD<SVMMinibatchState<MutableArrayHandle<double> >,
+        SVMMinibatchState<ArrayHandle<double> >,
+        LinearSVM<GLMModel, SVMMiniBatchTuple > > LinearSVMIGDAlgoMiniBatch;
+
 typedef Loss<GLMIGDState<MutableArrayHandle<double> >,
         GLMIGDState<ArrayHandle<double> >,
         LinearSVM<GLMModel, GLMTuple > > LinearSVMLossAlgorithm;
@@ -120,6 +124,98 @@ linear_svm_igd_transition::run(AnyType &args) {
     return state;
 }
 
+/**
+ * @brief Perform the linear support vector machine transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+linear_svm_igd_minibatch_transition::run(AnyType &args) {
+    // The real state.
+    // For the first tuple: args[0] is nothing more than a marker that
+    // indicates that we should do some initial operations.
+    // For other tuples: args[0] holds the computation state until last tuple
+    SVMMinibatchState<MutableArrayHandle<double> > state = args[0];
+
+    // initialize the state if first tuple
+    if (state.algo.numRows == 0) {
+
+        LinearSVM<GLMModel, GLMTuple >::epsilon = args[9].getAs<double>();;
+        LinearSVM<GLMModel, GLMTuple >::is_svc = args[10].getAs<bool>();;
+        if (!args[3].isNull()) {
+            SVMMinibatchState<ArrayHandle<double> > previousState = args[3];
+            state.allocate(*this, previousState.task.nFeatures);
+            state = previousState;
+        } else {
+            // configuration parameters
+            uint32_t dimension = args[4].getAs<uint32_t>();
+            state.allocate(*this, dimension); // with zeros
+        }
+        // resetting in either case
+        // state.reset();
+        state.task.stepsize = args[5].getAs<double>();
+        const double lambda = args[6].getAs<double>();
+        const bool isL2 = args[7].getAs<bool>();
+        const int nTuples = args[8].getAs<int>();
+
+        // The regularization operations called below (scaling and clipping)
+        // need these class variables to be set.
+        L1<GLMModel>::n_tuples = nTuples;
+        L2<GLMModel>::n_tuples = nTuples;
+        if (isL2)
+            L2<GLMModel>::lambda = lambda;
+        else
+            L1<GLMModel>::lambda = lambda;
+    }
+
+    state.algo.nEpochs = args[12].getAs<int>();
+    state.algo.batchSize = args[13].getAs<int>();
+
+    // Skip the current record if args[1] (features) contains NULL values,
+    // or args[2] is NULL
+    try {
+        args[1].getAs<MappedMatrix>();
+    } catch (const ArrayWithNullException &e) {
+        return args[0];
+    }
+    if (args[2].isNull())
+        return args[0];
+
+    // tuple
+    using madlib::dbal::eigen_integration::MappedColumnVector;
+
+    MappedMatrix x(NULL);
+    MappedColumnVector y(NULL);
+    try {
+        new (&x) MappedMatrix(args[1].getAs<MappedMatrix>());
+        new (&y) MappedColumnVector(args[2].getAs<MappedColumnVector>());
+    } catch (const ArrayWithNullException &e) {
+        return args[0];
+    }
+    SVMMiniBatchTuple tuple;
+    tuple.indVar = trans(x);
+    tuple.depVar = y;
+
+    // each tuple can be weighted - this can be combination of the sample weight
+    // and the class weight. Calling function is responsible for combining the two
+    // into a single tuple weight. The default value for this parameter is 1, set
+    // into the definition of "tuple".
+    // The weight is used to increase the value of a particular tuple for the online
+    // learning. The weight is not used for the loss computation.
+    tuple.weight = args[11].getAs<double>();
+
+
+    // Now do the transition step
+    // apply Minibatching with regularization
+    L2<GLMModel>::scaling(state.task.model, state.task.stepsize);
+    LinearSVMIGDAlgoMiniBatch::transitionInMiniBatch(state, tuple);
+    L1<GLMModel>::clipping(state.task.model, state.task.stepsize);
+
+    state.algo.numRows += x.cols();
+    return state;
+}
+
+
 /**
  * @brief Perform the perliminary aggregation function: Merge transition states
  */
@@ -145,6 +241,30 @@ linear_svm_igd_merge::run(AnyType &args) {
     return stateLeft;
 }
 
+/**
+ * @brief Perform the perliminary aggregation function: Merge transition states
+ */
+AnyType
+linear_svm_igd_minibatch_merge::run(AnyType &args) {
+    SVMMinibatchState<MutableArrayHandle<double> > stateLeft = args[0];
+    SVMMinibatchState<ArrayHandle<double> > stateRight = args[1];
+
+    // We first handle the trivial case where this function is called with one
+    // of the states being the initial state
+    if (stateLeft.algo.numRows == 0) { return stateRight; }
+    else if (stateRight.algo.numRows == 0) { return stateLeft; }
+
+    // Merge states together
+    LinearSVMIGDAlgoMiniBatch::mergeInPlace(stateLeft, stateRight);
+
+    // The following numRows update, cannot be put above, because the model
+    // averaging depends on their original values
+    stateLeft.algo.numRows += stateRight.algo.numRows;
+    stateLeft.algo.loss += stateRight.algo.loss;
+
+    return stateLeft;
+}
+
 /**
  * @brief Perform the linear support vector machine final step
  */
@@ -171,6 +291,29 @@ linear_svm_igd_final::run(AnyType &args) {
     return state;
 }
 
+/**
+ * @brief Perform the linear support vector machine final step
+ */
+AnyType
+linear_svm_igd_minibatch_final::run(AnyType &args) {
+    // We request a mutable object. Depending on the backend, this might perform
+    // a deep copy.
+    SVMMinibatchState<MutableArrayHandle<double> > state = args[0];
+    // Aggregates that haven't seen any data just return Null.
+    if (state.algo.numRows == 0) { return Null(); }
+    state.algo.loss = state.algo.loss / state.algo.numRows;
+    return state;
+}
+
+AnyType
+internal_linear_svm_igd_minibatch_distance::run(AnyType &args) {
+    SVMMinibatchState<ArrayHandle<double> > stateLeft = args[0];
+    SVMMinibatchState<ArrayHandle<double> > stateRight = args[1];
+
+    return std::abs((stateLeft.algo.loss - stateRight.algo.loss)
+            / stateLeft.algo.loss);
+}
+
 /**
  * @brief Return the difference in RMSE between two states
  */
@@ -199,6 +342,22 @@ internal_linear_svm_igd_result::run(AnyType &args) {
     return tuple;
 }
 
+/**
+ * @brief Return the coefficients and diagnostic statistics of the state
+ */
+AnyType
+internal_linear_svm_igd_minibatch_result::run(AnyType &args) {
+    SVMMinibatchState<ArrayHandle<double> > state = args[0];
+
+    AnyType tuple;
+    tuple << state.task.model
+        << static_cast<double>(state.algo.loss)
+        << 0.
+        << static_cast<int64_t>(state.algo.numRows);
+
+    return tuple;
+}
+
 } // namespace convex
 
 } // namespace modules

diff --git a/src/modules/convex/linear_svm_igd.hpp b/src/modules/convex/linear_svm_igd.hpp
@@ -8,26 +8,32 @@
  * @brief Linear support vector machine (incremental gradient): Transition function
  */
 DECLARE_UDF(convex, linear_svm_igd_transition)
+DECLARE_UDF(convex, linear_svm_igd_minibatch_transition)
 
 /**
  * @brief Linear support vector machine (incremental gradient): State merge function
  */
 DECLARE_UDF(convex, linear_svm_igd_merge)
+DECLARE_UDF(convex, linear_svm_igd_minibatch_merge)
 
 /**
  * @brief Linear support vector machine (incremental gradient): Final function
  */
 DECLARE_UDF(convex, linear_svm_igd_final)
+DECLARE_UDF(convex, linear_svm_igd_minibatch_final)
 
 /**
  * @brief Linear support vector machine (incremental gradient): Difference in
  *     log-likelihood between two transition states
  */
 DECLARE_UDF(convex, internal_linear_svm_igd_distance)
+DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_distance)
+
 
 /**
  * @brief Linear support vector machine (incremental gradient): Convert
  *     transition state to result tuple
  */
 DECLARE_UDF(convex, internal_linear_svm_igd_result)
+DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_result)