Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lda v1 #311

Open
wants to merge 6 commits into
base: lda_output_fix_final
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ auto
*.swp
*.fdb_latexmk
*.swo # vim swap file
\#*\# # emacs backup file

# Biblatex temporary files
*-blx.bib
Expand Down
3 changes: 2 additions & 1 deletion src/dbal/EigenIntegration/HandleMap_proto.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ namespace eigen_integration {
template <class EigenType, class Handle, int MapOptions = Eigen::Unaligned>
class HandleMap : public Eigen::Map<EigenType, MapOptions> {
public:
typedef EigenType PlainEigenType;
typedef Eigen::Map<EigenType, MapOptions> Base;
typedef typename Base::Scalar Scalar;
typedef typename Base::Index Index;
Expand Down Expand Up @@ -57,7 +58,7 @@ class HandleMap : public Eigen::Map<EigenType, MapOptions> {
*
* For example, this allows construction of MappedColumnVector from
* MappedMatrix::col(int) or NativeColumnVector, etc.
*/
*/
template <class Derived>
HandleMap(const Eigen::MapBase<Derived>& inMappedData,
typename boost::enable_if_c<Derived::IsVectorAtCompileTime>::type* = 0)
Expand Down
81 changes: 80 additions & 1 deletion src/modules/convex/algo/igd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ class IGD {
typedef typename Task::model_type model_type;

static void transition(state_type &state, const tuple_type &tuple);
static void transitionInMiniBatch(state_type &state, const tuple_type &tuple);
static void merge(state_type &state, const_state_type &otherState);
static void mergeInPlace(state_type &state, const_state_type &otherState);
static void final(state_type &state);
};

Expand All @@ -56,6 +58,62 @@ IGD<State, ConstState, Task>::transition(state_type &state,
state.task.stepsize * tuple.weight);
}

/**
* @brief Update the transition state in mini-batches
*
* Note: We assume that
* 1. Task defines a model_eigen_type
* 2. A batch of tuple.indVar is a Matrix
* 3. A batch of tuple.depVar is a ColumnVector
* 4. Task defines a getLossAndUpdateModel method
*
*/
template <class State, class ConstState, class Task>
void
IGD<State, ConstState, Task>::transitionInMiniBatch(
state_type &state,
const tuple_type &tuple) {

madlib_assert(tuple.indVar.rows() == tuple.depVar.rows(),
std::runtime_error("Invalid data. Independent and dependent "
"batches don't have same number of rows."));

int batch_size = state.algo.batchSize;
int n_epochs = state.algo.nEpochs;

// n_rows/n_ind_cols are the rows/cols in a transition tuple.
int n_rows = tuple.indVar.rows();
int n_ind_cols = tuple.indVar.cols();
int n_batches = n_rows < batch_size ? 1 :
n_rows / batch_size +
int(n_rows%batch_size > 0);

for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
double loss = 0.0;
for (int curr_batch=0, curr_batch_row_index=0; curr_batch < n_batches;
curr_batch++, curr_batch_row_index += batch_size) {
Matrix X_batch;
ColumnVector y_batch;
if (curr_batch == n_batches-1) {
// last batch
X_batch = tuple.indVar.bottomRows(n_rows-curr_batch_row_index);
y_batch = tuple.depVar.tail(n_rows-curr_batch_row_index);
} else {
X_batch = tuple.indVar.block(curr_batch_row_index, 0, batch_size, n_ind_cols);
y_batch = tuple.depVar.segment(curr_batch_row_index, batch_size);
}
loss += Task::getLossAndUpdateModel(
state.task.model, X_batch, y_batch, state.task.stepsize);
}

// The first epoch will most likely have the highest loss.
// Being pessimistic, use the total loss only from the first epoch.
if (curr_epoch==0) state.algo.loss += loss;
}
return;
}


template <class State, class ConstState, class Task>
void
IGD<State, ConstState, Task>::merge(state_type &state,
Expand Down Expand Up @@ -84,13 +142,34 @@ IGD<State, ConstState, Task>::merge(state_type &state,
static_cast<double>(totalNumRows);
}

template <class State, class ConstState, class Task>
void
IGD<State, ConstState, Task>::mergeInPlace(state_type &state,
const_state_type &otherState) {
// avoid division by zero
if (state.algo.numRows == 0) {
state.task.model = otherState.task.model;
return;
} else if (otherState.algo.numRows == 0) {
return;
}

// model averaging, weighted by rows seen
double leftRows = static_cast<double>(state.algo.numRows + state.algo.numRows);
double rightRows = static_cast<double>(otherState.algo.numRows + otherState.algo.numRows);
double totalNumRows = leftRows + rightRows;
state.task.model *= leftRows / rightRows;
state.task.model += otherState.task.model;
state.task.model *= rightRows / totalNumRows;
}

template <class State, class ConstState, class Task>
void
IGD<State, ConstState, Task>::final(state_type &state) {
// The reason that we have to keep the task.model untouched in transition
// funtion: loss computation needs the model from last iteration cleanly

state.task.model = state.algo.incrModel;

}

} // namespace convex
Expand Down
159 changes: 159 additions & 0 deletions src/modules/convex/linear_svm_igd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ typedef IGD<GLMIGDState<MutableArrayHandle<double> >,
GLMIGDState<ArrayHandle<double> >,
LinearSVM<GLMModel, GLMTuple > > LinearSVMIGDAlgorithm;

typedef IGD<SVMMinibatchState<MutableArrayHandle<double> >,
SVMMinibatchState<ArrayHandle<double> >,
LinearSVM<GLMModel, SVMMiniBatchTuple > > LinearSVMIGDAlgoMiniBatch;

typedef Loss<GLMIGDState<MutableArrayHandle<double> >,
GLMIGDState<ArrayHandle<double> >,
LinearSVM<GLMModel, GLMTuple > > LinearSVMLossAlgorithm;
Expand Down Expand Up @@ -120,6 +124,98 @@ linear_svm_igd_transition::run(AnyType &args) {
return state;
}

/**
* @brief Perform the linear support vector machine transition step
*
* Called for each tuple.
*/
AnyType
linear_svm_igd_minibatch_transition::run(AnyType &args) {
// The real state.
// For the first tuple: args[0] is nothing more than a marker that
// indicates that we should do some initial operations.
// For other tuples: args[0] holds the computation state until last tuple
SVMMinibatchState<MutableArrayHandle<double> > state = args[0];

// initialize the state if first tuple
if (state.algo.numRows == 0) {

LinearSVM<GLMModel, GLMTuple >::epsilon = args[9].getAs<double>();;
LinearSVM<GLMModel, GLMTuple >::is_svc = args[10].getAs<bool>();;
if (!args[3].isNull()) {
SVMMinibatchState<ArrayHandle<double> > previousState = args[3];
state.allocate(*this, previousState.task.nFeatures);
state = previousState;
} else {
// configuration parameters
uint32_t dimension = args[4].getAs<uint32_t>();
state.allocate(*this, dimension); // with zeros
}
// resetting in either case
// state.reset();
state.task.stepsize = args[5].getAs<double>();
const double lambda = args[6].getAs<double>();
const bool isL2 = args[7].getAs<bool>();
const int nTuples = args[8].getAs<int>();

// The regularization operations called below (scaling and clipping)
// need these class variables to be set.
L1<GLMModel>::n_tuples = nTuples;
L2<GLMModel>::n_tuples = nTuples;
if (isL2)
L2<GLMModel>::lambda = lambda;
else
L1<GLMModel>::lambda = lambda;
}

state.algo.nEpochs = args[12].getAs<int>();
state.algo.batchSize = args[13].getAs<int>();

// Skip the current record if args[1] (features) contains NULL values,
// or args[2] is NULL
try {
args[1].getAs<MappedMatrix>();
} catch (const ArrayWithNullException &e) {
return args[0];
}
if (args[2].isNull())
return args[0];

// tuple
using madlib::dbal::eigen_integration::MappedColumnVector;

MappedMatrix x(NULL);
MappedColumnVector y(NULL);
try {
new (&x) MappedMatrix(args[1].getAs<MappedMatrix>());
new (&y) MappedColumnVector(args[2].getAs<MappedColumnVector>());
} catch (const ArrayWithNullException &e) {
return args[0];
}
SVMMiniBatchTuple tuple;
tuple.indVar = trans(x);
tuple.depVar = y;

// each tuple can be weighted - this can be combination of the sample weight
// and the class weight. Calling function is responsible for combining the two
// into a single tuple weight. The default value for this parameter is 1, set
// into the definition of "tuple".
// The weight is used to increase the value of a particular tuple for the online
// learning. The weight is not used for the loss computation.
tuple.weight = args[11].getAs<double>();


// Now do the transition step
// apply Minibatching with regularization
L2<GLMModel>::scaling(state.task.model, state.task.stepsize);
LinearSVMIGDAlgoMiniBatch::transitionInMiniBatch(state, tuple);
L1<GLMModel>::clipping(state.task.model, state.task.stepsize);

state.algo.numRows += x.cols();
return state;
}


/**
* @brief Perform the perliminary aggregation function: Merge transition states
*/
Expand All @@ -145,6 +241,30 @@ linear_svm_igd_merge::run(AnyType &args) {
return stateLeft;
}

/**
* @brief Perform the perliminary aggregation function: Merge transition states
*/
AnyType
linear_svm_igd_minibatch_merge::run(AnyType &args) {
SVMMinibatchState<MutableArrayHandle<double> > stateLeft = args[0];
SVMMinibatchState<ArrayHandle<double> > stateRight = args[1];

// We first handle the trivial case where this function is called with one
// of the states being the initial state
if (stateLeft.algo.numRows == 0) { return stateRight; }
else if (stateRight.algo.numRows == 0) { return stateLeft; }

// Merge states together
LinearSVMIGDAlgoMiniBatch::mergeInPlace(stateLeft, stateRight);

// The following numRows update, cannot be put above, because the model
// averaging depends on their original values
stateLeft.algo.numRows += stateRight.algo.numRows;
stateLeft.algo.loss += stateRight.algo.loss;

return stateLeft;
}

/**
* @brief Perform the linear support vector machine final step
*/
Expand All @@ -171,6 +291,29 @@ linear_svm_igd_final::run(AnyType &args) {
return state;
}

/**
* @brief Perform the linear support vector machine final step
*/
AnyType
linear_svm_igd_minibatch_final::run(AnyType &args) {
// We request a mutable object. Depending on the backend, this might perform
// a deep copy.
SVMMinibatchState<MutableArrayHandle<double> > state = args[0];
// Aggregates that haven't seen any data just return Null.
if (state.algo.numRows == 0) { return Null(); }
state.algo.loss = state.algo.loss / state.algo.numRows;
return state;
}

AnyType
internal_linear_svm_igd_minibatch_distance::run(AnyType &args) {
SVMMinibatchState<ArrayHandle<double> > stateLeft = args[0];
SVMMinibatchState<ArrayHandle<double> > stateRight = args[1];

return std::abs((stateLeft.algo.loss - stateRight.algo.loss)
/ stateLeft.algo.loss);
}

/**
* @brief Return the difference in RMSE between two states
*/
Expand Down Expand Up @@ -199,6 +342,22 @@ internal_linear_svm_igd_result::run(AnyType &args) {
return tuple;
}

/**
* @brief Return the coefficients and diagnostic statistics of the state
*/
AnyType
internal_linear_svm_igd_minibatch_result::run(AnyType &args) {
SVMMinibatchState<ArrayHandle<double> > state = args[0];

AnyType tuple;
tuple << state.task.model
<< static_cast<double>(state.algo.loss)
<< 0.
<< static_cast<int64_t>(state.algo.numRows);

return tuple;
}

} // namespace convex

} // namespace modules
Expand Down
6 changes: 6 additions & 0 deletions src/modules/convex/linear_svm_igd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,32 @@
* @brief Linear support vector machine (incremental gradient): Transition function
*/
DECLARE_UDF(convex, linear_svm_igd_transition)
DECLARE_UDF(convex, linear_svm_igd_minibatch_transition)

/**
* @brief Linear support vector machine (incremental gradient): State merge function
*/
DECLARE_UDF(convex, linear_svm_igd_merge)
DECLARE_UDF(convex, linear_svm_igd_minibatch_merge)

/**
* @brief Linear support vector machine (incremental gradient): Final function
*/
DECLARE_UDF(convex, linear_svm_igd_final)
DECLARE_UDF(convex, linear_svm_igd_minibatch_final)

/**
* @brief Linear support vector machine (incremental gradient): Difference in
* log-likelihood between two transition states
*/
DECLARE_UDF(convex, internal_linear_svm_igd_distance)
DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_distance)


/**
* @brief Linear support vector machine (incremental gradient): Convert
* transition state to result tuple
*/
DECLARE_UDF(convex, internal_linear_svm_igd_result)
DECLARE_UDF(convex, internal_linear_svm_igd_minibatch_result)

Loading