yandex
diff --git a/‎build.sh
Lines changed: 1 addition & 1 deletion b/‎build.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎faster-rnnlm/Makefile
Lines changed: 22 additions & 9 deletions b/‎faster-rnnlm/Makefile
Lines changed: 22 additions & 9 deletions
diff --git a/‎faster-rnnlm/layers/activation_functions.h
Lines changed: 86 additions & 0 deletions b/‎faster-rnnlm/layers/activation_functions.h
Lines changed: 86 additions & 0 deletions
diff --git a/‎faster-rnnlm/layers/gru_layer.cc
Lines changed: 191 additions & 0 deletions b/‎faster-rnnlm/layers/gru_layer.cc
Lines changed: 191 additions & 0 deletions
@@ -15,7 +15,7 @@ dst_folder="eigen3"
 
 function wget_or_curl() {
   [ $# -eq 2 ] || { echo "Usage: wget_or_curl <url> <fpath>" && exit 1; }
-  if type wgt &> /dev/null; then
+  if type wget &> /dev/null; then
     local download_cmd="wget -T 10 -t 3 -O"
   else
     local download_cmd="curl -L -o"
 
@@ -27,7 +27,8 @@ else
 	LDFLAGS += -lrt
 endif
 
-OBJ_FILES = rnnlm.o hierarchical_softmax.o nce.o words.o maxent.o nnet.o recurrent.o
+LAYER_OBJ_FILES = layers/simple_layer.o layers/gru_layer.o layers/scrn_layer.o layers/layer_stack.o layers/interface.o
+OBJ_FILES = hierarchical_softmax.o nce.o words.o maxent.o nnet.o $(LAYER_OBJ_FILES)
 ifeq ($(NVCC_TEST), nvcc)
 	OBJ_FILES += cuda_softmax.o
 endif
@@ -43,26 +44,38 @@ hierarchical_softmax.o : hierarchical_softmax.cc hierarchical_softmax.h maxent.h
 words.o : words.cc words.h settings.h
 	$(CC) $< -c -o $@ $(CFLAGS)
 
-rnnlm.o : rnnlm.cc maxent.h nnet.h nce.h settings.h hierarchical_softmax.h words.h recurrent.h util.h program_options.h
+rnnlm.o : rnnlm.cc maxent.h nnet.h nce.h settings.h hierarchical_softmax.h words.h layers/interface.h util.h program_options.h
 	$(CC) $< -c -o $@ $(CFLAGS)
 
-nce.o : nce.cc nce.h cuda_softmax.h maxent.h recurrent.h settings.h words.h util.h
+nce.o : nce.cc nce.h cuda_softmax.h maxent.h layers/interface.h settings.h words.h util.h
 	$(CC) $< -c -o $@ $(CFLAGS)
 
 maxent.o : maxent.cc maxent.h settings.h
 	$(CC) $< -c -o $@ $(CFLAGS)
 
-recurrent.o : recurrent.cc recurrent.h settings.h util.h
-	$(CC) $< -c -o $@ $(CFLAGS)
-
-nnet.o : nnet.cc nnet.h maxent.h settings.h hierarchical_softmax.h words.h recurrent.h util.h
+nnet.o : nnet.cc nnet.h maxent.h settings.h hierarchical_softmax.h words.h layers/interface.h util.h
 	$(CC) $< -c -o $@ $(CFLAGS)
 
 cuda_softmax.o : cuda_softmax.cu cuda_softmax.h settings.h
 	nvcc $< -c -Xcompiler "$(NVCC_CFLAGS)" -o $@
 
-tests : tests.cc $(OBJ_FILES)
-	$(CC) $^ -o $@ $(CFLAGS) $(LDFLAGS)
+layers/simple_layer.o : layers/simple_layer.cc layers/simple_layer.h layers/interface.h layers/activation_functions.h layers/util.h
+	$(CC) $< -c -o $@ $(CFLAGS)
+
+layers/scrn_layer.o : layers/scrn_layer.cc layers/scrn_layer.h layers/interface.h layers/util.h layers/activation_functions.h settings.h util.h
+	$(CC) $< -c -o $@ $(CFLAGS)
+
+layers/gru_layer.o : layers/gru_layer.cc layers/gru_layer.h layers/interface.h layers/util.h layers/activation_functions.h settings.h util.h
+	$(CC) $< -c -o $@ $(CFLAGS)
+
+layers/layer_stack.o : layers/layer_stack.cc layers/layer_stack.h layers/interface.h layers/util.h settings.h util.h
+	$(CC) $< -c -o $@ $(CFLAGS)
+
+layers/interface.o : layers/interface.cc layers/interface.h layers/util.h settings.h util.h $(LAYER_OBJ_FILES:.o=.h)
+	$(CC) $< -c -o $@ $(CFLAGS)
+
+test_gradients/test_gradients : test_gradients/test_gradients.cc $(OBJ_FILES)
+	$(CC) $^ -o $@ -std=gnu++0x $(CFLAGS) $(LDFLAGS)
 
 clean:
 	rm -f rnnlm $(OBJ_FILES)
@@ -0,0 +1,86 @@
+#ifndef FASTER_RNNLM_LAYERS_ACTIVATION_FUNCTIONS_H_
+#define FASTER_RNNLM_LAYERS_ACTIVATION_FUNCTIONS_H_
+
+#include <math.h>
+
+#include "faster-rnnlm/settings.h"
+
+
+namespace {
+const Real kReLUTruncation = 20;
+};
+
+
+// For each activation function a class with two static methods must be defined,
+// namely Forward and Backward
+//
+// Forward applies the function f to each elemente in the given array
+// Backward takes f(x) as the first argument and multiples elements
+// of the second array by f'(x)
+struct IActivation {
+  virtual void Forward(Real* hidden, int size) = 0;
+  virtual void Backward(const Real* hidden, int size, Real* hidden_g) = 0;
+  virtual ~IActivation() {}
+};
+
+
+struct SigmoidActivation : public IActivation {
+  void Forward(Real* hidden, int size) {
+    for (int i = 0; i < size; i++) {
+      hidden[i] = exp(hidden[i]) / (1 + exp(hidden[i]));
+    }
+  }
+
+  void Backward(const Real* hidden, int size, Real* hidden_g) {
+    for (int i = 0; i < size; ++i) {
+      hidden_g[i] *= hidden[i] * (1 - hidden[i]);
+    }
+  }
+};
+
+
+struct TanhActivation : public IActivation {
+  void Forward(Real* hidden, int size) {
+    for (int i = 0; i < size; i++) {
+      hidden[i] = tanh(hidden[i]);
+    }
+  }
+
+  void Backward(const Real* hidden, int size, Real* hidden_g) {
+    for (int i = 0; i < size; ++i) {
+      hidden_g[i] *= (1 - hidden[i] * hidden[i]);
+    }
+  }
+};
+
+
+struct ReLUActivation : public IActivation {
+  void Forward(Real* hidden, int size) {
+    for (int i = 0; i < size; i++) {
+      hidden[i] = (hidden[i] > 0) ? hidden[i] : 0;
+    }
+  }
+
+  void Backward(const Real* hidden, int size, Real* hidden_g) {
+    for (int i = 0; i < size; ++i) {
+      hidden_g[i] *= static_cast<int>(hidden[i] > 0);
+    }
+  }
+};
+
+
+struct TruncatedReLUActivation : public IActivation {
+  void Forward(Real* hidden, int size) {
+    for (int i = 0; i < size; i++) {
+      hidden[i] = (hidden[i] > 0 && hidden[i] < kReLUTruncation) ? hidden[i] : 0;
+    }
+  }
+
+  void Backward(const Real* hidden, int size, Real* hidden_g) {
+    for (int i = 0; i < size; ++i) {
+      hidden_g[i] *= static_cast<int>(hidden[i] > 0 && hidden[i] < kReLUTruncation);
+    }
+  }
+};
+
+#endif  // FASTER_RNNLM_LAYERS_ACTIVATION_FUNCTIONS_H_
@@ -0,0 +1,191 @@
+#include "faster-rnnlm/layers/gru_layer.h"
+
+#include "faster-rnnlm/layers/activation_functions.h"
+
+
+class GRULayer::Updater : public IRecUpdater, public TruncatedBPTTMixin<GRULayer::Updater> {
+ public:
+  explicit Updater(GRULayer* layer)
+      : IRecUpdater(layer->weights_.layer_size_)
+      , layer_(*layer)
+      , reset_(MAX_SENTENCE_WORDS, size_), reset_g_(reset_)
+      , update_(MAX_SENTENCE_WORDS, size_), update_g_(update_)
+      , partialhidden_(MAX_SENTENCE_WORDS, size_), partialhidden_g_(partialhidden_)
+      , quasihidden_(MAX_SENTENCE_WORDS, size_), quasihidden_g_(quasihidden_)
+
+      , syn_reset_in_(&layer_.weights_.syn_reset_in_)
+      , syn_reset_out_(&layer_.weights_.syn_reset_out_)
+      , syn_update_in_(&layer_.weights_.syn_update_in_)
+      , syn_update_out_(&layer_.weights_.syn_update_out_)
+      , syn_quasihidden_in_(&layer_.weights_.syn_quasihidden_in_)
+      , syn_quasihidden_out_(&layer_.weights_.syn_quasihidden_out_)
+
+      , bias_reset_(&layer_.weights_.bias_reset_)
+      , bias_update_(&layer_.weights_.bias_update_)
+    {
+        partialhidden_.row(0).setZero();
+        reset_g_.row(0).setZero();
+    }
+
+  void ForwardSubSequence(int start, int steps);
+
+  void BackwardSequence(int steps, uint32_t truncation_seed, int bptt_period, int bptt);
+
+  void UpdateWeights(int steps, Real lrate, Real l2reg, Real rmsprop, Real gradient_clipping);
+
+  void BackwardStep(int step);
+
+  void BackwardStepThroughTime(int step);
+
+ private:
+  GRULayer& layer_;
+
+  RowMatrix reset_, reset_g_;
+  RowMatrix update_, update_g_;
+  RowMatrix partialhidden_, partialhidden_g_;
+  RowMatrix quasihidden_, quasihidden_g_;
+
+  WeightMatrixUpdater<RowMatrix> syn_reset_in_;
+  WeightMatrixUpdater<RowMatrix> syn_reset_out_;
+  WeightMatrixUpdater<RowMatrix> syn_update_in_;
+  WeightMatrixUpdater<RowMatrix> syn_update_out_;
+  WeightMatrixUpdater<RowMatrix> syn_quasihidden_in_;
+  WeightMatrixUpdater<RowMatrix> syn_quasihidden_out_;
+
+  WeightMatrixUpdater<RowVector> bias_reset_;
+  WeightMatrixUpdater<RowVector> bias_update_;
+};
+
+void GRULayer::Updater::ForwardSubSequence(int start, int steps) {
+  reset_.middleRows(start, steps) = input_.middleRows(start, steps);
+  update_.middleRows(start, steps) = input_.middleRows(start, steps);
+  quasihidden_.middleRows(start, steps) = input_.middleRows(start, steps);
+
+  if (layer_.use_input_weights_) {
+    reset_.middleRows(start, steps) *= syn_reset_in_.W().transpose();
+    update_.middleRows(start, steps) *= syn_update_in_.W().transpose();
+    quasihidden_.middleRows(start, steps) *= syn_quasihidden_in_.W().transpose();
+  }
+
+  for (int step = start; step < start + steps; ++step) {
+    if (layer_.use_bias_) {
+      reset_.row(step) += bias_reset_.W();
+      update_.row(step) += bias_update_.W();
+    }
+
+    if (step != 0) {
+      reset_.row(step).noalias()  += output_.row(step - 1) * syn_reset_out_.W().transpose();
+      update_.row(step).noalias() += output_.row(step - 1) * syn_update_out_.W().transpose();
+    }
+    SigmoidActivation().Forward(reset_.row(step).data(), size_);
+    SigmoidActivation().Forward(update_.row(step).data(), size_);
+
+    if (step != 0) {
+      partialhidden_.row(step).noalias() = output_.row(step - 1).cwiseProduct(reset_.row(step));
+      quasihidden_.row(step).noalias() +=
+          partialhidden_.row(step) * syn_quasihidden_out_.W().transpose();
+    }
+    TanhActivation().Forward(quasihidden_.row(step).data(), size_);
+
+    if (step == 0) {
+      output_.row(step).row(step).noalias()
+          = quasihidden_.row(step).cwiseProduct(update_.row(step));
+    } else {
+      // these 3 lines means:
+      // output_t = (quasihidden_t - output_{t - 1}) * update_t + output_{t - 1}
+      output_.row(step).noalias() = quasihidden_.row(step) - output_.row(step - 1);
+      output_.row(step).array() *= update_.row(step).array();
+      output_.row(step) += output_.row(step - 1);
+    }
+  }
+}
+
+void GRULayer::Updater::BackwardSequence(
+    int steps, uint32_t truncation_seed, int bptt_period, int bptt) {
+  BackwardSequenceTruncated(steps, truncation_seed, bptt_period, bptt);
+
+  input_g_.topRows(steps).setZero();
+  if (layer_.use_input_weights_) {
+    input_g_.topRows(steps).noalias() += quasihidden_g_.topRows(steps) * syn_update_in_.W();
+    input_g_.topRows(steps).noalias() += reset_g_.topRows(steps) * syn_reset_in_.W();
+    input_g_.topRows(steps).noalias() += update_g_.topRows(steps) * syn_update_in_.W();
+  } else {
+    input_g_.topRows(steps) += quasihidden_g_.topRows(steps);
+    input_g_.topRows(steps) += reset_g_.topRows(steps);
+    input_g_.topRows(steps) += update_g_.topRows(steps);
+  }
+}
+
+void GRULayer::Updater::BackwardStep(int step) {
+  update_g_.row(step) = quasihidden_.row(step);
+  if (step != 0) {
+    update_g_.row(step) -= output_.row(step - 1);
+  }
+  update_g_.row(step).array() *= output_g_.row(step).array();
+  SigmoidActivation().Backward(update_.row(step).data(), size_, update_g_.row(step).data());
+
+  quasihidden_g_.row(step) = output_g_.row(step).cwiseProduct(update_.row(step));
+  TanhActivation().Backward(quasihidden_.row(step).data(), size_, quasihidden_g_.row(step).data());
+
+  partialhidden_g_.row(step).noalias() = quasihidden_g_.row(step) * syn_quasihidden_out_.W();
+
+  if (step != 0) {
+    reset_g_.row(step) = partialhidden_g_.row(step).cwiseProduct(output_.row(step - 1));
+  }
+  SigmoidActivation().Backward(reset_.row(step).data(), size_, reset_g_.row(step).data());
+}
+
+
+void GRULayer::Updater::BackwardStepThroughTime(int step) {
+  // these 2 lines means: h'_{t - 1} += (1 - u_t) * h'_t
+  output_g_.row(step - 1) += output_g_.row(step);
+  output_g_.row(step - 1) -= update_.row(step).cwiseProduct(output_g_.row(step));
+
+  output_g_.row(step - 1) += reset_.row(step).cwiseProduct(partialhidden_g_.row(step));
+  output_g_.row(step - 1).noalias() += reset_g_.row(step) * syn_reset_out_.W();
+  output_g_.row(step - 1).noalias() += update_g_.row(step) * syn_update_out_.W();
+}
+
+
+void GRULayer::Updater::UpdateWeights(
+    int steps, Real lrate, Real l2reg, Real rmsprop, Real gradient_clipping) {
+  if (steps <= 1 || size_ == 0) {
+    return;
+  }
+
+  UpdateRecurrentSynWeights(steps - 1, lrate, l2reg, rmsprop, gradient_clipping,
+      partialhidden_.middleRows(1, steps - 1), quasihidden_g_.middleRows(1, steps - 1),
+      &syn_quasihidden_out_);
+  UpdateRecurrentSynWeights(steps - 1, lrate, l2reg, rmsprop, gradient_clipping,
+      output_, reset_g_.bottomRows(reset_g_.rows() - 1),
+      &syn_reset_out_);
+  UpdateRecurrentSynWeights(steps - 1, lrate, l2reg, rmsprop, gradient_clipping,
+      output_, update_g_.bottomRows(update_g_.rows() - 1),
+      &syn_update_out_);
+
+  if (layer_.use_input_weights_) {
+    UpdateRecurrentSynWeights(steps, lrate, l2reg, rmsprop, gradient_clipping,
+        input_, quasihidden_g_,
+        &syn_quasihidden_in_);
+    UpdateRecurrentSynWeights(steps, lrate, l2reg, rmsprop, gradient_clipping,
+        input_, reset_g_,
+        &syn_reset_in_);
+    UpdateRecurrentSynWeights(steps, lrate, l2reg, rmsprop, gradient_clipping,
+        input_, update_g_,
+        &syn_update_in_);
+  }
+
+  if (layer_.use_bias_) {
+    *bias_reset_.GetGradients() = reset_g_.middleRows(1, steps - 1).colwise().mean();
+    bias_reset_.ApplyGradients(lrate, l2reg, rmsprop, gradient_clipping);
+
+    *bias_update_.GetGradients() = update_g_.middleRows(1, steps - 1).colwise().mean();
+    bias_update_.ApplyGradients(lrate, l2reg, rmsprop, gradient_clipping);
+  }
+}
+
+
+IRecUpdater* GRULayer::CreateUpdater() {
+  return new Updater(this);
+}
+