Skip to content

Commit ce03805

Browse files
author
Anton Bakhtin
committed
Refactor inner layer classes
1 parent 5707ecf commit ce03805

18 files changed

+1280
-1229
lines changed

build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dst_folder="eigen3"
1515

1616
function wget_or_curl() {
1717
[ $# -eq 2 ] || { echo "Usage: wget_or_curl <url> <fpath>" && exit 1; }
18-
if type wgt &> /dev/null; then
18+
if type wget &> /dev/null; then
1919
local download_cmd="wget -T 10 -t 3 -O"
2020
else
2121
local download_cmd="curl -L -o"

faster-rnnlm/Makefile

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ else
2727
LDFLAGS += -lrt
2828
endif
2929

30-
OBJ_FILES = rnnlm.o hierarchical_softmax.o nce.o words.o maxent.o nnet.o recurrent.o
30+
LAYER_OBJ_FILES = layers/simple_layer.o layers/gru_layer.o layers/scrn_layer.o layers/layer_stack.o layers/interface.o
31+
OBJ_FILES = hierarchical_softmax.o nce.o words.o maxent.o nnet.o $(LAYER_OBJ_FILES)
3132
ifeq ($(NVCC_TEST), nvcc)
3233
OBJ_FILES += cuda_softmax.o
3334
endif
@@ -43,26 +44,38 @@ hierarchical_softmax.o : hierarchical_softmax.cc hierarchical_softmax.h maxent.h
4344
words.o : words.cc words.h settings.h
4445
$(CC) $< -c -o $@ $(CFLAGS)
4546

46-
rnnlm.o : rnnlm.cc maxent.h nnet.h nce.h settings.h hierarchical_softmax.h words.h recurrent.h util.h program_options.h
47+
rnnlm.o : rnnlm.cc maxent.h nnet.h nce.h settings.h hierarchical_softmax.h words.h layers/interface.h util.h program_options.h
4748
$(CC) $< -c -o $@ $(CFLAGS)
4849

49-
nce.o : nce.cc nce.h cuda_softmax.h maxent.h recurrent.h settings.h words.h util.h
50+
nce.o : nce.cc nce.h cuda_softmax.h maxent.h layers/interface.h settings.h words.h util.h
5051
$(CC) $< -c -o $@ $(CFLAGS)
5152

5253
maxent.o : maxent.cc maxent.h settings.h
5354
$(CC) $< -c -o $@ $(CFLAGS)
5455

55-
recurrent.o : recurrent.cc recurrent.h settings.h util.h
56-
$(CC) $< -c -o $@ $(CFLAGS)
57-
58-
nnet.o : nnet.cc nnet.h maxent.h settings.h hierarchical_softmax.h words.h recurrent.h util.h
56+
nnet.o : nnet.cc nnet.h maxent.h settings.h hierarchical_softmax.h words.h layers/interface.h util.h
5957
$(CC) $< -c -o $@ $(CFLAGS)
6058

6159
cuda_softmax.o : cuda_softmax.cu cuda_softmax.h settings.h
6260
nvcc $< -c -Xcompiler "$(NVCC_CFLAGS)" -o $@
6361

64-
tests : tests.cc $(OBJ_FILES)
65-
$(CC) $^ -o $@ $(CFLAGS) $(LDFLAGS)
62+
layers/simple_layer.o : layers/simple_layer.cc layers/simple_layer.h layers/interface.h layers/activation_functions.h layers/util.h
63+
$(CC) $< -c -o $@ $(CFLAGS)
64+
65+
layers/scrn_layer.o : layers/scrn_layer.cc layers/scrn_layer.h layers/interface.h layers/util.h layers/activation_functions.h settings.h util.h
66+
$(CC) $< -c -o $@ $(CFLAGS)
67+
68+
layers/gru_layer.o : layers/gru_layer.cc layers/gru_layer.h layers/interface.h layers/util.h layers/activation_functions.h settings.h util.h
69+
$(CC) $< -c -o $@ $(CFLAGS)
70+
71+
layers/layer_stack.o : layers/layer_stack.cc layers/layer_stack.h layers/interface.h layers/util.h settings.h util.h
72+
$(CC) $< -c -o $@ $(CFLAGS)
73+
74+
layers/interface.o : layers/interface.cc layers/interface.h layers/util.h settings.h util.h $(LAYER_OBJ_FILES:.o=.h)
75+
$(CC) $< -c -o $@ $(CFLAGS)
76+
77+
test_gradients/test_gradients : test_gradients/test_gradients.cc $(OBJ_FILES)
78+
$(CC) $^ -o $@ -std=gnu++0x $(CFLAGS) $(LDFLAGS)
6679

6780
clean:
6881
rm -f rnnlm $(OBJ_FILES)
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#ifndef FASTER_RNNLM_LAYERS_ACTIVATION_FUNCTIONS_H_
2+
#define FASTER_RNNLM_LAYERS_ACTIVATION_FUNCTIONS_H_
3+
4+
#include <math.h>
5+
6+
#include "faster-rnnlm/settings.h"
7+
8+
9+
namespace {
10+
const Real kReLUTruncation = 20;
11+
};
12+
13+
14+
// For each activation function a class with two static methods must be defined,
15+
// namely Forward and Backward
16+
//
17+
// Forward applies the function f to each elemente in the given array
18+
// Backward takes f(x) as the first argument and multiples elements
19+
// of the second array by f'(x)
20+
struct IActivation {
21+
virtual void Forward(Real* hidden, int size) = 0;
22+
virtual void Backward(const Real* hidden, int size, Real* hidden_g) = 0;
23+
virtual ~IActivation() {}
24+
};
25+
26+
27+
struct SigmoidActivation : public IActivation {
28+
void Forward(Real* hidden, int size) {
29+
for (int i = 0; i < size; i++) {
30+
hidden[i] = exp(hidden[i]) / (1 + exp(hidden[i]));
31+
}
32+
}
33+
34+
void Backward(const Real* hidden, int size, Real* hidden_g) {
35+
for (int i = 0; i < size; ++i) {
36+
hidden_g[i] *= hidden[i] * (1 - hidden[i]);
37+
}
38+
}
39+
};
40+
41+
42+
struct TanhActivation : public IActivation {
43+
void Forward(Real* hidden, int size) {
44+
for (int i = 0; i < size; i++) {
45+
hidden[i] = tanh(hidden[i]);
46+
}
47+
}
48+
49+
void Backward(const Real* hidden, int size, Real* hidden_g) {
50+
for (int i = 0; i < size; ++i) {
51+
hidden_g[i] *= (1 - hidden[i] * hidden[i]);
52+
}
53+
}
54+
};
55+
56+
57+
struct ReLUActivation : public IActivation {
58+
void Forward(Real* hidden, int size) {
59+
for (int i = 0; i < size; i++) {
60+
hidden[i] = (hidden[i] > 0) ? hidden[i] : 0;
61+
}
62+
}
63+
64+
void Backward(const Real* hidden, int size, Real* hidden_g) {
65+
for (int i = 0; i < size; ++i) {
66+
hidden_g[i] *= static_cast<int>(hidden[i] > 0);
67+
}
68+
}
69+
};
70+
71+
72+
struct TruncatedReLUActivation : public IActivation {
73+
void Forward(Real* hidden, int size) {
74+
for (int i = 0; i < size; i++) {
75+
hidden[i] = (hidden[i] > 0 && hidden[i] < kReLUTruncation) ? hidden[i] : 0;
76+
}
77+
}
78+
79+
void Backward(const Real* hidden, int size, Real* hidden_g) {
80+
for (int i = 0; i < size; ++i) {
81+
hidden_g[i] *= static_cast<int>(hidden[i] > 0 && hidden[i] < kReLUTruncation);
82+
}
83+
}
84+
};
85+
86+
#endif // FASTER_RNNLM_LAYERS_ACTIVATION_FUNCTIONS_H_

faster-rnnlm/layers/gru_layer.cc

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#include "faster-rnnlm/layers/gru_layer.h"
2+
3+
#include "faster-rnnlm/layers/activation_functions.h"
4+
5+
6+
class GRULayer::Updater : public IRecUpdater, public TruncatedBPTTMixin<GRULayer::Updater> {
7+
public:
8+
explicit Updater(GRULayer* layer)
9+
: IRecUpdater(layer->weights_.layer_size_)
10+
, layer_(*layer)
11+
, reset_(MAX_SENTENCE_WORDS, size_), reset_g_(reset_)
12+
, update_(MAX_SENTENCE_WORDS, size_), update_g_(update_)
13+
, partialhidden_(MAX_SENTENCE_WORDS, size_), partialhidden_g_(partialhidden_)
14+
, quasihidden_(MAX_SENTENCE_WORDS, size_), quasihidden_g_(quasihidden_)
15+
16+
, syn_reset_in_(&layer_.weights_.syn_reset_in_)
17+
, syn_reset_out_(&layer_.weights_.syn_reset_out_)
18+
, syn_update_in_(&layer_.weights_.syn_update_in_)
19+
, syn_update_out_(&layer_.weights_.syn_update_out_)
20+
, syn_quasihidden_in_(&layer_.weights_.syn_quasihidden_in_)
21+
, syn_quasihidden_out_(&layer_.weights_.syn_quasihidden_out_)
22+
23+
, bias_reset_(&layer_.weights_.bias_reset_)
24+
, bias_update_(&layer_.weights_.bias_update_)
25+
{
26+
partialhidden_.row(0).setZero();
27+
reset_g_.row(0).setZero();
28+
}
29+
30+
void ForwardSubSequence(int start, int steps);
31+
32+
void BackwardSequence(int steps, uint32_t truncation_seed, int bptt_period, int bptt);
33+
34+
void UpdateWeights(int steps, Real lrate, Real l2reg, Real rmsprop, Real gradient_clipping);
35+
36+
void BackwardStep(int step);
37+
38+
void BackwardStepThroughTime(int step);
39+
40+
private:
41+
GRULayer& layer_;
42+
43+
RowMatrix reset_, reset_g_;
44+
RowMatrix update_, update_g_;
45+
RowMatrix partialhidden_, partialhidden_g_;
46+
RowMatrix quasihidden_, quasihidden_g_;
47+
48+
WeightMatrixUpdater<RowMatrix> syn_reset_in_;
49+
WeightMatrixUpdater<RowMatrix> syn_reset_out_;
50+
WeightMatrixUpdater<RowMatrix> syn_update_in_;
51+
WeightMatrixUpdater<RowMatrix> syn_update_out_;
52+
WeightMatrixUpdater<RowMatrix> syn_quasihidden_in_;
53+
WeightMatrixUpdater<RowMatrix> syn_quasihidden_out_;
54+
55+
WeightMatrixUpdater<RowVector> bias_reset_;
56+
WeightMatrixUpdater<RowVector> bias_update_;
57+
};
58+
59+
void GRULayer::Updater::ForwardSubSequence(int start, int steps) {
60+
reset_.middleRows(start, steps) = input_.middleRows(start, steps);
61+
update_.middleRows(start, steps) = input_.middleRows(start, steps);
62+
quasihidden_.middleRows(start, steps) = input_.middleRows(start, steps);
63+
64+
if (layer_.use_input_weights_) {
65+
reset_.middleRows(start, steps) *= syn_reset_in_.W().transpose();
66+
update_.middleRows(start, steps) *= syn_update_in_.W().transpose();
67+
quasihidden_.middleRows(start, steps) *= syn_quasihidden_in_.W().transpose();
68+
}
69+
70+
for (int step = start; step < start + steps; ++step) {
71+
if (layer_.use_bias_) {
72+
reset_.row(step) += bias_reset_.W();
73+
update_.row(step) += bias_update_.W();
74+
}
75+
76+
if (step != 0) {
77+
reset_.row(step).noalias() += output_.row(step - 1) * syn_reset_out_.W().transpose();
78+
update_.row(step).noalias() += output_.row(step - 1) * syn_update_out_.W().transpose();
79+
}
80+
SigmoidActivation().Forward(reset_.row(step).data(), size_);
81+
SigmoidActivation().Forward(update_.row(step).data(), size_);
82+
83+
if (step != 0) {
84+
partialhidden_.row(step).noalias() = output_.row(step - 1).cwiseProduct(reset_.row(step));
85+
quasihidden_.row(step).noalias() +=
86+
partialhidden_.row(step) * syn_quasihidden_out_.W().transpose();
87+
}
88+
TanhActivation().Forward(quasihidden_.row(step).data(), size_);
89+
90+
if (step == 0) {
91+
output_.row(step).row(step).noalias()
92+
= quasihidden_.row(step).cwiseProduct(update_.row(step));
93+
} else {
94+
// these 3 lines means:
95+
// output_t = (quasihidden_t - output_{t - 1}) * update_t + output_{t - 1}
96+
output_.row(step).noalias() = quasihidden_.row(step) - output_.row(step - 1);
97+
output_.row(step).array() *= update_.row(step).array();
98+
output_.row(step) += output_.row(step - 1);
99+
}
100+
}
101+
}
102+
103+
void GRULayer::Updater::BackwardSequence(
104+
int steps, uint32_t truncation_seed, int bptt_period, int bptt) {
105+
BackwardSequenceTruncated(steps, truncation_seed, bptt_period, bptt);
106+
107+
input_g_.topRows(steps).setZero();
108+
if (layer_.use_input_weights_) {
109+
input_g_.topRows(steps).noalias() += quasihidden_g_.topRows(steps) * syn_update_in_.W();
110+
input_g_.topRows(steps).noalias() += reset_g_.topRows(steps) * syn_reset_in_.W();
111+
input_g_.topRows(steps).noalias() += update_g_.topRows(steps) * syn_update_in_.W();
112+
} else {
113+
input_g_.topRows(steps) += quasihidden_g_.topRows(steps);
114+
input_g_.topRows(steps) += reset_g_.topRows(steps);
115+
input_g_.topRows(steps) += update_g_.topRows(steps);
116+
}
117+
}
118+
119+
void GRULayer::Updater::BackwardStep(int step) {
120+
update_g_.row(step) = quasihidden_.row(step);
121+
if (step != 0) {
122+
update_g_.row(step) -= output_.row(step - 1);
123+
}
124+
update_g_.row(step).array() *= output_g_.row(step).array();
125+
SigmoidActivation().Backward(update_.row(step).data(), size_, update_g_.row(step).data());
126+
127+
quasihidden_g_.row(step) = output_g_.row(step).cwiseProduct(update_.row(step));
128+
TanhActivation().Backward(quasihidden_.row(step).data(), size_, quasihidden_g_.row(step).data());
129+
130+
partialhidden_g_.row(step).noalias() = quasihidden_g_.row(step) * syn_quasihidden_out_.W();
131+
132+
if (step != 0) {
133+
reset_g_.row(step) = partialhidden_g_.row(step).cwiseProduct(output_.row(step - 1));
134+
}
135+
SigmoidActivation().Backward(reset_.row(step).data(), size_, reset_g_.row(step).data());
136+
}
137+
138+
139+
void GRULayer::Updater::BackwardStepThroughTime(int step) {
140+
// these 2 lines means: h'_{t - 1} += (1 - u_t) * h'_t
141+
output_g_.row(step - 1) += output_g_.row(step);
142+
output_g_.row(step - 1) -= update_.row(step).cwiseProduct(output_g_.row(step));
143+
144+
output_g_.row(step - 1) += reset_.row(step).cwiseProduct(partialhidden_g_.row(step));
145+
output_g_.row(step - 1).noalias() += reset_g_.row(step) * syn_reset_out_.W();
146+
output_g_.row(step - 1).noalias() += update_g_.row(step) * syn_update_out_.W();
147+
}
148+
149+
150+
void GRULayer::Updater::UpdateWeights(
151+
int steps, Real lrate, Real l2reg, Real rmsprop, Real gradient_clipping) {
152+
if (steps <= 1 || size_ == 0) {
153+
return;
154+
}
155+
156+
UpdateRecurrentSynWeights(steps - 1, lrate, l2reg, rmsprop, gradient_clipping,
157+
partialhidden_.middleRows(1, steps - 1), quasihidden_g_.middleRows(1, steps - 1),
158+
&syn_quasihidden_out_);
159+
UpdateRecurrentSynWeights(steps - 1, lrate, l2reg, rmsprop, gradient_clipping,
160+
output_, reset_g_.bottomRows(reset_g_.rows() - 1),
161+
&syn_reset_out_);
162+
UpdateRecurrentSynWeights(steps - 1, lrate, l2reg, rmsprop, gradient_clipping,
163+
output_, update_g_.bottomRows(update_g_.rows() - 1),
164+
&syn_update_out_);
165+
166+
if (layer_.use_input_weights_) {
167+
UpdateRecurrentSynWeights(steps, lrate, l2reg, rmsprop, gradient_clipping,
168+
input_, quasihidden_g_,
169+
&syn_quasihidden_in_);
170+
UpdateRecurrentSynWeights(steps, lrate, l2reg, rmsprop, gradient_clipping,
171+
input_, reset_g_,
172+
&syn_reset_in_);
173+
UpdateRecurrentSynWeights(steps, lrate, l2reg, rmsprop, gradient_clipping,
174+
input_, update_g_,
175+
&syn_update_in_);
176+
}
177+
178+
if (layer_.use_bias_) {
179+
*bias_reset_.GetGradients() = reset_g_.middleRows(1, steps - 1).colwise().mean();
180+
bias_reset_.ApplyGradients(lrate, l2reg, rmsprop, gradient_clipping);
181+
182+
*bias_update_.GetGradients() = update_g_.middleRows(1, steps - 1).colwise().mean();
183+
bias_update_.ApplyGradients(lrate, l2reg, rmsprop, gradient_clipping);
184+
}
185+
}
186+
187+
188+
IRecUpdater* GRULayer::CreateUpdater() {
189+
return new Updater(this);
190+
}
191+

0 commit comments

Comments
 (0)