yandex
diff --git a/‎AUTHORS
Lines changed: 1 addition & 1 deletion b/‎AUTHORS
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 7 additions & 1 deletion b/‎README.md
Lines changed: 7 additions & 1 deletion
diff --git a/‎faster-rnnlm/Makefile
Lines changed: 2 additions & 1 deletion b/‎faster-rnnlm/Makefile
Lines changed: 2 additions & 1 deletion
diff --git a/‎faster-rnnlm/cuda_softmax.cu
Lines changed: 0 additions & 4 deletions b/‎faster-rnnlm/cuda_softmax.cu
Lines changed: 0 additions & 4 deletions
diff --git a/‎faster-rnnlm/cuda_softmax.h
Lines changed: 0 additions & 4 deletions b/‎faster-rnnlm/cuda_softmax.h
Lines changed: 0 additions & 4 deletions
diff --git a/‎faster-rnnlm/hierarchical_softmax.cc
Lines changed: 14 additions & 15 deletions b/‎faster-rnnlm/hierarchical_softmax.cc
Lines changed: 14 additions & 15 deletions
diff --git a/‎faster-rnnlm/hierarchical_softmax.h
Lines changed: 1 addition & 1 deletion b/‎faster-rnnlm/hierarchical_softmax.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎faster-rnnlm/nce.cc
Lines changed: 5 additions & 4 deletions b/‎faster-rnnlm/nce.cc
Lines changed: 5 additions & 4 deletions
diff --git a/‎faster-rnnlm/nce.h
Lines changed: 1 addition & 1 deletion b/‎faster-rnnlm/nce.h
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
 The following authors have created the source code of "Faster RNNLM"
 published and distributed by YANDEX LLC as the owner:
 Anton Bakhtin <[email protected]>
-Ilia Edrenkin <[email protected]>
+Ilya Edrenkin <[email protected]>
@@ -17,7 +17,7 @@ Run `./build.sh` to download Eigen library and build faster-rnnlm.
 
 To train a simple model with GRU hidden unit and Noise Contrastive Estimation, use the following command:
 
-   `./rnnlm -rnnlm model_name -train train.txt -valid validation.txt -hidden 128 -hidden-type gru -nce 20 -alpha 0.01 -rmsprop 0.9`
+   `./rnnlm -rnnlm model_name -train train.txt -valid validation.txt -hidden 128 -hidden-type gru -nce 20 -alpha 0.01`
 
 Files train.txt and test.txt must contain one sentence per line. All distinct words that are found in the training file will be used for the nnet vocab, their counts will determine Huffman tree structure and remain fixed for this nnet. If you prefer using limited vocabulary (say, top 1 million words) you should map all other words to <unk> or another token of your choice. Limited vocabulary is usually a good idea if it helps you to have enough training examples for each word.
 
@@ -138,6 +138,12 @@ Optimization options
   --rmsprop <float>
     RMSprop coefficient; rmsprop=1 disables rmsprop and rmsprop=0 equivalent to RMS
     (default: 1)
+  --gradient-clipping <float>
+    Clip updates above the value (default: 1)
+  --learn-recurrent (0 | 1)
+    Learn hidden layer weights (default: 1)
+  --learn-embeddings (0 | 1)
+    Learn embedding weights (default: 1)
   --alpha <float>
     Learning rate for recurrent and embedding weights (default: 0.1)
   --maxent-alpha <float>
 
@@ -4,6 +4,7 @@ CC = g++
 CFLAGS = -Wall -march=native -funroll-loops -g -D__STDC_FORMAT_MACROS
 CFLAGS += -I../ -DEIGEN_DONT_PARALLELIZE # for Eigen
 CFLAGS += $(shell $(CC) -dumpversion | awk '{if(NR==1 && $$1>="4.6") print "-Ofast -Wno-unused-result"; else print "-O3";}')
+NVCC_CFLAGS = -O3 -march=native -funroll-loops
 LDFLAGS = -lm -lrt
 ifeq ($(NOTHREAD), 1)
 	CFLAGS += -DNOTHREAD
@@ -51,7 +52,7 @@ nnet.o : nnet.cc nnet.h maxent.h settings.h hierarchical_softmax.h words.h recur
 	$(CC) $< -c -o $@ $(CFLAGS)
 
 cuda_softmax.o : cuda_softmax.cu cuda_softmax.h settings.h
-	nvcc $< -c -Xcompiler "-Ofast -march=native -funroll-loops" -o $@
+	nvcc $< -c -Xcompiler "$(NVCC_CFLAGS)" -o $@
 
 
 clean:
 
@@ -26,7 +26,6 @@ void AssertCudaSuccessLast(const char* message) {
   return AssertCudaSuccess(cudaGetLastError(), message);
 }
 
-extern "C"
 void InitCudaStorage(CudaStorage* cust, size_t layer_size, size_t vocab_size, size_t maxent_hash_size, Real lnz) {
   cust->layer_size = layer_size;
   cust->vocab_size = vocab_size;
@@ -88,7 +87,6 @@ void InitCudaStorage(CudaStorage* cust, size_t layer_size, size_t vocab_size, si
   }
 }
 
-extern "C"
 void FreeCudaStorage(CudaStorage* cust) {
   cudaFree(cust->sm_embedding);
   if (cust->maxent_hash_size != 0) {
@@ -115,7 +113,6 @@ void FreeCudaStorage(CudaStorage* cust) {
   delete cust->inner;
 }
 
-extern "C"
 void UploadNetWeights(CudaStorage* cust, const Real* sm_embedding_cpu, const Real* maxent_cpu) {
   cudaMemcpy(cust->sm_embedding, sm_embedding_cpu, cust->layer_size * cust->vocab_size * sizeof(Real), cudaMemcpyHostToDevice);
   cudaMemcpy(cust->maxent, maxent_cpu, cust->maxent_hash_size * sizeof(Real), cudaMemcpyHostToDevice);
@@ -174,7 +171,6 @@ void CublasMultiply_A_BT(cublasHandle_t* handle, float beta, int rows_a, int row
             dev_c, rows_b);
 }
 
-extern "C"
 void CalculateSoftMax(
     CudaStorage* cust, const Real* hidden_layers,
     const uint64_t* maxent_indices_all, const int* maxent_indices_count_all,
 
@@ -33,16 +33,12 @@ struct CudaStorage {
   CudaStorageInner* inner;
 };
 
-extern "C"
 void InitCudaStorage(CudaStorage* cust, size_t layer_size, size_t vocab_size, size_t maxent_hash_size, Real lnz);
 
-extern "C"
 void FreeCudaStorage(CudaStorage* cust);
 
-extern "C"
 void UploadNetWeights(CudaStorage* cust, const Real* sm_embedding_cpu, const Real* maxent);
 
-extern "C"
 void CalculateSoftMax(CudaStorage* cust, const Real* hidden_layers, const uint64_t* maxent_indices_all, const int* maxent_indices_count_all, size_t sentence_length, const WordIndex* sen, Real* logprobs);
 
 #endif  // FASTER_RNNLM_CUDA_SOFTMAX_H_
@@ -346,7 +346,7 @@ inline void PropagateNodeForwardByDepth(
 inline void PropagateNodeBackward(
     HSTree* hs, WordIndex target_word, int depth,
     const uint64_t* feature_hashes, int maxent_order,
-    Real lrate, Real maxent_lrate, Real l2reg, Real maxent_l2reg,
+    Real lrate, Real maxent_lrate, Real l2reg, Real maxent_l2reg, Real gradient_clipping,
     const double* state,
     const Real* hidden,
     Real* hidden_grad, MaxEnt* maxent
@@ -357,33 +357,32 @@ inline void PropagateNodeBackward(
   Real branch_gradient[ARITY - 1];
   for (int branch = 0; branch < ARITY - 1; ++branch) {
     const int match = (branch == selected_branch);
-    // gradient of softmax
-    // g = branch_softmax_prob[selected_branch] * (match - branch_softmax_prob[branch]);
     // gradient of logsoftmax
-    // g /= branch_softmax_prob[selected_branch];
-    Real g = (match - (Real) state[branch]);
-    // gradient clipping
-    g = Clip(g, GRAD_CLIPPING);
-    branch_gradient[branch] = g;
+    branch_gradient[branch] = (match - static_cast<Real>(state[branch]));
   }
 
   for (int branch = 0; branch < ARITY - 1; ++branch) {
-    Real g = branch_gradient[branch];
+    Real grad = branch_gradient[branch];
     int child_offset = hs->tree_->GetChildOffsetByDepth(target_word, depth, branch);
+    Real* sm_embedding = hs->weights_.row(child_offset).data();
 
     // Propagate errors output -> hidden
     for (int i = 0; i < hs->layer_size; ++i) {
-      hidden_grad[i] += g * hs->weights_(child_offset, i);
+      hidden_grad[i] += grad * sm_embedding[i];
     }
 
     // Learn weights hidden -> output
     for (int i = 0; i < hs->layer_size; ++i) {
-      Real update = g * lrate * hidden[i] - l2reg * hs->weights_(child_offset, i);
-      hs->weights_(child_offset, i) += Clip(update, GRAD_CLIPPING);
+      Real update = grad * hidden[i];
+      sm_embedding[i] *= (1 - l2reg);
+      sm_embedding[i] += lrate * Clip(update, gradient_clipping);
     }
+
+    // update maxent weights
+    Real maxent_grad = Clip(grad, gradient_clipping);
     for (int order = 0; order < maxent_order; ++order) {
       uint64_t maxent_index = feature_hashes[order] + child_offset;
-      maxent->UpdateValue(maxent_index, maxent_lrate, g, maxent_l2reg);
+      maxent->UpdateValue(maxent_index, maxent_lrate, maxent_grad, maxent_l2reg);
     }
   }
 }
@@ -392,7 +391,7 @@ inline void PropagateNodeBackward(
 Real HSTree::PropagateForwardAndBackward(
     bool calculate_probability, WordIndex target_word,
     const uint64_t* feature_hashes, int maxent_order,
-    Real lrate, Real maxent_lrate, Real l2reg, Real maxent_l2reg,
+    Real lrate, Real maxent_lrate, Real l2reg, Real maxent_l2reg, Real gradient_clipping,
     const Real* hidden,
     Real* hidden_grad, MaxEnt* maxent
     ) {
@@ -406,7 +405,7 @@ Real HSTree::PropagateForwardAndBackward(
 
     PropagateNodeBackward(
         this, target_word, depth, feature_hashes, maxent_order,
-        lrate, maxent_lrate, l2reg, maxent_l2reg,
+        lrate, maxent_lrate, l2reg, maxent_l2reg, gradient_clipping,
         softmax_state,
         hidden, hidden_grad, maxent);
 
 
@@ -50,7 +50,7 @@ class HSTree {
   Real PropagateForwardAndBackward(
       bool calculate_probability, WordIndex target_word,
       const uint64_t* feature_hashes, int maxent_order,
-      Real lrate, Real maxent_lrate, Real l2reg, Real maxent_l2reg,
+      Real lrate, Real maxent_lrate, Real l2reg, Real maxent_l2reg, Real gradient_clipping,
       const Real* hidden,
       Real* hidden_grad, MaxEnt* maxent);
 
 
@@ -92,7 +92,7 @@ void NCE::Updater::PropagateForwardAndBackward(
     const Ref<const RowVector> hidden, WordIndex target_word,
     const uint64_t* maxent_indices, size_t maxent_size,
     const NoiseSample& sample, Real lrate, Real l2reg,
-    Real maxent_lrate, Real maxent_l2reg,
+    Real maxent_lrate, Real maxent_l2reg, Real gradient_clipping,
     Ref<RowVector> hidden_grad, MaxEnt* maxent) {
 
   Real ln_sample_size = log(sample.size);
@@ -116,14 +116,15 @@ void NCE::Updater::PropagateForwardAndBackward(
 
     // update softmax weights
     embedding_grad_.noalias() = grad * hidden;
-    ClipMatrix(embedding_grad_);
+    ClipMatrix(embedding_grad_, gradient_clipping);
     nce_->sm_embedding_.row(word) *= (1 - l2reg);
     nce_->sm_embedding_.row(word) += embedding_grad_ * lrate;
 
-    // update softmax weights
+    // update maxent weights
+    Real maxent_grad = Clip(grad, gradient_clipping);
     for (size_t i = 0; i < maxent_size; i++) {
       uint64_t maxent_index = get_maxent_index(maxent_indices[i], nce_->maxent_hash_size_, word);
-      maxent->UpdateValue(maxent_index, maxent_lrate, grad, maxent_l2reg);
+      maxent->UpdateValue(maxent_index, maxent_lrate, maxent_grad, maxent_l2reg);
     }
   }
 }
 
@@ -41,7 +41,7 @@ class NCE {
           const Ref<const RowVector> hidden, WordIndex target_word,
           const uint64_t* maxent_indices, size_t maxent_size,
           const NoiseSample& sample, Real lrate, Real l2reg,
-          Real maxent_lrate, Real maxent_l2reg,
+          Real maxent_lrate, Real maxent_l2reg, Real gradient_clipping,
           Ref<RowVector> hidden_grad, MaxEnt* maxent);
 
      private:
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,6 @@ void AssertCudaSuccessLast(const char* message) {`
`26`	`26`	`return AssertCudaSuccess(cudaGetLastError(), message);`
`27`	`27`	`}`
`28`	`28`
`29`		`-extern "C"`
`30`	`29`	`void InitCudaStorage(CudaStorage* cust, size_t layer_size, size_t vocab_size, size_t maxent_hash_size, Real lnz) {`
`31`	`30`	`cust->layer_size = layer_size;`
`32`	`31`	`cust->vocab_size = vocab_size;`
`@@ -88,7 +87,6 @@ void InitCudaStorage(CudaStorage* cust, size_t layer_size, size_t vocab_size, si`
`88`	`87`	`}`
`89`	`88`	`}`
`90`	`89`
`91`		`-extern "C"`
`92`	`90`	`void FreeCudaStorage(CudaStorage* cust) {`
`93`	`91`	`cudaFree(cust->sm_embedding);`
`94`	`92`	`if (cust->maxent_hash_size != 0) {`
`@@ -115,7 +113,6 @@ void FreeCudaStorage(CudaStorage* cust) {`
`115`	`113`	`delete cust->inner;`
`116`	`114`	`}`
`117`	`115`
`118`		`-extern "C"`
`119`	`116`	`void UploadNetWeights(CudaStorage* cust, const Real* sm_embedding_cpu, const Real* maxent_cpu) {`
`120`	`117`	`cudaMemcpy(cust->sm_embedding, sm_embedding_cpu, cust->layer_size * cust->vocab_size * sizeof(Real), cudaMemcpyHostToDevice);`
`121`	`118`	`cudaMemcpy(cust->maxent, maxent_cpu, cust->maxent_hash_size * sizeof(Real), cudaMemcpyHostToDevice);`
`@@ -174,7 +171,6 @@ void CublasMultiply_A_BT(cublasHandle_t* handle, float beta, int rows_a, int row`
`174`	`171`	`dev_c, rows_b);`
`175`	`172`	`}`
`176`	`173`
`177`		`-extern "C"`
`178`	`174`	`void CalculateSoftMax(`
`179`	`175`	`CudaStorage* cust, const Real* hidden_layers,`
`180`	`176`	`const uint64_t* maxent_indices_all, const int* maxent_indices_count_all,`