fix doc newdistrib

vince62s · vince62s · commit 63f8eee999b1 · 2018-09-18T10:02:11.000+02:00
diff --git a/README.md b/README.md
@@ -96,8 +96,11 @@ python train.py -data data/demo -save_model demo-model
 
 The main train command is quite simple. Minimally it takes a data file
 and a save file.  This will run the default model, which consists of a
-2-layer LSTM with 500 hidden units on both the encoder/decoder. You
-can also add `-gpuid 1` to use (say) GPU 1.
+2-layer LSTM with 500 hidden units on both the encoder/decoder.
+If you want to train on GPU, you need to set, as an example:
+CUDA_VISIBLE_DEVICES=1,3
+`-world_size 2 -gpu_ranks 0 1` to use (say) GPU 1 and 3 on this node only.
+To know more about distributed training on single or multi nodes, read the FAQ section.
 
 ### Step 3: Translate
 
diff --git a/data/README.md b/data/README.md
@@ -4,4 +4,4 @@
 
 > python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000
 
-> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -gpuid 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam  -learning_rate 0.001
+> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -world_size 1 -gpu_ranks 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam  -learning_rate 0.001
diff --git a/docs/source/FAQ.md b/docs/source/FAQ.md
@@ -62,21 +62,22 @@ python train.py -save_model data/model \
 ```
 
 
-## How do I use the Transformer model?
+## How do I use the Transformer model? Do you support multi-gpu?
 
 The transformer model is very sensitive to hyperparameters. To run it
 effectively you need to set a bunch of different options that mimic the Google
 setup. We have confirmed the following command can replicate their WMT results.
 
 ```
-python  train.py -data /tmp/de2/data -save_model /tmp/extra -gpuid 1 \
+python  train.py -data /tmp/de2/data -save_model /tmp/extra \
         -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8  \
         -encoder_type transformer -decoder_type transformer -position_encoding \
         -train_steps 200000  -max_generator_batches 2 -dropout 0.1 \
         -batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 \
         -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 \
         -max_grad_norm 0 -param_init 0  -param_init_glorot \
-        -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 -gpuid 0 1 2 3 
+        -label_smoothing 0.1 -valid_steps 10000 -save_checkpoint_steps 10000 \
+        -world_size 4 -gpu_ranks 0 1 2 3 
 ```
 
 Here are what each of the parameters mean:
@@ -87,16 +88,17 @@ Here are what each of the parameters mean:
 * `batch_type tokens`, `normalization tokens`, `accum_count 4`: batch and normalize based on number of tokens and not sentences. Compute gradients based on four batches. 
 - `label_smoothing 0.1`: use label smoothing loss. 
 
-* `gpuid 0 1 2 3 accum_count 2`: This will use 4 GPU and accumulate over 2 batches before updating parameters, this will emulate using 8 GPUS.
-
-
-## Do you support multi-gpu?
-
-Yes !
+Multi GPU settings
 First you need to make sure you export CUDA_VISIBLE_DEVICES=0,1,2,3
-Then use -gpuid 0 1 2 3
 If you want to use GPU id 1 and 3 of your OS, you will need to export CUDA_VISIBLE_DEVICES=1,3
-then use -gpuid 0 1
+* `world_size 4 gpu_ranks 0 1 2 3`: This will use 4 GPU on this node only.
+
+If you want to use 2 nodes with 2 GPU each, you need to set -master_ip and master_port, and
+* `world_size 4 gpu_ranks 0 1`: on the first node
+* `world_size 4 gpu_ranks 2 3`: on the second node
+* `accum_count 2`: This will accumulate over 2 batches before updating parameters.
+
+if you use a regular network card (1 Gbps) then we suggest to use a higher accum_count to minimize the inter-node communication.
 
 ## How can I ensemble Models at inference?
 
diff --git a/docs/source/Summarization.md b/docs/source/Summarization.md
@@ -94,7 +94,8 @@ python train.py -save_model models/cnndm \
                 -copy_loss_by_seqlength \
                 -bridge \
                 -seed 777 \
-                -gpuid X
+                -world_size 2 \
+                -gpu_ranks 0 1
 ```
 
 (2) CNNDM Transformer
@@ -129,7 +130,8 @@ python -u train.py -data data/cnndm/CNNDM \
                    -share_embeddings \
                    -copy_attn \
                    -param_init_glorot \
-                   -gpuid 3
+                   -world_size 2 \
+                   -gpu_ranks 0 1
 ```
 
 (3) Gigaword
diff --git a/docs/source/extended.md b/docs/source/extended.md
@@ -23,7 +23,7 @@ python preprocess.py -train_src data/multi30k/train.en.atok -train_tgt data/mult
 Step 2. Train the model.
 
 ```bash
-python train.py -data data/multi30k.atok.low -save_model multi30k_model -gpuid 0
+python train.py -data data/multi30k.atok.low -save_model multi30k_model -gpu_ranks 0
 ```
 
 Step 3. Translate sentences.
diff --git a/docs/source/im2text.md b/docs/source/im2text.md
@@ -43,7 +43,7 @@ python preprocess.py -data_type img -src_dir data/im2text/images/ -train_src dat
 2) Train the model.
 
 ```
-python train.py -model_type img -data data/im2text/demo -save_model demo-model -gpuid 0 -batch_size 20 \
+python train.py -model_type img -data data/im2text/demo -save_model demo-model -gpu_ranks 0 -batch_size 20 \
 				-max_grad_norm 20 -learning_rate 0.1 -word_vec_size 80 -encoder_type brnn
 ```
 
diff --git a/docs/source/options/train.md b/docs/source/options/train.md
@@ -116,15 +116,18 @@ Path prefix to the ".train.pt" and ".valid.pt" file path from preprocess.py
 Model filename (the model will be saved as <save_model>_epochN_PPL.pt where PPL
 is the validation perplexity
 
-* **-gpuid []** 
-Use CUDA on the listed devices.
+* **-world_size [1]** 
+Total number of GPU processes accross several nodes.
+
+* **-gpu_ranks []** 
+Indices in the total number of procsses accross several nodes.
 
 * **-seed [-1]** 
 Random seed used for the experiments reproducibility.
 
 ### **Initialization**:
-* **-start_epoch [1]** 
-The epoch from which to start
+* **-train_steps [100000]** 
+Number of iterations (parameters update) for training
 
 * **-param_init [0.1]** 
 Parameters are initialized over uniform distribution with support (-param_init,
@@ -169,13 +172,13 @@ batch_size * accum_count batches at once. Recommended for Transformer.
 * **-valid_batch_size [32]** 
 Maximum batch size for validation
 
+* **-valid_steps [10000]** 
+Run a validation every these steps
+
 * **-max_generator_batches [32]** 
 Maximum batches of words in a sequence to run the generator on in parallel.
 Higher is faster, but uses more memory.
 
-* **-epochs [13]** 
-Number of training epochs
-
 * **-optim [sgd]** 
 Optimization method.
 
@@ -222,11 +225,17 @@ Starting learning rate. Recommended settings: sgd = 1, adagrad = 0.1, adadelta =
 If update_learning_rate, decay learning rate by this much if (i) perplexity does
 not decrease on the validation set or (ii) epoch has gone past start_decay_at
 
-* **-start_decay_at [8]** 
-Start decaying every epoch after and including this epoch
+* **-start_decay_steps [50000]** 
+Start decaying after these steps
+
+* **-decay_steps [10000]** 
+Decay every these steps (after the start_decay_steps)
+
+* **-save_checkpoint_steps [5000]** 
+Save a checkpoint every these steps
 
-* **-start_checkpoint_at []** 
-Start checkpointing every epoch after and including this epoch
+* **-keep_checkpoint [-1]** 
+Keep N last checkpoints. -1 = keep all.
 
 * **-decay_method []** 
 Use a custom decay rate.
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
@@ -35,8 +35,11 @@ python train.py -data data/demo -save_model demo-model
 
 The main train command is quite simple. Minimally it takes a data file
 and a save file.  This will run the default model, which consists of a
-2-layer LSTM with 500 hidden units on both the encoder/decoder. You
-can also add `-gpuid 1` to use (say) GPU 1.
+2-layer LSTM with 500 hidden units on both the encoder/decoder.
+If you want to train on GPU, you need to set, as an example:
+CUDA_VISIBLE_DEVICES=1,3
+`-world_size 2 -gpu_ranks 0 1` to use (say) GPU 1 and 3 on this node only.
+To know more about distributed training on single or multi nodes, read the FAQ section.
 
 ### Step 3: Translate
 
diff --git a/docs/source/speech2text.md b/docs/source/speech2text.md
@@ -29,7 +29,7 @@ python preprocess.py -data_type audio -src_dir data/speech/an4_dataset -train_sr
 2) Train the model.
 
 ```
-python train.py -model_type audio -data data/speech/demo -save_model demo-model -gpuid 0 -batch_size 16 -max_grad_norm 20 -learning_rate 0.1 -learning_rate_decay 0.98 -train_steps 100000
+python train.py -model_type audio -data data/speech/demo -save_model demo-model -gpu_ranks 0 -batch_size 16 -max_grad_norm 20 -learning_rate 0.1 -learning_rate_decay 0.98 -train_steps 100000
 ```
 
 3) Translate the speechs.
diff --git a/onmt/inputters/inputter.py b/onmt/inputters/inputter.py
@@ -488,8 +488,7 @@ def batch_size_fn(new, count, sofar):
             return max(src_elements, tgt_elements)
     else:
         batch_size_fn = None
-    # device = opt.device_id if opt.gpuid else -1
-    # breaking change torchtext 0.3
+
     if opt.gpu_ranks:
         device = "cuda"
     else:
diff --git a/onmt/opts.py b/onmt/opts.py
@@ -250,7 +250,7 @@ def train_opts(parser):
                        help="Deprecated see world_size and gpu_ranks.")
     group.add_argument('-gpu_ranks', default=[], nargs='+', type=int,
                        help="list of ranks of each process.")
-    group.add_argument('-world_size', default=0, type=int,
+    group.add_argument('-world_size', default=1, type=int,
                        help="total number of distributed processes.")
     group.add_argument('-gpu_backend', default='nccl', nargs='+', type=str,
                        help="Type of torch distributed backend")
diff --git a/onmt/tests/rebuild_test_models.sh b/onmt/tests/rebuild_test_models.sh
@@ -7,7 +7,7 @@ if true; then
 rm data/*.pt
 $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 
 
-$my_python train.py -data data/data -save_model tmp -gpuid 0 -rnn_size 256 -word_vec_size 256 -layers 1 -train_steps 10000 -optim adam  -learning_rate 0.001 -rnn_type LSTM -input_feed 0
+$my_python train.py -data data/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 1 -train_steps 10000 -optim adam  -learning_rate 0.001 -rnn_type LSTM -input_feed 0
 #-truncated_decoder 5 
 #-label_smoothing 0.1
 
@@ -21,7 +21,7 @@ if false; then
 rm data/*.pt
 $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 
 
-$my_python train.py -data data/data -save_model /tmp/tmp -gpuid 0 -rnn_size 256 -word_vec_size 256 -layers 2 -train_steps 10000 -optim adam  -learning_rate 0.001 -encoder_type cnn -decoder_type cnn
+$my_python train.py -data data/data -save_model /tmp/tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 2 -train_steps 10000 -optim adam  -learning_rate 0.001 -encoder_type cnn -decoder_type cnn
 
 
 mv /tmp/tmp*e10.pt onmt/tests/test_model.pt
@@ -34,7 +34,7 @@ if true; then
 rm data/morph/*.pt
 $my_python preprocess.py -train_src data/morph/src.train -train_tgt data/morph/tgt.train -valid_src data/morph/src.valid -valid_tgt data/morph/tgt.valid -save_data data/morph/data 
 
-$my_python train.py -data data/morph/data -save_model tmp -gpuid 0 -rnn_size 400 -word_vec_size 100 -layers 1 -train_steps 8000 -optim adam  -learning_rate 0.001
+$my_python train.py -data data/morph/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 400 -word_vec_size 100 -layers 1 -train_steps 8000 -optim adam  -learning_rate 0.001
 
 
 mv tmp*e8.pt onmt/tests/test_model2.pt
@@ -49,7 +49,7 @@ $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train
 
 $my_python train.py -data data/data -save_model /tmp/tmp -batch_type tokens -batch_size 1024 -accum_count 4 \
  -layers 4 -rnn_size 256 -word_vec_size 256 -encoder_type transformer -decoder_type transformer -share_embedding \
- -train_steps 10000 -gpuid 0 -max_generator_batches 4 -dropout 0.1 -normalization tokens \
+ -train_steps 10000 -world_size 1 -gpu_ranks 0 -max_generator_batches 4 -dropout 0.1 -normalization tokens \
  -max_grad_norm 0 -optim adam -decay_method noam -learning_rate 2 -label_smoothing 0.1 \
  -position_encoding -param_init 0 -warmup_steps 100 -param_init_glorot -adam_beta2 0.998
 #
diff --git a/onmt/train_multi.py b/onmt/train_multi.py
diff --git a/onmt/trainer.py b/onmt/trainer.py
@@ -41,8 +41,11 @@ def build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=N
     norm_method = opt.normalization
     grad_accum_count = opt.accum_count
     n_gpu = opt.world_size
-    # TODO if no GPU device_id = -1
-    gpu_rank = opt.gpu_ranks[device_id]
+    if device_id >= 0:
+        gpu_rank = opt.gpu_ranks[device_id]
+    else:
+        gpu_rank = 0
+        n_gpu = 0
     gpu_verbose_level = opt.gpu_verbose_level
 
     report_manager = onmt.utils.build_report_manager(opt)
diff --git a/onmt/utils/distributed.py b/onmt/utils/distributed.py
@@ -18,9 +18,6 @@ def is_master(opt, device_id):
 
 
 def multi_init(opt, device_id):
-    # if len(opt.gpuid) == 1:
-    #   raise ValueError('Cannot initialize multiprocess with one gpu only')
-    # dist_init_method = 'tcp://localhost:10000'
     dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
         master_ip=opt.master_ip,
         master_port=opt.master_port)
diff --git a/tools/bpe_pipeline.sh b/tools/bpe_pipeline.sh
@@ -110,7 +110,7 @@ python $ONMT/preprocess.py \
 echo "Step 2: Train"
 GPU_OPTS=""
 if [[ ! -z $GPUARG ]]; then
-    GPU_OPTS="-gpuid $GPUARG"
+    GPU_OPTS="-gpu_ranks $GPUARG"
 fi
 CMD="python $ONMT/train.py -data $OUT/data/processed -save_model $OUT/models/$NAME $GPU_OPTS"
 echo "Training command :: $CMD"
diff --git a/train.py b/train.py

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@`
`4`	`4`
`5`	`5`	`> python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000`
`6`	`6`
`7`		`-> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -gpuid 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam -learning_rate 0.001`
	`7`	`+> python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -world_size 1 -gpu_ranks 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam -learning_rate 0.001`