Skip to content

Commit 46c95f9

Browse files
Jun Tianwangkuiyi
authored andcommitted
Adjust default value of max_queue for TF SummaryWriter
GitOrigin-RevId: f37f63f7ca2f685530b93fa93da888cf5fc671a6
1 parent 57e901b commit 46c95f9

156 files changed

Lines changed: 292 additions & 2 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

axlearn/common/summary_writer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,14 +232,15 @@ class Config(BaseWriter.Config):
232232
If a type is not listed, `write_every_n_steps` is used as fallback. Each value must
233233
be a positive integer multiple of `write_every_n_steps`.
234234
max_queue: Configures maximum number of summaries before flush.
235-
If None, uses the `tf_summary` default (10).
235+
Defaults to 1000. The original `tf_summary` default is 10, which is too small
236+
and may cause frequent flushes to GCS, potentially blocking training.
236237
flush_ms: Largest interval between flushes in milliseconds.
237238
If None, uses the `tf_summary` default (120,000, i.e. 2 minutes).
238239
"""
239240

240241
write_every_n_steps: int = 1
241242
write_every_n_steps_map: Optional[dict[SummaryKind, int]] = None
242-
max_queue: Optional[int] = None
243+
max_queue: int = 1000
243244
flush_ms: Optional[float] = None
244245

245246
def __init__(self, cfg: BaseWriter.Config, *, parent: Optional[Module]):

axlearn/experiments/testdata/axlearn.experiments.audio.conformer.librispeech_trainer/conformer-l-rnnt.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ evalers['eval_train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
3535
evalers['eval_train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
3636
evalers['eval_train'].metric_calculator.model_method: 'forward'
3737
evalers['eval_train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
38+
evalers['eval_train'].summary_writer.max_queue: 1000
3839
evalers['eval_train'].summary_writer.write_every_n_steps: 1
3940
evalers['eval_dev_clean'].eval_dtype: 'jax.numpy.float32'
4041
evalers['eval_dev_clean'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -63,6 +64,7 @@ evalers['eval_dev_clean'].metric_calculator.klass: 'axlearn.common.evaler.ModelS
6364
evalers['eval_dev_clean'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
6465
evalers['eval_dev_clean'].metric_calculator.model_method: 'forward'
6566
evalers['eval_dev_clean'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
67+
evalers['eval_dev_clean'].summary_writer.max_queue: 1000
6668
evalers['eval_dev_clean'].summary_writer.write_every_n_steps: 1
6769
evalers['eval_dev_other'].eval_dtype: 'jax.numpy.float32'
6870
evalers['eval_dev_other'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -91,6 +93,7 @@ evalers['eval_dev_other'].metric_calculator.klass: 'axlearn.common.evaler.ModelS
9193
evalers['eval_dev_other'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
9294
evalers['eval_dev_other'].metric_calculator.model_method: 'forward'
9395
evalers['eval_dev_other'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
96+
evalers['eval_dev_other'].summary_writer.max_queue: 1000
9497
evalers['eval_dev_other'].summary_writer.write_every_n_steps: 1
9598
evalers['decoder_dev_clean'].eval_dtype: 'jax.numpy.float32'
9699
evalers['decoder_dev_clean'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -128,6 +131,7 @@ evalers['decoder_dev_clean'].metric_calculator.vocab.reverse_extra_ids: True
128131
evalers['decoder_dev_clean'].metric_calculator.vocab.sentencepiece_model_file: '$DATA_DIR/tokenizers/sentencepiece/librispeech_bpe_1024.model'
129132
evalers['decoder_dev_clean'].metric_calculator.vocab.use_fast_tokenizer: False
130133
evalers['decoder_dev_clean'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
134+
evalers['decoder_dev_clean'].summary_writer.max_queue: 1000
131135
evalers['decoder_dev_clean'].summary_writer.write_every_n_steps: 1
132136
evalers['decoder_dev_other'].eval_dtype: 'jax.numpy.float32'
133137
evalers['decoder_dev_other'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -165,6 +169,7 @@ evalers['decoder_dev_other'].metric_calculator.vocab.reverse_extra_ids: True
165169
evalers['decoder_dev_other'].metric_calculator.vocab.sentencepiece_model_file: '$DATA_DIR/tokenizers/sentencepiece/librispeech_bpe_1024.model'
166170
evalers['decoder_dev_other'].metric_calculator.vocab.use_fast_tokenizer: False
167171
evalers['decoder_dev_other'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
172+
evalers['decoder_dev_other'].summary_writer.max_queue: 1000
168173
evalers['decoder_dev_other'].summary_writer.write_every_n_steps: 1
169174
evalers['decoder_test_clean'].eval_dtype: 'jax.numpy.float32'
170175
evalers['decoder_test_clean'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -202,6 +207,7 @@ evalers['decoder_test_clean'].metric_calculator.vocab.reverse_extra_ids: True
202207
evalers['decoder_test_clean'].metric_calculator.vocab.sentencepiece_model_file: '$DATA_DIR/tokenizers/sentencepiece/librispeech_bpe_1024.model'
203208
evalers['decoder_test_clean'].metric_calculator.vocab.use_fast_tokenizer: False
204209
evalers['decoder_test_clean'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
210+
evalers['decoder_test_clean'].summary_writer.max_queue: 1000
205211
evalers['decoder_test_clean'].summary_writer.write_every_n_steps: 1
206212
evalers['decoder_test_other'].eval_dtype: 'jax.numpy.float32'
207213
evalers['decoder_test_other'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -239,6 +245,7 @@ evalers['decoder_test_other'].metric_calculator.vocab.reverse_extra_ids: True
239245
evalers['decoder_test_other'].metric_calculator.vocab.sentencepiece_model_file: '$DATA_DIR/tokenizers/sentencepiece/librispeech_bpe_1024.model'
240246
evalers['decoder_test_other'].metric_calculator.vocab.use_fast_tokenizer: False
241247
evalers['decoder_test_other'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
248+
evalers['decoder_test_other'].summary_writer.max_queue: 1000
242249
evalers['decoder_test_other'].summary_writer.write_every_n_steps: 1
243250
evalers['decoder_train'].eval_dtype: 'jax.numpy.float32'
244251
evalers['decoder_train'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -276,6 +283,7 @@ evalers['decoder_train'].metric_calculator.vocab.reverse_extra_ids: True
276283
evalers['decoder_train'].metric_calculator.vocab.sentencepiece_model_file: '$DATA_DIR/tokenizers/sentencepiece/librispeech_bpe_1024.model'
277284
evalers['decoder_train'].metric_calculator.vocab.use_fast_tokenizer: False
278285
evalers['decoder_train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
286+
evalers['decoder_train'].summary_writer.max_queue: 1000
279287
evalers['decoder_train'].summary_writer.write_every_n_steps: 1
280288
input.batcher.fn: 'axlearn.common.input_tf_data.batch'
281289
input.batcher.global_batch_size: 2048
@@ -561,4 +569,5 @@ prune_empty_state_updates: True
561569
save_input_iterator: False
562570
start_trace_process_indices[0]: 0
563571
summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
572+
summary_writer.max_queue: 1000
564573
summary_writer.write_every_n_steps: 200

axlearn/experiments/testdata/axlearn.experiments.audio.conformer.librispeech_trainer/conformer-test-ctc.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,4 +250,5 @@ prune_empty_state_updates: True
250250
save_input_iterator: False
251251
start_trace_process_indices[0]: 0
252252
summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
253+
summary_writer.max_queue: 1000
253254
summary_writer.write_every_n_steps: 200

axlearn/experiments/testdata/axlearn.experiments.logistic_regression.tutorial/LogisticRegression.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ evalers['eval'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAccu
1818
evalers['eval'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
1919
evalers['eval'].metric_calculator.model_method: 'forward'
2020
evalers['eval'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
21+
evalers['eval'].summary_writer.max_queue: 1000
2122
evalers['eval'].summary_writer.write_every_n_steps: 1
2223
input.klass: 'axlearn.common.input_grain.Input'
2324
input.source.fn: 'axlearn.experiments.logistic_regression.tutorial.build_source'
@@ -51,4 +52,5 @@ prune_empty_state_updates: True
5152
save_input_iterator: False
5253
start_trace_process_indices[0]: 0
5354
summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
55+
summary_writer.max_queue: 1000
5456
summary_writer.write_every_n_steps: 10

axlearn/experiments/testdata/axlearn.experiments.text.gpt.c4_trainer/envy-Switch-Base-single-host.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ evalers['train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAcc
4141
evalers['train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
4242
evalers['train'].metric_calculator.model_method: 'forward'
4343
evalers['train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
44+
evalers['train'].summary_writer.max_queue: 1000
4445
evalers['train'].summary_writer.write_every_n_steps: 1
4546
evalers['validation'].eval_dtype: 'jax.numpy.bfloat16'
4647
evalers['validation'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -71,6 +72,7 @@ evalers['validation'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
7172
evalers['validation'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
7273
evalers['validation'].metric_calculator.model_method: 'forward'
7374
evalers['validation'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
75+
evalers['validation'].summary_writer.max_queue: 1000
7476
evalers['validation'].summary_writer.write_every_n_steps: 1
7577
input.batcher.feed_batch_size: 8
7678
input.batcher.fn: 'axlearn.common.input_tf_data.per_feed_batch'

axlearn/experiments/testdata/axlearn.experiments.text.gpt.c4_trainer/envy-Switch-Base.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ evalers['train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAcc
4040
evalers['train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
4141
evalers['train'].metric_calculator.model_method: 'forward'
4242
evalers['train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
43+
evalers['train'].summary_writer.max_queue: 1000
4344
evalers['train'].summary_writer.write_every_n_steps: 1
4445
evalers['validation'].eval_dtype: 'jax.numpy.bfloat16'
4546
evalers['validation'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -69,6 +70,7 @@ evalers['validation'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
6970
evalers['validation'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
7071
evalers['validation'].metric_calculator.model_method: 'forward'
7172
evalers['validation'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
73+
evalers['validation'].summary_writer.max_queue: 1000
7274
evalers['validation'].summary_writer.write_every_n_steps: 1
7375
input.batcher.fn: 'axlearn.common.input_tf_data.per_feed_batch'
7476
input.batcher.pad_example_fn: 'axlearn.common.input_tf_data.default_pad_example_fn'

axlearn/experiments/testdata/axlearn.experiments.text.gpt.c4_trainer/envy-Switch-Large.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ evalers['train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAcc
4040
evalers['train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
4141
evalers['train'].metric_calculator.model_method: 'forward'
4242
evalers['train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
43+
evalers['train'].summary_writer.max_queue: 1000
4344
evalers['train'].summary_writer.write_every_n_steps: 1
4445
evalers['validation'].eval_dtype: 'jax.numpy.bfloat16'
4546
evalers['validation'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -69,6 +70,7 @@ evalers['validation'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
6970
evalers['validation'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
7071
evalers['validation'].metric_calculator.model_method: 'forward'
7172
evalers['validation'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
73+
evalers['validation'].summary_writer.max_queue: 1000
7274
evalers['validation'].summary_writer.write_every_n_steps: 1
7375
input.batcher.fn: 'axlearn.common.input_tf_data.per_feed_batch'
7476
input.batcher.pad_example_fn: 'axlearn.common.input_tf_data.default_pad_example_fn'

axlearn/experiments/testdata/axlearn.experiments.text.gpt.c4_trainer/envy-Switch-XXL.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ evalers['train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAcc
4040
evalers['train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
4141
evalers['train'].metric_calculator.model_method: 'forward'
4242
evalers['train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
43+
evalers['train'].summary_writer.max_queue: 1000
4344
evalers['train'].summary_writer.write_every_n_steps: 1
4445
evalers['validation'].eval_dtype: 'jax.numpy.bfloat16'
4546
evalers['validation'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -69,6 +70,7 @@ evalers['validation'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
6970
evalers['validation'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
7071
evalers['validation'].metric_calculator.model_method: 'forward'
7172
evalers['validation'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
73+
evalers['validation'].summary_writer.max_queue: 1000
7274
evalers['validation'].summary_writer.write_every_n_steps: 1
7375
input.batcher.fn: 'axlearn.common.input_tf_data.per_feed_batch'
7476
input.batcher.pad_example_fn: 'axlearn.common.input_tf_data.default_pad_example_fn'

axlearn/experiments/testdata/axlearn.experiments.text.gpt.c4_trainer/envy-test.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ evalers['train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAcc
4040
evalers['train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
4141
evalers['train'].metric_calculator.model_method: 'forward'
4242
evalers['train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
43+
evalers['train'].summary_writer.max_queue: 1000
4344
evalers['train'].summary_writer.write_every_n_steps: 1
4445
evalers['validation'].eval_dtype: 'jax.numpy.bfloat16'
4546
evalers['validation'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -69,6 +70,7 @@ evalers['validation'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
6970
evalers['validation'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
7071
evalers['validation'].metric_calculator.model_method: 'forward'
7172
evalers['validation'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
73+
evalers['validation'].summary_writer.max_queue: 1000
7274
evalers['validation'].summary_writer.write_every_n_steps: 1
7375
input.batcher.fn: 'axlearn.common.input_tf_data.per_feed_batch'
7476
input.batcher.pad_example_fn: 'axlearn.common.input_tf_data.default_pad_example_fn'

axlearn/experiments/testdata/axlearn.experiments.text.gpt.c4_trainer/fuji-1B-v3-flash-fp8-single-host.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ evalers['train'].metric_calculator.klass: 'axlearn.common.evaler.ModelSummaryAcc
4040
evalers['train'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
4141
evalers['train'].metric_calculator.model_method: 'forward'
4242
evalers['train'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
43+
evalers['train'].summary_writer.max_queue: 1000
4344
evalers['train'].summary_writer.write_every_n_steps: 1
4445
evalers['validation'].eval_dtype: 'jax.numpy.bfloat16'
4546
evalers['validation'].eval_policy.fn: 'axlearn.common.evaler.every_n_steps_policy'
@@ -69,6 +70,7 @@ evalers['validation'].metric_calculator.klass: 'axlearn.common.evaler.ModelSumma
6970
evalers['validation'].metric_calculator.metric_accumulator.klass: 'axlearn.common.metrics.MetricAccumulator'
7071
evalers['validation'].metric_calculator.model_method: 'forward'
7172
evalers['validation'].summary_writer.klass: 'axlearn.common.summary_writer.SummaryWriter'
73+
evalers['validation'].summary_writer.max_queue: 1000
7274
evalers['validation'].summary_writer.write_every_n_steps: 1
7375
input.batcher.fn: 'axlearn.common.input_tf_data.per_feed_batch'
7476
input.batcher.pad_example_fn: 'axlearn.common.input_tf_data.default_pad_example_fn'

0 commit comments

Comments
 (0)