Skip to content

Commit

Permalink
Merge pull request #805 from AE-Hertz/branch8
Browse files Browse the repository at this point in the history
 Feature request: Image Autoregressive Modeling #768
  • Loading branch information
UppuluriKalyani authored Nov 7, 2024
2 parents 903045d + af48798 commit 9483c5f
Show file tree
Hide file tree
Showing 58 changed files with 7,668 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# @package _group_

common:
fp16: true
log_format: json
log_interval: 100
min_loss_scale: 1e-6
user_dir: fairseq_user

checkpoint:
save_interval: 1
save_interval_updates: 0
keep_last_epochs: 1
keep_interval_updates: -1
no_epoch_checkpoints: true

task:
_name: image_generation_stage1
input_size: 256
source_vocab_size: -1
target_vocab_size: 8192
augmentation: randcrop

dataset:
train_subset: train
valid_subset: val
num_workers: 8
batch_size: 256
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1

criterion:
_name: ce_loss_ls
label_smoothing: 0.1

optimization:
max_update: 125000
lr: [3e-4]
update_freq: [1]
clip_norm: 1.0

optimizer:
_name: adam
adam_betas: [0.9,0.98]
weight_decay: 0.01

lr_scheduler:
_name: inverse_sqrt
warmup_updates: 6250
warmup_init_lr: 1e-7

model:
_name: stlm
kmeans_path: dataset/ILSVRC2012/dino_short_224_l3/km_8k.npy
pretrained_enc_config:
pretrained_enc_arch: dino_vit_base
pretrained_enc_path: models/dinov2_vitb14_reg4_pretrain.pth
pretrained_enc_proj_dim: 256
pretrained_enc_withproj: True
layer: 3
decoder_embed_dim: 1024
decoder_output_dim: 1024
decoder_input_dim: 1024
decoder_ffn_embed_dim: 4096
decoder_layers: 16
decoder_attention_heads: 16
activation_fn: gelu
decoder_learned_pos: true

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# @package _group_

common:
fp16: true
log_format: json
log_interval: 100
min_loss_scale: 1e-6
user_dir: fairseq_user

checkpoint:
save_interval: 1
save_interval_updates: 0
keep_last_epochs: 1
keep_interval_updates: -1
no_epoch_checkpoints: true

task:
_name: image_generation_stage1
input_size: 256
source_vocab_size: -1
target_vocab_size: 8192
augmentation: randcrop

dataset:
train_subset: train
valid_subset: val
num_workers: 8
batch_size: 256
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1

criterion:
_name: ce_loss_ls
label_smoothing: 0.1

optimization:
max_update: 125000
lr: [3e-4]
update_freq: [1]
clip_norm: 1.0

optimizer:
_name: adam
adam_betas: [0.9,0.98]
weight_decay: 0.01

lr_scheduler:
_name: inverse_sqrt
warmup_updates: 6250
warmup_init_lr: 1e-7

model:
_name: stlm
kmeans_path: dataset/ILSVRC2012/dino_short_224_l3/km_8k.npy
pretrained_enc_config:
pretrained_enc_arch: dino_vit_base
pretrained_enc_path: models/dinov2_vitb14_reg4_pretrain.pth
pretrained_enc_proj_dim: 256
pretrained_enc_withproj: True
layer: 3
decoder_embed_dim: 1024
decoder_output_dim: 1024
decoder_input_dim: 1024
decoder_ffn_embed_dim: 4096
decoder_layers: 16
decoder_attention_heads: 16
activation_fn: gelu
decoder_learned_pos: true
class_conditional: true

Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# @package _group_

common:
fp16: true
log_format: json
log_interval: 100
min_loss_scale: 1e-6
user_dir: fairseq_user

checkpoint:
save_interval: 1
save_interval_updates: 0
keep_last_epochs: 1
keep_interval_updates: -1
no_epoch_checkpoints: true

task:
_name: image_generation_stage1
input_size: 256
source_vocab_size: -1
target_vocab_size: 8192
augmentation: randresizedcrop
linear_probe: true

dataset:
train_subset: train
valid_subset: val
num_workers: 4
batch_size: 256
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1

criterion:
_name: ce_loss_ls
label_smoothing: 0
report_accuracy: true

optimization:
max_update: 100000
update_freq: [1]

optimizer:
_name: sgd_hydra
momentum: 0.9

lr_scheduler:
_name: cosine

model:
_name: stlm_linearprobe
kmeans_path: dataset/ILSVRC2012/dino_short_224_l3/km_8k.npy
pretrained_enc_config:
pretrained_enc_arch: dino_vit_base
pretrained_enc_path: models/dinov2_vitb14_reg4_pretrain.pth
pretrained_enc_proj_dim: 256
pretrained_enc_withproj: True
layer: 3
decoder_embed_dim: 1024
decoder_output_dim: 1024
decoder_input_dim: 1024
decoder_ffn_embed_dim: 4096
decoder_layers: 16
decoder_attention_heads: 16
activation_fn: gelu
decoder_learned_pos: true
pretrained_decoder_path: outputs/dino_base_stage1/checkpoints/checkpoint_last.pt

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# @package _group_

common:
fp16: true
log_format: json
log_interval: 100
min_loss_scale: 1e-6
user_dir: fairseq_user

checkpoint:
save_interval: 1
save_interval_updates: 0
keep_last_epochs: 1
keep_interval_updates: -1
no_epoch_checkpoints: true

task:
_name: image_generation_stage2
input_size: 256
source_vocab_size: 8192
target_vocab_size: 1024

dataset:
train_subset: train
valid_subset: val
num_workers: 8
batch_size: 128
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1

criterion:
_name: ce_loss_ls
label_smoothing: 0.1

optimization:
max_update: 250000
lr: [5e-4]
update_freq: [1]
clip_norm: 1.0

optimizer:
_name: adam
adam_betas: [0.9,0.98]
weight_decay: 0.01

lr_scheduler:
_name: inverse_sqrt
warmup_updates: 6250
warmup_init_lr: 1e-7

model:
_name: st2vtlm
vqgan_ckpt_path: models/vqgan_jax_strongaug.ckpt
vqgan_config_path: config/vqgan/vqgan.yaml
kmeans_path: dataset/ILSVRC2012/dino_short_224_l3/km_8k.npy
pretrained_enc_config:
pretrained_enc_arch: dino_vit_base
pretrained_enc_path: models/dinov2_vitb14_reg4_pretrain.pth
pretrained_enc_proj_dim: 256
pretrained_enc_withproj: True
layer: 3
decoder_embed_dim: 1024
decoder_output_dim: 1024
decoder_input_dim: 1024
decoder_ffn_embed_dim: 4096
decoder_layers: 16
decoder_attention_heads: 16
activation_fn: gelu
decoder_learned_pos: true

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# @package _group_

common:
fp16: true
log_format: json
log_interval: 100
min_loss_scale: 1e-6
user_dir: fairseq_user

checkpoint:
save_interval: 1
save_interval_updates: 0
keep_last_epochs: 1
keep_interval_updates: -1
no_epoch_checkpoints: true

task:
_name: image_generation_stage2
input_size: 256
source_vocab_size: 8192
target_vocab_size: 1024

dataset:
train_subset: train
valid_subset: val
num_workers: 8
batch_size: 128
skip_invalid_size_inputs_valid_test: true
required_batch_size_multiple: 1

criterion:
_name: ce_loss_ls
label_smoothing: 0.1

optimization:
max_update: 125000
lr: [5e-4]
update_freq: [2]
clip_norm: 1.0

optimizer:
_name: adam
adam_betas: [0.9,0.98]
weight_decay: 0.01

lr_scheduler:
_name: inverse_sqrt
warmup_updates: 6250
warmup_init_lr: 1e-7

model:
_name: st2vtlm_nar
vqgan_ckpt_path: models/vqgan_jax_strongaug.ckpt
vqgan_config_path: config/vqgan/vqgan.yaml
kmeans_path: dataset/ILSVRC2012/dino_short_224_l3/km_8k.npy
pretrained_enc_config:
pretrained_enc_arch: dino_vit_base
pretrained_enc_path: models/dinov2_vitb14_reg4_pretrain.pth
pretrained_enc_proj_dim: 256
pretrained_enc_withproj: True
layer: 3
decoder_embed_dim: 1024
decoder_output_dim: 1024
decoder_input_dim: 1024
decoder_ffn_embed_dim: 4096
decoder_layers: 16
decoder_attention_heads: 16
activation_fn: gelu
decoder_learned_pos: true

Loading

0 comments on commit 9483c5f

Please sign in to comment.