wandb: entity: null # run_id: askkz9i2 resume: 'auto' experiment: project: "demo" name: "mmada-demo" output_dir: "mmada-demo" model: vq_model: type: "magvitv2" vq_model_name: "showlab/magvitv2" mmada: pretrained_model_path: "Gen-Verse/MMaDA-8B-Base" w_clip_vit: False new_vocab_size: 134656 llm_vocab_size: 126464 codebook_size: 8192 num_vq_tokens: 1024 num_new_special_tokens: 0 tie_word_embeddings: False gradient_checkpointing: True dataset: gen_type: "imagenet1k" und_type: "captioning" combined_loader_mode: "max_size_cycle" params: train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train" train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar", "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar", "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..01209}.tar" ] train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet" add_caption_prompt: True external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M" external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json" external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions" external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions" validation_prompts_file: "validation_prompts/imagenet_prompts.txt" shuffle_buffer_size: 1000 num_workers: 32 resolution: 512 pin_memory: True persistent_workers: True preprocessing: max_seq_length: 512 # for text tokens resolution: 512 center_crop: False random_flip: False optimizer: name: adamw params: # default adamw params learning_rate: 5e-5 scale_lr: False # scale learning rate by total batch size beta1: 0.9 beta2: 0.999 weight_decay: 0.01 epsilon: 1e-8 lr_scheduler: scheduler: "cosine" params: learning_rate: ${optimizer.params.learning_rate} warmup_steps: 8000 training: gradient_accumulation_steps: 4 noise_type: "mask" batch_size_t2i: 5 batch_size_lm: 1 batch_size_mmu: 2 mixed_precision: "bf16" enable_tf32: True seed: 10086 max_train_steps: 500000 overfit_one_batch: False cond_dropout_prob: 0.1 min_masking_rate: 0.0 label_smoothing: 0.0 max_grad_norm: 1 guidance_scale: 1.5 generation_timesteps: 12 t2i_coeff: 1.0 lm_coeff: 0.1 mmu_coeff: 1.0 mask_schedule: schedule: "cosine"