| accum_freq: 2 | |
| attn_activation: None | |
| attn_name: auto | |
| attn_seq_scalar: None | |
| attn_seq_scalar_alpha: None | |
| average: None | |
| average_coefficients: None | |
| beta1: 0.9 | |
| beta2: 0.95 | |
| checkpoint_path: logs/17288/rpj-d=1024_l=24_h=8-16.0/checkpoints | |
| copy_codebase: False | |
| data_key: txt | |
| dataset_manifest: None | |
| dataset_resampled: False | |
| dataset_type: auto | |
| ddp_static_graph: False | |
| debug: False | |
| delete_previous_checkpoint: True | |
| device: cuda:0 | |
| disable_buffer: False | |
| dist_backend: nccl | |
| dist_url: env:// | |
| distill_model: None | |
| distill_pretrained: None | |
| distributed: True | |
| epochs: 5 | |
| epochs_cooldown: None | |
| eps: 1e-08 | |
| experimental_meta_device: False | |
| ffn_type: swiglu | |
| force_distributed: False | |
| force_min_lr: 0.0 | |
| fsdp: False | |
| fsdp_amp: False | |
| fsdp_backward_prefetch: False | |
| fsdp_checkpoint: False | |
| fsdp_cpu_offload: False | |
| fsdp_hybrid: False | |
| fsdp_hybrid_o2: False | |
| fsdp_limit_all_gathers: False | |
| fsdp_pure_bf16: False | |
| fsdp_use_orig_params: False | |
| global_batch_size: 32 | |
| global_val_batch_size: 16 | |
| grad_checkpointing: False | |
| grad_clip_norm: 1.0 | |
| hf_fsdp_block: None | |
| hf_model: None | |
| hf_seq_len: None | |
| ignore_parse_errors: False | |
| load_pretrained_state: False | |
| local_rank: 0 | |
| log_every_n_steps: 20 | |
| log_level: 20 | |
| log_local: False | |
| log_logit_mean: False | |
| log_path: logs/17288/rpj-d=1024_l=24_h=8-16.0/out.log | |
| logs: logs/17288 | |
| lr: 0.003 | |
| lr_cooldown_end: 3e-05 | |
| lr_cooldown_power: 1.0 | |
| lr_scheduler: cosine | |
| model: d=1024_l=24_h=8 | |
| model_norm: gain_only_lp_layer_norm | |
| moe_capacity_factor: 1.25 | |
| moe_expert_model_parallelism: False | |
| moe_freq: 0 | |
| moe_loss_weight: 0.1 | |
| moe_num_experts: None | |
| moe_top_k: 2 | |
| moe_weight_parallelism: False | |
| multiple_data_passes: False | |
| name: rpj-d=1024_l=24_h=8-16.0 | |
| no_set_device_rank: False | |
| optimizer: adamw | |
| per_gpu_batch_size: 16 | |
| per_gpu_val_batch_size: 8 | |
| positional_embedding_type: rotary | |
| precision: amp_bfloat16 | |
| pretrained: None | |
| qk_norm: True | |
| rank: 0 | |
| remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 | |
| remote_sync_frequency: 300 | |
| remote_sync_protocol: s3 | |
| report_to: | |
| resume: s3://dcnlp-west/dcnlp_experiments_v3/rpj-d=1024_l=24_h=8-16.0/checkpoints/epoch_6.pt | |
| save_frequency: 1 | |
| save_most_recent: False | |
| seed: 124 | |
| seq_len: 2048 | |
| skip_scheduler: False | |
| squash_mask_left: True | |
| target_mask_individual: 50400 | |
| target_mask_left: 50300 | |
| tensorboard: False | |
| tensorboard_path: | |
| torchcompile: False | |
| torchscript: False | |
| trace: False | |
| train_data: None | |
| train_data_mix_weights: None | |
| train_data_upsampling_factors: None | |
| train_num_samples: None | |
| use_bn_sync: False | |
| use_bnb_linear: None | |
| val_data: ['training/eval_data/val_tok_mult/openlm/shard_00000000.tar', 'training/eval_data/c4_val/shard-{0000000..0000010}.tar', 'training/eval_data/val_tok_mult/paloma_4chan_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_100_domains/00000001.tar', 'training/eval_data/val_tok_mult/paloma_c4_en/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma-v1_5/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_programing_languages/00000001.tar', 'training/eval_data/val_tok_mult/paloma_dolma_100_subreddits/00000001.tar', 'training/eval_data/val_tok_mult/paloma_falcon-refinedweb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_gab/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_s2orc_unsplit_dedup/00000001.tar', 'training/eval_data/val_tok_mult/paloma_m2d2_wikipedia_unsplit/00000001.tar', 'training/eval_data/val_tok_mult/paloma_manosphere_meta_sep/00000001.tar', 'training/eval_data/val_tok_mult/paloma_mc4/00000001.tar', 'training/eval_data/val_tok_mult/paloma_ptb/00000001.tar', 'training/eval_data/val_tok_mult/paloma_redpajama/00000001.tar', 'training/eval_data/val_tok_mult/paloma_twitterAAE_HELM_fixed/00000001.tar', 'training/eval_data/val_tok_mult/paloma_wikitext_103/00000001.tar', '/admin/home-sy/dcnlp/training/eval_data/mmlu/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/hellaswag/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/jeopardy_all/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/triviaqa_sm_sub/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/gsm8k/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_math/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/aqua/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/svamp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_qa_wikidata/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_easy/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/arc_challenge/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_misconceptions/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/copa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/siqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/commonsense_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/piqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/openbook_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_novel_concepts/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strange_stories/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_strategy_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/lambada_openai/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winograd_wsc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogrande/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conlang_translation/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_language_identification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_conceptual_combinations/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_elementary_math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_dyck_languages/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_ar/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_cs_algorithms/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_logical_deduction/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_operators/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_repeat_copy_logic/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_nospaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/simple_arithmetic_withspaces/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/math_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/logi_qa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/pubmed_qa_labeled/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/squad/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_rc/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_lsat_lr/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/coqa/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bigbench_understanding_fables/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/boolq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/agi_eval_sat_en/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_female/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/winogender_mc_male/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/enterprise_pii_classification/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/bbq/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_complex/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval_return_simple/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.5/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.25/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval-0.75/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/human_eval/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_cpp/shard-0000000.tar', '/admin/home-sy/dcnlp/training/eval_data/processed_human_eval_js/shard-0000000.tar'] | |
| val_data_key: ['json', 'txt', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt', 'txt'] | |
| val_frequency: 5 | |
| val_iter_ci: 10000 | |
| val_max_pop_ci: 300000 | |
| val_num_samples: None | |
| val_seq_ci: True | |
| val_tok_ci: True | |
| vocab_size: 50432 | |
| wandb: False | |
| wandb_notes: | |
| wandb_project_name: open-lm | |
| warmup: 2000 | |
| wd: 0.033 | |
| workers: 2 | |
| world_size: 2 | |
| z_loss_coefficient: 0.0001 | |