| { | |
| "accumulate_gradients": 1, | |
| "ae_steps": [], | |
| "amp": 1, | |
| "architectures": [ | |
| "XLMModel" | |
| ], | |
| "asm": false, | |
| "attention_dropout": 0.1, | |
| "batch_size": 64, | |
| "beam_size": 1, | |
| "bos_index": 0, | |
| "bos_token_id": 0, | |
| "bptt": 256, | |
| "bt_src_langs": [], | |
| "bt_steps": [], | |
| "causal": false, | |
| "clip_grad_norm": 5, | |
| "clm_steps": [], | |
| "command": "python train.py --local_rank=0 --exp_name unihan_zh_ja --dump_path '/mnt/exp/ft_char' --data_path 'data/processed/xlm_zh_ja/new' --lgs 'zh-ja' --clm_steps '' --mlm_steps 'ja,zh' --emb_dim 1024 --n_layers 12 --n_heads 16 --dropout '0.1' --attention_dropout '0.1' --gelu_activation true --batch_size 64 --bptt 256 --optimizer 'adam_inverse_sqrt,lr=0.00005,warmup_updates=30000,beta1=0.9,beta2=0.999,weight_decay=0.01,eps=0.000001' --epoch_size 300000 --max_epoch 100000 --validation_metrics _valid_mlm_ppl --stopping_criterion '_valid_mlm_ppl,25' --fp16 true --amp 1 --exp_id epoch169 --reload_model '/mnt/exp/hard_pretrain/unihan_zh_ja/recycled/converted-best-valid_mlm_ppl.pth' --exp_id \"epoch169\"", | |
| "context_size": 0, | |
| "data_path": "data/processed/xlm_zh_ja/new", | |
| "debug": false, | |
| "debug_slurm": false, | |
| "debug_train": false, | |
| "dropout": 0.1, | |
| "dump_path": "/mnt/exp/ft_char/unihan_zh_ja/epoch169", | |
| "emb_dim": 1024, | |
| "embed_init_std": 0.02209708691207961, | |
| "encoder_only": true, | |
| "end_n_top": 5, | |
| "eos_index": 1, | |
| "epoch_size": 300000, | |
| "eval_bleu": false, | |
| "eval_only": false, | |
| "exp_id": "epoch169", | |
| "exp_name": "unihan_zh_ja", | |
| "fp16": true, | |
| "gelu_activation": true, | |
| "global_rank": 0, | |
| "group_by_size": true, | |
| "hyp_path": "/mnt/exp/ft_char/unihan_zh_ja/epoch169/hypotheses", | |
| "id2lang": { | |
| "0": "ja", | |
| "1": "zh" | |
| }, | |
| "init_std": 0.02, | |
| "is_encoder": true, | |
| "is_master": true, | |
| "is_slurm_job": false, | |
| "lambda_ae": 1.0, | |
| "lambda_ae_config": null, | |
| "lambda_bt": 1.0, | |
| "lambda_bt_config": null, | |
| "lambda_clm": 1.0, | |
| "lambda_clm_config": null, | |
| "lambda_mlm": 1.0, | |
| "lambda_mlm_config": null, | |
| "lambda_mt": 1.0, | |
| "lambda_mt_config": null, | |
| "lambda_pc": 1.0, | |
| "lambda_pc_config": null, | |
| "lang2id": { | |
| "ja": 0, | |
| "zh": 1 | |
| }, | |
| "lang_id": 0, | |
| "langs": [ | |
| "zh", | |
| "ja" | |
| ], | |
| "layer_norm_eps": 1e-12, | |
| "lg_sampling_factor": -1, | |
| "lgs": "zh-ja", | |
| "local_rank": 0, | |
| "mask_index": 5, | |
| "mask_token_id": 0, | |
| "master_port": -1, | |
| "max_batch_size": 0, | |
| "max_epoch": 100000, | |
| "max_len": 100, | |
| "max_position_embeddings": 512, | |
| "max_vocab": -1, | |
| "min_count": 0, | |
| "mlm_steps": [ | |
| [ | |
| "ja", | |
| null | |
| ], | |
| [ | |
| "zh", | |
| null | |
| ] | |
| ], | |
| "model_type": "xlm", | |
| "mono_dataset": { | |
| "ja": { | |
| "test": "data/processed/xlm_zh_ja/new/test.ja.pth", | |
| "train": "data/processed/xlm_zh_ja/new/train.ja.pth", | |
| "valid": "data/processed/xlm_zh_ja/new/valid.ja.pth" | |
| }, | |
| "zh": { | |
| "test": "data/processed/xlm_zh_ja/new/test.zh.pth", | |
| "train": "data/processed/xlm_zh_ja/new/train.zh.pth", | |
| "valid": "data/processed/xlm_zh_ja/new/valid.zh.pth" | |
| } | |
| }, | |
| "mt_steps": [], | |
| "multi_gpu": true, | |
| "multi_node": false, | |
| "n_gpu_per_node": 8, | |
| "n_heads": 16, | |
| "n_langs": 2, | |
| "n_layers": 12, | |
| "n_nodes": 1, | |
| "node_id": 0, | |
| "optimizer": "adam_inverse_sqrt,lr=0.00005,warmup_updates=30000,beta1=0.9,beta2=0.999,weight_decay=0.01,eps=0.000001", | |
| "pad_index": 2, | |
| "pad_token_id": 2, | |
| "para_dataset": {}, | |
| "pc_steps": [], | |
| "ref_paths": {}, | |
| "reload_checkpoint": "", | |
| "reload_emb": "", | |
| "reload_model": "/mnt/exp/hard_pretrain/unihan_zh_ja/recycled/converted-best-valid_mlm_ppl.pth", | |
| "sample_alpha": 0, | |
| "save_periodic": 0, | |
| "share_inout_emb": true, | |
| "sinusoidal_embeddings": false, | |
| "split_data": false, | |
| "start_n_top": 5, | |
| "stopping_criterion": "_valid_mlm_ppl,25", | |
| "summary_activation": null, | |
| "summary_first_dropout": 0.1, | |
| "summary_proj_to_labels": true, | |
| "summary_type": "first", | |
| "summary_use_proj": true, | |
| "tokens_per_batch": -1, | |
| "unk_index": 3, | |
| "use_lang_emb": true, | |
| "use_memory": false, | |
| "validation_metrics": "_valid_mlm_ppl", | |
| "vocab_size": 24044, | |
| "word_blank": 0, | |
| "word_dropout": 0, | |
| "word_keep": 0.1, | |
| "word_mask": 0.8, | |
| "word_mask_keep_rand": "0.8,0.1,0.1", | |
| "word_pred": 0.15, | |
| "word_rand": 0.1, | |
| "word_shuffle": 0, | |
| "world_size": 8 | |
| } | |