Commit 
							
							·
						
						e758a08
	
1
								Parent(s):
							
							c44db9e
								
Create sbatch_4b284b84b20c4py.sh
Browse files- sbatch_4b284b84b20c4py.sh +164 -0
    	
        sbatch_4b284b84b20c4py.sh
    ADDED
    
    | @@ -0,0 +1,164 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/bin/bash
         | 
| 2 | 
            +
            #SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901
         | 
| 3 | 
            +
            #SBATCH --nodes=32
         | 
| 4 | 
            +
            #SBATCH --ntasks-per-node=1
         | 
| 5 | 
            +
            #SBATCH --cpus-per-task=40
         | 
| 6 | 
            +
            #SBATCH --mem=256G
         | 
| 7 | 
            +
            #SBATCH -p standard-g
         | 
| 8 | 
            +
            #SBATCH -t 48:00:00
         | 
| 9 | 
            +
            #SBATCH --gpus-per-node=mi250:8
         | 
| 10 | 
            +
            #SBATCH --exclusive=user
         | 
| 11 | 
            +
            #SBATCH --hint=nomultithread
         | 
| 12 | 
            +
            #SBATCH --account=project_462000119
         | 
| 13 | 
            +
            #SBATCH -o logs/%j.out
         | 
| 14 | 
            +
            #SBATCH -e logs/%j.err
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            VARIANT=4b284b84b20c4py
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            # if run without sbatch, invoke here
         | 
| 19 | 
            +
            if [ -z $SLURM_JOB_ID ]; then
         | 
| 20 | 
            +
                mkdir -p logs
         | 
| 21 | 
            +
                sbatch "$0"
         | 
| 22 | 
            +
                exit
         | 
| 23 | 
            +
            fi
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            set -euo pipefail
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            # symlink logs/latest.out and logs/latest.err
         | 
| 28 | 
            +
            ln -f -s $SLURM_JOB_ID.out logs/latest.out
         | 
| 29 | 
            +
            ln -f -s $SLURM_JOB_ID.err logs/latest.err
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            KILL_SWITCH_PATH=kill-switch-$VARIANT
         | 
| 32 | 
            +
            CHECKPOINT_PATH=checkpoints_$VARIANT
         | 
| 33 | 
            +
            TENSORBOARD_PATH=tensorboard_$VARIANT
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            # Data
         | 
| 36 | 
            +
            VOCAB_FILE="gpt2/vocab.json"
         | 
| 37 | 
            +
            MERGE_FILE="gpt2/merges.txt"
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            TRAIN_DATA_PATH=train84b20c4py.txt
         | 
| 40 | 
            +
            # "train: 0.2 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.8 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document"
         | 
| 41 | 
            +
            VALID_DATA_PATH=valc4py.txt
         | 
| 42 | 
            +
            # "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1  /scratch/project_462000119/data/python/gpt2tok_python_content_document"
         | 
| 43 | 
            +
             | 
| 44 | 
            +
             | 
| 45 | 
            +
            PP_SIZE=1
         | 
| 46 | 
            +
            TP_SIZE=2
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            MICRO_BATCH_SIZE=2
         | 
| 49 | 
            +
            GRADIENT_ACCUMULATION_STEPS=1
         | 
| 50 | 
            +
            WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
         | 
| 51 | 
            +
            GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            # Model parameters
         | 
| 54 | 
            +
            source model_params.sh
         | 
| 55 | 
            +
            MODEL_PARAM=("${PARAM_4516M[@]}")
         | 
| 56 | 
            +
            NHIDDEN=${MODEL_PARAM[0]}
         | 
| 57 | 
            +
            FFN_HIDDEN_SIZE=${MODEL_PARAM[1]}
         | 
| 58 | 
            +
            KV_SIZE=${MODEL_PARAM[2]}
         | 
| 59 | 
            +
            NHEADS=${MODEL_PARAM[3]}
         | 
| 60 | 
            +
            NLAYERS=${MODEL_PARAM[4]}
         | 
| 61 | 
            +
            SEQ_LEN=2048
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS"
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            SAVE_INTERVAL=10000
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            # Tokens: 84_000_000_000
         | 
| 68 | 
            +
            # -> Samples: 41_015_625.0
         | 
| 69 | 
            +
            TRAIN_SAMPLES=41_015_625
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            OPTIMIZER_ARGS=" \
         | 
| 72 | 
            +
                --optimizer adam \
         | 
| 73 | 
            +
                --adam-beta1 0.9 \
         | 
| 74 | 
            +
                --adam-beta2 0.95 \
         | 
| 75 | 
            +
                --adam-eps 1e-8 \
         | 
| 76 | 
            +
                --lr 2e-4 \
         | 
| 77 | 
            +
                --min-lr 2e-5 \
         | 
| 78 | 
            +
                --lr-decay-style cosine \
         | 
| 79 | 
            +
                --lr-decay-samples $TRAIN_SAMPLES \
         | 
| 80 | 
            +
                --lr-warmup-samples 410_156 \
         | 
| 81 | 
            +
                --clip-grad 1.0 \
         | 
| 82 | 
            +
                --weight-decay 1e-1 \
         | 
| 83 | 
            +
                "
         | 
| 84 | 
            +
             | 
| 85 | 
            +
            GPT_ARGS=" \
         | 
| 86 | 
            +
                --num-layers $NLAYERS \
         | 
| 87 | 
            +
                --hidden-size $NHIDDEN \
         | 
| 88 | 
            +
                --num-attention-heads $NHEADS \
         | 
| 89 | 
            +
                --kv-channels $KV_SIZE \
         | 
| 90 | 
            +
                --ffn-hidden-size $FFN_HIDDEN_SIZE \
         | 
| 91 | 
            +
                --seq-length $SEQ_LEN \
         | 
| 92 | 
            +
                --max-position-embeddings $SEQ_LEN \
         | 
| 93 | 
            +
                --micro-batch-size $MICRO_BATCH_SIZE \
         | 
| 94 | 
            +
                --global-batch-size $GLOBAL_BATCH_SIZE \
         | 
| 95 | 
            +
                --train-samples $TRAIN_SAMPLES \
         | 
| 96 | 
            +
                --vocab-file $VOCAB_FILE \
         | 
| 97 | 
            +
                --merge-file $MERGE_FILE \
         | 
| 98 | 
            +
                --clip-grad 1.0 \
         | 
| 99 | 
            +
                --kill-switch-path $KILL_SWITCH_PATH \
         | 
| 100 | 
            +
                --bf16 \
         | 
| 101 | 
            +
                $OPTIMIZER_ARGS \
         | 
| 102 | 
            +
                "
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            OUTPUT_ARGS=" \
         | 
| 105 | 
            +
                --log-interval 10 \
         | 
| 106 | 
            +
                --save-interval $SAVE_INTERVAL \
         | 
| 107 | 
            +
                --eval-interval 5000 \
         | 
| 108 | 
            +
                --eval-iters 10 \
         | 
| 109 | 
            +
                --tensorboard-dir $TENSORBOARD_PATH \
         | 
| 110 | 
            +
                --tensorboard-queue-size 5 \
         | 
| 111 | 
            +
                --log-timers-to-tensorboard \
         | 
| 112 | 
            +
                --log-batch-size-to-tensorboard \
         | 
| 113 | 
            +
                --log-validation-ppl-to-tensorboard \
         | 
| 114 | 
            +
                "
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            ZERO_STAGE=0
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            mkdir -p ds_configs
         | 
| 119 | 
            +
            DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            cat <<EOF > $DS_CONFIG_PATH
         | 
| 122 | 
            +
            {
         | 
| 123 | 
            +
                "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
         | 
| 124 | 
            +
                "train_batch_size": $GLOBAL_BATCH_SIZE,
         | 
| 125 | 
            +
                "gradient_clipping": 1.0,
         | 
| 126 | 
            +
                "zero_optimization": {
         | 
| 127 | 
            +
                    "stage": $ZERO_STAGE
         | 
| 128 | 
            +
                },
         | 
| 129 | 
            +
                "bf16": {
         | 
| 130 | 
            +
                    "enabled": true
         | 
| 131 | 
            +
                },
         | 
| 132 | 
            +
                "steps_per_print": 2000,
         | 
| 133 | 
            +
                "wall_clock_breakdown": false
         | 
| 134 | 
            +
            }
         | 
| 135 | 
            +
            EOF
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            DEEPSPEED_ARGS=" \
         | 
| 138 | 
            +
                --deepspeed \
         | 
| 139 | 
            +
                --deepspeed_config $DS_CONFIG_PATH \
         | 
| 140 | 
            +
                --zero-stage $ZERO_STAGE \
         | 
| 141 | 
            +
                "
         | 
| 142 | 
            +
             | 
| 143 | 
            +
            CMD=" \
         | 
| 144 | 
            +
                Megatron-DeepSpeed/pretrain_gpt.py \
         | 
| 145 | 
            +
                --tensor-model-parallel-size $TP_SIZE \
         | 
| 146 | 
            +
                --pipeline-model-parallel-size $PP_SIZE \
         | 
| 147 | 
            +
                $GPT_ARGS \
         | 
| 148 | 
            +
                $OUTPUT_ARGS \
         | 
| 149 | 
            +
                --save $CHECKPOINT_PATH \
         | 
| 150 | 
            +
                --load $CHECKPOINT_PATH \
         | 
| 151 | 
            +
                --train-weighted-split-paths-path $TRAIN_DATA_PATH \
         | 
| 152 | 
            +
                --valid-weighted-split-paths-path $VALID_DATA_PATH \
         | 
| 153 | 
            +
                --data-impl mmap \
         | 
| 154 | 
            +
                 $DEEPSPEED_ARGS \
         | 
| 155 | 
            +
                "
         | 
| 156 | 
            +
             | 
| 157 | 
            +
            echo $CMD
         | 
| 158 | 
            +
             | 
| 159 | 
            +
            echo "START $SLURM_JOBID: $(date)"
         | 
| 160 | 
            +
             | 
| 161 | 
            +
            # bash launch_srun.sh $CMD
         | 
| 162 | 
            +
            srun --label launch.sh $CMD
         | 
| 163 | 
            +
             | 
| 164 | 
            +
            echo "END $SLURM_JOBID: $(date)"
         | 

