| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 1000, | |
| "global_step": 5493, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0018204988166757691, | |
| "grad_norm": 900.346923828125, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 20.2441, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0036409976333515383, | |
| "grad_norm": 939.3712158203125, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 19.6193, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.005461496450027308, | |
| "grad_norm": 562.291015625, | |
| "learning_rate": 6e-06, | |
| "loss": 18.7898, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.007281995266703077, | |
| "grad_norm": 447.43060302734375, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 17.2234, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.009102494083378846, | |
| "grad_norm": 361.34100341796875, | |
| "learning_rate": 1e-05, | |
| "loss": 15.4955, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010922992900054615, | |
| "grad_norm": 320.7839660644531, | |
| "learning_rate": 1.2e-05, | |
| "loss": 13.6977, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.012743491716730384, | |
| "grad_norm": 131.0957794189453, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 11.4716, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.014563990533406153, | |
| "grad_norm": 75.75025177001953, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 9.5351, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.016384489350081924, | |
| "grad_norm": 42.2626953125, | |
| "learning_rate": 1.8e-05, | |
| "loss": 8.1668, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.018204988166757693, | |
| "grad_norm": 25.88639259338379, | |
| "learning_rate": 2e-05, | |
| "loss": 7.3242, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02002548698343346, | |
| "grad_norm": 16.043380737304688, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 6.6513, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02184598580010923, | |
| "grad_norm": 15.031912803649902, | |
| "learning_rate": 2.4e-05, | |
| "loss": 6.1476, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.023666484616785, | |
| "grad_norm": 8.564423561096191, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 5.7499, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02548698343346077, | |
| "grad_norm": 9.205732345581055, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 5.4031, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.027307482250136537, | |
| "grad_norm": 7.424269199371338, | |
| "learning_rate": 3e-05, | |
| "loss": 5.1054, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.029127981066812306, | |
| "grad_norm": 7.239726543426514, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 4.9637, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.030948479883488075, | |
| "grad_norm": 9.377843856811523, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 4.7583, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03276897870016385, | |
| "grad_norm": 5.893341064453125, | |
| "learning_rate": 3.6e-05, | |
| "loss": 4.7093, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03458947751683961, | |
| "grad_norm": 5.311996936798096, | |
| "learning_rate": 3.8e-05, | |
| "loss": 4.5147, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.036409976333515386, | |
| "grad_norm": 5.311359882354736, | |
| "learning_rate": 4e-05, | |
| "loss": 4.5004, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03823047515019115, | |
| "grad_norm": 8.67163372039795, | |
| "learning_rate": 4.2e-05, | |
| "loss": 4.3923, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.04005097396686692, | |
| "grad_norm": 5.317925930023193, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 4.3169, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.04187147278354269, | |
| "grad_norm": 10.292462348937988, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 4.2887, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04369197160021846, | |
| "grad_norm": 6.532808303833008, | |
| "learning_rate": 4.8e-05, | |
| "loss": 4.2458, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04551247041689423, | |
| "grad_norm": 5.974935531616211, | |
| "learning_rate": 5e-05, | |
| "loss": 4.1716, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04733296923357, | |
| "grad_norm": 10.58936595916748, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 4.0606, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.049153468050245765, | |
| "grad_norm": 11.461891174316406, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 4.1349, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.05097396686692154, | |
| "grad_norm": 8.389955520629883, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 4.044, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0527944656835973, | |
| "grad_norm": 5.730175018310547, | |
| "learning_rate": 5.8e-05, | |
| "loss": 4.0169, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.054614964500273075, | |
| "grad_norm": 8.236620903015137, | |
| "learning_rate": 6e-05, | |
| "loss": 3.8992, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05643546331694885, | |
| "grad_norm": 9.377148628234863, | |
| "learning_rate": 6.2e-05, | |
| "loss": 3.9041, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.05825596213362461, | |
| "grad_norm": 8.225547790527344, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 3.8243, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.060076460950300385, | |
| "grad_norm": 6.899202823638916, | |
| "learning_rate": 6.6e-05, | |
| "loss": 3.8613, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.06189695976697615, | |
| "grad_norm": 7.693598747253418, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 3.7624, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.06371745858365192, | |
| "grad_norm": 6.594521522521973, | |
| "learning_rate": 7e-05, | |
| "loss": 3.7709, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0655379574003277, | |
| "grad_norm": 7.71303129196167, | |
| "learning_rate": 7.2e-05, | |
| "loss": 3.699, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06735845621700345, | |
| "grad_norm": 11.58485221862793, | |
| "learning_rate": 7.4e-05, | |
| "loss": 3.6711, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06917895503367923, | |
| "grad_norm": 12.817239761352539, | |
| "learning_rate": 7.6e-05, | |
| "loss": 3.6647, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.070999453850355, | |
| "grad_norm": 11.794710159301758, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 3.6485, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.07281995266703077, | |
| "grad_norm": 7.5003509521484375, | |
| "learning_rate": 8e-05, | |
| "loss": 3.63, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07464045148370653, | |
| "grad_norm": 7.507719993591309, | |
| "learning_rate": 8.2e-05, | |
| "loss": 3.5663, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0764609503003823, | |
| "grad_norm": 10.392959594726562, | |
| "learning_rate": 8.4e-05, | |
| "loss": 3.5462, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07828144911705807, | |
| "grad_norm": 10.792546272277832, | |
| "learning_rate": 8.6e-05, | |
| "loss": 3.489, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.08010194793373385, | |
| "grad_norm": 8.886263847351074, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 3.507, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.0819224467504096, | |
| "grad_norm": 10.633005142211914, | |
| "learning_rate": 9e-05, | |
| "loss": 3.4786, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08374294556708538, | |
| "grad_norm": 8.927319526672363, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 3.4799, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.08556344438376115, | |
| "grad_norm": 7.764442443847656, | |
| "learning_rate": 9.4e-05, | |
| "loss": 3.4424, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08738394320043692, | |
| "grad_norm": 7.294579982757568, | |
| "learning_rate": 9.6e-05, | |
| "loss": 3.3888, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.0892044420171127, | |
| "grad_norm": 12.751729965209961, | |
| "learning_rate": 9.8e-05, | |
| "loss": 3.4342, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.09102494083378845, | |
| "grad_norm": 12.688404083251953, | |
| "learning_rate": 0.0001, | |
| "loss": 3.412, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09284543965046423, | |
| "grad_norm": 8.39454174041748, | |
| "learning_rate": 9.99990102735217e-05, | |
| "loss": 3.3787, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.09466593846714, | |
| "grad_norm": 8.661295890808105, | |
| "learning_rate": 9.999604113326911e-05, | |
| "loss": 3.3499, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.09648643728381577, | |
| "grad_norm": 11.745123863220215, | |
| "learning_rate": 9.999109269678773e-05, | |
| "loss": 3.3025, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.09830693610049153, | |
| "grad_norm": 11.857743263244629, | |
| "learning_rate": 9.998416515998146e-05, | |
| "loss": 3.3443, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.1001274349171673, | |
| "grad_norm": 10.025672912597656, | |
| "learning_rate": 9.997525879710501e-05, | |
| "loss": 3.2624, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.10194793373384307, | |
| "grad_norm": 11.870292663574219, | |
| "learning_rate": 9.996437396075289e-05, | |
| "loss": 3.3323, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.10376843255051885, | |
| "grad_norm": 11.637451171875, | |
| "learning_rate": 9.995151108184551e-05, | |
| "loss": 3.2349, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1055889313671946, | |
| "grad_norm": 21.10885238647461, | |
| "learning_rate": 9.993667066961219e-05, | |
| "loss": 3.3025, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.10740943018387038, | |
| "grad_norm": 19.601778030395508, | |
| "learning_rate": 9.991985331157083e-05, | |
| "loss": 3.2509, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.10922992900054615, | |
| "grad_norm": 15.462264060974121, | |
| "learning_rate": 9.990105967350486e-05, | |
| "loss": 3.2197, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.11105042781722192, | |
| "grad_norm": 10.48941421508789, | |
| "learning_rate": 9.98802904994367e-05, | |
| "loss": 3.2523, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1128709266338977, | |
| "grad_norm": 11.569725036621094, | |
| "learning_rate": 9.985754661159844e-05, | |
| "loss": 3.192, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.11469142545057345, | |
| "grad_norm": 7.362033843994141, | |
| "learning_rate": 9.983282891039914e-05, | |
| "loss": 3.2174, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.11651192426724923, | |
| "grad_norm": 7.256021022796631, | |
| "learning_rate": 9.98061383743894e-05, | |
| "loss": 3.1671, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.118332423083925, | |
| "grad_norm": 8.411303520202637, | |
| "learning_rate": 9.97774760602224e-05, | |
| "loss": 3.1793, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.12015292190060077, | |
| "grad_norm": 9.6874361038208, | |
| "learning_rate": 9.97468431026122e-05, | |
| "loss": 3.167, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.12197342071727653, | |
| "grad_norm": 9.121397972106934, | |
| "learning_rate": 9.971424071428877e-05, | |
| "loss": 3.1107, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1237939195339523, | |
| "grad_norm": 13.291868209838867, | |
| "learning_rate": 9.967967018594997e-05, | |
| "loss": 3.1678, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.12561441835062806, | |
| "grad_norm": 10.143365859985352, | |
| "learning_rate": 9.96431328862105e-05, | |
| "loss": 3.1591, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.12743491716730385, | |
| "grad_norm": 9.821294784545898, | |
| "learning_rate": 9.96046302615477e-05, | |
| "loss": 3.1315, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1292554159839796, | |
| "grad_norm": 12.868693351745605, | |
| "learning_rate": 9.956416383624422e-05, | |
| "loss": 3.0713, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1310759148006554, | |
| "grad_norm": 10.408199310302734, | |
| "learning_rate": 9.95217352123278e-05, | |
| "loss": 3.1292, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.13289641361733115, | |
| "grad_norm": 8.829959869384766, | |
| "learning_rate": 9.947734606950771e-05, | |
| "loss": 3.111, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.1347169124340069, | |
| "grad_norm": 9.13364315032959, | |
| "learning_rate": 9.943099816510836e-05, | |
| "loss": 3.1011, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1365374112506827, | |
| "grad_norm": 8.674768447875977, | |
| "learning_rate": 9.93826933339997e-05, | |
| "loss": 3.0991, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.13835791006735845, | |
| "grad_norm": 8.487624168395996, | |
| "learning_rate": 9.933243348852451e-05, | |
| "loss": 3.0915, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.14017840888403424, | |
| "grad_norm": 7.808052062988281, | |
| "learning_rate": 9.928022061842282e-05, | |
| "loss": 3.0694, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.14199890770071, | |
| "grad_norm": 12.615427017211914, | |
| "learning_rate": 9.922605679075298e-05, | |
| "loss": 3.0524, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.14381940651738576, | |
| "grad_norm": 8.977923393249512, | |
| "learning_rate": 9.916994414981002e-05, | |
| "loss": 2.9989, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.14563990533406154, | |
| "grad_norm": 8.723967552185059, | |
| "learning_rate": 9.911188491704058e-05, | |
| "loss": 3.0123, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.1474604041507373, | |
| "grad_norm": 8.96397590637207, | |
| "learning_rate": 9.905188139095506e-05, | |
| "loss": 3.0031, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.14928090296741306, | |
| "grad_norm": 21.173276901245117, | |
| "learning_rate": 9.89899359470366e-05, | |
| "loss": 3.0045, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.15110140178408885, | |
| "grad_norm": 9.059152603149414, | |
| "learning_rate": 9.892605103764704e-05, | |
| "loss": 2.972, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.1529219006007646, | |
| "grad_norm": 7.885227203369141, | |
| "learning_rate": 9.886022919192985e-05, | |
| "loss": 2.9822, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.1547423994174404, | |
| "grad_norm": 7.191554069519043, | |
| "learning_rate": 9.879247301570995e-05, | |
| "loss": 3.0297, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.15656289823411615, | |
| "grad_norm": 10.712031364440918, | |
| "learning_rate": 9.872278519139062e-05, | |
| "loss": 3.0149, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.1583833970507919, | |
| "grad_norm": 10.305954933166504, | |
| "learning_rate": 9.865116847784726e-05, | |
| "loss": 3.0046, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.1602038958674677, | |
| "grad_norm": 11.114262580871582, | |
| "learning_rate": 9.857762571031818e-05, | |
| "loss": 2.9784, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.16202439468414345, | |
| "grad_norm": 10.611502647399902, | |
| "learning_rate": 9.850215980029234e-05, | |
| "loss": 2.9992, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.1638448935008192, | |
| "grad_norm": 8.901230812072754, | |
| "learning_rate": 9.842477373539412e-05, | |
| "loss": 2.9712, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.165665392317495, | |
| "grad_norm": 7.750337600708008, | |
| "learning_rate": 9.834547057926502e-05, | |
| "loss": 2.9586, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.16748589113417076, | |
| "grad_norm": 8.038575172424316, | |
| "learning_rate": 9.826425347144237e-05, | |
| "loss": 2.9487, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.16930638995084654, | |
| "grad_norm": 8.837507247924805, | |
| "learning_rate": 9.818112562723507e-05, | |
| "loss": 2.9682, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1711268887675223, | |
| "grad_norm": 8.412049293518066, | |
| "learning_rate": 9.809609033759625e-05, | |
| "loss": 2.9253, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.17294738758419806, | |
| "grad_norm": 8.611617088317871, | |
| "learning_rate": 9.8009150968993e-05, | |
| "loss": 2.9494, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.17476788640087385, | |
| "grad_norm": 8.870803833007812, | |
| "learning_rate": 9.792031096327318e-05, | |
| "loss": 2.9347, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1765883852175496, | |
| "grad_norm": 6.085882186889648, | |
| "learning_rate": 9.782957383752898e-05, | |
| "loss": 2.8954, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1784088840342254, | |
| "grad_norm": 53.74106979370117, | |
| "learning_rate": 9.773694318395786e-05, | |
| "loss": 2.9159, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.18022938285090115, | |
| "grad_norm": 8.432936668395996, | |
| "learning_rate": 9.764242266972021e-05, | |
| "loss": 2.9752, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.1820498816675769, | |
| "grad_norm": 6.479644298553467, | |
| "learning_rate": 9.75460160367943e-05, | |
| "loss": 2.938, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1820498816675769, | |
| "eval_loss": 2.901522159576416, | |
| "eval_runtime": 1012.3411, | |
| "eval_samples_per_second": 9.647, | |
| "eval_steps_per_second": 1.206, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1838703804842527, | |
| "grad_norm": 8.284567832946777, | |
| "learning_rate": 9.744772710182801e-05, | |
| "loss": 2.899, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.18569087930092845, | |
| "grad_norm": 6.623629570007324, | |
| "learning_rate": 9.734755975598777e-05, | |
| "loss": 2.9371, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.1875113781176042, | |
| "grad_norm": 8.541956901550293, | |
| "learning_rate": 9.724551796480459e-05, | |
| "loss": 2.8807, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.18933187693428, | |
| "grad_norm": 8.620600700378418, | |
| "learning_rate": 9.714160576801696e-05, | |
| "loss": 2.8888, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.19115237575095576, | |
| "grad_norm": 8.644622802734375, | |
| "learning_rate": 9.7035827279411e-05, | |
| "loss": 2.8747, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.19297287456763154, | |
| "grad_norm": 9.656100273132324, | |
| "learning_rate": 9.692818668665752e-05, | |
| "loss": 2.9203, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1947933733843073, | |
| "grad_norm": 10.529635429382324, | |
| "learning_rate": 9.681868825114634e-05, | |
| "loss": 2.9257, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.19661387220098306, | |
| "grad_norm": 8.376754760742188, | |
| "learning_rate": 9.670733630781747e-05, | |
| "loss": 2.8864, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.19843437101765884, | |
| "grad_norm": 8.018802642822266, | |
| "learning_rate": 9.659413526498962e-05, | |
| "loss": 2.8672, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.2002548698343346, | |
| "grad_norm": 7.348598480224609, | |
| "learning_rate": 9.647908960418553e-05, | |
| "loss": 2.8528, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2020753686510104, | |
| "grad_norm": 7.87021017074585, | |
| "learning_rate": 9.636220387995469e-05, | |
| "loss": 2.8713, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.20389586746768615, | |
| "grad_norm": 8.476405143737793, | |
| "learning_rate": 9.624348271969295e-05, | |
| "loss": 2.8667, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2057163662843619, | |
| "grad_norm": 8.64283561706543, | |
| "learning_rate": 9.612293082345931e-05, | |
| "loss": 2.8523, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.2075368651010377, | |
| "grad_norm": 10.11330795288086, | |
| "learning_rate": 9.600055296378995e-05, | |
| "loss": 2.8375, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.20935736391771345, | |
| "grad_norm": 8.217743873596191, | |
| "learning_rate": 9.58763539855092e-05, | |
| "loss": 2.8685, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.2111778627343892, | |
| "grad_norm": 7.501378536224365, | |
| "learning_rate": 9.575033880553774e-05, | |
| "loss": 2.8349, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.212998361551065, | |
| "grad_norm": 8.812211036682129, | |
| "learning_rate": 9.562251241269798e-05, | |
| "loss": 2.8384, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.21481886036774075, | |
| "grad_norm": 7.964756011962891, | |
| "learning_rate": 9.549287986751655e-05, | |
| "loss": 2.8653, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.21663935918441654, | |
| "grad_norm": 7.216350555419922, | |
| "learning_rate": 9.536144630202395e-05, | |
| "loss": 2.8276, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.2184598580010923, | |
| "grad_norm": 7.890927314758301, | |
| "learning_rate": 9.522821691955135e-05, | |
| "loss": 2.7802, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.22028035681776806, | |
| "grad_norm": 8.259157180786133, | |
| "learning_rate": 9.509319699452469e-05, | |
| "loss": 2.8407, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.22210085563444384, | |
| "grad_norm": 7.810998916625977, | |
| "learning_rate": 9.495639187225575e-05, | |
| "loss": 2.8374, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.2239213544511196, | |
| "grad_norm": 6.905944347381592, | |
| "learning_rate": 9.481780696873059e-05, | |
| "loss": 2.8342, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.2257418532677954, | |
| "grad_norm": 8.832979202270508, | |
| "learning_rate": 9.467744777039517e-05, | |
| "loss": 2.7816, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.22756235208447115, | |
| "grad_norm": 6.949944972991943, | |
| "learning_rate": 9.453531983393809e-05, | |
| "loss": 2.8104, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2293828509011469, | |
| "grad_norm": 11.183205604553223, | |
| "learning_rate": 9.439142878607061e-05, | |
| "loss": 2.8605, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.2312033497178227, | |
| "grad_norm": 8.672426223754883, | |
| "learning_rate": 9.424578032330398e-05, | |
| "loss": 2.7866, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.23302384853449845, | |
| "grad_norm": 8.570023536682129, | |
| "learning_rate": 9.409838021172375e-05, | |
| "loss": 2.7814, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.2348443473511742, | |
| "grad_norm": 17.605865478515625, | |
| "learning_rate": 9.394923428676168e-05, | |
| "loss": 2.8896, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.23666484616785, | |
| "grad_norm": 8.613877296447754, | |
| "learning_rate": 9.379834845296463e-05, | |
| "loss": 2.8474, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.23848534498452575, | |
| "grad_norm": 9.39710807800293, | |
| "learning_rate": 9.364572868376075e-05, | |
| "loss": 2.7771, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.24030584380120154, | |
| "grad_norm": 12.333969116210938, | |
| "learning_rate": 9.349138102122316e-05, | |
| "loss": 2.8079, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2421263426178773, | |
| "grad_norm": 10.491060256958008, | |
| "learning_rate": 9.333531157583055e-05, | |
| "loss": 2.7536, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.24394684143455306, | |
| "grad_norm": 9.862618446350098, | |
| "learning_rate": 9.317752652622547e-05, | |
| "loss": 2.8011, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.24576734025122884, | |
| "grad_norm": 11.95722484588623, | |
| "learning_rate": 9.301803211896955e-05, | |
| "loss": 2.8058, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.2475878390679046, | |
| "grad_norm": 8.709095001220703, | |
| "learning_rate": 9.28568346682963e-05, | |
| "loss": 2.7922, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2494083378845804, | |
| "grad_norm": 6.32808256149292, | |
| "learning_rate": 9.269394055586116e-05, | |
| "loss": 2.7246, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.2512288367012561, | |
| "grad_norm": 10.615900039672852, | |
| "learning_rate": 9.252935623048875e-05, | |
| "loss": 2.7993, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.2530493355179319, | |
| "grad_norm": 10.374322891235352, | |
| "learning_rate": 9.236308820791768e-05, | |
| "loss": 2.7583, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.2548698343346077, | |
| "grad_norm": 11.486263275146484, | |
| "learning_rate": 9.219514307054251e-05, | |
| "loss": 2.8258, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2566903331512835, | |
| "grad_norm": 9.840982437133789, | |
| "learning_rate": 9.202552746715322e-05, | |
| "loss": 2.8464, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.2585108319679592, | |
| "grad_norm": 15.894274711608887, | |
| "learning_rate": 9.185424811267199e-05, | |
| "loss": 2.8465, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.260331330784635, | |
| "grad_norm": 8.428662300109863, | |
| "learning_rate": 9.168131178788726e-05, | |
| "loss": 2.8095, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.2621518296013108, | |
| "grad_norm": 17.082258224487305, | |
| "learning_rate": 9.150672533918544e-05, | |
| "loss": 2.7782, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.2639723284179865, | |
| "grad_norm": 7.154361724853516, | |
| "learning_rate": 9.133049567827982e-05, | |
| "loss": 2.7773, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2657928272346623, | |
| "grad_norm": 6.119648456573486, | |
| "learning_rate": 9.115262978193679e-05, | |
| "loss": 2.7788, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.2676133260513381, | |
| "grad_norm": 8.635058403015137, | |
| "learning_rate": 9.097313469169988e-05, | |
| "loss": 2.7703, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.2694338248680138, | |
| "grad_norm": 12.325600624084473, | |
| "learning_rate": 9.079201751361082e-05, | |
| "loss": 2.7313, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2712543236846896, | |
| "grad_norm": 8.181892395019531, | |
| "learning_rate": 9.06092854179283e-05, | |
| "loss": 2.7795, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.2730748225013654, | |
| "grad_norm": 14.719033241271973, | |
| "learning_rate": 9.042494563884404e-05, | |
| "loss": 2.8108, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2748953213180411, | |
| "grad_norm": 7.8658061027526855, | |
| "learning_rate": 9.023900547419646e-05, | |
| "loss": 2.7663, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.2767158201347169, | |
| "grad_norm": 15.445107460021973, | |
| "learning_rate": 9.005147228518174e-05, | |
| "loss": 2.7878, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2785363189513927, | |
| "grad_norm": 12.650901794433594, | |
| "learning_rate": 8.986235349606238e-05, | |
| "loss": 2.8219, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.2803568177680685, | |
| "grad_norm": 10.226774215698242, | |
| "learning_rate": 8.967165659387331e-05, | |
| "loss": 2.742, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2821773165847442, | |
| "grad_norm": 138.37210083007812, | |
| "learning_rate": 8.947938912812548e-05, | |
| "loss": 2.9524, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.28399781540142, | |
| "grad_norm": 16.11450958251953, | |
| "learning_rate": 8.928555871050693e-05, | |
| "loss": 2.7966, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.2858183142180958, | |
| "grad_norm": 10.201882362365723, | |
| "learning_rate": 8.909017301458156e-05, | |
| "loss": 2.8389, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2876388130347715, | |
| "grad_norm": 10.96867847442627, | |
| "learning_rate": 8.889323977548521e-05, | |
| "loss": 2.7495, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.2894593118514473, | |
| "grad_norm": 10.814942359924316, | |
| "learning_rate": 8.869476678961954e-05, | |
| "loss": 2.7676, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2912798106681231, | |
| "grad_norm": 6.535337448120117, | |
| "learning_rate": 8.849476191434334e-05, | |
| "loss": 2.7589, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2931003094847988, | |
| "grad_norm": 7.036696434020996, | |
| "learning_rate": 8.829323306766142e-05, | |
| "loss": 2.7921, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2949208083014746, | |
| "grad_norm": 5.92086124420166, | |
| "learning_rate": 8.809018822791121e-05, | |
| "loss": 2.7267, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.2967413071181504, | |
| "grad_norm": 6.186739921569824, | |
| "learning_rate": 8.788563543344688e-05, | |
| "loss": 2.795, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.2985618059348261, | |
| "grad_norm": 8.154546737670898, | |
| "learning_rate": 8.767958278232112e-05, | |
| "loss": 2.7627, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.3003823047515019, | |
| "grad_norm": 7.674529075622559, | |
| "learning_rate": 8.74720384319645e-05, | |
| "loss": 2.7996, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.3022028035681777, | |
| "grad_norm": 6.348474025726318, | |
| "learning_rate": 8.726301059886259e-05, | |
| "loss": 2.7704, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.3040233023848534, | |
| "grad_norm": 10.496267318725586, | |
| "learning_rate": 8.705250755823064e-05, | |
| "loss": 2.7591, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.3058438012015292, | |
| "grad_norm": 102.05543518066406, | |
| "learning_rate": 8.684053764368598e-05, | |
| "loss": 2.8027, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.307664300018205, | |
| "grad_norm": 8.403404235839844, | |
| "learning_rate": 8.662710924691805e-05, | |
| "loss": 2.8801, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.3094847988348808, | |
| "grad_norm": 7.355569839477539, | |
| "learning_rate": 8.64122308173563e-05, | |
| "loss": 2.8346, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3113052976515565, | |
| "grad_norm": 12.551121711730957, | |
| "learning_rate": 8.61959108618356e-05, | |
| "loss": 2.8381, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.3131257964682323, | |
| "grad_norm": 116.6989517211914, | |
| "learning_rate": 8.597815794425943e-05, | |
| "loss": 2.814, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.3149462952849081, | |
| "grad_norm": 21.63788604736328, | |
| "learning_rate": 8.575898068526093e-05, | |
| "loss": 2.8389, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.3167667941015838, | |
| "grad_norm": 143.42408752441406, | |
| "learning_rate": 8.553838776186158e-05, | |
| "loss": 2.8534, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.3185872929182596, | |
| "grad_norm": 9.04028034210205, | |
| "learning_rate": 8.531638790712765e-05, | |
| "loss": 2.8186, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.3204077917349354, | |
| "grad_norm": 11.659414291381836, | |
| "learning_rate": 8.509298990982453e-05, | |
| "loss": 2.8078, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.3222282905516111, | |
| "grad_norm": 7.934113502502441, | |
| "learning_rate": 8.486820261406873e-05, | |
| "loss": 2.792, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.3240487893682869, | |
| "grad_norm": 12.919567108154297, | |
| "learning_rate": 8.464203491897779e-05, | |
| "loss": 2.8111, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.3258692881849627, | |
| "grad_norm": 13.67540454864502, | |
| "learning_rate": 8.441449577831801e-05, | |
| "loss": 2.8085, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.3276897870016384, | |
| "grad_norm": 7.7655110359191895, | |
| "learning_rate": 8.418559420014984e-05, | |
| "loss": 2.7689, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3295102858183142, | |
| "grad_norm": 8.168259620666504, | |
| "learning_rate": 8.395533924647141e-05, | |
| "loss": 2.7534, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.33133078463499, | |
| "grad_norm": 14.387748718261719, | |
| "learning_rate": 8.372374003285968e-05, | |
| "loss": 2.8353, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.3331512834516658, | |
| "grad_norm": 9.209723472595215, | |
| "learning_rate": 8.349080572810965e-05, | |
| "loss": 2.7837, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.3349717822683415, | |
| "grad_norm": 9.160303115844727, | |
| "learning_rate": 8.325654555387123e-05, | |
| "loss": 2.8186, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.3367922810850173, | |
| "grad_norm": 20.171415328979492, | |
| "learning_rate": 8.302096878428438e-05, | |
| "loss": 2.8011, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.3386127799016931, | |
| "grad_norm": 29.545217514038086, | |
| "learning_rate": 8.278408474561169e-05, | |
| "loss": 2.7971, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.3404332787183688, | |
| "grad_norm": 19.314136505126953, | |
| "learning_rate": 8.254590281586942e-05, | |
| "loss": 2.7983, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.3422537775350446, | |
| "grad_norm": 8.010175704956055, | |
| "learning_rate": 8.230643242445605e-05, | |
| "loss": 2.7921, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3440742763517204, | |
| "grad_norm": 24.649381637573242, | |
| "learning_rate": 8.206568305177907e-05, | |
| "loss": 2.7962, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.3458947751683961, | |
| "grad_norm": 8.272650718688965, | |
| "learning_rate": 8.182366422887964e-05, | |
| "loss": 2.7439, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3477152739850719, | |
| "grad_norm": 7.553550720214844, | |
| "learning_rate": 8.158038553705524e-05, | |
| "loss": 2.7845, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.3495357728017477, | |
| "grad_norm": 8.573986053466797, | |
| "learning_rate": 8.13358566074804e-05, | |
| "loss": 2.7003, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.3513562716184234, | |
| "grad_norm": 10.316489219665527, | |
| "learning_rate": 8.109008712082538e-05, | |
| "loss": 2.7627, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.3531767704350992, | |
| "grad_norm": 8.462483406066895, | |
| "learning_rate": 8.084308680687287e-05, | |
| "loss": 2.7281, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.354997269251775, | |
| "grad_norm": 20.140274047851562, | |
| "learning_rate": 8.059486544413298e-05, | |
| "loss": 2.6906, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.3568177680684508, | |
| "grad_norm": 7.473912239074707, | |
| "learning_rate": 8.034543285945584e-05, | |
| "loss": 2.8117, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3586382668851265, | |
| "grad_norm": 38.26898193359375, | |
| "learning_rate": 8.009479892764284e-05, | |
| "loss": 2.7456, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.3604587657018023, | |
| "grad_norm": 48.63120651245117, | |
| "learning_rate": 7.984297357105552e-05, | |
| "loss": 2.7224, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.3622792645184781, | |
| "grad_norm": 10.31283187866211, | |
| "learning_rate": 7.95899667592228e-05, | |
| "loss": 2.7108, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.3640997633351538, | |
| "grad_norm": 6.465616703033447, | |
| "learning_rate": 7.933578850844636e-05, | |
| "loss": 2.6901, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3640997633351538, | |
| "eval_loss": 2.735260248184204, | |
| "eval_runtime": 1011.986, | |
| "eval_samples_per_second": 9.65, | |
| "eval_steps_per_second": 1.207, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3659202621518296, | |
| "grad_norm": 7.348064422607422, | |
| "learning_rate": 7.908044888140394e-05, | |
| "loss": 2.7194, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3677407609685054, | |
| "grad_norm": 10.493142127990723, | |
| "learning_rate": 7.882395798675115e-05, | |
| "loss": 2.7374, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3695612597851811, | |
| "grad_norm": 9.447548866271973, | |
| "learning_rate": 7.856632597872122e-05, | |
| "loss": 2.7186, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3713817586018569, | |
| "grad_norm": 6.926711559295654, | |
| "learning_rate": 7.83075630567229e-05, | |
| "loss": 2.7369, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3732022574185327, | |
| "grad_norm": 19.662818908691406, | |
| "learning_rate": 7.804767946493685e-05, | |
| "loss": 2.71, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3750227562352084, | |
| "grad_norm": 37.960330963134766, | |
| "learning_rate": 7.778668549190994e-05, | |
| "loss": 2.7497, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.3768432550518842, | |
| "grad_norm": 10.371471405029297, | |
| "learning_rate": 7.7524591470148e-05, | |
| "loss": 2.7329, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.37866375386856, | |
| "grad_norm": 7.78175163269043, | |
| "learning_rate": 7.726140777570675e-05, | |
| "loss": 2.6866, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3804842526852358, | |
| "grad_norm": 63.65814971923828, | |
| "learning_rate": 7.699714482778104e-05, | |
| "loss": 2.6993, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.3823047515019115, | |
| "grad_norm": 20.166156768798828, | |
| "learning_rate": 7.673181308829233e-05, | |
| "loss": 2.7455, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3841252503185873, | |
| "grad_norm": 10.943995475769043, | |
| "learning_rate": 7.646542306147455e-05, | |
| "loss": 2.7369, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.3859457491352631, | |
| "grad_norm": 8.025435447692871, | |
| "learning_rate": 7.619798529345825e-05, | |
| "loss": 2.6558, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3877662479519388, | |
| "grad_norm": 7.745648384094238, | |
| "learning_rate": 7.592951037185301e-05, | |
| "loss": 2.7071, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.3895867467686146, | |
| "grad_norm": 11.425312995910645, | |
| "learning_rate": 7.566000892532838e-05, | |
| "loss": 2.7322, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3914072455852904, | |
| "grad_norm": 21.18678855895996, | |
| "learning_rate": 7.538949162319306e-05, | |
| "loss": 2.6649, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3932277444019661, | |
| "grad_norm": 7.39724588394165, | |
| "learning_rate": 7.511796917497255e-05, | |
| "loss": 2.6632, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3950482432186419, | |
| "grad_norm": 26.170703887939453, | |
| "learning_rate": 7.484545232998508e-05, | |
| "loss": 2.7239, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3968687420353177, | |
| "grad_norm": 9.031233787536621, | |
| "learning_rate": 7.457195187691614e-05, | |
| "loss": 2.6995, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.3986892408519934, | |
| "grad_norm": 9.091829299926758, | |
| "learning_rate": 7.429747864339136e-05, | |
| "loss": 2.6826, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.4005097396686692, | |
| "grad_norm": 8.380509376525879, | |
| "learning_rate": 7.40220434955478e-05, | |
| "loss": 2.7026, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.402330238485345, | |
| "grad_norm": 5.83329439163208, | |
| "learning_rate": 7.374565733760379e-05, | |
| "loss": 2.6523, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.4041507373020208, | |
| "grad_norm": 6.2706475257873535, | |
| "learning_rate": 7.346833111142735e-05, | |
| "loss": 2.6854, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.4059712361186965, | |
| "grad_norm": 6.717202663421631, | |
| "learning_rate": 7.319007579610277e-05, | |
| "loss": 2.6949, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.4077917349353723, | |
| "grad_norm": 9.39269733428955, | |
| "learning_rate": 7.291090240749621e-05, | |
| "loss": 2.6903, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.4096122337520481, | |
| "grad_norm": 5.5886383056640625, | |
| "learning_rate": 7.263082199781949e-05, | |
| "loss": 2.6633, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.4114327325687238, | |
| "grad_norm": 7.369466304779053, | |
| "learning_rate": 7.234984565519247e-05, | |
| "loss": 2.6493, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.4132532313853996, | |
| "grad_norm": 11.548351287841797, | |
| "learning_rate": 7.206798450320422e-05, | |
| "loss": 2.6406, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.4150737302020754, | |
| "grad_norm": 9.547252655029297, | |
| "learning_rate": 7.178524970047253e-05, | |
| "loss": 2.7079, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.4168942290187511, | |
| "grad_norm": 7.008022785186768, | |
| "learning_rate": 7.150165244020224e-05, | |
| "loss": 2.6277, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.4187147278354269, | |
| "grad_norm": 7.342654705047607, | |
| "learning_rate": 7.121720394974206e-05, | |
| "loss": 2.6857, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4205352266521027, | |
| "grad_norm": 6.655099391937256, | |
| "learning_rate": 7.093191549014007e-05, | |
| "loss": 2.654, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.4223557254687784, | |
| "grad_norm": 6.727902412414551, | |
| "learning_rate": 7.0645798355698e-05, | |
| "loss": 2.6778, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.4241762242854542, | |
| "grad_norm": 6.407891273498535, | |
| "learning_rate": 7.035886387352399e-05, | |
| "loss": 2.6798, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.42599672310213, | |
| "grad_norm": 6.782746315002441, | |
| "learning_rate": 7.007112340308423e-05, | |
| "loss": 2.6669, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.4278172219188058, | |
| "grad_norm": 6.325183868408203, | |
| "learning_rate": 6.97825883357532e-05, | |
| "loss": 2.6556, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.4296377207354815, | |
| "grad_norm": 5.9982008934021, | |
| "learning_rate": 6.949327009436278e-05, | |
| "loss": 2.6764, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.4314582195521573, | |
| "grad_norm": 6.148717403411865, | |
| "learning_rate": 6.920318013274988e-05, | |
| "loss": 2.6792, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.4332787183688331, | |
| "grad_norm": 6.28306245803833, | |
| "learning_rate": 6.891232993530319e-05, | |
| "loss": 2.6945, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.4350992171855088, | |
| "grad_norm": 6.82548713684082, | |
| "learning_rate": 6.862073101650837e-05, | |
| "loss": 2.6592, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.4369197160021846, | |
| "grad_norm": 6.251521587371826, | |
| "learning_rate": 6.832839492049225e-05, | |
| "loss": 2.6386, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4387402148188604, | |
| "grad_norm": 6.3002800941467285, | |
| "learning_rate": 6.80353332205658e-05, | |
| "loss": 2.6491, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.4405607136355361, | |
| "grad_norm": 6.661396026611328, | |
| "learning_rate": 6.774155751876603e-05, | |
| "loss": 2.6538, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.4423812124522119, | |
| "grad_norm": 5.950438022613525, | |
| "learning_rate": 6.744707944539654e-05, | |
| "loss": 2.603, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.4442017112688877, | |
| "grad_norm": 9.665274620056152, | |
| "learning_rate": 6.715191065856721e-05, | |
| "loss": 2.6364, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.4460222100855634, | |
| "grad_norm": 7.168936252593994, | |
| "learning_rate": 6.685606284373258e-05, | |
| "loss": 2.6911, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.4478427089022392, | |
| "grad_norm": 6.2818145751953125, | |
| "learning_rate": 6.655954771322929e-05, | |
| "loss": 2.6304, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.449663207718915, | |
| "grad_norm": 6.3358964920043945, | |
| "learning_rate": 6.626237700581238e-05, | |
| "loss": 2.6124, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.4514837065355908, | |
| "grad_norm": 7.2275004386901855, | |
| "learning_rate": 6.596456248619054e-05, | |
| "loss": 2.644, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.4533042053522665, | |
| "grad_norm": 5.857280254364014, | |
| "learning_rate": 6.566611594456042e-05, | |
| "loss": 2.6175, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.4551247041689423, | |
| "grad_norm": 6.958190441131592, | |
| "learning_rate": 6.536704919613982e-05, | |
| "loss": 2.6372, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4569452029856181, | |
| "grad_norm": 6.084266662597656, | |
| "learning_rate": 6.506737408069988e-05, | |
| "loss": 2.5989, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.4587657018022938, | |
| "grad_norm": 6.761366367340088, | |
| "learning_rate": 6.476710246209649e-05, | |
| "loss": 2.6231, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4605862006189696, | |
| "grad_norm": 6.110794544219971, | |
| "learning_rate": 6.446624622780052e-05, | |
| "loss": 2.6294, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.4624066994356454, | |
| "grad_norm": 7.39132022857666, | |
| "learning_rate": 6.416481728842722e-05, | |
| "loss": 2.589, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.4642271982523211, | |
| "grad_norm": 5.385328769683838, | |
| "learning_rate": 6.386282757726467e-05, | |
| "loss": 2.5938, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.4660476970689969, | |
| "grad_norm": 6.137452602386475, | |
| "learning_rate": 6.356028904980145e-05, | |
| "loss": 2.611, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.4678681958856727, | |
| "grad_norm": 6.881803512573242, | |
| "learning_rate": 6.325721368325317e-05, | |
| "loss": 2.5577, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.4696886947023484, | |
| "grad_norm": 7.0561323165893555, | |
| "learning_rate": 6.295361347608846e-05, | |
| "loss": 2.5971, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4715091935190242, | |
| "grad_norm": 8.178688049316406, | |
| "learning_rate": 6.264950044755387e-05, | |
| "loss": 2.6111, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.4733296923357, | |
| "grad_norm": 7.674656867980957, | |
| "learning_rate": 6.234488663719807e-05, | |
| "loss": 2.6211, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4751501911523758, | |
| "grad_norm": 6.952455043792725, | |
| "learning_rate": 6.203978410439519e-05, | |
| "loss": 2.5976, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.4769706899690515, | |
| "grad_norm": 7.033661842346191, | |
| "learning_rate": 6.173420492786747e-05, | |
| "loss": 2.5667, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.4787911887857273, | |
| "grad_norm": 7.665953636169434, | |
| "learning_rate": 6.142816120520699e-05, | |
| "loss": 2.5964, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.4806116876024031, | |
| "grad_norm": 6.789517402648926, | |
| "learning_rate": 6.112166505239679e-05, | |
| "loss": 2.5046, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4824321864190788, | |
| "grad_norm": 6.597125053405762, | |
| "learning_rate": 6.0814728603331176e-05, | |
| "loss": 2.6044, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.4842526852357546, | |
| "grad_norm": 5.781830310821533, | |
| "learning_rate": 6.050736400933538e-05, | |
| "loss": 2.6041, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4860731840524304, | |
| "grad_norm": 6.907136917114258, | |
| "learning_rate": 6.0199583438684495e-05, | |
| "loss": 2.587, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4878936828691061, | |
| "grad_norm": 5.483163833618164, | |
| "learning_rate": 5.989139907612174e-05, | |
| "loss": 2.6136, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4897141816857819, | |
| "grad_norm": 5.925361156463623, | |
| "learning_rate": 5.958282312237605e-05, | |
| "loss": 2.5801, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.4915346805024577, | |
| "grad_norm": 6.07294225692749, | |
| "learning_rate": 5.927386779367912e-05, | |
| "loss": 2.6287, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4933551793191334, | |
| "grad_norm": 5.8490424156188965, | |
| "learning_rate": 5.896454532128171e-05, | |
| "loss": 2.5894, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4951756781358092, | |
| "grad_norm": 6.67198371887207, | |
| "learning_rate": 5.865486795096948e-05, | |
| "loss": 2.5684, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.496996176952485, | |
| "grad_norm": 6.779095649719238, | |
| "learning_rate": 5.8344847942578175e-05, | |
| "loss": 2.5916, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.4988166757691608, | |
| "grad_norm": 6.148252487182617, | |
| "learning_rate": 5.8034497569508206e-05, | |
| "loss": 2.579, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.5006371745858366, | |
| "grad_norm": 7.566195011138916, | |
| "learning_rate": 5.772382911823886e-05, | |
| "loss": 2.5639, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.5024576734025122, | |
| "grad_norm": 6.119766712188721, | |
| "learning_rate": 5.741285488784183e-05, | |
| "loss": 2.5728, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.504278172219188, | |
| "grad_norm": 6.376175880432129, | |
| "learning_rate": 5.710158718949431e-05, | |
| "loss": 2.5889, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.5060986710358638, | |
| "grad_norm": 7.798823833465576, | |
| "learning_rate": 5.67900383459916e-05, | |
| "loss": 2.5611, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.5079191698525396, | |
| "grad_norm": 5.470147132873535, | |
| "learning_rate": 5.64782206912593e-05, | |
| "loss": 2.6003, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.5097396686692154, | |
| "grad_norm": 6.034721374511719, | |
| "learning_rate": 5.6166146569864986e-05, | |
| "loss": 2.6126, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5115601674858912, | |
| "grad_norm": 5.667842388153076, | |
| "learning_rate": 5.585382833652951e-05, | |
| "loss": 2.5349, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.513380666302567, | |
| "grad_norm": 5.957113742828369, | |
| "learning_rate": 5.554127835563784e-05, | |
| "loss": 2.5567, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.5152011651192426, | |
| "grad_norm": 8.104249000549316, | |
| "learning_rate": 5.5228509000749705e-05, | |
| "loss": 2.5654, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.5170216639359184, | |
| "grad_norm": 5.897515296936035, | |
| "learning_rate": 5.491553265410956e-05, | |
| "loss": 2.599, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.5188421627525942, | |
| "grad_norm": 7.213466167449951, | |
| "learning_rate": 5.46023617061565e-05, | |
| "loss": 2.5782, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.52066266156927, | |
| "grad_norm": 6.2212910652160645, | |
| "learning_rate": 5.4289008555033704e-05, | |
| "loss": 2.5869, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.5224831603859458, | |
| "grad_norm": 7.721590042114258, | |
| "learning_rate": 5.397548560609762e-05, | |
| "loss": 2.5546, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.5243036592026216, | |
| "grad_norm": 6.373271465301514, | |
| "learning_rate": 5.366180527142678e-05, | |
| "loss": 2.555, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.5261241580192972, | |
| "grad_norm": 7.320889949798584, | |
| "learning_rate": 5.334797996933052e-05, | |
| "loss": 2.5487, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.527944656835973, | |
| "grad_norm": 5.541944980621338, | |
| "learning_rate": 5.3034022123857285e-05, | |
| "loss": 2.6031, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5297651556526488, | |
| "grad_norm": 6.056175708770752, | |
| "learning_rate": 5.271994416430278e-05, | |
| "loss": 2.5569, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.5315856544693246, | |
| "grad_norm": 6.227322578430176, | |
| "learning_rate": 5.240575852471791e-05, | |
| "loss": 2.5726, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.5334061532860004, | |
| "grad_norm": 5.931169509887695, | |
| "learning_rate": 5.2091477643416565e-05, | |
| "loss": 2.563, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.5352266521026762, | |
| "grad_norm": 5.819269180297852, | |
| "learning_rate": 5.17771139624831e-05, | |
| "loss": 2.5544, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.537047150919352, | |
| "grad_norm": 5.978631973266602, | |
| "learning_rate": 5.14626799272799e-05, | |
| "loss": 2.5477, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.5388676497360276, | |
| "grad_norm": 7.41937255859375, | |
| "learning_rate": 5.114818798595457e-05, | |
| "loss": 2.537, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.5406881485527034, | |
| "grad_norm": 6.325901985168457, | |
| "learning_rate": 5.083365058894717e-05, | |
| "loss": 2.5219, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.5425086473693792, | |
| "grad_norm": 5.593471050262451, | |
| "learning_rate": 5.051908018849729e-05, | |
| "loss": 2.5031, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.544329146186055, | |
| "grad_norm": 6.014733791351318, | |
| "learning_rate": 5.020448923815115e-05, | |
| "loss": 2.5559, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.5461496450027308, | |
| "grad_norm": 5.821467399597168, | |
| "learning_rate": 4.988989019226846e-05, | |
| "loss": 2.5543, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5461496450027308, | |
| "eval_loss": 2.542562246322632, | |
| "eval_runtime": 1012.6169, | |
| "eval_samples_per_second": 9.644, | |
| "eval_steps_per_second": 1.206, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5479701438194066, | |
| "grad_norm": 5.459403991699219, | |
| "learning_rate": 4.9575295505529476e-05, | |
| "loss": 2.5296, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.5497906426360822, | |
| "grad_norm": 5.517323970794678, | |
| "learning_rate": 4.926071763244182e-05, | |
| "loss": 2.5244, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.551611141452758, | |
| "grad_norm": 6.594572067260742, | |
| "learning_rate": 4.894616902684755e-05, | |
| "loss": 2.5305, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.5534316402694338, | |
| "grad_norm": 5.410883903503418, | |
| "learning_rate": 4.8631662141429965e-05, | |
| "loss": 2.52, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.5552521390861096, | |
| "grad_norm": 6.53268575668335, | |
| "learning_rate": 4.8317209427220776e-05, | |
| "loss": 2.552, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.5570726379027854, | |
| "grad_norm": 5.973785877227783, | |
| "learning_rate": 4.8002823333107094e-05, | |
| "loss": 2.5089, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.5588931367194612, | |
| "grad_norm": 5.9094038009643555, | |
| "learning_rate": 4.768851630533858e-05, | |
| "loss": 2.5641, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.560713635536137, | |
| "grad_norm": 6.263980865478516, | |
| "learning_rate": 4.737430078703473e-05, | |
| "loss": 2.5573, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.5625341343528126, | |
| "grad_norm": 5.653293609619141, | |
| "learning_rate": 4.706018921769229e-05, | |
| "loss": 2.5162, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.5643546331694884, | |
| "grad_norm": 5.543784141540527, | |
| "learning_rate": 4.674619403269275e-05, | |
| "loss": 2.5121, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5661751319861642, | |
| "grad_norm": 5.531651020050049, | |
| "learning_rate": 4.643232766281003e-05, | |
| "loss": 2.5214, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.56799563080284, | |
| "grad_norm": 6.792632579803467, | |
| "learning_rate": 4.6118602533718457e-05, | |
| "loss": 2.5064, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5698161296195158, | |
| "grad_norm": 5.586999416351318, | |
| "learning_rate": 4.580503106550069e-05, | |
| "loss": 2.5475, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5716366284361916, | |
| "grad_norm": 5.922962188720703, | |
| "learning_rate": 4.549162567215612e-05, | |
| "loss": 2.4942, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5734571272528672, | |
| "grad_norm": 6.408661842346191, | |
| "learning_rate": 4.517839876110942e-05, | |
| "loss": 2.5143, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.575277626069543, | |
| "grad_norm": 6.490014553070068, | |
| "learning_rate": 4.4865362732719266e-05, | |
| "loss": 2.524, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.5770981248862188, | |
| "grad_norm": 7.0241217613220215, | |
| "learning_rate": 4.45525299797875e-05, | |
| "loss": 2.5019, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.5789186237028946, | |
| "grad_norm": 5.701231002807617, | |
| "learning_rate": 4.423991288706851e-05, | |
| "loss": 2.5243, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5807391225195704, | |
| "grad_norm": 5.697451591491699, | |
| "learning_rate": 4.392752383077883e-05, | |
| "loss": 2.5121, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.5825596213362462, | |
| "grad_norm": 6.233221530914307, | |
| "learning_rate": 4.3615375178107306e-05, | |
| "loss": 2.5233, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.584380120152922, | |
| "grad_norm": 5.627314567565918, | |
| "learning_rate": 4.330347928672538e-05, | |
| "loss": 2.4944, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.5862006189695976, | |
| "grad_norm": 5.6860737800598145, | |
| "learning_rate": 4.299184850429795e-05, | |
| "loss": 2.5099, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5880211177862734, | |
| "grad_norm": 6.200436592102051, | |
| "learning_rate": 4.26804951679945e-05, | |
| "loss": 2.5251, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.5898416166029492, | |
| "grad_norm": 6.162016868591309, | |
| "learning_rate": 4.2369431604000654e-05, | |
| "loss": 2.5156, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.591662115419625, | |
| "grad_norm": 5.810561656951904, | |
| "learning_rate": 4.205867012703025e-05, | |
| "loss": 2.5343, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5934826142363008, | |
| "grad_norm": 5.519481182098389, | |
| "learning_rate": 4.174822303983779e-05, | |
| "loss": 2.4742, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5953031130529766, | |
| "grad_norm": 5.590371131896973, | |
| "learning_rate": 4.1438102632731346e-05, | |
| "loss": 2.5091, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5971236118696522, | |
| "grad_norm": 5.302878379821777, | |
| "learning_rate": 4.1128321183086065e-05, | |
| "loss": 2.5024, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.598944110686328, | |
| "grad_norm": 5.2954864501953125, | |
| "learning_rate": 4.081889095485806e-05, | |
| "loss": 2.5304, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.6007646095030038, | |
| "grad_norm": 6.41418981552124, | |
| "learning_rate": 4.050982419809895e-05, | |
| "loss": 2.482, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.6025851083196796, | |
| "grad_norm": 6.165164947509766, | |
| "learning_rate": 4.020113314847082e-05, | |
| "loss": 2.5042, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.6044056071363554, | |
| "grad_norm": 5.556238651275635, | |
| "learning_rate": 3.989283002676193e-05, | |
| "loss": 2.4748, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.6062261059530312, | |
| "grad_norm": 5.898430824279785, | |
| "learning_rate": 3.958492703840276e-05, | |
| "loss": 2.5103, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.6080466047697068, | |
| "grad_norm": 6.131360054016113, | |
| "learning_rate": 3.9277436372982945e-05, | |
| "loss": 2.5023, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.6098671035863826, | |
| "grad_norm": 6.646694660186768, | |
| "learning_rate": 3.8970370203768595e-05, | |
| "loss": 2.5278, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.6116876024030584, | |
| "grad_norm": 5.293123245239258, | |
| "learning_rate": 3.8663740687220466e-05, | |
| "loss": 2.5339, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.6135081012197342, | |
| "grad_norm": 5.64793586730957, | |
| "learning_rate": 3.835755996251261e-05, | |
| "loss": 2.4955, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.61532860003641, | |
| "grad_norm": 5.227383613586426, | |
| "learning_rate": 3.805184015105182e-05, | |
| "loss": 2.4942, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.6171490988530858, | |
| "grad_norm": 6.251855850219727, | |
| "learning_rate": 3.774659335599783e-05, | |
| "loss": 2.4941, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.6189695976697616, | |
| "grad_norm": 6.169816970825195, | |
| "learning_rate": 3.744183166178405e-05, | |
| "loss": 2.5091, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6207900964864372, | |
| "grad_norm": 5.742128372192383, | |
| "learning_rate": 3.71375671336392e-05, | |
| "loss": 2.4775, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.622610595303113, | |
| "grad_norm": 5.501997470855713, | |
| "learning_rate": 3.683381181710969e-05, | |
| "loss": 2.5165, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.6244310941197888, | |
| "grad_norm": 5.634415149688721, | |
| "learning_rate": 3.653057773758268e-05, | |
| "loss": 2.5137, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.6262515929364646, | |
| "grad_norm": 6.159447193145752, | |
| "learning_rate": 3.622787689981009e-05, | |
| "loss": 2.5118, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.6280720917531404, | |
| "grad_norm": 5.9222307205200195, | |
| "learning_rate": 3.5925721287433304e-05, | |
| "loss": 2.5158, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.6298925905698162, | |
| "grad_norm": 5.72164249420166, | |
| "learning_rate": 3.5624122862508724e-05, | |
| "loss": 2.4573, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.6317130893864918, | |
| "grad_norm": 5.604609489440918, | |
| "learning_rate": 3.5323093565034213e-05, | |
| "loss": 2.4427, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.6335335882031676, | |
| "grad_norm": 6.2562336921691895, | |
| "learning_rate": 3.502264531247644e-05, | |
| "loss": 2.5276, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.6353540870198434, | |
| "grad_norm": 5.994789123535156, | |
| "learning_rate": 3.4722789999299034e-05, | |
| "loss": 2.5015, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.6371745858365192, | |
| "grad_norm": 7.051544189453125, | |
| "learning_rate": 3.442353949649173e-05, | |
| "loss": 2.4955, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.638995084653195, | |
| "grad_norm": 5.279881000518799, | |
| "learning_rate": 3.412490565110034e-05, | |
| "loss": 2.5066, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.6408155834698708, | |
| "grad_norm": 5.663167476654053, | |
| "learning_rate": 3.382690028575789e-05, | |
| "loss": 2.4843, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.6426360822865466, | |
| "grad_norm": 6.197967052459717, | |
| "learning_rate": 3.352953519821637e-05, | |
| "loss": 2.4439, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.6444565811032222, | |
| "grad_norm": 6.703118324279785, | |
| "learning_rate": 3.3232822160879825e-05, | |
| "loss": 2.4902, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.646277079919898, | |
| "grad_norm": 6.075878143310547, | |
| "learning_rate": 3.2936772920338244e-05, | |
| "loss": 2.4375, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.6480975787365738, | |
| "grad_norm": 5.913654327392578, | |
| "learning_rate": 3.2641399196902505e-05, | |
| "loss": 2.4575, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.6499180775532496, | |
| "grad_norm": 5.623748779296875, | |
| "learning_rate": 3.234671268414041e-05, | |
| "loss": 2.5078, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.6517385763699254, | |
| "grad_norm": 4.9896745681762695, | |
| "learning_rate": 3.2052725048413734e-05, | |
| "loss": 2.461, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.6535590751866012, | |
| "grad_norm": 5.503573417663574, | |
| "learning_rate": 3.175944792841639e-05, | |
| "loss": 2.4964, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.6553795740032768, | |
| "grad_norm": 5.610907077789307, | |
| "learning_rate": 3.146689293471362e-05, | |
| "loss": 2.496, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6572000728199526, | |
| "grad_norm": 5.222755432128906, | |
| "learning_rate": 3.117507164928235e-05, | |
| "loss": 2.4587, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.6590205716366284, | |
| "grad_norm": 5.29688024520874, | |
| "learning_rate": 3.0883995625052735e-05, | |
| "loss": 2.4976, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.6608410704533042, | |
| "grad_norm": 5.991846084594727, | |
| "learning_rate": 3.059367638545069e-05, | |
| "loss": 2.4668, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.66266156926998, | |
| "grad_norm": 5.622383117675781, | |
| "learning_rate": 3.030412542394176e-05, | |
| "loss": 2.4624, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.6644820680866558, | |
| "grad_norm": 5.628267288208008, | |
| "learning_rate": 3.001535420357607e-05, | |
| "loss": 2.4497, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.6663025669033316, | |
| "grad_norm": 5.707132339477539, | |
| "learning_rate": 2.972737415653456e-05, | |
| "loss": 2.4921, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.6681230657200072, | |
| "grad_norm": 6.638173580169678, | |
| "learning_rate": 2.9440196683676337e-05, | |
| "loss": 2.4978, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.669943564536683, | |
| "grad_norm": 6.14837121963501, | |
| "learning_rate": 2.915383315408736e-05, | |
| "loss": 2.4597, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.6717640633533588, | |
| "grad_norm": 5.452149391174316, | |
| "learning_rate": 2.8868294904630333e-05, | |
| "loss": 2.4789, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.6735845621700346, | |
| "grad_norm": 5.278966903686523, | |
| "learning_rate": 2.8583593239495875e-05, | |
| "loss": 2.4435, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6754050609867104, | |
| "grad_norm": 5.96103048324585, | |
| "learning_rate": 2.8299739429755057e-05, | |
| "loss": 2.4103, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.6772255598033862, | |
| "grad_norm": 5.705805778503418, | |
| "learning_rate": 2.8016744712913164e-05, | |
| "loss": 2.494, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.6790460586200618, | |
| "grad_norm": 6.212954044342041, | |
| "learning_rate": 2.773462029246475e-05, | |
| "loss": 2.4419, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.6808665574367376, | |
| "grad_norm": 5.658579349517822, | |
| "learning_rate": 2.7453377337450182e-05, | |
| "loss": 2.4648, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6826870562534134, | |
| "grad_norm": 6.050464630126953, | |
| "learning_rate": 2.7173026982013417e-05, | |
| "loss": 2.4964, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6845075550700892, | |
| "grad_norm": 5.388469219207764, | |
| "learning_rate": 2.689358032496129e-05, | |
| "loss": 2.4463, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.686328053886765, | |
| "grad_norm": 5.987942695617676, | |
| "learning_rate": 2.661504842932402e-05, | |
| "loss": 2.4777, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6881485527034408, | |
| "grad_norm": 20.632400512695312, | |
| "learning_rate": 2.633744232191726e-05, | |
| "loss": 2.4645, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6899690515201166, | |
| "grad_norm": 5.323004245758057, | |
| "learning_rate": 2.6060772992905647e-05, | |
| "loss": 2.4832, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.6917895503367922, | |
| "grad_norm": 5.565816402435303, | |
| "learning_rate": 2.578505139536762e-05, | |
| "loss": 2.4677, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.693610049153468, | |
| "grad_norm": 6.668233394622803, | |
| "learning_rate": 2.5510288444861784e-05, | |
| "loss": 2.4372, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.6954305479701438, | |
| "grad_norm": 6.057459354400635, | |
| "learning_rate": 2.5236495018994844e-05, | |
| "loss": 2.4625, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.6972510467868196, | |
| "grad_norm": 5.716314315795898, | |
| "learning_rate": 2.4963681956990896e-05, | |
| "loss": 2.4582, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.6990715456034954, | |
| "grad_norm": 5.098658561706543, | |
| "learning_rate": 2.4691860059262427e-05, | |
| "loss": 2.449, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.7008920444201712, | |
| "grad_norm": 6.102919578552246, | |
| "learning_rate": 2.4421040086982587e-05, | |
| "loss": 2.4678, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.7027125432368468, | |
| "grad_norm": 6.12929630279541, | |
| "learning_rate": 2.4151232761659305e-05, | |
| "loss": 2.4162, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.7045330420535226, | |
| "grad_norm": 5.601887226104736, | |
| "learning_rate": 2.3882448764710698e-05, | |
| "loss": 2.4455, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.7063535408701984, | |
| "grad_norm": 5.384401798248291, | |
| "learning_rate": 2.3614698737042355e-05, | |
| "loss": 2.4618, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.7081740396868742, | |
| "grad_norm": 5.636379718780518, | |
| "learning_rate": 2.3347993278625933e-05, | |
| "loss": 2.43, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.70999453850355, | |
| "grad_norm": 6.716041088104248, | |
| "learning_rate": 2.3082342948079606e-05, | |
| "loss": 2.4177, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.7118150373202258, | |
| "grad_norm": 4.779394149780273, | |
| "learning_rate": 2.2817758262249988e-05, | |
| "loss": 2.4502, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.7136355361369016, | |
| "grad_norm": 5.225165843963623, | |
| "learning_rate": 2.2554249695795878e-05, | |
| "loss": 2.4954, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.7154560349535772, | |
| "grad_norm": 5.962668418884277, | |
| "learning_rate": 2.2291827680773508e-05, | |
| "loss": 2.4367, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.717276533770253, | |
| "grad_norm": 5.683213233947754, | |
| "learning_rate": 2.2030502606223534e-05, | |
| "loss": 2.4969, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.7190970325869288, | |
| "grad_norm": 5.069901466369629, | |
| "learning_rate": 2.1770284817759767e-05, | |
| "loss": 2.4462, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.7209175314036046, | |
| "grad_norm": 5.872584819793701, | |
| "learning_rate": 2.1511184617159652e-05, | |
| "loss": 2.4353, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.7227380302202804, | |
| "grad_norm": 5.787328720092773, | |
| "learning_rate": 2.125321226195634e-05, | |
| "loss": 2.4174, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.7245585290369562, | |
| "grad_norm": 8.6191987991333, | |
| "learning_rate": 2.0996377965032638e-05, | |
| "loss": 2.4482, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.7263790278536318, | |
| "grad_norm": 5.412010192871094, | |
| "learning_rate": 2.074069189421673e-05, | |
| "loss": 2.4207, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.7281995266703076, | |
| "grad_norm": 5.450544834136963, | |
| "learning_rate": 2.0486164171879613e-05, | |
| "loss": 2.4624, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7281995266703076, | |
| "eval_loss": 2.449294328689575, | |
| "eval_runtime": 1012.1688, | |
| "eval_samples_per_second": 9.649, | |
| "eval_steps_per_second": 1.206, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7300200254869834, | |
| "grad_norm": 5.740320205688477, | |
| "learning_rate": 2.0232804874534313e-05, | |
| "loss": 2.4582, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.7318405243036592, | |
| "grad_norm": 5.987521171569824, | |
| "learning_rate": 1.998062403243704e-05, | |
| "loss": 2.4499, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.733661023120335, | |
| "grad_norm": 5.337474346160889, | |
| "learning_rate": 1.9729631629190042e-05, | |
| "loss": 2.4598, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.7354815219370108, | |
| "grad_norm": 5.655992031097412, | |
| "learning_rate": 1.9479837601346457e-05, | |
| "loss": 2.4601, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.7373020207536866, | |
| "grad_norm": 5.7331414222717285, | |
| "learning_rate": 1.923125183801678e-05, | |
| "loss": 2.4445, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.7391225195703622, | |
| "grad_norm": 5.471503257751465, | |
| "learning_rate": 1.898388418047753e-05, | |
| "loss": 2.4683, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.740943018387038, | |
| "grad_norm": 5.409184455871582, | |
| "learning_rate": 1.87377444217815e-05, | |
| "loss": 2.4358, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.7427635172037138, | |
| "grad_norm": 5.579779148101807, | |
| "learning_rate": 1.8492842306370182e-05, | |
| "loss": 2.4989, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.7445840160203896, | |
| "grad_norm": 5.366626262664795, | |
| "learning_rate": 1.8249187529687895e-05, | |
| "loss": 2.4102, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.7464045148370654, | |
| "grad_norm": 5.263418674468994, | |
| "learning_rate": 1.8006789737797984e-05, | |
| "loss": 2.4573, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7482250136537412, | |
| "grad_norm": 5.129177570343018, | |
| "learning_rate": 1.7765658527000966e-05, | |
| "loss": 2.4792, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.7500455124704168, | |
| "grad_norm": 6.237401962280273, | |
| "learning_rate": 1.7525803443454615e-05, | |
| "loss": 2.479, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.7518660112870926, | |
| "grad_norm": 6.163425445556641, | |
| "learning_rate": 1.728723398279603e-05, | |
| "loss": 2.4222, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.7536865101037684, | |
| "grad_norm": 5.254932403564453, | |
| "learning_rate": 1.7049959589765686e-05, | |
| "loss": 2.4307, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.7555070089204442, | |
| "grad_norm": 6.144068717956543, | |
| "learning_rate": 1.6813989657833534e-05, | |
| "loss": 2.4923, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.75732750773712, | |
| "grad_norm": 5.038397789001465, | |
| "learning_rate": 1.6579333528827205e-05, | |
| "loss": 2.4732, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.7591480065537958, | |
| "grad_norm": 5.2848076820373535, | |
| "learning_rate": 1.634600049256204e-05, | |
| "loss": 2.4651, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.7609685053704716, | |
| "grad_norm": 5.274468898773193, | |
| "learning_rate": 1.611399978647342e-05, | |
| "loss": 2.4407, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.7627890041871472, | |
| "grad_norm": 5.039272308349609, | |
| "learning_rate": 1.588334059525099e-05, | |
| "loss": 2.4892, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.764609503003823, | |
| "grad_norm": 5.7419867515563965, | |
| "learning_rate": 1.5654032050475138e-05, | |
| "loss": 2.4456, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7664300018204988, | |
| "grad_norm": 5.30146598815918, | |
| "learning_rate": 1.5426083230255405e-05, | |
| "loss": 2.4515, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.7682505006371746, | |
| "grad_norm": 4.977199077606201, | |
| "learning_rate": 1.5199503158871115e-05, | |
| "loss": 2.4317, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.7700709994538504, | |
| "grad_norm": 5.318095684051514, | |
| "learning_rate": 1.4974300806414082e-05, | |
| "loss": 2.403, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.7718914982705262, | |
| "grad_norm": 5.638497352600098, | |
| "learning_rate": 1.4750485088433592e-05, | |
| "loss": 2.4327, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.7737119970872018, | |
| "grad_norm": 5.739340305328369, | |
| "learning_rate": 1.4528064865583301e-05, | |
| "loss": 2.4266, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7755324959038776, | |
| "grad_norm": 4.749205112457275, | |
| "learning_rate": 1.4307048943270606e-05, | |
| "loss": 2.4136, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.7773529947205534, | |
| "grad_norm": 5.616302490234375, | |
| "learning_rate": 1.4087446071307903e-05, | |
| "loss": 2.4197, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.7791734935372292, | |
| "grad_norm": 5.402510643005371, | |
| "learning_rate": 1.3869264943566263e-05, | |
| "loss": 2.4194, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.780993992353905, | |
| "grad_norm": 5.278769493103027, | |
| "learning_rate": 1.3652514197631277e-05, | |
| "loss": 2.4351, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.7828144911705808, | |
| "grad_norm": 6.828596115112305, | |
| "learning_rate": 1.343720241446103e-05, | |
| "loss": 2.3813, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7846349899872566, | |
| "grad_norm": 5.306332588195801, | |
| "learning_rate": 1.322333811804643e-05, | |
| "loss": 2.4133, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.7864554888039322, | |
| "grad_norm": 5.437227249145508, | |
| "learning_rate": 1.3010929775073765e-05, | |
| "loss": 2.4166, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.788275987620608, | |
| "grad_norm": 5.493254661560059, | |
| "learning_rate": 1.2799985794589497e-05, | |
| "loss": 2.3842, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.7900964864372838, | |
| "grad_norm": 5.259057521820068, | |
| "learning_rate": 1.2590514527667336e-05, | |
| "loss": 2.3783, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.7919169852539596, | |
| "grad_norm": 5.750987529754639, | |
| "learning_rate": 1.2382524267077645e-05, | |
| "loss": 2.4202, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7937374840706354, | |
| "grad_norm": 4.952456951141357, | |
| "learning_rate": 1.2176023246959133e-05, | |
| "loss": 2.4393, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.7955579828873112, | |
| "grad_norm": 5.3008713722229, | |
| "learning_rate": 1.1971019642492942e-05, | |
| "loss": 2.375, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.7973784817039868, | |
| "grad_norm": 4.872366428375244, | |
| "learning_rate": 1.176752156957886e-05, | |
| "loss": 2.4257, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7991989805206626, | |
| "grad_norm": 5.488797664642334, | |
| "learning_rate": 1.1565537084514123e-05, | |
| "loss": 2.4424, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.8010194793373384, | |
| "grad_norm": 5.145867824554443, | |
| "learning_rate": 1.1365074183674468e-05, | |
| "loss": 2.4806, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8028399781540142, | |
| "grad_norm": 5.343238353729248, | |
| "learning_rate": 1.116614080319754e-05, | |
| "loss": 2.4321, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.80466047697069, | |
| "grad_norm": 5.240965366363525, | |
| "learning_rate": 1.0968744818668691e-05, | |
| "loss": 2.4358, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.8064809757873658, | |
| "grad_norm": 5.5220513343811035, | |
| "learning_rate": 1.0772894044809229e-05, | |
| "loss": 2.442, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.8083014746040416, | |
| "grad_norm": 4.8629045486450195, | |
| "learning_rate": 1.0578596235166998e-05, | |
| "loss": 2.4567, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.8101219734207172, | |
| "grad_norm": 5.297680854797363, | |
| "learning_rate": 1.0385859081809508e-05, | |
| "loss": 2.4544, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.811942472237393, | |
| "grad_norm": 5.134615898132324, | |
| "learning_rate": 1.0194690215019292e-05, | |
| "loss": 2.4656, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.8137629710540688, | |
| "grad_norm": 5.012113571166992, | |
| "learning_rate": 1.0005097202991948e-05, | |
| "loss": 2.382, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.8155834698707446, | |
| "grad_norm": 5.369142532348633, | |
| "learning_rate": 9.817087551536414e-06, | |
| "loss": 2.4584, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.8174039686874204, | |
| "grad_norm": 5.545107841491699, | |
| "learning_rate": 9.630668703777922e-06, | |
| "loss": 2.4013, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.8192244675040962, | |
| "grad_norm": 4.933434963226318, | |
| "learning_rate": 9.445848039863252e-06, | |
| "loss": 2.4516, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8210449663207718, | |
| "grad_norm": 4.916785717010498, | |
| "learning_rate": 9.262632876668591e-06, | |
| "loss": 2.4555, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.8228654651374476, | |
| "grad_norm": 5.11759090423584, | |
| "learning_rate": 9.08103046750986e-06, | |
| "loss": 2.447, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.8246859639541234, | |
| "grad_norm": 5.081522464752197, | |
| "learning_rate": 8.901048001855583e-06, | |
| "loss": 2.4004, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.8265064627707992, | |
| "grad_norm": 5.203310489654541, | |
| "learning_rate": 8.722692605042248e-06, | |
| "loss": 2.4237, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.828326961587475, | |
| "grad_norm": 5.090500831604004, | |
| "learning_rate": 8.545971337992197e-06, | |
| "loss": 2.4342, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.8301474604041508, | |
| "grad_norm": 5.330081462860107, | |
| "learning_rate": 8.37089119693411e-06, | |
| "loss": 2.3922, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.8319679592208266, | |
| "grad_norm": 5.339773178100586, | |
| "learning_rate": 8.197459113126067e-06, | |
| "loss": 2.4342, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.8337884580375022, | |
| "grad_norm": 5.109127044677734, | |
| "learning_rate": 8.02568195258107e-06, | |
| "loss": 2.4207, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.835608956854178, | |
| "grad_norm": 5.223607540130615, | |
| "learning_rate": 7.855566515795282e-06, | |
| "loss": 2.383, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.8374294556708538, | |
| "grad_norm": 5.024397373199463, | |
| "learning_rate": 7.687119537478799e-06, | |
| "loss": 2.4197, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8392499544875296, | |
| "grad_norm": 5.112728595733643, | |
| "learning_rate": 7.52034768628902e-06, | |
| "loss": 2.4399, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.8410704533042054, | |
| "grad_norm": 5.149270057678223, | |
| "learning_rate": 7.3552575645666036e-06, | |
| "loss": 2.417, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.8428909521208812, | |
| "grad_norm": 5.0890350341796875, | |
| "learning_rate": 7.191855708074152e-06, | |
| "loss": 2.4217, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.8447114509375568, | |
| "grad_norm": 5.196211338043213, | |
| "learning_rate": 7.030148585737406e-06, | |
| "loss": 2.4351, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.8465319497542326, | |
| "grad_norm": 5.102319717407227, | |
| "learning_rate": 6.870142599389217e-06, | |
| "loss": 2.4397, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.8483524485709084, | |
| "grad_norm": 5.2318220138549805, | |
| "learning_rate": 6.711844083516022e-06, | |
| "loss": 2.3708, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.8501729473875842, | |
| "grad_norm": 5.27686071395874, | |
| "learning_rate": 6.555259305007139e-06, | |
| "loss": 2.4018, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.85199344620426, | |
| "grad_norm": 5.138775825500488, | |
| "learning_rate": 6.400394462906612e-06, | |
| "loss": 2.4244, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.8538139450209358, | |
| "grad_norm": 4.929832935333252, | |
| "learning_rate": 6.247255688167852e-06, | |
| "loss": 2.3637, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.8556344438376116, | |
| "grad_norm": 5.328685283660889, | |
| "learning_rate": 6.09584904341085e-06, | |
| "loss": 2.4037, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8574549426542872, | |
| "grad_norm": 4.969110012054443, | |
| "learning_rate": 5.946180522682227e-06, | |
| "loss": 2.4054, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.859275441470963, | |
| "grad_norm": 4.729069232940674, | |
| "learning_rate": 5.798256051217882e-06, | |
| "loss": 2.419, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.8610959402876388, | |
| "grad_norm": 4.847239971160889, | |
| "learning_rate": 5.652081485208482e-06, | |
| "loss": 2.4067, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.8629164391043146, | |
| "grad_norm": 4.862872123718262, | |
| "learning_rate": 5.507662611567565e-06, | |
| "loss": 2.4237, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.8647369379209904, | |
| "grad_norm": 4.765954971313477, | |
| "learning_rate": 5.365005147702462e-06, | |
| "loss": 2.3841, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.8665574367376662, | |
| "grad_norm": 5.191616535186768, | |
| "learning_rate": 5.224114741287922e-06, | |
| "loss": 2.4473, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.8683779355543418, | |
| "grad_norm": 5.6387619972229, | |
| "learning_rate": 5.084996970042599e-06, | |
| "loss": 2.3946, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.8701984343710176, | |
| "grad_norm": 4.992214202880859, | |
| "learning_rate": 4.947657341508166e-06, | |
| "loss": 2.4029, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.8720189331876934, | |
| "grad_norm": 4.994503021240234, | |
| "learning_rate": 4.812101292831283e-06, | |
| "loss": 2.4212, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.8738394320043692, | |
| "grad_norm": 5.3045220375061035, | |
| "learning_rate": 4.678334190548378e-06, | |
| "loss": 2.4025, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.875659930821045, | |
| "grad_norm": 5.20510196685791, | |
| "learning_rate": 4.546361330373178e-06, | |
| "loss": 2.4087, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.8774804296377208, | |
| "grad_norm": 5.151695728302002, | |
| "learning_rate": 4.41618793698706e-06, | |
| "loss": 2.3748, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.8793009284543966, | |
| "grad_norm": 5.292723178863525, | |
| "learning_rate": 4.287819163832179e-06, | |
| "loss": 2.4068, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.8811214272710722, | |
| "grad_norm": 5.057366847991943, | |
| "learning_rate": 4.161260092907476e-06, | |
| "loss": 2.4191, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.882941926087748, | |
| "grad_norm": 5.375776290893555, | |
| "learning_rate": 4.0365157345675255e-06, | |
| "loss": 2.4085, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8847624249044238, | |
| "grad_norm": 5.110659599304199, | |
| "learning_rate": 3.91359102732407e-06, | |
| "loss": 2.3547, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.8865829237210996, | |
| "grad_norm": 5.154110908508301, | |
| "learning_rate": 3.792490837650642e-06, | |
| "loss": 2.3731, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.8884034225377754, | |
| "grad_norm": 4.55220365524292, | |
| "learning_rate": 3.673219959789803e-06, | |
| "loss": 2.3956, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.8902239213544512, | |
| "grad_norm": 5.095583915710449, | |
| "learning_rate": 3.5557831155633715e-06, | |
| "loss": 2.4077, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.8920444201711268, | |
| "grad_norm": 5.244335174560547, | |
| "learning_rate": 3.4401849541855493e-06, | |
| "loss": 2.4164, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8938649189878026, | |
| "grad_norm": 5.595185279846191, | |
| "learning_rate": 3.3264300520787607e-06, | |
| "loss": 2.4055, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.8956854178044784, | |
| "grad_norm": 5.167870044708252, | |
| "learning_rate": 3.214522912692547e-06, | |
| "loss": 2.4064, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.8975059166211542, | |
| "grad_norm": 4.97356653213501, | |
| "learning_rate": 3.1044679663252807e-06, | |
| "loss": 2.4249, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.89932641543783, | |
| "grad_norm": 5.116016387939453, | |
| "learning_rate": 2.996269569948745e-06, | |
| "loss": 2.3617, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.9011469142545058, | |
| "grad_norm": 5.192502498626709, | |
| "learning_rate": 2.889932007035645e-06, | |
| "loss": 2.3889, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.9029674130711816, | |
| "grad_norm": 4.694880485534668, | |
| "learning_rate": 2.7854594873900463e-06, | |
| "loss": 2.4309, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.9047879118878572, | |
| "grad_norm": 4.840787887573242, | |
| "learning_rate": 2.6828561469807e-06, | |
| "loss": 2.4412, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.906608410704533, | |
| "grad_norm": 5.104063987731934, | |
| "learning_rate": 2.582126047777328e-06, | |
| "loss": 2.4295, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.9084289095212088, | |
| "grad_norm": 4.761752605438232, | |
| "learning_rate": 2.4832731775897844e-06, | |
| "loss": 2.4125, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.9102494083378846, | |
| "grad_norm": 4.580504894256592, | |
| "learning_rate": 2.3863014499101775e-06, | |
| "loss": 2.4762, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9102494083378846, | |
| "eval_loss": 2.4089949131011963, | |
| "eval_runtime": 1012.4809, | |
| "eval_samples_per_second": 9.646, | |
| "eval_steps_per_second": 1.206, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9120699071545604, | |
| "grad_norm": 4.849244117736816, | |
| "learning_rate": 2.291214703757982e-06, | |
| "loss": 2.3958, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.9138904059712362, | |
| "grad_norm": 4.8128204345703125, | |
| "learning_rate": 2.1980167035280163e-06, | |
| "loss": 2.4288, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.9157109047879118, | |
| "grad_norm": 5.573403835296631, | |
| "learning_rate": 2.1067111388414163e-06, | |
| "loss": 2.4134, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.9175314036045876, | |
| "grad_norm": 4.9487504959106445, | |
| "learning_rate": 2.0173016243995866e-06, | |
| "loss": 2.4095, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.9193519024212634, | |
| "grad_norm": 4.933927536010742, | |
| "learning_rate": 1.929791699841066e-06, | |
| "loss": 2.4014, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.9211724012379392, | |
| "grad_norm": 5.116062641143799, | |
| "learning_rate": 1.844184829601453e-06, | |
| "loss": 2.4196, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.922992900054615, | |
| "grad_norm": 4.888516902923584, | |
| "learning_rate": 1.7604844027761802e-06, | |
| "loss": 2.4418, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.9248133988712908, | |
| "grad_norm": 4.990447998046875, | |
| "learning_rate": 1.6786937329864027e-06, | |
| "loss": 2.4049, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.9266338976879666, | |
| "grad_norm": 4.672518253326416, | |
| "learning_rate": 1.5988160582477818e-06, | |
| "loss": 2.3873, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.9284543965046422, | |
| "grad_norm": 5.029353618621826, | |
| "learning_rate": 1.5208545408423092e-06, | |
| "loss": 2.4754, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.930274895321318, | |
| "grad_norm": 4.660059928894043, | |
| "learning_rate": 1.444812267193102e-06, | |
| "loss": 2.4081, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.9320953941379938, | |
| "grad_norm": 5.001034259796143, | |
| "learning_rate": 1.3706922477422336e-06, | |
| "loss": 2.4014, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.9339158929546696, | |
| "grad_norm": 5.1275858879089355, | |
| "learning_rate": 1.2984974168315234e-06, | |
| "loss": 2.4251, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.9357363917713454, | |
| "grad_norm": 4.893324375152588, | |
| "learning_rate": 1.2282306325864135e-06, | |
| "loss": 2.4196, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.9375568905880212, | |
| "grad_norm": 4.734968662261963, | |
| "learning_rate": 1.1598946768027863e-06, | |
| "loss": 2.401, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.9393773894046968, | |
| "grad_norm": 4.66255521774292, | |
| "learning_rate": 1.0934922548368254e-06, | |
| "loss": 2.3846, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.9411978882213726, | |
| "grad_norm": 4.771427631378174, | |
| "learning_rate": 1.0290259954979397e-06, | |
| "loss": 2.3953, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.9430183870380484, | |
| "grad_norm": 4.673166275024414, | |
| "learning_rate": 9.664984509446917e-07, | |
| "loss": 2.3694, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.9448388858547242, | |
| "grad_norm": 4.778134346008301, | |
| "learning_rate": 9.059120965837331e-07, | |
| "loss": 2.3948, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.9466593846714, | |
| "grad_norm": 4.706231594085693, | |
| "learning_rate": 8.472693309718283e-07, | |
| "loss": 2.4153, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9484798834880758, | |
| "grad_norm": 4.645259380340576, | |
| "learning_rate": 7.905724757208965e-07, | |
| "loss": 2.3806, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.9503003823047516, | |
| "grad_norm": 5.04796838760376, | |
| "learning_rate": 7.358237754060915e-07, | |
| "loss": 2.454, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.9521208811214272, | |
| "grad_norm": 4.7881646156311035, | |
| "learning_rate": 6.830253974769496e-07, | |
| "loss": 2.4161, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.953941379938103, | |
| "grad_norm": 4.7254743576049805, | |
| "learning_rate": 6.321794321715757e-07, | |
| "loss": 2.4715, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.9557618787547788, | |
| "grad_norm": 5.13754415512085, | |
| "learning_rate": 5.832878924338869e-07, | |
| "loss": 2.4191, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.9575823775714546, | |
| "grad_norm": 4.781599998474121, | |
| "learning_rate": 5.363527138339597e-07, | |
| "loss": 2.4127, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.9594028763881304, | |
| "grad_norm": 4.541421413421631, | |
| "learning_rate": 4.913757544913355e-07, | |
| "loss": 2.3908, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.9612233752048062, | |
| "grad_norm": 5.078845500946045, | |
| "learning_rate": 4.4835879500153556e-07, | |
| "loss": 2.4303, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.9630438740214818, | |
| "grad_norm": 4.745322227478027, | |
| "learning_rate": 4.0730353836549993e-07, | |
| "loss": 2.4046, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.9648643728381576, | |
| "grad_norm": 4.688536643981934, | |
| "learning_rate": 3.6821160992221993e-07, | |
| "loss": 2.4456, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9666848716548334, | |
| "grad_norm": 4.9088592529296875, | |
| "learning_rate": 3.310845572843557e-07, | |
| "loss": 2.3846, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.9685053704715092, | |
| "grad_norm": 5.126766681671143, | |
| "learning_rate": 2.959238502769912e-07, | |
| "loss": 2.4093, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.970325869288185, | |
| "grad_norm": 4.49152946472168, | |
| "learning_rate": 2.6273088087943597e-07, | |
| "loss": 2.3837, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.9721463681048608, | |
| "grad_norm": 4.944559097290039, | |
| "learning_rate": 2.315069631701139e-07, | |
| "loss": 2.3791, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.9739668669215366, | |
| "grad_norm": 4.91040563583374, | |
| "learning_rate": 2.022533332745602e-07, | |
| "loss": 2.4035, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.9757873657382122, | |
| "grad_norm": 4.91538143157959, | |
| "learning_rate": 1.7497114931644965e-07, | |
| "loss": 2.4057, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.977607864554888, | |
| "grad_norm": 5.63076114654541, | |
| "learning_rate": 1.496614913717831e-07, | |
| "loss": 2.3627, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.9794283633715638, | |
| "grad_norm": 4.944591045379639, | |
| "learning_rate": 1.2632536142609397e-07, | |
| "loss": 2.3662, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.9812488621882396, | |
| "grad_norm": 4.864638328552246, | |
| "learning_rate": 1.0496368333482442e-07, | |
| "loss": 2.3704, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.9830693610049154, | |
| "grad_norm": 4.991931438446045, | |
| "learning_rate": 8.557730278669906e-08, | |
| "loss": 2.3767, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9848898598215912, | |
| "grad_norm": 4.382468223571777, | |
| "learning_rate": 6.816698727029614e-08, | |
| "loss": 2.4112, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.9867103586382668, | |
| "grad_norm": 44.841453552246094, | |
| "learning_rate": 5.273342604361631e-08, | |
| "loss": 2.4092, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.9885308574549426, | |
| "grad_norm": 4.815988063812256, | |
| "learning_rate": 3.9277230106832264e-08, | |
| "loss": 2.4256, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.9903513562716184, | |
| "grad_norm": 4.87392520904541, | |
| "learning_rate": 2.7798932178080274e-08, | |
| "loss": 2.3936, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.9921718550882942, | |
| "grad_norm": 5.1465559005737305, | |
| "learning_rate": 1.829898667237151e-08, | |
| "loss": 2.3805, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.99399235390497, | |
| "grad_norm": 4.486802101135254, | |
| "learning_rate": 1.0777769683617544e-08, | |
| "loss": 2.3492, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.9958128527216458, | |
| "grad_norm": 5.0049614906311035, | |
| "learning_rate": 5.2355789697144945e-09, | |
| "loss": 2.4414, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.9976333515383216, | |
| "grad_norm": 4.7070441246032715, | |
| "learning_rate": 1.6726339407857616e-09, | |
| "loss": 2.4294, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.9994538503549972, | |
| "grad_norm": 4.9832539558410645, | |
| "learning_rate": 8.907565046678557e-11, | |
| "loss": 2.3724, | |
| "step": 5490 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5493, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2843428615741768e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |