diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,10498 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.956126096847579,
+  "eval_steps": 500,
+  "global_step": 30500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025071034598027745,
+      "grad_norm": 0.09887482225894928,
+      "learning_rate": 3e-05,
+      "loss": 1.1694,
+      "step": 15
+    },
+    {
+      "epoch": 0.005014206919605549,
+      "grad_norm": 0.04257430136203766,
+      "learning_rate": 6e-05,
+      "loss": 1.0648,
+      "step": 30
+    },
+    {
+      "epoch": 0.007521310379408324,
+      "grad_norm": 0.025177787989377975,
+      "learning_rate": 9e-05,
+      "loss": 0.9901,
+      "step": 45
+    },
+    {
+      "epoch": 0.010028413839211098,
+      "grad_norm": 0.014405222609639168,
+      "learning_rate": 0.00012,
+      "loss": 0.9677,
+      "step": 60
+    },
+    {
+      "epoch": 0.012535517299013872,
+      "grad_norm": 0.012216474860906601,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.9271,
+      "step": 75
+    },
+    {
+      "epoch": 0.015042620758816648,
+      "grad_norm": 0.01270276214927435,
+      "learning_rate": 0.00018,
+      "loss": 0.9087,
+      "step": 90
+    },
+    {
+      "epoch": 0.01754972421861942,
+      "grad_norm": 0.014931446872651577,
+      "learning_rate": 0.00019996645983565321,
+      "loss": 0.8911,
+      "step": 105
+    },
+    {
+      "epoch": 0.020056827678422196,
+      "grad_norm": 0.013434696942567825,
+      "learning_rate": 0.00019986583934261277,
+      "loss": 0.9009,
+      "step": 120
+    },
+    {
+      "epoch": 0.022563931138224972,
+      "grad_norm": 0.014926938340067863,
+      "learning_rate": 0.00019976521884957238,
+      "loss": 0.8827,
+      "step": 135
+    },
+    {
+      "epoch": 0.025071034598027744,
+      "grad_norm": 0.018700918182730675,
+      "learning_rate": 0.00019966459835653194,
+      "loss": 0.8815,
+      "step": 150
+    },
+    {
+      "epoch": 0.02757813805783052,
+      "grad_norm": 0.01647135801613331,
+      "learning_rate": 0.00019956397786349156,
+      "loss": 0.8762,
+      "step": 165
+    },
+    {
+      "epoch": 0.030085241517633296,
+      "grad_norm": 0.016942940652370453,
+      "learning_rate": 0.00019946335737045111,
+      "loss": 0.8648,
+      "step": 180
+    },
+    {
+      "epoch": 0.03259234497743607,
+      "grad_norm": 0.019231606274843216,
+      "learning_rate": 0.00019936273687741073,
+      "loss": 0.8567,
+      "step": 195
+    },
+    {
+      "epoch": 0.03509944843723884,
+      "grad_norm": 0.019707536324858665,
+      "learning_rate": 0.00019926211638437028,
+      "loss": 0.8568,
+      "step": 210
+    },
+    {
+      "epoch": 0.03760655189704162,
+      "grad_norm": 0.023225486278533936,
+      "learning_rate": 0.0001991614958913299,
+      "loss": 0.8507,
+      "step": 225
+    },
+    {
+      "epoch": 0.04011365535684439,
+      "grad_norm": 0.019466817378997803,
+      "learning_rate": 0.00019906087539828946,
+      "loss": 0.8559,
+      "step": 240
+    },
+    {
+      "epoch": 0.04262075881664717,
+      "grad_norm": 0.020322684198617935,
+      "learning_rate": 0.00019896025490524907,
+      "loss": 0.8411,
+      "step": 255
+    },
+    {
+      "epoch": 0.045127862276449944,
+      "grad_norm": 0.018744077533483505,
+      "learning_rate": 0.00019885963441220863,
+      "loss": 0.8587,
+      "step": 270
+    },
+    {
+      "epoch": 0.04763496573625271,
+      "grad_norm": 0.018990306183695793,
+      "learning_rate": 0.0001987590139191682,
+      "loss": 0.8434,
+      "step": 285
+    },
+    {
+      "epoch": 0.05014206919605549,
+      "grad_norm": 0.018513506278395653,
+      "learning_rate": 0.0001986583934261278,
+      "loss": 0.8573,
+      "step": 300
+    },
+    {
+      "epoch": 0.052649172655858265,
+      "grad_norm": 0.019561799243092537,
+      "learning_rate": 0.00019855777293308738,
+      "loss": 0.8459,
+      "step": 315
+    },
+    {
+      "epoch": 0.05515627611566104,
+      "grad_norm": 0.019172094762325287,
+      "learning_rate": 0.00019845715244004697,
+      "loss": 0.8653,
+      "step": 330
+    },
+    {
+      "epoch": 0.057663379575463816,
+      "grad_norm": 0.018002351745963097,
+      "learning_rate": 0.00019835653194700655,
+      "loss": 0.837,
+      "step": 345
+    },
+    {
+      "epoch": 0.06017048303526659,
+      "grad_norm": 0.01977609097957611,
+      "learning_rate": 0.00019825591145396614,
+      "loss": 0.8352,
+      "step": 360
+    },
+    {
+      "epoch": 0.06267758649506937,
+      "grad_norm": 0.019932597875595093,
+      "learning_rate": 0.0001981552909609257,
+      "loss": 0.8309,
+      "step": 375
+    },
+    {
+      "epoch": 0.06518468995487214,
+      "grad_norm": 0.01805214211344719,
+      "learning_rate": 0.0001980546704678853,
+      "loss": 0.8427,
+      "step": 390
+    },
+    {
+      "epoch": 0.0676917934146749,
+      "grad_norm": 0.018298575654625893,
+      "learning_rate": 0.00019795404997484487,
+      "loss": 0.8286,
+      "step": 405
+    },
+    {
+      "epoch": 0.07019889687447768,
+      "grad_norm": 0.01844840496778488,
+      "learning_rate": 0.00019785342948180448,
+      "loss": 0.8455,
+      "step": 420
+    },
+    {
+      "epoch": 0.07270600033428046,
+      "grad_norm": 0.018125606700778008,
+      "learning_rate": 0.00019775280898876404,
+      "loss": 0.8376,
+      "step": 435
+    },
+    {
+      "epoch": 0.07521310379408323,
+      "grad_norm": 0.01847078464925289,
+      "learning_rate": 0.00019765218849572365,
+      "loss": 0.8387,
+      "step": 450
+    },
+    {
+      "epoch": 0.07772020725388601,
+      "grad_norm": 0.0198803897947073,
+      "learning_rate": 0.0001975515680026832,
+      "loss": 0.8123,
+      "step": 465
+    },
+    {
+      "epoch": 0.08022731071368878,
+      "grad_norm": 0.018840806558728218,
+      "learning_rate": 0.00019745094750964282,
+      "loss": 0.8182,
+      "step": 480
+    },
+    {
+      "epoch": 0.08273441417349156,
+      "grad_norm": 0.02007896639406681,
+      "learning_rate": 0.00019735032701660238,
+      "loss": 0.8417,
+      "step": 495
+    },
+    {
+      "epoch": 0.08524151763329434,
+      "grad_norm": 0.02028510719537735,
+      "learning_rate": 0.000197249706523562,
+      "loss": 0.8359,
+      "step": 510
+    },
+    {
+      "epoch": 0.08774862109309711,
+      "grad_norm": 0.019926371052861214,
+      "learning_rate": 0.00019714908603052155,
+      "loss": 0.8253,
+      "step": 525
+    },
+    {
+      "epoch": 0.09025572455289989,
+      "grad_norm": 0.02000526711344719,
+      "learning_rate": 0.00019704846553748117,
+      "loss": 0.8136,
+      "step": 540
+    },
+    {
+      "epoch": 0.09276282801270266,
+      "grad_norm": 0.019949857145547867,
+      "learning_rate": 0.00019694784504444072,
+      "loss": 0.814,
+      "step": 555
+    },
+    {
+      "epoch": 0.09526993147250543,
+      "grad_norm": 0.018684815615415573,
+      "learning_rate": 0.0001968472245514003,
+      "loss": 0.8251,
+      "step": 570
+    },
+    {
+      "epoch": 0.0977770349323082,
+      "grad_norm": 0.01959558017551899,
+      "learning_rate": 0.0001967466040583599,
+      "loss": 0.8176,
+      "step": 585
+    },
+    {
+      "epoch": 0.10028413839211098,
+      "grad_norm": 0.02020624279975891,
+      "learning_rate": 0.00019664598356531948,
+      "loss": 0.8181,
+      "step": 600
+    },
+    {
+      "epoch": 0.10279124185191375,
+      "grad_norm": 0.019187506288290024,
+      "learning_rate": 0.00019654536307227907,
+      "loss": 0.8269,
+      "step": 615
+    },
+    {
+      "epoch": 0.10529834531171653,
+      "grad_norm": 0.019997362047433853,
+      "learning_rate": 0.00019644474257923865,
+      "loss": 0.8208,
+      "step": 630
+    },
+    {
+      "epoch": 0.1078054487715193,
+      "grad_norm": 0.02035447023808956,
+      "learning_rate": 0.00019634412208619824,
+      "loss": 0.822,
+      "step": 645
+    },
+    {
+      "epoch": 0.11031255223132208,
+      "grad_norm": 0.019842060282826424,
+      "learning_rate": 0.0001962435015931578,
+      "loss": 0.8123,
+      "step": 660
+    },
+    {
+      "epoch": 0.11281965569112486,
+      "grad_norm": 0.0202711783349514,
+      "learning_rate": 0.0001961428811001174,
+      "loss": 0.8137,
+      "step": 675
+    },
+    {
+      "epoch": 0.11532675915092763,
+      "grad_norm": 0.020544525235891342,
+      "learning_rate": 0.00019604226060707697,
+      "loss": 0.807,
+      "step": 690
+    },
+    {
+      "epoch": 0.11783386261073041,
+      "grad_norm": 0.02084393985569477,
+      "learning_rate": 0.00019594164011403658,
+      "loss": 0.8136,
+      "step": 705
+    },
+    {
+      "epoch": 0.12034096607053318,
+      "grad_norm": 0.020337115973234177,
+      "learning_rate": 0.00019584101962099614,
+      "loss": 0.8162,
+      "step": 720
+    },
+    {
+      "epoch": 0.12284806953033595,
+      "grad_norm": 0.019300837069749832,
+      "learning_rate": 0.00019574039912795575,
+      "loss": 0.8116,
+      "step": 735
+    },
+    {
+      "epoch": 0.12535517299013874,
+      "grad_norm": 0.020422646775841713,
+      "learning_rate": 0.0001956397786349153,
+      "loss": 0.8208,
+      "step": 750
+    },
+    {
+      "epoch": 0.1278622764499415,
+      "grad_norm": 0.019620511680841446,
+      "learning_rate": 0.00019553915814187492,
+      "loss": 0.7984,
+      "step": 765
+    },
+    {
+      "epoch": 0.1303693799097443,
+      "grad_norm": 0.018732598051428795,
+      "learning_rate": 0.00019543853764883448,
+      "loss": 0.8121,
+      "step": 780
+    },
+    {
+      "epoch": 0.13287648336954705,
+      "grad_norm": 0.020731749013066292,
+      "learning_rate": 0.0001953379171557941,
+      "loss": 0.8106,
+      "step": 795
+    },
+    {
+      "epoch": 0.1353835868293498,
+      "grad_norm": 0.019481362774968147,
+      "learning_rate": 0.00019523729666275365,
+      "loss": 0.8098,
+      "step": 810
+    },
+    {
+      "epoch": 0.1378906902891526,
+      "grad_norm": 0.021802278235554695,
+      "learning_rate": 0.00019513667616971326,
+      "loss": 0.7904,
+      "step": 825
+    },
+    {
+      "epoch": 0.14039779374895536,
+      "grad_norm": 0.02061532624065876,
+      "learning_rate": 0.00019503605567667282,
+      "loss": 0.8084,
+      "step": 840
+    },
+    {
+      "epoch": 0.14290489720875815,
+      "grad_norm": 0.01921633817255497,
+      "learning_rate": 0.00019493543518363243,
+      "loss": 0.8099,
+      "step": 855
+    },
+    {
+      "epoch": 0.14541200066856091,
+      "grad_norm": 0.020100874826312065,
+      "learning_rate": 0.000194834814690592,
+      "loss": 0.7984,
+      "step": 870
+    },
+    {
+      "epoch": 0.1479191041283637,
+      "grad_norm": 0.019538206979632378,
+      "learning_rate": 0.00019473419419755158,
+      "loss": 0.8003,
+      "step": 885
+    },
+    {
+      "epoch": 0.15042620758816647,
+      "grad_norm": 0.021109605208039284,
+      "learning_rate": 0.00019463357370451116,
+      "loss": 0.8027,
+      "step": 900
+    },
+    {
+      "epoch": 0.15293331104796926,
+      "grad_norm": 0.023268043994903564,
+      "learning_rate": 0.00019453295321147075,
+      "loss": 0.8137,
+      "step": 915
+    },
+    {
+      "epoch": 0.15544041450777202,
+      "grad_norm": 0.020138578489422798,
+      "learning_rate": 0.00019443233271843033,
+      "loss": 0.8015,
+      "step": 930
+    },
+    {
+      "epoch": 0.1579475179675748,
+      "grad_norm": 0.02234073542058468,
+      "learning_rate": 0.00019433171222538992,
+      "loss": 0.812,
+      "step": 945
+    },
+    {
+      "epoch": 0.16045462142737757,
+      "grad_norm": 0.02045338600873947,
+      "learning_rate": 0.0001942310917323495,
+      "loss": 0.7988,
+      "step": 960
+    },
+    {
+      "epoch": 0.16296172488718033,
+      "grad_norm": 0.020514754578471184,
+      "learning_rate": 0.00019413047123930906,
+      "loss": 0.7954,
+      "step": 975
+    },
+    {
+      "epoch": 0.16546882834698312,
+      "grad_norm": 0.020174162462353706,
+      "learning_rate": 0.00019402985074626867,
+      "loss": 0.7998,
+      "step": 990
+    },
+    {
+      "epoch": 0.16797593180678588,
+      "grad_norm": 0.020632125437259674,
+      "learning_rate": 0.00019392923025322823,
+      "loss": 0.8049,
+      "step": 1005
+    },
+    {
+      "epoch": 0.17048303526658867,
+      "grad_norm": 0.02562854066491127,
+      "learning_rate": 0.00019382860976018785,
+      "loss": 0.8058,
+      "step": 1020
+    },
+    {
+      "epoch": 0.17299013872639143,
+      "grad_norm": 0.019526248797774315,
+      "learning_rate": 0.0001937279892671474,
+      "loss": 0.7936,
+      "step": 1035
+    },
+    {
+      "epoch": 0.17549724218619422,
+      "grad_norm": 0.020038483664393425,
+      "learning_rate": 0.00019362736877410702,
+      "loss": 0.8057,
+      "step": 1050
+    },
+    {
+      "epoch": 0.178004345645997,
+      "grad_norm": 0.022498290985822678,
+      "learning_rate": 0.00019352674828106658,
+      "loss": 0.801,
+      "step": 1065
+    },
+    {
+      "epoch": 0.18051144910579978,
+      "grad_norm": 0.020256614312529564,
+      "learning_rate": 0.0001934261277880262,
+      "loss": 0.7984,
+      "step": 1080
+    },
+    {
+      "epoch": 0.18301855256560254,
+      "grad_norm": 0.021500416100025177,
+      "learning_rate": 0.00019332550729498575,
+      "loss": 0.8026,
+      "step": 1095
+    },
+    {
+      "epoch": 0.18552565602540533,
+      "grad_norm": 0.02118818834424019,
+      "learning_rate": 0.00019322488680194536,
+      "loss": 0.805,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1880327594852081,
+      "grad_norm": 0.020503008738160133,
+      "learning_rate": 0.00019312426630890492,
+      "loss": 0.7991,
+      "step": 1125
+    },
+    {
+      "epoch": 0.19053986294501085,
+      "grad_norm": 0.02011336386203766,
+      "learning_rate": 0.00019302364581586453,
+      "loss": 0.7966,
+      "step": 1140
+    },
+    {
+      "epoch": 0.19304696640481364,
+      "grad_norm": 0.020229632034897804,
+      "learning_rate": 0.0001929230253228241,
+      "loss": 0.7925,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1955540698646164,
+      "grad_norm": 0.021130822598934174,
+      "learning_rate": 0.00019282240482978367,
+      "loss": 0.7942,
+      "step": 1170
+    },
+    {
+      "epoch": 0.1980611733244192,
+      "grad_norm": 0.02094241976737976,
+      "learning_rate": 0.00019272178433674326,
+      "loss": 0.7857,
+      "step": 1185
+    },
+    {
+      "epoch": 0.20056827678422195,
+      "grad_norm": 0.01990380696952343,
+      "learning_rate": 0.00019262116384370284,
+      "loss": 0.7821,
+      "step": 1200
+    },
+    {
+      "epoch": 0.20307538024402474,
+      "grad_norm": 0.020579956471920013,
+      "learning_rate": 0.00019252054335066243,
+      "loss": 0.8054,
+      "step": 1215
+    },
+    {
+      "epoch": 0.2055824837038275,
+      "grad_norm": 0.02037345990538597,
+      "learning_rate": 0.00019241992285762202,
+      "loss": 0.789,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2080895871636303,
+      "grad_norm": 0.02104773558676243,
+      "learning_rate": 0.0001923193023645816,
+      "loss": 0.7812,
+      "step": 1245
+    },
+    {
+      "epoch": 0.21059669062343306,
+      "grad_norm": 0.020640334114432335,
+      "learning_rate": 0.00019221868187154119,
+      "loss": 0.7976,
+      "step": 1260
+    },
+    {
+      "epoch": 0.21310379408323585,
+      "grad_norm": 0.02236183173954487,
+      "learning_rate": 0.00019211806137850077,
+      "loss": 0.7855,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2156108975430386,
+      "grad_norm": 0.021454576402902603,
+      "learning_rate": 0.00019201744088546033,
+      "loss": 0.7921,
+      "step": 1290
+    },
+    {
+      "epoch": 0.21811800100284137,
+      "grad_norm": 0.0205401424318552,
+      "learning_rate": 0.00019191682039241994,
+      "loss": 0.7919,
+      "step": 1305
+    },
+    {
+      "epoch": 0.22062510446264416,
+      "grad_norm": 0.020427586510777473,
+      "learning_rate": 0.0001918161998993795,
+      "loss": 0.7842,
+      "step": 1320
+    },
+    {
+      "epoch": 0.22313220792244692,
+      "grad_norm": 0.020643971860408783,
+      "learning_rate": 0.0001917155794063391,
+      "loss": 0.8012,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2256393113822497,
+      "grad_norm": 0.02139684371650219,
+      "learning_rate": 0.00019161495891329867,
+      "loss": 0.7853,
+      "step": 1350
+    },
+    {
+      "epoch": 0.22814641484205248,
+      "grad_norm": 0.020423240959644318,
+      "learning_rate": 0.00019151433842025828,
+      "loss": 0.7918,
+      "step": 1365
+    },
+    {
+      "epoch": 0.23065351830185526,
+      "grad_norm": 0.022509122267365456,
+      "learning_rate": 0.00019141371792721784,
+      "loss": 0.7867,
+      "step": 1380
+    },
+    {
+      "epoch": 0.23316062176165803,
+      "grad_norm": 0.021511022001504898,
+      "learning_rate": 0.00019131309743417746,
+      "loss": 0.7785,
+      "step": 1395
+    },
+    {
+      "epoch": 0.23566772522146082,
+      "grad_norm": 0.021113473922014236,
+      "learning_rate": 0.00019121247694113701,
+      "loss": 0.7835,
+      "step": 1410
+    },
+    {
+      "epoch": 0.23817482868126358,
+      "grad_norm": 0.02204412780702114,
+      "learning_rate": 0.00019111185644809663,
+      "loss": 0.7796,
+      "step": 1425
+    },
+    {
+      "epoch": 0.24068193214106637,
+      "grad_norm": 0.021546153351664543,
+      "learning_rate": 0.00019101123595505618,
+      "loss": 0.795,
+      "step": 1440
+    },
+    {
+      "epoch": 0.24318903560086913,
+      "grad_norm": 0.02130185067653656,
+      "learning_rate": 0.00019091061546201577,
+      "loss": 0.7743,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2456961390606719,
+      "grad_norm": 0.020676780492067337,
+      "learning_rate": 0.00019080999496897536,
+      "loss": 0.7827,
+      "step": 1470
+    },
+    {
+      "epoch": 0.24820324252047468,
+      "grad_norm": 0.02096562273800373,
+      "learning_rate": 0.00019070937447593494,
+      "loss": 0.8017,
+      "step": 1485
+    },
+    {
+      "epoch": 0.25071034598027747,
+      "grad_norm": 0.021092170849442482,
+      "learning_rate": 0.00019060875398289453,
+      "loss": 0.7752,
+      "step": 1500
+    },
+    {
+      "epoch": 0.25321744944008023,
+      "grad_norm": 0.02107168734073639,
+      "learning_rate": 0.0001905081334898541,
+      "loss": 0.7819,
+      "step": 1515
+    },
+    {
+      "epoch": 0.255724552899883,
+      "grad_norm": 0.021778512746095657,
+      "learning_rate": 0.0001904075129968137,
+      "loss": 0.7911,
+      "step": 1530
+    },
+    {
+      "epoch": 0.25823165635968576,
+      "grad_norm": 0.020381765440106392,
+      "learning_rate": 0.00019030689250377328,
+      "loss": 0.787,
+      "step": 1545
+    },
+    {
+      "epoch": 0.2607387598194886,
+      "grad_norm": 0.02274371311068535,
+      "learning_rate": 0.00019020627201073287,
+      "loss": 0.7828,
+      "step": 1560
+    },
+    {
+      "epoch": 0.26324586327929134,
+      "grad_norm": 0.02126036398112774,
+      "learning_rate": 0.00019010565151769243,
+      "loss": 0.7798,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2657529667390941,
+      "grad_norm": 0.02086903154850006,
+      "learning_rate": 0.00019000503102465204,
+      "loss": 0.7723,
+      "step": 1590
+    },
+    {
+      "epoch": 0.26826007019889686,
+      "grad_norm": 0.05506217107176781,
+      "learning_rate": 0.0001899044105316116,
+      "loss": 0.7798,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2707671736586996,
+      "grad_norm": 0.02119087241590023,
+      "learning_rate": 0.0001898037900385712,
+      "loss": 0.7809,
+      "step": 1620
+    },
+    {
+      "epoch": 0.27327427711850244,
+      "grad_norm": 0.02315429411828518,
+      "learning_rate": 0.00018970316954553077,
+      "loss": 0.7881,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2757813805783052,
+      "grad_norm": 0.021781641989946365,
+      "learning_rate": 0.00018960254905249038,
+      "loss": 0.7902,
+      "step": 1650
+    },
+    {
+      "epoch": 0.27828848403810796,
+      "grad_norm": 0.022906338796019554,
+      "learning_rate": 0.00018950192855944994,
+      "loss": 0.7766,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2807955874979107,
+      "grad_norm": 0.021640203893184662,
+      "learning_rate": 0.00018940130806640955,
+      "loss": 0.777,
+      "step": 1680
+    },
+    {
+      "epoch": 0.28330269095771354,
+      "grad_norm": 0.02225816249847412,
+      "learning_rate": 0.0001893006875733691,
+      "loss": 0.7777,
+      "step": 1695
+    },
+    {
+      "epoch": 0.2858097944175163,
+      "grad_norm": 0.021424556151032448,
+      "learning_rate": 0.00018920006708032872,
+      "loss": 0.7609,
+      "step": 1710
+    },
+    {
+      "epoch": 0.28831689787731907,
+      "grad_norm": 0.02180912159383297,
+      "learning_rate": 0.00018909944658728828,
+      "loss": 0.7691,
+      "step": 1725
+    },
+    {
+      "epoch": 0.29082400133712183,
+      "grad_norm": 0.021193066611886024,
+      "learning_rate": 0.00018899882609424787,
+      "loss": 0.7636,
+      "step": 1740
+    },
+    {
+      "epoch": 0.29333110479692465,
+      "grad_norm": 0.021105512976646423,
+      "learning_rate": 0.00018889820560120745,
+      "loss": 0.7756,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2958382082567274,
+      "grad_norm": 0.021696053445339203,
+      "learning_rate": 0.00018879758510816704,
+      "loss": 0.7814,
+      "step": 1770
+    },
+    {
+      "epoch": 0.29834531171653017,
+      "grad_norm": 0.021872224286198616,
+      "learning_rate": 0.00018869696461512662,
+      "loss": 0.7912,
+      "step": 1785
+    },
+    {
+      "epoch": 0.30085241517633293,
+      "grad_norm": 0.02113959938287735,
+      "learning_rate": 0.0001885963441220862,
+      "loss": 0.7775,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3033595186361357,
+      "grad_norm": 0.020779291167855263,
+      "learning_rate": 0.0001884957236290458,
+      "loss": 0.7752,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3058666220959385,
+      "grad_norm": 0.021366087719798088,
+      "learning_rate": 0.00018839510313600538,
+      "loss": 0.7748,
+      "step": 1830
+    },
+    {
+      "epoch": 0.3083737255557413,
+      "grad_norm": 0.02154374308884144,
+      "learning_rate": 0.00018829448264296497,
+      "loss": 0.7774,
+      "step": 1845
+    },
+    {
+      "epoch": 0.31088082901554404,
+      "grad_norm": 0.020630501210689545,
+      "learning_rate": 0.00018819386214992455,
+      "loss": 0.7832,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3133879324753468,
+      "grad_norm": 0.022217195481061935,
+      "learning_rate": 0.00018809324165688414,
+      "loss": 0.7742,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3158950359351496,
+      "grad_norm": 0.021622564643621445,
+      "learning_rate": 0.0001879926211638437,
+      "loss": 0.7782,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3184021393949524,
+      "grad_norm": 0.02158367820084095,
+      "learning_rate": 0.0001878920006708033,
+      "loss": 0.7753,
+      "step": 1905
+    },
+    {
+      "epoch": 0.32090924285475514,
+      "grad_norm": 0.021993108093738556,
+      "learning_rate": 0.00018779138017776287,
+      "loss": 0.7801,
+      "step": 1920
+    },
+    {
+      "epoch": 0.3234163463145579,
+      "grad_norm": 0.02169063873589039,
+      "learning_rate": 0.00018769075968472248,
+      "loss": 0.7937,
+      "step": 1935
+    },
+    {
+      "epoch": 0.32592344977436066,
+      "grad_norm": 0.023950908333063126,
+      "learning_rate": 0.00018759013919168204,
+      "loss": 0.7668,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3284305532341635,
+      "grad_norm": 0.02253536880016327,
+      "learning_rate": 0.00018748951869864165,
+      "loss": 0.7796,
+      "step": 1965
+    },
+    {
+      "epoch": 0.33093765669396624,
+      "grad_norm": 0.021693330258131027,
+      "learning_rate": 0.0001873888982056012,
+      "loss": 0.7634,
+      "step": 1980
+    },
+    {
+      "epoch": 0.333444760153769,
+      "grad_norm": 0.022510211914777756,
+      "learning_rate": 0.00018728827771256082,
+      "loss": 0.7664,
+      "step": 1995
+    },
+    {
+      "epoch": 0.33595186361357177,
+      "grad_norm": 0.021836843341588974,
+      "learning_rate": 0.00018718765721952038,
+      "loss": 0.7849,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3384589670733746,
+      "grad_norm": 0.021421095356345177,
+      "learning_rate": 0.00018708703672647996,
+      "loss": 0.78,
+      "step": 2025
+    },
+    {
+      "epoch": 0.34096607053317735,
+      "grad_norm": 0.02277962490916252,
+      "learning_rate": 0.00018698641623343955,
+      "loss": 0.7935,
+      "step": 2040
+    },
+    {
+      "epoch": 0.3434731739929801,
+      "grad_norm": 0.022962411865592003,
+      "learning_rate": 0.00018688579574039913,
+      "loss": 0.7573,
+      "step": 2055
+    },
+    {
+      "epoch": 0.34598027745278287,
+      "grad_norm": 0.021092860028147697,
+      "learning_rate": 0.00018678517524735872,
+      "loss": 0.7725,
+      "step": 2070
+    },
+    {
+      "epoch": 0.3484873809125857,
+      "grad_norm": 0.0216389037668705,
+      "learning_rate": 0.0001866845547543183,
+      "loss": 0.7717,
+      "step": 2085
+    },
+    {
+      "epoch": 0.35099448437238845,
+      "grad_norm": 0.022193802520632744,
+      "learning_rate": 0.0001865839342612779,
+      "loss": 0.7671,
+      "step": 2100
+    },
+    {
+      "epoch": 0.3535015878321912,
+      "grad_norm": 0.021959876641631126,
+      "learning_rate": 0.00018648331376823748,
+      "loss": 0.7893,
+      "step": 2115
+    },
+    {
+      "epoch": 0.356008691291994,
+      "grad_norm": 0.022308630868792534,
+      "learning_rate": 0.00018638269327519706,
+      "loss": 0.7719,
+      "step": 2130
+    },
+    {
+      "epoch": 0.35851579475179673,
+      "grad_norm": 0.022814404219388962,
+      "learning_rate": 0.00018628207278215665,
+      "loss": 0.7687,
+      "step": 2145
+    },
+    {
+      "epoch": 0.36102289821159955,
+      "grad_norm": 0.021741073578596115,
+      "learning_rate": 0.00018618145228911623,
+      "loss": 0.7757,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3635300016714023,
+      "grad_norm": 0.022700047120451927,
+      "learning_rate": 0.00018608083179607582,
+      "loss": 0.7724,
+      "step": 2175
+    },
+    {
+      "epoch": 0.3660371051312051,
+      "grad_norm": 0.023608332499861717,
+      "learning_rate": 0.0001859802113030354,
+      "loss": 0.7697,
+      "step": 2190
+    },
+    {
+      "epoch": 0.36854420859100784,
+      "grad_norm": 0.02221842296421528,
+      "learning_rate": 0.00018587959080999496,
+      "loss": 0.7746,
+      "step": 2205
+    },
+    {
+      "epoch": 0.37105131205081066,
+      "grad_norm": 0.022841554135084152,
+      "learning_rate": 0.00018577897031695457,
+      "loss": 0.7798,
+      "step": 2220
+    },
+    {
+      "epoch": 0.3735584155106134,
+      "grad_norm": 0.021496908739209175,
+      "learning_rate": 0.00018567834982391413,
+      "loss": 0.7608,
+      "step": 2235
+    },
+    {
+      "epoch": 0.3760655189704162,
+      "grad_norm": 0.022609667852520943,
+      "learning_rate": 0.00018557772933087375,
+      "loss": 0.7582,
+      "step": 2250
+    },
+    {
+      "epoch": 0.37857262243021894,
+      "grad_norm": 0.022388063371181488,
+      "learning_rate": 0.0001854771088378333,
+      "loss": 0.7736,
+      "step": 2265
+    },
+    {
+      "epoch": 0.3810797258900217,
+      "grad_norm": 0.021875958889722824,
+      "learning_rate": 0.00018537648834479292,
+      "loss": 0.764,
+      "step": 2280
+    },
+    {
+      "epoch": 0.3835868293498245,
+      "grad_norm": 0.023109521716833115,
+      "learning_rate": 0.00018527586785175247,
+      "loss": 0.7646,
+      "step": 2295
+    },
+    {
+      "epoch": 0.3860939328096273,
+      "grad_norm": 0.02191918157041073,
+      "learning_rate": 0.00018517524735871206,
+      "loss": 0.7688,
+      "step": 2310
+    },
+    {
+      "epoch": 0.38860103626943004,
+      "grad_norm": 0.022137146443128586,
+      "learning_rate": 0.00018507462686567165,
+      "loss": 0.7708,
+      "step": 2325
+    },
+    {
+      "epoch": 0.3911081397292328,
+      "grad_norm": 0.023074300959706306,
+      "learning_rate": 0.00018497400637263123,
+      "loss": 0.77,
+      "step": 2340
+    },
+    {
+      "epoch": 0.3936152431890356,
+      "grad_norm": 0.023129386827349663,
+      "learning_rate": 0.00018487338587959082,
+      "loss": 0.7645,
+      "step": 2355
+    },
+    {
+      "epoch": 0.3961223466488384,
+      "grad_norm": 0.022260216996073723,
+      "learning_rate": 0.0001847727653865504,
+      "loss": 0.7739,
+      "step": 2370
+    },
+    {
+      "epoch": 0.39862945010864115,
+      "grad_norm": 0.022797416895627975,
+      "learning_rate": 0.00018467214489351,
+      "loss": 0.7631,
+      "step": 2385
+    },
+    {
+      "epoch": 0.4011365535684439,
+      "grad_norm": 0.02237161435186863,
+      "learning_rate": 0.00018457152440046957,
+      "loss": 0.7708,
+      "step": 2400
+    },
+    {
+      "epoch": 0.40364365702824667,
+      "grad_norm": 0.023264579474925995,
+      "learning_rate": 0.00018447090390742916,
+      "loss": 0.7702,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4061507604880495,
+      "grad_norm": 0.022827420383691788,
+      "learning_rate": 0.00018437028341438874,
+      "loss": 0.7696,
+      "step": 2430
+    },
+    {
+      "epoch": 0.40865786394785225,
+      "grad_norm": 0.022284789010882378,
+      "learning_rate": 0.00018426966292134833,
+      "loss": 0.7759,
+      "step": 2445
+    },
+    {
+      "epoch": 0.411164967407655,
+      "grad_norm": 0.0228969044983387,
+      "learning_rate": 0.00018416904242830791,
+      "loss": 0.7711,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4136720708674578,
+      "grad_norm": 0.02203362248837948,
+      "learning_rate": 0.0001840684219352675,
+      "loss": 0.7557,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4161791743272606,
+      "grad_norm": 0.022419359534978867,
+      "learning_rate": 0.00018396780144222706,
+      "loss": 0.7806,
+      "step": 2490
+    },
+    {
+      "epoch": 0.41868627778706335,
+      "grad_norm": 0.02259223349392414,
+      "learning_rate": 0.00018386718094918667,
+      "loss": 0.7616,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4211933812468661,
+      "grad_norm": 0.023276396095752716,
+      "learning_rate": 0.00018376656045614623,
+      "loss": 0.7597,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4237004847066689,
+      "grad_norm": 0.022209784016013145,
+      "learning_rate": 0.00018366593996310584,
+      "loss": 0.7554,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4262075881664717,
+      "grad_norm": 0.022717982530593872,
+      "learning_rate": 0.0001835653194700654,
+      "loss": 0.7723,
+      "step": 2550
+    },
+    {
+      "epoch": 0.42871469162627446,
+      "grad_norm": 0.022251484915614128,
+      "learning_rate": 0.000183464698977025,
+      "loss": 0.7716,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4312217950860772,
+      "grad_norm": 0.022392725571990013,
+      "learning_rate": 0.00018336407848398457,
+      "loss": 0.7654,
+      "step": 2580
+    },
+    {
+      "epoch": 0.43372889854588,
+      "grad_norm": 0.023053428158164024,
+      "learning_rate": 0.00018326345799094416,
+      "loss": 0.7579,
+      "step": 2595
+    },
+    {
+      "epoch": 0.43623600200568274,
+      "grad_norm": 0.02315950021147728,
+      "learning_rate": 0.00018316283749790374,
+      "loss": 0.7655,
+      "step": 2610
+    },
+    {
+      "epoch": 0.43874310546548556,
+      "grad_norm": 0.02267162874341011,
+      "learning_rate": 0.00018306221700486333,
+      "loss": 0.7503,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4412502089252883,
+      "grad_norm": 0.022932684049010277,
+      "learning_rate": 0.0001829615965118229,
+      "loss": 0.7592,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4437573123850911,
+      "grad_norm": 0.023299789056181908,
+      "learning_rate": 0.0001828609760187825,
+      "loss": 0.7805,
+      "step": 2655
+    },
+    {
+      "epoch": 0.44626441584489385,
+      "grad_norm": 0.022324666380882263,
+      "learning_rate": 0.00018276035552574208,
+      "loss": 0.7639,
+      "step": 2670
+    },
+    {
+      "epoch": 0.44877151930469666,
+      "grad_norm": 0.023942479863762856,
+      "learning_rate": 0.00018265973503270167,
+      "loss": 0.7506,
+      "step": 2685
+    },
+    {
+      "epoch": 0.4512786227644994,
+      "grad_norm": 0.022840656340122223,
+      "learning_rate": 0.00018255911453966126,
+      "loss": 0.7568,
+      "step": 2700
+    },
+    {
+      "epoch": 0.4537857262243022,
+      "grad_norm": 0.022889986634254456,
+      "learning_rate": 0.00018245849404662084,
+      "loss": 0.757,
+      "step": 2715
+    },
+    {
+      "epoch": 0.45629282968410495,
+      "grad_norm": 0.02276541106402874,
+      "learning_rate": 0.00018235787355358043,
+      "loss": 0.7702,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4587999331439077,
+      "grad_norm": 0.022805610671639442,
+      "learning_rate": 0.00018225725306054,
+      "loss": 0.7522,
+      "step": 2745
+    },
+    {
+      "epoch": 0.46130703660371053,
+      "grad_norm": 0.02356228232383728,
+      "learning_rate": 0.0001821566325674996,
+      "loss": 0.7584,
+      "step": 2760
+    },
+    {
+      "epoch": 0.4638141400635133,
+      "grad_norm": 0.02339334785938263,
+      "learning_rate": 0.00018205601207445918,
+      "loss": 0.7551,
+      "step": 2775
+    },
+    {
+      "epoch": 0.46632124352331605,
+      "grad_norm": 0.022267676889896393,
+      "learning_rate": 0.00018195539158141877,
+      "loss": 0.7551,
+      "step": 2790
+    },
+    {
+      "epoch": 0.4688283469831188,
+      "grad_norm": 0.02209157682955265,
+      "learning_rate": 0.00018185477108837833,
+      "loss": 0.7555,
+      "step": 2805
+    },
+    {
+      "epoch": 0.47133545044292163,
+      "grad_norm": 0.023798322305083275,
+      "learning_rate": 0.00018175415059533794,
+      "loss": 0.7672,
+      "step": 2820
+    },
+    {
+      "epoch": 0.4738425539027244,
+      "grad_norm": 0.02182634547352791,
+      "learning_rate": 0.0001816535301022975,
+      "loss": 0.7523,
+      "step": 2835
+    },
+    {
+      "epoch": 0.47634965736252716,
+      "grad_norm": 0.02280135080218315,
+      "learning_rate": 0.0001815529096092571,
+      "loss": 0.7523,
+      "step": 2850
+    },
+    {
+      "epoch": 0.4788567608223299,
+      "grad_norm": 0.022913530468940735,
+      "learning_rate": 0.00018145228911621667,
+      "loss": 0.7664,
+      "step": 2865
+    },
+    {
+      "epoch": 0.48136386428213274,
+      "grad_norm": 0.022897444665431976,
+      "learning_rate": 0.00018135166862317625,
+      "loss": 0.7626,
+      "step": 2880
+    },
+    {
+      "epoch": 0.4838709677419355,
+      "grad_norm": 0.022669149562716484,
+      "learning_rate": 0.00018125104813013584,
+      "loss": 0.7666,
+      "step": 2895
+    },
+    {
+      "epoch": 0.48637807120173826,
+      "grad_norm": 0.022428149357438087,
+      "learning_rate": 0.00018115042763709542,
+      "loss": 0.7574,
+      "step": 2910
+    },
+    {
+      "epoch": 0.488885174661541,
+      "grad_norm": 0.02266399934887886,
+      "learning_rate": 0.000181049807144055,
+      "loss": 0.7713,
+      "step": 2925
+    },
+    {
+      "epoch": 0.4913922781213438,
+      "grad_norm": 0.022166673094034195,
+      "learning_rate": 0.0001809491866510146,
+      "loss": 0.7526,
+      "step": 2940
+    },
+    {
+      "epoch": 0.4938993815811466,
+      "grad_norm": 0.022409655153751373,
+      "learning_rate": 0.00018084856615797418,
+      "loss": 0.7378,
+      "step": 2955
+    },
+    {
+      "epoch": 0.49640648504094936,
+      "grad_norm": 0.02232409082353115,
+      "learning_rate": 0.00018074794566493377,
+      "loss": 0.7632,
+      "step": 2970
+    },
+    {
+      "epoch": 0.4989135885007521,
+      "grad_norm": 0.022124771028757095,
+      "learning_rate": 0.00018064732517189335,
+      "loss": 0.7493,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5014206919605549,
+      "grad_norm": 0.023450786247849464,
+      "learning_rate": 0.00018054670467885294,
+      "loss": 0.751,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5039277954203577,
+      "grad_norm": 0.023552143946290016,
+      "learning_rate": 0.00018044608418581252,
+      "loss": 0.7489,
+      "step": 3015
+    },
+    {
+      "epoch": 0.5064348988801605,
+      "grad_norm": 0.022822733968496323,
+      "learning_rate": 0.0001803454636927721,
+      "loss": 0.7464,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5089420023399632,
+      "grad_norm": 0.02279839850962162,
+      "learning_rate": 0.0001802448431997317,
+      "loss": 0.7613,
+      "step": 3045
+    },
+    {
+      "epoch": 0.511449105799766,
+      "grad_norm": 0.023819871246814728,
+      "learning_rate": 0.00018014422270669128,
+      "loss": 0.7368,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5139562092595688,
+      "grad_norm": 0.02348748780786991,
+      "learning_rate": 0.00018004360221365086,
+      "loss": 0.7485,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5164633127193715,
+      "grad_norm": 0.02394930087029934,
+      "learning_rate": 0.00017994298172061045,
+      "loss": 0.7504,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5189704161791743,
+      "grad_norm": 0.023166505619883537,
+      "learning_rate": 0.00017984236122757004,
+      "loss": 0.7526,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5214775196389771,
+      "grad_norm": 0.023279821500182152,
+      "learning_rate": 0.0001797417407345296,
+      "loss": 0.7474,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5239846230987799,
+      "grad_norm": 0.022907249629497528,
+      "learning_rate": 0.0001796411202414892,
+      "loss": 0.7555,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5264917265585827,
+      "grad_norm": 0.023161666467785835,
+      "learning_rate": 0.00017954049974844877,
+      "loss": 0.7587,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5289988300183854,
+      "grad_norm": 0.02453703060746193,
+      "learning_rate": 0.00017943987925540835,
+      "loss": 0.7623,
+      "step": 3165
+    },
+    {
+      "epoch": 0.5315059334781882,
+      "grad_norm": 0.02323891967535019,
+      "learning_rate": 0.00017933925876236794,
+      "loss": 0.7546,
+      "step": 3180
+    },
+    {
+      "epoch": 0.534013036937991,
+      "grad_norm": 0.022658636793494225,
+      "learning_rate": 0.00017923863826932752,
+      "loss": 0.7544,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5365201403977937,
+      "grad_norm": 0.023256490007042885,
+      "learning_rate": 0.0001791380177762871,
+      "loss": 0.7631,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5390272438575965,
+      "grad_norm": 0.02328312210738659,
+      "learning_rate": 0.0001790373972832467,
+      "loss": 0.7576,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5415343473173992,
+      "grad_norm": 0.023502754047513008,
+      "learning_rate": 0.00017893677679020628,
+      "loss": 0.7558,
+      "step": 3240
+    },
+    {
+      "epoch": 0.5440414507772021,
+      "grad_norm": 0.02397795580327511,
+      "learning_rate": 0.00017883615629716586,
+      "loss": 0.7465,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5465485542370049,
+      "grad_norm": 0.023231035098433495,
+      "learning_rate": 0.00017873553580412545,
+      "loss": 0.7596,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5490556576968076,
+      "grad_norm": 0.023429760709404945,
+      "learning_rate": 0.00017863491531108503,
+      "loss": 0.7631,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5515627611566104,
+      "grad_norm": 0.02327948808670044,
+      "learning_rate": 0.00017853429481804462,
+      "loss": 0.7493,
+      "step": 3300
+    },
+    {
+      "epoch": 0.5540698646164132,
+      "grad_norm": 0.023450564593076706,
+      "learning_rate": 0.0001784336743250042,
+      "loss": 0.7489,
+      "step": 3315
+    },
+    {
+      "epoch": 0.5565769680762159,
+      "grad_norm": 0.02356708236038685,
+      "learning_rate": 0.0001783330538319638,
+      "loss": 0.7541,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5590840715360187,
+      "grad_norm": 0.024269040673971176,
+      "learning_rate": 0.00017823243333892338,
+      "loss": 0.7717,
+      "step": 3345
+    },
+    {
+      "epoch": 0.5615911749958215,
+      "grad_norm": 0.02358848787844181,
+      "learning_rate": 0.00017813181284588296,
+      "loss": 0.7553,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5640982784556242,
+      "grad_norm": 0.02385580912232399,
+      "learning_rate": 0.00017803119235284255,
+      "loss": 0.7484,
+      "step": 3375
+    },
+    {
+      "epoch": 0.5666053819154271,
+      "grad_norm": 0.023820120841264725,
+      "learning_rate": 0.00017793057185980213,
+      "loss": 0.7529,
+      "step": 3390
+    },
+    {
+      "epoch": 0.5691124853752298,
+      "grad_norm": 0.023704256862401962,
+      "learning_rate": 0.00017782995136676172,
+      "loss": 0.763,
+      "step": 3405
+    },
+    {
+      "epoch": 0.5716195888350326,
+      "grad_norm": 0.02363293431699276,
+      "learning_rate": 0.0001777293308737213,
+      "loss": 0.7552,
+      "step": 3420
+    },
+    {
+      "epoch": 0.5741266922948354,
+      "grad_norm": 0.023471953347325325,
+      "learning_rate": 0.00017762871038068086,
+      "loss": 0.7516,
+      "step": 3435
+    },
+    {
+      "epoch": 0.5766337957546381,
+      "grad_norm": 0.023572325706481934,
+      "learning_rate": 0.00017752808988764045,
+      "loss": 0.7635,
+      "step": 3450
+    },
+    {
+      "epoch": 0.5791408992144409,
+      "grad_norm": 0.023114044219255447,
+      "learning_rate": 0.00017742746939460003,
+      "loss": 0.7376,
+      "step": 3465
+    },
+    {
+      "epoch": 0.5816480026742437,
+      "grad_norm": 0.022982290014624596,
+      "learning_rate": 0.00017732684890155962,
+      "loss": 0.7548,
+      "step": 3480
+    },
+    {
+      "epoch": 0.5841551061340464,
+      "grad_norm": 0.024818824604153633,
+      "learning_rate": 0.0001772262284085192,
+      "loss": 0.7555,
+      "step": 3495
+    },
+    {
+      "epoch": 0.5866622095938493,
+      "grad_norm": 0.024532759562134743,
+      "learning_rate": 0.0001771256079154788,
+      "loss": 0.7543,
+      "step": 3510
+    },
+    {
+      "epoch": 0.589169313053652,
+      "grad_norm": 0.023687878623604774,
+      "learning_rate": 0.00017702498742243837,
+      "loss": 0.7574,
+      "step": 3525
+    },
+    {
+      "epoch": 0.5916764165134548,
+      "grad_norm": 0.023244835436344147,
+      "learning_rate": 0.00017692436692939796,
+      "loss": 0.738,
+      "step": 3540
+    },
+    {
+      "epoch": 0.5941835199732576,
+      "grad_norm": 0.023271916434168816,
+      "learning_rate": 0.00017682374643635755,
+      "loss": 0.7472,
+      "step": 3555
+    },
+    {
+      "epoch": 0.5966906234330603,
+      "grad_norm": 0.023334383964538574,
+      "learning_rate": 0.00017672312594331713,
+      "loss": 0.7547,
+      "step": 3570
+    },
+    {
+      "epoch": 0.5991977268928631,
+      "grad_norm": 0.024023573845624924,
+      "learning_rate": 0.00017662250545027672,
+      "loss": 0.7516,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6017048303526659,
+      "grad_norm": 0.023526392877101898,
+      "learning_rate": 0.0001765218849572363,
+      "loss": 0.7484,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6042119338124686,
+      "grad_norm": 0.023420479148626328,
+      "learning_rate": 0.0001764212644641959,
+      "loss": 0.7368,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6067190372722714,
+      "grad_norm": 0.024068370461463928,
+      "learning_rate": 0.00017632064397115547,
+      "loss": 0.7448,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6092261407320743,
+      "grad_norm": 0.024318361654877663,
+      "learning_rate": 0.00017622002347811506,
+      "loss": 0.7544,
+      "step": 3645
+    },
+    {
+      "epoch": 0.611733244191877,
+      "grad_norm": 0.023683857172727585,
+      "learning_rate": 0.00017611940298507464,
+      "loss": 0.7583,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6142403476516798,
+      "grad_norm": 0.023911328986287117,
+      "learning_rate": 0.00017601878249203423,
+      "loss": 0.7482,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6167474511114825,
+      "grad_norm": 0.023844299837946892,
+      "learning_rate": 0.00017591816199899381,
+      "loss": 0.7406,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6192545545712853,
+      "grad_norm": 0.023253358900547028,
+      "learning_rate": 0.0001758175415059534,
+      "loss": 0.7476,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6217616580310881,
+      "grad_norm": 0.022935032844543457,
+      "learning_rate": 0.00017571692101291296,
+      "loss": 0.7563,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6242687614908908,
+      "grad_norm": 0.02410741336643696,
+      "learning_rate": 0.00017561630051987254,
+      "loss": 0.7553,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6267758649506936,
+      "grad_norm": 0.023733945563435555,
+      "learning_rate": 0.00017551568002683213,
+      "loss": 0.7395,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6292829684104964,
+      "grad_norm": 0.024090424180030823,
+      "learning_rate": 0.00017541505953379171,
+      "loss": 0.7615,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6317900718702992,
+      "grad_norm": 0.023794986307621002,
+      "learning_rate": 0.0001753144390407513,
+      "loss": 0.7527,
+      "step": 3780
+    },
+    {
+      "epoch": 0.634297175330102,
+      "grad_norm": 0.02363026887178421,
+      "learning_rate": 0.00017521381854771089,
+      "loss": 0.748,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6368042787899048,
+      "grad_norm": 0.024967040866613388,
+      "learning_rate": 0.00017511319805467047,
+      "loss": 0.7501,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6393113822497075,
+      "grad_norm": 0.02417265996336937,
+      "learning_rate": 0.00017501257756163006,
+      "loss": 0.7453,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6418184857095103,
+      "grad_norm": 0.024464495480060577,
+      "learning_rate": 0.00017491195706858964,
+      "loss": 0.758,
+      "step": 3840
+    },
+    {
+      "epoch": 0.644325589169313,
+      "grad_norm": 0.023871179670095444,
+      "learning_rate": 0.00017481133657554923,
+      "loss": 0.7616,
+      "step": 3855
+    },
+    {
+      "epoch": 0.6468326926291158,
+      "grad_norm": 0.023780934512615204,
+      "learning_rate": 0.0001747107160825088,
+      "loss": 0.7453,
+      "step": 3870
+    },
+    {
+      "epoch": 0.6493397960889186,
+      "grad_norm": 0.02408822439610958,
+      "learning_rate": 0.0001746100955894684,
+      "loss": 0.7471,
+      "step": 3885
+    },
+    {
+      "epoch": 0.6518468995487213,
+      "grad_norm": 0.024668745696544647,
+      "learning_rate": 0.00017450947509642798,
+      "loss": 0.7333,
+      "step": 3900
+    },
+    {
+      "epoch": 0.6543540030085242,
+      "grad_norm": 0.023561371490359306,
+      "learning_rate": 0.00017440885460338757,
+      "loss": 0.7454,
+      "step": 3915
+    },
+    {
+      "epoch": 0.656861106468327,
+      "grad_norm": 0.02355646714568138,
+      "learning_rate": 0.00017430823411034716,
+      "loss": 0.7505,
+      "step": 3930
+    },
+    {
+      "epoch": 0.6593682099281297,
+      "grad_norm": 0.02338649332523346,
+      "learning_rate": 0.00017420761361730674,
+      "loss": 0.7615,
+      "step": 3945
+    },
+    {
+      "epoch": 0.6618753133879325,
+      "grad_norm": 0.024536214768886566,
+      "learning_rate": 0.00017410699312426633,
+      "loss": 0.7497,
+      "step": 3960
+    },
+    {
+      "epoch": 0.6643824168477352,
+      "grad_norm": 0.023618606850504875,
+      "learning_rate": 0.0001740063726312259,
+      "loss": 0.741,
+      "step": 3975
+    },
+    {
+      "epoch": 0.666889520307538,
+      "grad_norm": 0.023363051936030388,
+      "learning_rate": 0.0001739057521381855,
+      "loss": 0.7498,
+      "step": 3990
+    },
+    {
+      "epoch": 0.6693966237673408,
+      "grad_norm": 0.023151425644755363,
+      "learning_rate": 0.00017380513164514508,
+      "loss": 0.7436,
+      "step": 4005
+    },
+    {
+      "epoch": 0.6719037272271435,
+      "grad_norm": 0.024613911285996437,
+      "learning_rate": 0.00017370451115210464,
+      "loss": 0.7484,
+      "step": 4020
+    },
+    {
+      "epoch": 0.6744108306869463,
+      "grad_norm": 0.023703262209892273,
+      "learning_rate": 0.00017360389065906423,
+      "loss": 0.7401,
+      "step": 4035
+    },
+    {
+      "epoch": 0.6769179341467492,
+      "grad_norm": 0.02323344349861145,
+      "learning_rate": 0.0001735032701660238,
+      "loss": 0.7372,
+      "step": 4050
+    },
+    {
+      "epoch": 0.6794250376065519,
+      "grad_norm": 0.023779282346367836,
+      "learning_rate": 0.0001734026496729834,
+      "loss": 0.7474,
+      "step": 4065
+    },
+    {
+      "epoch": 0.6819321410663547,
+      "grad_norm": 0.024744119495153427,
+      "learning_rate": 0.00017330202917994298,
+      "loss": 0.7337,
+      "step": 4080
+    },
+    {
+      "epoch": 0.6844392445261575,
+      "grad_norm": 0.02366352453827858,
+      "learning_rate": 0.00017320140868690257,
+      "loss": 0.7755,
+      "step": 4095
+    },
+    {
+      "epoch": 0.6869463479859602,
+      "grad_norm": 0.02404959499835968,
+      "learning_rate": 0.00017310078819386215,
+      "loss": 0.7412,
+      "step": 4110
+    },
+    {
+      "epoch": 0.689453451445763,
+      "grad_norm": 0.024871889501810074,
+      "learning_rate": 0.00017300016770082174,
+      "loss": 0.7521,
+      "step": 4125
+    },
+    {
+      "epoch": 0.6919605549055657,
+      "grad_norm": 0.02386365458369255,
+      "learning_rate": 0.00017289954720778132,
+      "loss": 0.7431,
+      "step": 4140
+    },
+    {
+      "epoch": 0.6944676583653685,
+      "grad_norm": 0.025385569781064987,
+      "learning_rate": 0.0001727989267147409,
+      "loss": 0.73,
+      "step": 4155
+    },
+    {
+      "epoch": 0.6969747618251714,
+      "grad_norm": 0.024604368954896927,
+      "learning_rate": 0.0001726983062217005,
+      "loss": 0.7474,
+      "step": 4170
+    },
+    {
+      "epoch": 0.6994818652849741,
+      "grad_norm": 0.025954630225896835,
+      "learning_rate": 0.00017259768572866008,
+      "loss": 0.7473,
+      "step": 4185
+    },
+    {
+      "epoch": 0.7019889687447769,
+      "grad_norm": 0.02412698231637478,
+      "learning_rate": 0.00017249706523561967,
+      "loss": 0.7498,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7044960722045797,
+      "grad_norm": 0.02433890663087368,
+      "learning_rate": 0.00017239644474257925,
+      "loss": 0.7474,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7070031756643824,
+      "grad_norm": 0.02414149045944214,
+      "learning_rate": 0.00017229582424953884,
+      "loss": 0.7416,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7095102791241852,
+      "grad_norm": 0.03919633850455284,
+      "learning_rate": 0.00017219520375649842,
+      "loss": 0.7502,
+      "step": 4245
+    },
+    {
+      "epoch": 0.712017382583988,
+      "grad_norm": 0.02408537268638611,
+      "learning_rate": 0.000172094583263458,
+      "loss": 0.7456,
+      "step": 4260
+    },
+    {
+      "epoch": 0.7145244860437907,
+      "grad_norm": 0.02505289390683174,
+      "learning_rate": 0.0001719939627704176,
+      "loss": 0.7412,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7170315895035935,
+      "grad_norm": 0.02388434298336506,
+      "learning_rate": 0.00017189334227737718,
+      "loss": 0.739,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7195386929633963,
+      "grad_norm": 0.02636132948100567,
+      "learning_rate": 0.00017179272178433674,
+      "loss": 0.7405,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7220457964231991,
+      "grad_norm": 0.02557826228439808,
+      "learning_rate": 0.00017169210129129635,
+      "loss": 0.7424,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7245528998830019,
+      "grad_norm": 0.02385845221579075,
+      "learning_rate": 0.0001715914807982559,
+      "loss": 0.7388,
+      "step": 4335
+    },
+    {
+      "epoch": 0.7270600033428046,
+      "grad_norm": 0.02581110969185829,
+      "learning_rate": 0.0001714908603052155,
+      "loss": 0.7414,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7295671068026074,
+      "grad_norm": 0.025572916492819786,
+      "learning_rate": 0.00017139023981217508,
+      "loss": 0.7526,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7320742102624102,
+      "grad_norm": 0.024279674515128136,
+      "learning_rate": 0.00017128961931913466,
+      "loss": 0.738,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7345813137222129,
+      "grad_norm": 0.02414841763675213,
+      "learning_rate": 0.00017118899882609425,
+      "loss": 0.7387,
+      "step": 4395
+    },
+    {
+      "epoch": 0.7370884171820157,
+      "grad_norm": 0.024131467565894127,
+      "learning_rate": 0.00017108837833305384,
+      "loss": 0.743,
+      "step": 4410
+    },
+    {
+      "epoch": 0.7395955206418184,
+      "grad_norm": 0.024498678743839264,
+      "learning_rate": 0.00017098775784001342,
+      "loss": 0.7531,
+      "step": 4425
+    },
+    {
+      "epoch": 0.7421026241016213,
+      "grad_norm": 0.024572541937232018,
+      "learning_rate": 0.000170887137346973,
+      "loss": 0.7489,
+      "step": 4440
+    },
+    {
+      "epoch": 0.7446097275614241,
+      "grad_norm": 0.02463640458881855,
+      "learning_rate": 0.0001707865168539326,
+      "loss": 0.7379,
+      "step": 4455
+    },
+    {
+      "epoch": 0.7471168310212268,
+      "grad_norm": 0.024474984034895897,
+      "learning_rate": 0.00017068589636089218,
+      "loss": 0.7532,
+      "step": 4470
+    },
+    {
+      "epoch": 0.7496239344810296,
+      "grad_norm": 0.023911593481898308,
+      "learning_rate": 0.00017058527586785176,
+      "loss": 0.7346,
+      "step": 4485
+    },
+    {
+      "epoch": 0.7521310379408324,
+      "grad_norm": 0.024990247562527657,
+      "learning_rate": 0.00017048465537481135,
+      "loss": 0.727,
+      "step": 4500
+    },
+    {
+      "epoch": 0.7546381414006351,
+      "grad_norm": 0.024192336946725845,
+      "learning_rate": 0.00017038403488177093,
+      "loss": 0.7462,
+      "step": 4515
+    },
+    {
+      "epoch": 0.7571452448604379,
+      "grad_norm": 0.02413538470864296,
+      "learning_rate": 0.00017028341438873052,
+      "loss": 0.7364,
+      "step": 4530
+    },
+    {
+      "epoch": 0.7596523483202406,
+      "grad_norm": 0.02461206167936325,
+      "learning_rate": 0.0001701827938956901,
+      "loss": 0.7321,
+      "step": 4545
+    },
+    {
+      "epoch": 0.7621594517800434,
+      "grad_norm": 0.024669578298926353,
+      "learning_rate": 0.0001700821734026497,
+      "loss": 0.7503,
+      "step": 4560
+    },
+    {
+      "epoch": 0.7646665552398463,
+      "grad_norm": 0.02436312846839428,
+      "learning_rate": 0.00016998155290960928,
+      "loss": 0.7346,
+      "step": 4575
+    },
+    {
+      "epoch": 0.767173658699649,
+      "grad_norm": 0.025169432163238525,
+      "learning_rate": 0.00016988093241656883,
+      "loss": 0.7219,
+      "step": 4590
+    },
+    {
+      "epoch": 0.7696807621594518,
+      "grad_norm": 0.025311505421996117,
+      "learning_rate": 0.00016978031192352845,
+      "loss": 0.742,
+      "step": 4605
+    },
+    {
+      "epoch": 0.7721878656192546,
+      "grad_norm": 0.024896448478102684,
+      "learning_rate": 0.000169679691430488,
+      "loss": 0.746,
+      "step": 4620
+    },
+    {
+      "epoch": 0.7746949690790573,
+      "grad_norm": 0.025063227862119675,
+      "learning_rate": 0.0001695790709374476,
+      "loss": 0.7399,
+      "step": 4635
+    },
+    {
+      "epoch": 0.7772020725388601,
+      "grad_norm": 0.024744588881731033,
+      "learning_rate": 0.00016947845044440718,
+      "loss": 0.742,
+      "step": 4650
+    },
+    {
+      "epoch": 0.7797091759986629,
+      "grad_norm": 0.025170577690005302,
+      "learning_rate": 0.00016937782995136676,
+      "loss": 0.7288,
+      "step": 4665
+    },
+    {
+      "epoch": 0.7822162794584656,
+      "grad_norm": 0.024757632985711098,
+      "learning_rate": 0.00016927720945832635,
+      "loss": 0.7407,
+      "step": 4680
+    },
+    {
+      "epoch": 0.7847233829182684,
+      "grad_norm": 0.025282783433794975,
+      "learning_rate": 0.00016917658896528593,
+      "loss": 0.7294,
+      "step": 4695
+    },
+    {
+      "epoch": 0.7872304863780712,
+      "grad_norm": 0.025306588038802147,
+      "learning_rate": 0.00016907596847224552,
+      "loss": 0.7414,
+      "step": 4710
+    },
+    {
+      "epoch": 0.789737589837874,
+      "grad_norm": 0.024476177990436554,
+      "learning_rate": 0.0001689753479792051,
+      "loss": 0.7377,
+      "step": 4725
+    },
+    {
+      "epoch": 0.7922446932976768,
+      "grad_norm": 0.025107109919190407,
+      "learning_rate": 0.0001688747274861647,
+      "loss": 0.7378,
+      "step": 4740
+    },
+    {
+      "epoch": 0.7947517967574795,
+      "grad_norm": 0.024397587403655052,
+      "learning_rate": 0.00016877410699312427,
+      "loss": 0.7308,
+      "step": 4755
+    },
+    {
+      "epoch": 0.7972589002172823,
+      "grad_norm": 0.02418595738708973,
+      "learning_rate": 0.00016867348650008386,
+      "loss": 0.7437,
+      "step": 4770
+    },
+    {
+      "epoch": 0.7997660036770851,
+      "grad_norm": 0.025148652493953705,
+      "learning_rate": 0.00016857286600704345,
+      "loss": 0.7365,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8022731071368878,
+      "grad_norm": 0.025669820606708527,
+      "learning_rate": 0.00016847224551400303,
+      "loss": 0.7392,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8047802105966906,
+      "grad_norm": 0.02602335438132286,
+      "learning_rate": 0.00016837162502096262,
+      "loss": 0.7377,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8072873140564933,
+      "grad_norm": 0.02492678537964821,
+      "learning_rate": 0.0001682710045279222,
+      "loss": 0.7435,
+      "step": 4830
+    },
+    {
+      "epoch": 0.8097944175162962,
+      "grad_norm": 0.02486814185976982,
+      "learning_rate": 0.0001681703840348818,
+      "loss": 0.7392,
+      "step": 4845
+    },
+    {
+      "epoch": 0.812301520976099,
+      "grad_norm": 0.026057204231619835,
+      "learning_rate": 0.00016806976354184137,
+      "loss": 0.7371,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8148086244359017,
+      "grad_norm": 0.025231441482901573,
+      "learning_rate": 0.00016796914304880093,
+      "loss": 0.7429,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8173157278957045,
+      "grad_norm": 0.025132806971669197,
+      "learning_rate": 0.00016786852255576054,
+      "loss": 0.7398,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8198228313555073,
+      "grad_norm": 0.02506762556731701,
+      "learning_rate": 0.0001677679020627201,
+      "loss": 0.7463,
+      "step": 4905
+    },
+    {
+      "epoch": 0.82232993481531,
+      "grad_norm": 0.02398357354104519,
+      "learning_rate": 0.00016766728156967971,
+      "loss": 0.7309,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8248370382751128,
+      "grad_norm": 0.025060344487428665,
+      "learning_rate": 0.00016756666107663927,
+      "loss": 0.7287,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8273441417349156,
+      "grad_norm": 0.024265987798571587,
+      "learning_rate": 0.00016746604058359886,
+      "loss": 0.7439,
+      "step": 4950
+    },
+    {
+      "epoch": 0.8298512451947184,
+      "grad_norm": 0.025207631289958954,
+      "learning_rate": 0.00016736542009055844,
+      "loss": 0.7332,
+      "step": 4965
+    },
+    {
+      "epoch": 0.8323583486545212,
+      "grad_norm": 0.025070613250136375,
+      "learning_rate": 0.00016726479959751803,
+      "loss": 0.7356,
+      "step": 4980
+    },
+    {
+      "epoch": 0.834865452114324,
+      "grad_norm": 0.025521699339151382,
+      "learning_rate": 0.00016716417910447761,
+      "loss": 0.7345,
+      "step": 4995
+    },
+    {
+      "epoch": 0.8373725555741267,
+      "grad_norm": 0.025154948234558105,
+      "learning_rate": 0.0001670635586114372,
+      "loss": 0.7362,
+      "step": 5010
+    },
+    {
+      "epoch": 0.8398796590339295,
+      "grad_norm": 0.025558389723300934,
+      "learning_rate": 0.00016696293811839679,
+      "loss": 0.7294,
+      "step": 5025
+    },
+    {
+      "epoch": 0.8423867624937322,
+      "grad_norm": 0.026137180626392365,
+      "learning_rate": 0.00016686231762535637,
+      "loss": 0.7331,
+      "step": 5040
+    },
+    {
+      "epoch": 0.844893865953535,
+      "grad_norm": 0.024644847959280014,
+      "learning_rate": 0.00016676169713231596,
+      "loss": 0.7382,
+      "step": 5055
+    },
+    {
+      "epoch": 0.8474009694133378,
+      "grad_norm": 0.024775272235274315,
+      "learning_rate": 0.00016666107663927554,
+      "loss": 0.7242,
+      "step": 5070
+    },
+    {
+      "epoch": 0.8499080728731405,
+      "grad_norm": 0.025577571243047714,
+      "learning_rate": 0.00016656045614623513,
+      "loss": 0.7192,
+      "step": 5085
+    },
+    {
+      "epoch": 0.8524151763329434,
+      "grad_norm": 0.024751491844654083,
+      "learning_rate": 0.0001664598356531947,
+      "loss": 0.7219,
+      "step": 5100
+    },
+    {
+      "epoch": 0.8549222797927462,
+      "grad_norm": 0.025324271991848946,
+      "learning_rate": 0.0001663592151601543,
+      "loss": 0.7412,
+      "step": 5115
+    },
+    {
+      "epoch": 0.8574293832525489,
+      "grad_norm": 0.02564609982073307,
+      "learning_rate": 0.00016625859466711388,
+      "loss": 0.7366,
+      "step": 5130
+    },
+    {
+      "epoch": 0.8599364867123517,
+      "grad_norm": 0.02468453161418438,
+      "learning_rate": 0.00016615797417407347,
+      "loss": 0.7387,
+      "step": 5145
+    },
+    {
+      "epoch": 0.8624435901721544,
+      "grad_norm": 0.025196226313710213,
+      "learning_rate": 0.00016605735368103303,
+      "loss": 0.7299,
+      "step": 5160
+    },
+    {
+      "epoch": 0.8649506936319572,
+      "grad_norm": 0.02621576189994812,
+      "learning_rate": 0.00016595673318799264,
+      "loss": 0.7495,
+      "step": 5175
+    },
+    {
+      "epoch": 0.86745779709176,
+      "grad_norm": 0.025252273306250572,
+      "learning_rate": 0.0001658561126949522,
+      "loss": 0.7322,
+      "step": 5190
+    },
+    {
+      "epoch": 0.8699649005515627,
+      "grad_norm": 0.025535358116030693,
+      "learning_rate": 0.0001657554922019118,
+      "loss": 0.7281,
+      "step": 5205
+    },
+    {
+      "epoch": 0.8724720040113655,
+      "grad_norm": 0.024804269894957542,
+      "learning_rate": 0.00016565487170887137,
+      "loss": 0.7505,
+      "step": 5220
+    },
+    {
+      "epoch": 0.8749791074711684,
+      "grad_norm": 0.02469950169324875,
+      "learning_rate": 0.00016555425121583098,
+      "loss": 0.7265,
+      "step": 5235
+    },
+    {
+      "epoch": 0.8774862109309711,
+      "grad_norm": 0.02518155239522457,
+      "learning_rate": 0.00016545363072279054,
+      "loss": 0.7288,
+      "step": 5250
+    },
+    {
+      "epoch": 0.8799933143907739,
+      "grad_norm": 0.024804813787341118,
+      "learning_rate": 0.00016535301022975013,
+      "loss": 0.7382,
+      "step": 5265
+    },
+    {
+      "epoch": 0.8825004178505766,
+      "grad_norm": 0.024241533130407333,
+      "learning_rate": 0.0001652523897367097,
+      "loss": 0.7408,
+      "step": 5280
+    },
+    {
+      "epoch": 0.8850075213103794,
+      "grad_norm": 0.025099163874983788,
+      "learning_rate": 0.0001651517692436693,
+      "loss": 0.7324,
+      "step": 5295
+    },
+    {
+      "epoch": 0.8875146247701822,
+      "grad_norm": 0.025935839861631393,
+      "learning_rate": 0.00016505114875062888,
+      "loss": 0.7353,
+      "step": 5310
+    },
+    {
+      "epoch": 0.8900217282299849,
+      "grad_norm": 0.024958360940217972,
+      "learning_rate": 0.00016495052825758847,
+      "loss": 0.724,
+      "step": 5325
+    },
+    {
+      "epoch": 0.8925288316897877,
+      "grad_norm": 0.024382906034588814,
+      "learning_rate": 0.00016484990776454805,
+      "loss": 0.7372,
+      "step": 5340
+    },
+    {
+      "epoch": 0.8950359351495905,
+      "grad_norm": 0.02473212592303753,
+      "learning_rate": 0.00016474928727150764,
+      "loss": 0.7531,
+      "step": 5355
+    },
+    {
+      "epoch": 0.8975430386093933,
+      "grad_norm": 0.024407681077718735,
+      "learning_rate": 0.00016464866677846722,
+      "loss": 0.7489,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9000501420691961,
+      "grad_norm": 0.02625984139740467,
+      "learning_rate": 0.0001645480462854268,
+      "loss": 0.7399,
+      "step": 5385
+    },
+    {
+      "epoch": 0.9025572455289989,
+      "grad_norm": 0.026630889624357224,
+      "learning_rate": 0.0001644474257923864,
+      "loss": 0.7322,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9050643489888016,
+      "grad_norm": 0.025531059131026268,
+      "learning_rate": 0.00016434680529934598,
+      "loss": 0.7499,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9075714524486044,
+      "grad_norm": 0.025482535362243652,
+      "learning_rate": 0.00016424618480630557,
+      "loss": 0.7287,
+      "step": 5430
+    },
+    {
+      "epoch": 0.9100785559084071,
+      "grad_norm": 0.026173191145062447,
+      "learning_rate": 0.00016414556431326512,
+      "loss": 0.7363,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9125856593682099,
+      "grad_norm": 0.024878835305571556,
+      "learning_rate": 0.00016404494382022474,
+      "loss": 0.7592,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9150927628280127,
+      "grad_norm": 0.025847023352980614,
+      "learning_rate": 0.0001639443233271843,
+      "loss": 0.7138,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9175998662878154,
+      "grad_norm": 0.026683717966079712,
+      "learning_rate": 0.0001638437028341439,
+      "loss": 0.7266,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9201069697476183,
+      "grad_norm": 0.02581162378191948,
+      "learning_rate": 0.00016374308234110347,
+      "loss": 0.7175,
+      "step": 5505
+    },
+    {
+      "epoch": 0.9226140732074211,
+      "grad_norm": 0.02513813227415085,
+      "learning_rate": 0.00016364246184806308,
+      "loss": 0.74,
+      "step": 5520
+    },
+    {
+      "epoch": 0.9251211766672238,
+      "grad_norm": 0.024819128215312958,
+      "learning_rate": 0.00016354184135502264,
+      "loss": 0.742,
+      "step": 5535
+    },
+    {
+      "epoch": 0.9276282801270266,
+      "grad_norm": 0.024832414463162422,
+      "learning_rate": 0.00016344122086198222,
+      "loss": 0.7465,
+      "step": 5550
+    },
+    {
+      "epoch": 0.9301353835868293,
+      "grad_norm": 0.02581876330077648,
+      "learning_rate": 0.0001633406003689418,
+      "loss": 0.7383,
+      "step": 5565
+    },
+    {
+      "epoch": 0.9326424870466321,
+      "grad_norm": 0.024939673021435738,
+      "learning_rate": 0.0001632399798759014,
+      "loss": 0.7479,
+      "step": 5580
+    },
+    {
+      "epoch": 0.9351495905064349,
+      "grad_norm": 0.025533072650432587,
+      "learning_rate": 0.00016313935938286098,
+      "loss": 0.7259,
+      "step": 5595
+    },
+    {
+      "epoch": 0.9376566939662376,
+      "grad_norm": 0.02547396905720234,
+      "learning_rate": 0.00016303873888982056,
+      "loss": 0.7258,
+      "step": 5610
+    },
+    {
+      "epoch": 0.9401637974260404,
+      "grad_norm": 0.025361550971865654,
+      "learning_rate": 0.00016293811839678015,
+      "loss": 0.7302,
+      "step": 5625
+    },
+    {
+      "epoch": 0.9426709008858433,
+      "grad_norm": 0.02566991187632084,
+      "learning_rate": 0.00016283749790373974,
+      "loss": 0.7319,
+      "step": 5640
+    },
+    {
+      "epoch": 0.945178004345646,
+      "grad_norm": 0.026383578777313232,
+      "learning_rate": 0.00016273687741069932,
+      "loss": 0.7523,
+      "step": 5655
+    },
+    {
+      "epoch": 0.9476851078054488,
+      "grad_norm": 0.025949161499738693,
+      "learning_rate": 0.0001626362569176589,
+      "loss": 0.7116,
+      "step": 5670
+    },
+    {
+      "epoch": 0.9501922112652516,
+      "grad_norm": 0.02509259060025215,
+      "learning_rate": 0.0001625356364246185,
+      "loss": 0.7259,
+      "step": 5685
+    },
+    {
+      "epoch": 0.9526993147250543,
+      "grad_norm": 0.025692781433463097,
+      "learning_rate": 0.00016243501593157808,
+      "loss": 0.7263,
+      "step": 5700
+    },
+    {
+      "epoch": 0.9552064181848571,
+      "grad_norm": 0.025238677859306335,
+      "learning_rate": 0.00016233439543853766,
+      "loss": 0.7315,
+      "step": 5715
+    },
+    {
+      "epoch": 0.9577135216446598,
+      "grad_norm": 0.025801653042435646,
+      "learning_rate": 0.00016223377494549722,
+      "loss": 0.7329,
+      "step": 5730
+    },
+    {
+      "epoch": 0.9602206251044626,
+      "grad_norm": 0.025331363081932068,
+      "learning_rate": 0.00016213315445245683,
+      "loss": 0.7245,
+      "step": 5745
+    },
+    {
+      "epoch": 0.9627277285642655,
+      "grad_norm": 0.025975272059440613,
+      "learning_rate": 0.0001620325339594164,
+      "loss": 0.7119,
+      "step": 5760
+    },
+    {
+      "epoch": 0.9652348320240682,
+      "grad_norm": 0.025318987667560577,
+      "learning_rate": 0.000161931913466376,
+      "loss": 0.7481,
+      "step": 5775
+    },
+    {
+      "epoch": 0.967741935483871,
+      "grad_norm": 0.02570466138422489,
+      "learning_rate": 0.00016183129297333556,
+      "loss": 0.7506,
+      "step": 5790
+    },
+    {
+      "epoch": 0.9702490389436738,
+      "grad_norm": 0.024957410991191864,
+      "learning_rate": 0.00016173067248029518,
+      "loss": 0.7277,
+      "step": 5805
+    },
+    {
+      "epoch": 0.9727561424034765,
+      "grad_norm": 0.026068007573485374,
+      "learning_rate": 0.00016163005198725473,
+      "loss": 0.7305,
+      "step": 5820
+    },
+    {
+      "epoch": 0.9752632458632793,
+      "grad_norm": 0.026244519278407097,
+      "learning_rate": 0.00016152943149421435,
+      "loss": 0.7184,
+      "step": 5835
+    },
+    {
+      "epoch": 0.977770349323082,
+      "grad_norm": 0.025324849411845207,
+      "learning_rate": 0.0001614288110011739,
+      "loss": 0.7264,
+      "step": 5850
+    },
+    {
+      "epoch": 0.9802774527828848,
+      "grad_norm": 0.025065554305911064,
+      "learning_rate": 0.0001613281905081335,
+      "loss": 0.7294,
+      "step": 5865
+    },
+    {
+      "epoch": 0.9827845562426876,
+      "grad_norm": 0.025444064289331436,
+      "learning_rate": 0.00016122757001509308,
+      "loss": 0.728,
+      "step": 5880
+    },
+    {
+      "epoch": 0.9852916597024904,
+      "grad_norm": 0.026068173348903656,
+      "learning_rate": 0.00016112694952205266,
+      "loss": 0.741,
+      "step": 5895
+    },
+    {
+      "epoch": 0.9877987631622932,
+      "grad_norm": 0.024954237043857574,
+      "learning_rate": 0.00016102632902901225,
+      "loss": 0.7375,
+      "step": 5910
+    },
+    {
+      "epoch": 0.990305866622096,
+      "grad_norm": 0.0247243270277977,
+      "learning_rate": 0.00016092570853597183,
+      "loss": 0.7375,
+      "step": 5925
+    },
+    {
+      "epoch": 0.9928129700818987,
+      "grad_norm": 0.025755500420928,
+      "learning_rate": 0.00016082508804293142,
+      "loss": 0.7368,
+      "step": 5940
+    },
+    {
+      "epoch": 0.9953200735417015,
+      "grad_norm": 0.026517482474446297,
+      "learning_rate": 0.000160724467549891,
+      "loss": 0.7203,
+      "step": 5955
+    },
+    {
+      "epoch": 0.9978271770015043,
+      "grad_norm": 0.025983400642871857,
+      "learning_rate": 0.0001606238470568506,
+      "loss": 0.7142,
+      "step": 5970
+    },
+    {
+      "epoch": 1.0003342804613071,
+      "grad_norm": 0.024920133873820305,
+      "learning_rate": 0.00016052322656381017,
+      "loss": 0.7166,
+      "step": 5985
+    },
+    {
+      "epoch": 1.0028413839211099,
+      "grad_norm": 0.02642948552966118,
+      "learning_rate": 0.00016042260607076976,
+      "loss": 0.7074,
+      "step": 6000
+    },
+    {
+      "epoch": 1.0053484873809126,
+      "grad_norm": 0.026105554774403572,
+      "learning_rate": 0.00016032198557772932,
+      "loss": 0.7139,
+      "step": 6015
+    },
+    {
+      "epoch": 1.0078555908407154,
+      "grad_norm": 0.0251301322132349,
+      "learning_rate": 0.00016022136508468893,
+      "loss": 0.7071,
+      "step": 6030
+    },
+    {
+      "epoch": 1.0103626943005182,
+      "grad_norm": 0.025702379643917084,
+      "learning_rate": 0.0001601207445916485,
+      "loss": 0.6975,
+      "step": 6045
+    },
+    {
+      "epoch": 1.012869797760321,
+      "grad_norm": 0.02600419521331787,
+      "learning_rate": 0.0001600201240986081,
+      "loss": 0.718,
+      "step": 6060
+    },
+    {
+      "epoch": 1.0153769012201237,
+      "grad_norm": 0.026151692494750023,
+      "learning_rate": 0.00015991950360556766,
+      "loss": 0.7069,
+      "step": 6075
+    },
+    {
+      "epoch": 1.0178840046799265,
+      "grad_norm": 0.025088109076023102,
+      "learning_rate": 0.00015981888311252727,
+      "loss": 0.708,
+      "step": 6090
+    },
+    {
+      "epoch": 1.0203911081397292,
+      "grad_norm": 0.026014501228928566,
+      "learning_rate": 0.00015971826261948683,
+      "loss": 0.7133,
+      "step": 6105
+    },
+    {
+      "epoch": 1.022898211599532,
+      "grad_norm": 0.02501731365919113,
+      "learning_rate": 0.00015961764212644644,
+      "loss": 0.7191,
+      "step": 6120
+    },
+    {
+      "epoch": 1.0254053150593347,
+      "grad_norm": 0.025291357189416885,
+      "learning_rate": 0.000159517021633406,
+      "loss": 0.7164,
+      "step": 6135
+    },
+    {
+      "epoch": 1.0279124185191375,
+      "grad_norm": 0.026282720267772675,
+      "learning_rate": 0.00015941640114036561,
+      "loss": 0.72,
+      "step": 6150
+    },
+    {
+      "epoch": 1.0304195219789403,
+      "grad_norm": 0.026496944949030876,
+      "learning_rate": 0.00015931578064732517,
+      "loss": 0.7212,
+      "step": 6165
+    },
+    {
+      "epoch": 1.032926625438743,
+      "grad_norm": 0.026129065081477165,
+      "learning_rate": 0.00015921516015428476,
+      "loss": 0.7121,
+      "step": 6180
+    },
+    {
+      "epoch": 1.0354337288985458,
+      "grad_norm": 0.027691906318068504,
+      "learning_rate": 0.00015911453966124434,
+      "loss": 0.7238,
+      "step": 6195
+    },
+    {
+      "epoch": 1.0379408323583486,
+      "grad_norm": 0.025952916592359543,
+      "learning_rate": 0.00015901391916820393,
+      "loss": 0.7141,
+      "step": 6210
+    },
+    {
+      "epoch": 1.0404479358181513,
+      "grad_norm": 0.0261197779327631,
+      "learning_rate": 0.00015891329867516351,
+      "loss": 0.7116,
+      "step": 6225
+    },
+    {
+      "epoch": 1.0429550392779543,
+      "grad_norm": 0.02602444589138031,
+      "learning_rate": 0.0001588126781821231,
+      "loss": 0.7212,
+      "step": 6240
+    },
+    {
+      "epoch": 1.045462142737757,
+      "grad_norm": 0.027341393753886223,
+      "learning_rate": 0.00015871205768908269,
+      "loss": 0.7065,
+      "step": 6255
+    },
+    {
+      "epoch": 1.0479692461975598,
+      "grad_norm": 0.026516225188970566,
+      "learning_rate": 0.00015861143719604227,
+      "loss": 0.7137,
+      "step": 6270
+    },
+    {
+      "epoch": 1.0504763496573626,
+      "grad_norm": 0.025233183056116104,
+      "learning_rate": 0.00015851081670300186,
+      "loss": 0.7286,
+      "step": 6285
+    },
+    {
+      "epoch": 1.0529834531171653,
+      "grad_norm": 0.025705158710479736,
+      "learning_rate": 0.00015841019620996141,
+      "loss": 0.7252,
+      "step": 6300
+    },
+    {
+      "epoch": 1.055490556576968,
+      "grad_norm": 0.025452638044953346,
+      "learning_rate": 0.00015830957571692103,
+      "loss": 0.7207,
+      "step": 6315
+    },
+    {
+      "epoch": 1.0579976600367709,
+      "grad_norm": 0.027089523151516914,
+      "learning_rate": 0.00015820895522388059,
+      "loss": 0.7123,
+      "step": 6330
+    },
+    {
+      "epoch": 1.0605047634965736,
+      "grad_norm": 0.02557321824133396,
+      "learning_rate": 0.0001581083347308402,
+      "loss": 0.7153,
+      "step": 6345
+    },
+    {
+      "epoch": 1.0630118669563764,
+      "grad_norm": 0.026297248899936676,
+      "learning_rate": 0.00015800771423779976,
+      "loss": 0.7183,
+      "step": 6360
+    },
+    {
+      "epoch": 1.0655189704161792,
+      "grad_norm": 0.026958812028169632,
+      "learning_rate": 0.00015790709374475937,
+      "loss": 0.7117,
+      "step": 6375
+    },
+    {
+      "epoch": 1.068026073875982,
+      "grad_norm": 0.026555512100458145,
+      "learning_rate": 0.00015780647325171893,
+      "loss": 0.7025,
+      "step": 6390
+    },
+    {
+      "epoch": 1.0705331773357847,
+      "grad_norm": 0.026713771745562553,
+      "learning_rate": 0.00015770585275867854,
+      "loss": 0.7158,
+      "step": 6405
+    },
+    {
+      "epoch": 1.0730402807955874,
+      "grad_norm": 0.02662680670619011,
+      "learning_rate": 0.0001576052322656381,
+      "loss": 0.7104,
+      "step": 6420
+    },
+    {
+      "epoch": 1.0755473842553902,
+      "grad_norm": 0.02612622268497944,
+      "learning_rate": 0.0001575046117725977,
+      "loss": 0.7186,
+      "step": 6435
+    },
+    {
+      "epoch": 1.078054487715193,
+      "grad_norm": 0.02652982994914055,
+      "learning_rate": 0.00015740399127955727,
+      "loss": 0.7097,
+      "step": 6450
+    },
+    {
+      "epoch": 1.0805615911749957,
+      "grad_norm": 0.026232892647385597,
+      "learning_rate": 0.00015730337078651685,
+      "loss": 0.7203,
+      "step": 6465
+    },
+    {
+      "epoch": 1.0830686946347985,
+      "grad_norm": 0.02632397972047329,
+      "learning_rate": 0.00015720275029347644,
+      "loss": 0.7209,
+      "step": 6480
+    },
+    {
+      "epoch": 1.0855757980946015,
+      "grad_norm": 0.02648136578500271,
+      "learning_rate": 0.00015710212980043603,
+      "loss": 0.7182,
+      "step": 6495
+    },
+    {
+      "epoch": 1.0880829015544042,
+      "grad_norm": 0.025636956095695496,
+      "learning_rate": 0.0001570015093073956,
+      "loss": 0.7077,
+      "step": 6510
+    },
+    {
+      "epoch": 1.090590005014207,
+      "grad_norm": 0.026664093136787415,
+      "learning_rate": 0.0001569008888143552,
+      "loss": 0.7216,
+      "step": 6525
+    },
+    {
+      "epoch": 1.0930971084740098,
+      "grad_norm": 0.02704274095594883,
+      "learning_rate": 0.00015680026832131478,
+      "loss": 0.714,
+      "step": 6540
+    },
+    {
+      "epoch": 1.0956042119338125,
+      "grad_norm": 0.026222985237836838,
+      "learning_rate": 0.00015669964782827437,
+      "loss": 0.7134,
+      "step": 6555
+    },
+    {
+      "epoch": 1.0981113153936153,
+      "grad_norm": 0.02940414845943451,
+      "learning_rate": 0.00015659902733523395,
+      "loss": 0.6986,
+      "step": 6570
+    },
+    {
+      "epoch": 1.100618418853418,
+      "grad_norm": 0.025812886655330658,
+      "learning_rate": 0.0001564984068421935,
+      "loss": 0.7163,
+      "step": 6585
+    },
+    {
+      "epoch": 1.1031255223132208,
+      "grad_norm": 0.026331394910812378,
+      "learning_rate": 0.00015639778634915312,
+      "loss": 0.7097,
+      "step": 6600
+    },
+    {
+      "epoch": 1.1056326257730236,
+      "grad_norm": 0.027025267481803894,
+      "learning_rate": 0.00015629716585611268,
+      "loss": 0.6983,
+      "step": 6615
+    },
+    {
+      "epoch": 1.1081397292328263,
+      "grad_norm": 0.02628287486732006,
+      "learning_rate": 0.0001561965453630723,
+      "loss": 0.7113,
+      "step": 6630
+    },
+    {
+      "epoch": 1.110646832692629,
+      "grad_norm": 0.0271297600120306,
+      "learning_rate": 0.00015609592487003185,
+      "loss": 0.7163,
+      "step": 6645
+    },
+    {
+      "epoch": 1.1131539361524319,
+      "grad_norm": 0.027640245854854584,
+      "learning_rate": 0.00015599530437699147,
+      "loss": 0.6974,
+      "step": 6660
+    },
+    {
+      "epoch": 1.1156610396122346,
+      "grad_norm": 0.026571575552225113,
+      "learning_rate": 0.00015589468388395102,
+      "loss": 0.7032,
+      "step": 6675
+    },
+    {
+      "epoch": 1.1181681430720374,
+      "grad_norm": 0.02639468014240265,
+      "learning_rate": 0.00015579406339091064,
+      "loss": 0.7139,
+      "step": 6690
+    },
+    {
+      "epoch": 1.1206752465318401,
+      "grad_norm": 0.026831267401576042,
+      "learning_rate": 0.0001556934428978702,
+      "loss": 0.7209,
+      "step": 6705
+    },
+    {
+      "epoch": 1.123182349991643,
+      "grad_norm": 0.028162870556116104,
+      "learning_rate": 0.0001555928224048298,
+      "loss": 0.7066,
+      "step": 6720
+    },
+    {
+      "epoch": 1.1256894534514457,
+      "grad_norm": 0.02714131958782673,
+      "learning_rate": 0.00015549220191178937,
+      "loss": 0.721,
+      "step": 6735
+    },
+    {
+      "epoch": 1.1281965569112486,
+      "grad_norm": 0.0281366016715765,
+      "learning_rate": 0.00015539158141874898,
+      "loss": 0.7024,
+      "step": 6750
+    },
+    {
+      "epoch": 1.1307036603710512,
+      "grad_norm": 0.027032790705561638,
+      "learning_rate": 0.00015529096092570854,
+      "loss": 0.7283,
+      "step": 6765
+    },
+    {
+      "epoch": 1.1332107638308542,
+      "grad_norm": 0.026658054441213608,
+      "learning_rate": 0.00015519034043266812,
+      "loss": 0.7111,
+      "step": 6780
+    },
+    {
+      "epoch": 1.135717867290657,
+      "grad_norm": 0.026945605874061584,
+      "learning_rate": 0.0001550897199396277,
+      "loss": 0.7298,
+      "step": 6795
+    },
+    {
+      "epoch": 1.1382249707504597,
+      "grad_norm": 0.02765739895403385,
+      "learning_rate": 0.0001549890994465873,
+      "loss": 0.7081,
+      "step": 6810
+    },
+    {
+      "epoch": 1.1407320742102625,
+      "grad_norm": 0.02612920291721821,
+      "learning_rate": 0.00015488847895354688,
+      "loss": 0.709,
+      "step": 6825
+    },
+    {
+      "epoch": 1.1432391776700652,
+      "grad_norm": 0.026704227551817894,
+      "learning_rate": 0.00015478785846050646,
+      "loss": 0.7088,
+      "step": 6840
+    },
+    {
+      "epoch": 1.145746281129868,
+      "grad_norm": 0.027153639122843742,
+      "learning_rate": 0.00015468723796746605,
+      "loss": 0.7166,
+      "step": 6855
+    },
+    {
+      "epoch": 1.1482533845896707,
+      "grad_norm": 0.02730732038617134,
+      "learning_rate": 0.0001545866174744256,
+      "loss": 0.7101,
+      "step": 6870
+    },
+    {
+      "epoch": 1.1507604880494735,
+      "grad_norm": 0.027596892789006233,
+      "learning_rate": 0.00015448599698138522,
+      "loss": 0.7122,
+      "step": 6885
+    },
+    {
+      "epoch": 1.1532675915092763,
+      "grad_norm": 0.02678474597632885,
+      "learning_rate": 0.00015438537648834478,
+      "loss": 0.7226,
+      "step": 6900
+    },
+    {
+      "epoch": 1.155774694969079,
+      "grad_norm": 0.02596975676715374,
+      "learning_rate": 0.0001542847559953044,
+      "loss": 0.7119,
+      "step": 6915
+    },
+    {
+      "epoch": 1.1582817984288818,
+      "grad_norm": 0.026990054175257683,
+      "learning_rate": 0.00015418413550226395,
+      "loss": 0.7186,
+      "step": 6930
+    },
+    {
+      "epoch": 1.1607889018886846,
+      "grad_norm": 0.026957310736179352,
+      "learning_rate": 0.00015408351500922356,
+      "loss": 0.7201,
+      "step": 6945
+    },
+    {
+      "epoch": 1.1632960053484873,
+      "grad_norm": 0.02676299959421158,
+      "learning_rate": 0.00015398289451618312,
+      "loss": 0.7116,
+      "step": 6960
+    },
+    {
+      "epoch": 1.16580310880829,
+      "grad_norm": 0.026614701375365257,
+      "learning_rate": 0.00015388227402314273,
+      "loss": 0.7181,
+      "step": 6975
+    },
+    {
+      "epoch": 1.1683102122680928,
+      "grad_norm": 0.02804492600262165,
+      "learning_rate": 0.0001537816535301023,
+      "loss": 0.7062,
+      "step": 6990
+    },
+    {
+      "epoch": 1.1708173157278958,
+      "grad_norm": 0.027462385594844818,
+      "learning_rate": 0.0001536810330370619,
+      "loss": 0.7274,
+      "step": 7005
+    },
+    {
+      "epoch": 1.1733244191876984,
+      "grad_norm": 0.026805778965353966,
+      "learning_rate": 0.00015358041254402146,
+      "loss": 0.7176,
+      "step": 7020
+    },
+    {
+      "epoch": 1.1758315226475013,
+      "grad_norm": 0.027235226705670357,
+      "learning_rate": 0.00015347979205098108,
+      "loss": 0.6944,
+      "step": 7035
+    },
+    {
+      "epoch": 1.178338626107304,
+      "grad_norm": 0.02651335299015045,
+      "learning_rate": 0.00015337917155794063,
+      "loss": 0.7084,
+      "step": 7050
+    },
+    {
+      "epoch": 1.1808457295671069,
+      "grad_norm": 0.027431068941950798,
+      "learning_rate": 0.00015327855106490025,
+      "loss": 0.7261,
+      "step": 7065
+    },
+    {
+      "epoch": 1.1833528330269096,
+      "grad_norm": 0.027069034054875374,
+      "learning_rate": 0.0001531779305718598,
+      "loss": 0.7269,
+      "step": 7080
+    },
+    {
+      "epoch": 1.1858599364867124,
+      "grad_norm": 0.026999959722161293,
+      "learning_rate": 0.0001530773100788194,
+      "loss": 0.713,
+      "step": 7095
+    },
+    {
+      "epoch": 1.1883670399465152,
+      "grad_norm": 0.027173152193427086,
+      "learning_rate": 0.00015297668958577898,
+      "loss": 0.7099,
+      "step": 7110
+    },
+    {
+      "epoch": 1.190874143406318,
+      "grad_norm": 0.026728777214884758,
+      "learning_rate": 0.00015287606909273856,
+      "loss": 0.7006,
+      "step": 7125
+    },
+    {
+      "epoch": 1.1933812468661207,
+      "grad_norm": 0.02722666971385479,
+      "learning_rate": 0.00015277544859969815,
+      "loss": 0.711,
+      "step": 7140
+    },
+    {
+      "epoch": 1.1958883503259234,
+      "grad_norm": 0.027167314663529396,
+      "learning_rate": 0.00015267482810665773,
+      "loss": 0.708,
+      "step": 7155
+    },
+    {
+      "epoch": 1.1983954537857262,
+      "grad_norm": 0.027100099250674248,
+      "learning_rate": 0.00015257420761361732,
+      "loss": 0.6944,
+      "step": 7170
+    },
+    {
+      "epoch": 1.200902557245529,
+      "grad_norm": 0.026492077857255936,
+      "learning_rate": 0.00015247358712057688,
+      "loss": 0.7122,
+      "step": 7185
+    },
+    {
+      "epoch": 1.2034096607053317,
+      "grad_norm": 0.027062034234404564,
+      "learning_rate": 0.0001523729666275365,
+      "loss": 0.7333,
+      "step": 7200
+    },
+    {
+      "epoch": 1.2059167641651345,
+      "grad_norm": 0.026957035064697266,
+      "learning_rate": 0.00015227234613449605,
+      "loss": 0.706,
+      "step": 7215
+    },
+    {
+      "epoch": 1.2084238676249373,
+      "grad_norm": 0.027580831199884415,
+      "learning_rate": 0.00015217172564145566,
+      "loss": 0.7113,
+      "step": 7230
+    },
+    {
+      "epoch": 1.21093097108474,
+      "grad_norm": 0.02672952227294445,
+      "learning_rate": 0.00015207110514841522,
+      "loss": 0.6934,
+      "step": 7245
+    },
+    {
+      "epoch": 1.2134380745445428,
+      "grad_norm": 0.026900822296738625,
+      "learning_rate": 0.00015197048465537483,
+      "loss": 0.7015,
+      "step": 7260
+    },
+    {
+      "epoch": 1.2159451780043455,
+      "grad_norm": 0.028098303824663162,
+      "learning_rate": 0.0001518698641623344,
+      "loss": 0.7143,
+      "step": 7275
+    },
+    {
+      "epoch": 1.2184522814641485,
+      "grad_norm": 0.026865461841225624,
+      "learning_rate": 0.000151769243669294,
+      "loss": 0.7253,
+      "step": 7290
+    },
+    {
+      "epoch": 1.2209593849239513,
+      "grad_norm": 0.02781241200864315,
+      "learning_rate": 0.00015166862317625356,
+      "loss": 0.7099,
+      "step": 7305
+    },
+    {
+      "epoch": 1.223466488383754,
+      "grad_norm": 0.027126578614115715,
+      "learning_rate": 0.00015156800268321317,
+      "loss": 0.6956,
+      "step": 7320
+    },
+    {
+      "epoch": 1.2259735918435568,
+      "grad_norm": 0.02705315686762333,
+      "learning_rate": 0.00015146738219017273,
+      "loss": 0.7037,
+      "step": 7335
+    },
+    {
+      "epoch": 1.2284806953033596,
+      "grad_norm": 0.027233878150582314,
+      "learning_rate": 0.00015136676169713234,
+      "loss": 0.7137,
+      "step": 7350
+    },
+    {
+      "epoch": 1.2309877987631623,
+      "grad_norm": 0.028538642451167107,
+      "learning_rate": 0.0001512661412040919,
+      "loss": 0.7123,
+      "step": 7365
+    },
+    {
+      "epoch": 1.233494902222965,
+      "grad_norm": 0.027490422129631042,
+      "learning_rate": 0.0001511655207110515,
+      "loss": 0.7208,
+      "step": 7380
+    },
+    {
+      "epoch": 1.2360020056827679,
+      "grad_norm": 0.02747008576989174,
+      "learning_rate": 0.00015106490021801107,
+      "loss": 0.7196,
+      "step": 7395
+    },
+    {
+      "epoch": 1.2385091091425706,
+      "grad_norm": 0.026851654052734375,
+      "learning_rate": 0.00015096427972497066,
+      "loss": 0.7149,
+      "step": 7410
+    },
+    {
+      "epoch": 1.2410162126023734,
+      "grad_norm": 0.02743196301162243,
+      "learning_rate": 0.00015086365923193024,
+      "loss": 0.7175,
+      "step": 7425
+    },
+    {
+      "epoch": 1.2435233160621761,
+      "grad_norm": 0.028329750522971153,
+      "learning_rate": 0.00015076303873888983,
+      "loss": 0.698,
+      "step": 7440
+    },
+    {
+      "epoch": 1.246030419521979,
+      "grad_norm": 0.026834193617105484,
+      "learning_rate": 0.00015066241824584941,
+      "loss": 0.7063,
+      "step": 7455
+    },
+    {
+      "epoch": 1.2485375229817817,
+      "grad_norm": 0.028689688071608543,
+      "learning_rate": 0.000150561797752809,
+      "loss": 0.704,
+      "step": 7470
+    },
+    {
+      "epoch": 1.2510446264415844,
+      "grad_norm": 0.02716403640806675,
+      "learning_rate": 0.00015046117725976859,
+      "loss": 0.7095,
+      "step": 7485
+    },
+    {
+      "epoch": 1.2535517299013872,
+      "grad_norm": 0.027952060103416443,
+      "learning_rate": 0.00015036055676672814,
+      "loss": 0.7341,
+      "step": 7500
+    },
+    {
+      "epoch": 1.2560588333611902,
+      "grad_norm": 0.028136277571320534,
+      "learning_rate": 0.00015025993627368776,
+      "loss": 0.6967,
+      "step": 7515
+    },
+    {
+      "epoch": 1.2585659368209927,
+      "grad_norm": 0.027513163164258003,
+      "learning_rate": 0.00015015931578064731,
+      "loss": 0.7053,
+      "step": 7530
+    },
+    {
+      "epoch": 1.2610730402807957,
+      "grad_norm": 0.027584819123148918,
+      "learning_rate": 0.00015005869528760693,
+      "loss": 0.7162,
+      "step": 7545
+    },
+    {
+      "epoch": 1.2635801437405982,
+      "grad_norm": 0.02737903967499733,
+      "learning_rate": 0.00014995807479456649,
+      "loss": 0.7211,
+      "step": 7560
+    },
+    {
+      "epoch": 1.2660872472004012,
+      "grad_norm": 0.028384409844875336,
+      "learning_rate": 0.0001498574543015261,
+      "loss": 0.7059,
+      "step": 7575
+    },
+    {
+      "epoch": 1.268594350660204,
+      "grad_norm": 0.027213079854846,
+      "learning_rate": 0.00014975683380848566,
+      "loss": 0.7064,
+      "step": 7590
+    },
+    {
+      "epoch": 1.2711014541200067,
+      "grad_norm": 0.02736794948577881,
+      "learning_rate": 0.00014965621331544527,
+      "loss": 0.712,
+      "step": 7605
+    },
+    {
+      "epoch": 1.2736085575798095,
+      "grad_norm": 0.026495933532714844,
+      "learning_rate": 0.00014955559282240483,
+      "loss": 0.7115,
+      "step": 7620
+    },
+    {
+      "epoch": 1.2761156610396123,
+      "grad_norm": 0.02718982845544815,
+      "learning_rate": 0.00014945497232936444,
+      "loss": 0.7039,
+      "step": 7635
+    },
+    {
+      "epoch": 1.278622764499415,
+      "grad_norm": 0.027888623997569084,
+      "learning_rate": 0.000149354351836324,
+      "loss": 0.6947,
+      "step": 7650
+    },
+    {
+      "epoch": 1.2811298679592178,
+      "grad_norm": 0.027887005358934402,
+      "learning_rate": 0.0001492537313432836,
+      "loss": 0.7092,
+      "step": 7665
+    },
+    {
+      "epoch": 1.2836369714190206,
+      "grad_norm": 0.02832951210439205,
+      "learning_rate": 0.00014915311085024317,
+      "loss": 0.7253,
+      "step": 7680
+    },
+    {
+      "epoch": 1.2861440748788233,
+      "grad_norm": 0.027755776420235634,
+      "learning_rate": 0.00014905249035720275,
+      "loss": 0.7051,
+      "step": 7695
+    },
+    {
+      "epoch": 1.288651178338626,
+      "grad_norm": 0.027755258604884148,
+      "learning_rate": 0.00014895186986416234,
+      "loss": 0.7131,
+      "step": 7710
+    },
+    {
+      "epoch": 1.2911582817984288,
+      "grad_norm": 0.027515331283211708,
+      "learning_rate": 0.00014885124937112193,
+      "loss": 0.6972,
+      "step": 7725
+    },
+    {
+      "epoch": 1.2936653852582316,
+      "grad_norm": 0.02867818996310234,
+      "learning_rate": 0.0001487506288780815,
+      "loss": 0.6909,
+      "step": 7740
+    },
+    {
+      "epoch": 1.2961724887180344,
+      "grad_norm": 0.027417359873652458,
+      "learning_rate": 0.0001486500083850411,
+      "loss": 0.7112,
+      "step": 7755
+    },
+    {
+      "epoch": 1.2986795921778371,
+      "grad_norm": 0.02725161798298359,
+      "learning_rate": 0.00014854938789200068,
+      "loss": 0.7172,
+      "step": 7770
+    },
+    {
+      "epoch": 1.30118669563764,
+      "grad_norm": 0.027100631967186928,
+      "learning_rate": 0.00014844876739896024,
+      "loss": 0.7079,
+      "step": 7785
+    },
+    {
+      "epoch": 1.3036937990974429,
+      "grad_norm": 0.026735814288258553,
+      "learning_rate": 0.00014834814690591985,
+      "loss": 0.7134,
+      "step": 7800
+    },
+    {
+      "epoch": 1.3062009025572454,
+      "grad_norm": 0.02827010303735733,
+      "learning_rate": 0.0001482475264128794,
+      "loss": 0.7077,
+      "step": 7815
+    },
+    {
+      "epoch": 1.3087080060170484,
+      "grad_norm": 0.02705741673707962,
+      "learning_rate": 0.00014814690591983902,
+      "loss": 0.7172,
+      "step": 7830
+    },
+    {
+      "epoch": 1.3112151094768512,
+      "grad_norm": 0.02796081081032753,
+      "learning_rate": 0.00014804628542679858,
+      "loss": 0.7232,
+      "step": 7845
+    },
+    {
+      "epoch": 1.313722212936654,
+      "grad_norm": 0.027841266244649887,
+      "learning_rate": 0.0001479456649337582,
+      "loss": 0.7113,
+      "step": 7860
+    },
+    {
+      "epoch": 1.3162293163964567,
+      "grad_norm": 0.030358731746673584,
+      "learning_rate": 0.00014784504444071775,
+      "loss": 0.7014,
+      "step": 7875
+    },
+    {
+      "epoch": 1.3187364198562594,
+      "grad_norm": 0.02849227376282215,
+      "learning_rate": 0.00014774442394767737,
+      "loss": 0.7233,
+      "step": 7890
+    },
+    {
+      "epoch": 1.3212435233160622,
+      "grad_norm": 0.02644391544163227,
+      "learning_rate": 0.00014764380345463692,
+      "loss": 0.729,
+      "step": 7905
+    },
+    {
+      "epoch": 1.323750626775865,
+      "grad_norm": 0.027298742905259132,
+      "learning_rate": 0.00014754318296159654,
+      "loss": 0.722,
+      "step": 7920
+    },
+    {
+      "epoch": 1.3262577302356677,
+      "grad_norm": 0.027199968695640564,
+      "learning_rate": 0.0001474425624685561,
+      "loss": 0.7041,
+      "step": 7935
+    },
+    {
+      "epoch": 1.3287648336954705,
+      "grad_norm": 0.027822501957416534,
+      "learning_rate": 0.0001473419419755157,
+      "loss": 0.7044,
+      "step": 7950
+    },
+    {
+      "epoch": 1.3312719371552733,
+      "grad_norm": 0.027914773672819138,
+      "learning_rate": 0.00014724132148247527,
+      "loss": 0.7074,
+      "step": 7965
+    },
+    {
+      "epoch": 1.333779040615076,
+      "grad_norm": 0.028190581128001213,
+      "learning_rate": 0.00014714070098943488,
+      "loss": 0.7014,
+      "step": 7980
+    },
+    {
+      "epoch": 1.3362861440748788,
+      "grad_norm": 0.027638264000415802,
+      "learning_rate": 0.00014704008049639444,
+      "loss": 0.6973,
+      "step": 7995
+    },
+    {
+      "epoch": 1.3387932475346815,
+      "grad_norm": 0.028353575617074966,
+      "learning_rate": 0.00014693946000335402,
+      "loss": 0.7191,
+      "step": 8010
+    },
+    {
+      "epoch": 1.3413003509944843,
+      "grad_norm": 0.027547866106033325,
+      "learning_rate": 0.0001468388395103136,
+      "loss": 0.7089,
+      "step": 8025
+    },
+    {
+      "epoch": 1.343807454454287,
+      "grad_norm": 0.02667342871427536,
+      "learning_rate": 0.0001467382190172732,
+      "loss": 0.7075,
+      "step": 8040
+    },
+    {
+      "epoch": 1.34631455791409,
+      "grad_norm": 0.028818530961871147,
+      "learning_rate": 0.00014663759852423278,
+      "loss": 0.7008,
+      "step": 8055
+    },
+    {
+      "epoch": 1.3488216613738926,
+      "grad_norm": 0.02606160379946232,
+      "learning_rate": 0.00014653697803119236,
+      "loss": 0.7207,
+      "step": 8070
+    },
+    {
+      "epoch": 1.3513287648336956,
+      "grad_norm": 0.028475910425186157,
+      "learning_rate": 0.00014643635753815195,
+      "loss": 0.6993,
+      "step": 8085
+    },
+    {
+      "epoch": 1.3538358682934981,
+      "grad_norm": 0.02790878899395466,
+      "learning_rate": 0.0001463357370451115,
+      "loss": 0.7058,
+      "step": 8100
+    },
+    {
+      "epoch": 1.356342971753301,
+      "grad_norm": 0.028986552730202675,
+      "learning_rate": 0.00014623511655207112,
+      "loss": 0.7056,
+      "step": 8115
+    },
+    {
+      "epoch": 1.3588500752131039,
+      "grad_norm": 0.02837732620537281,
+      "learning_rate": 0.00014613449605903068,
+      "loss": 0.708,
+      "step": 8130
+    },
+    {
+      "epoch": 1.3613571786729066,
+      "grad_norm": 0.027905132621526718,
+      "learning_rate": 0.0001460338755659903,
+      "loss": 0.7024,
+      "step": 8145
+    },
+    {
+      "epoch": 1.3638642821327094,
+      "grad_norm": 0.027892014011740685,
+      "learning_rate": 0.00014593325507294985,
+      "loss": 0.7011,
+      "step": 8160
+    },
+    {
+      "epoch": 1.3663713855925121,
+      "grad_norm": 0.02732338011264801,
+      "learning_rate": 0.00014583263457990946,
+      "loss": 0.6972,
+      "step": 8175
+    },
+    {
+      "epoch": 1.368878489052315,
+      "grad_norm": 0.028097622096538544,
+      "learning_rate": 0.00014573201408686902,
+      "loss": 0.6951,
+      "step": 8190
+    },
+    {
+      "epoch": 1.3713855925121177,
+      "grad_norm": 0.028016911819577217,
+      "learning_rate": 0.00014563139359382863,
+      "loss": 0.7156,
+      "step": 8205
+    },
+    {
+      "epoch": 1.3738926959719204,
+      "grad_norm": 0.02762255072593689,
+      "learning_rate": 0.0001455307731007882,
+      "loss": 0.7021,
+      "step": 8220
+    },
+    {
+      "epoch": 1.3763997994317232,
+      "grad_norm": 0.027654899284243584,
+      "learning_rate": 0.0001454301526077478,
+      "loss": 0.7074,
+      "step": 8235
+    },
+    {
+      "epoch": 1.378906902891526,
+      "grad_norm": 0.027378590777516365,
+      "learning_rate": 0.00014532953211470736,
+      "loss": 0.706,
+      "step": 8250
+    },
+    {
+      "epoch": 1.3814140063513287,
+      "grad_norm": 0.026956256479024887,
+      "learning_rate": 0.00014522891162166698,
+      "loss": 0.7183,
+      "step": 8265
+    },
+    {
+      "epoch": 1.3839211098111315,
+      "grad_norm": 0.027121366932988167,
+      "learning_rate": 0.00014512829112862653,
+      "loss": 0.7027,
+      "step": 8280
+    },
+    {
+      "epoch": 1.3864282132709342,
+      "grad_norm": 0.02765464223921299,
+      "learning_rate": 0.00014502767063558612,
+      "loss": 0.7132,
+      "step": 8295
+    },
+    {
+      "epoch": 1.3889353167307372,
+      "grad_norm": 0.02817637287080288,
+      "learning_rate": 0.0001449270501425457,
+      "loss": 0.6864,
+      "step": 8310
+    },
+    {
+      "epoch": 1.3914424201905398,
+      "grad_norm": 0.02854936383664608,
+      "learning_rate": 0.0001448264296495053,
+      "loss": 0.7248,
+      "step": 8325
+    },
+    {
+      "epoch": 1.3939495236503427,
+      "grad_norm": 0.028685523197054863,
+      "learning_rate": 0.00014472580915646488,
+      "loss": 0.7036,
+      "step": 8340
+    },
+    {
+      "epoch": 1.3964566271101453,
+      "grad_norm": 0.028023192659020424,
+      "learning_rate": 0.00014462518866342446,
+      "loss": 0.706,
+      "step": 8355
+    },
+    {
+      "epoch": 1.3989637305699483,
+      "grad_norm": 0.027805542573332787,
+      "learning_rate": 0.00014452456817038405,
+      "loss": 0.7126,
+      "step": 8370
+    },
+    {
+      "epoch": 1.401470834029751,
+      "grad_norm": 0.027909213677048683,
+      "learning_rate": 0.00014442394767734363,
+      "loss": 0.7133,
+      "step": 8385
+    },
+    {
+      "epoch": 1.4039779374895538,
+      "grad_norm": 0.02798452228307724,
+      "learning_rate": 0.00014432332718430322,
+      "loss": 0.7052,
+      "step": 8400
+    },
+    {
+      "epoch": 1.4064850409493566,
+      "grad_norm": 0.02735227160155773,
+      "learning_rate": 0.00014422270669126278,
+      "loss": 0.7096,
+      "step": 8415
+    },
+    {
+      "epoch": 1.4089921444091593,
+      "grad_norm": 0.027850987389683723,
+      "learning_rate": 0.0001441220861982224,
+      "loss": 0.718,
+      "step": 8430
+    },
+    {
+      "epoch": 1.411499247868962,
+      "grad_norm": 0.028347337618470192,
+      "learning_rate": 0.00014402146570518195,
+      "loss": 0.6989,
+      "step": 8445
+    },
+    {
+      "epoch": 1.4140063513287648,
+      "grad_norm": 0.028133846819400787,
+      "learning_rate": 0.00014392084521214156,
+      "loss": 0.7073,
+      "step": 8460
+    },
+    {
+      "epoch": 1.4165134547885676,
+      "grad_norm": 0.02889505960047245,
+      "learning_rate": 0.00014382022471910112,
+      "loss": 0.7157,
+      "step": 8475
+    },
+    {
+      "epoch": 1.4190205582483704,
+      "grad_norm": 0.02751564234495163,
+      "learning_rate": 0.00014371960422606073,
+      "loss": 0.7115,
+      "step": 8490
+    },
+    {
+      "epoch": 1.4215276617081731,
+      "grad_norm": 0.027201758697628975,
+      "learning_rate": 0.0001436189837330203,
+      "loss": 0.7121,
+      "step": 8505
+    },
+    {
+      "epoch": 1.424034765167976,
+      "grad_norm": 0.047122806310653687,
+      "learning_rate": 0.0001435183632399799,
+      "loss": 0.7103,
+      "step": 8520
+    },
+    {
+      "epoch": 1.4265418686277787,
+      "grad_norm": 0.028580831363797188,
+      "learning_rate": 0.00014341774274693946,
+      "loss": 0.6933,
+      "step": 8535
+    },
+    {
+      "epoch": 1.4290489720875814,
+      "grad_norm": 0.028754740953445435,
+      "learning_rate": 0.00014331712225389907,
+      "loss": 0.7155,
+      "step": 8550
+    },
+    {
+      "epoch": 1.4315560755473842,
+      "grad_norm": 0.028142735362052917,
+      "learning_rate": 0.00014321650176085863,
+      "loss": 0.7076,
+      "step": 8565
+    },
+    {
+      "epoch": 1.434063179007187,
+      "grad_norm": 0.02792290225625038,
+      "learning_rate": 0.00014311588126781822,
+      "loss": 0.7202,
+      "step": 8580
+    },
+    {
+      "epoch": 1.43657028246699,
+      "grad_norm": 0.027254393324255943,
+      "learning_rate": 0.0001430152607747778,
+      "loss": 0.7116,
+      "step": 8595
+    },
+    {
+      "epoch": 1.4390773859267925,
+      "grad_norm": 0.027158159762620926,
+      "learning_rate": 0.0001429146402817374,
+      "loss": 0.7034,
+      "step": 8610
+    },
+    {
+      "epoch": 1.4415844893865954,
+      "grad_norm": 0.028217531740665436,
+      "learning_rate": 0.00014281401978869697,
+      "loss": 0.7136,
+      "step": 8625
+    },
+    {
+      "epoch": 1.4440915928463982,
+      "grad_norm": 0.028678081929683685,
+      "learning_rate": 0.00014271339929565656,
+      "loss": 0.7053,
+      "step": 8640
+    },
+    {
+      "epoch": 1.446598696306201,
+      "grad_norm": 0.028371306136250496,
+      "learning_rate": 0.00014261277880261614,
+      "loss": 0.7115,
+      "step": 8655
+    },
+    {
+      "epoch": 1.4491057997660037,
+      "grad_norm": 0.027796892449259758,
+      "learning_rate": 0.00014251215830957573,
+      "loss": 0.7138,
+      "step": 8670
+    },
+    {
+      "epoch": 1.4516129032258065,
+      "grad_norm": 0.027524475008249283,
+      "learning_rate": 0.00014241153781653531,
+      "loss": 0.7048,
+      "step": 8685
+    },
+    {
+      "epoch": 1.4541200066856093,
+      "grad_norm": 0.02704106830060482,
+      "learning_rate": 0.00014231091732349487,
+      "loss": 0.6967,
+      "step": 8700
+    },
+    {
+      "epoch": 1.456627110145412,
+      "grad_norm": 0.028332151472568512,
+      "learning_rate": 0.00014221029683045448,
+      "loss": 0.7015,
+      "step": 8715
+    },
+    {
+      "epoch": 1.4591342136052148,
+      "grad_norm": 0.028455249965190887,
+      "learning_rate": 0.00014210967633741404,
+      "loss": 0.6973,
+      "step": 8730
+    },
+    {
+      "epoch": 1.4616413170650175,
+      "grad_norm": 0.028323177248239517,
+      "learning_rate": 0.00014200905584437366,
+      "loss": 0.7119,
+      "step": 8745
+    },
+    {
+      "epoch": 1.4641484205248203,
+      "grad_norm": 0.028827426955103874,
+      "learning_rate": 0.00014190843535133321,
+      "loss": 0.6973,
+      "step": 8760
+    },
+    {
+      "epoch": 1.466655523984623,
+      "grad_norm": 0.029024334624409676,
+      "learning_rate": 0.00014180781485829283,
+      "loss": 0.7109,
+      "step": 8775
+    },
+    {
+      "epoch": 1.4691626274444258,
+      "grad_norm": 0.02851213701069355,
+      "learning_rate": 0.00014170719436525239,
+      "loss": 0.7038,
+      "step": 8790
+    },
+    {
+      "epoch": 1.4716697309042286,
+      "grad_norm": 0.027595283463597298,
+      "learning_rate": 0.000141606573872212,
+      "loss": 0.7119,
+      "step": 8805
+    },
+    {
+      "epoch": 1.4741768343640314,
+      "grad_norm": 0.02817492000758648,
+      "learning_rate": 0.00014150595337917156,
+      "loss": 0.7007,
+      "step": 8820
+    },
+    {
+      "epoch": 1.4766839378238341,
+      "grad_norm": 0.028595896437764168,
+      "learning_rate": 0.00014140533288613117,
+      "loss": 0.7031,
+      "step": 8835
+    },
+    {
+      "epoch": 1.479191041283637,
+      "grad_norm": 0.028396232053637505,
+      "learning_rate": 0.00014130471239309073,
+      "loss": 0.6944,
+      "step": 8850
+    },
+    {
+      "epoch": 1.4816981447434396,
+      "grad_norm": 0.02777491882443428,
+      "learning_rate": 0.0001412040919000503,
+      "loss": 0.7069,
+      "step": 8865
+    },
+    {
+      "epoch": 1.4842052482032426,
+      "grad_norm": 0.02780229039490223,
+      "learning_rate": 0.0001411034714070099,
+      "loss": 0.6955,
+      "step": 8880
+    },
+    {
+      "epoch": 1.4867123516630452,
+      "grad_norm": 0.02779022604227066,
+      "learning_rate": 0.00014100285091396948,
+      "loss": 0.7101,
+      "step": 8895
+    },
+    {
+      "epoch": 1.4892194551228481,
+      "grad_norm": 0.029339686036109924,
+      "learning_rate": 0.00014090223042092907,
+      "loss": 0.7074,
+      "step": 8910
+    },
+    {
+      "epoch": 1.491726558582651,
+      "grad_norm": 0.0277661494910717,
+      "learning_rate": 0.00014080160992788865,
+      "loss": 0.7007,
+      "step": 8925
+    },
+    {
+      "epoch": 1.4942336620424537,
+      "grad_norm": 0.028384177014231682,
+      "learning_rate": 0.00014070098943484824,
+      "loss": 0.7006,
+      "step": 8940
+    },
+    {
+      "epoch": 1.4967407655022564,
+      "grad_norm": 0.027173573151230812,
+      "learning_rate": 0.00014060036894180783,
+      "loss": 0.7126,
+      "step": 8955
+    },
+    {
+      "epoch": 1.4992478689620592,
+      "grad_norm": 0.029250754043459892,
+      "learning_rate": 0.0001404997484487674,
+      "loss": 0.7053,
+      "step": 8970
+    },
+    {
+      "epoch": 1.501754972421862,
+      "grad_norm": 0.02840678207576275,
+      "learning_rate": 0.000140399127955727,
+      "loss": 0.691,
+      "step": 8985
+    },
+    {
+      "epoch": 1.5042620758816647,
+      "grad_norm": 0.029002662748098373,
+      "learning_rate": 0.00014029850746268658,
+      "loss": 0.6969,
+      "step": 9000
+    },
+    {
+      "epoch": 1.5067691793414675,
+      "grad_norm": 0.028643961995840073,
+      "learning_rate": 0.00014019788696964614,
+      "loss": 0.7145,
+      "step": 9015
+    },
+    {
+      "epoch": 1.5092762828012702,
+      "grad_norm": 0.027849212288856506,
+      "learning_rate": 0.00014009726647660575,
+      "loss": 0.7056,
+      "step": 9030
+    },
+    {
+      "epoch": 1.511783386261073,
+      "grad_norm": 0.02838641032576561,
+      "learning_rate": 0.0001399966459835653,
+      "loss": 0.7171,
+      "step": 9045
+    },
+    {
+      "epoch": 1.5142904897208758,
+      "grad_norm": 0.028329892084002495,
+      "learning_rate": 0.00013989602549052492,
+      "loss": 0.7021,
+      "step": 9060
+    },
+    {
+      "epoch": 1.5167975931806787,
+      "grad_norm": 0.0278428103774786,
+      "learning_rate": 0.00013979540499748448,
+      "loss": 0.7008,
+      "step": 9075
+    },
+    {
+      "epoch": 1.5193046966404813,
+      "grad_norm": 0.029085583984851837,
+      "learning_rate": 0.0001396947845044441,
+      "loss": 0.7014,
+      "step": 9090
+    },
+    {
+      "epoch": 1.5218118001002843,
+      "grad_norm": 0.028230642899870872,
+      "learning_rate": 0.00013959416401140365,
+      "loss": 0.7028,
+      "step": 9105
+    },
+    {
+      "epoch": 1.5243189035600868,
+      "grad_norm": 0.02829892747104168,
+      "learning_rate": 0.00013949354351836327,
+      "loss": 0.7052,
+      "step": 9120
+    },
+    {
+      "epoch": 1.5268260070198898,
+      "grad_norm": 0.02769339270889759,
+      "learning_rate": 0.00013939292302532282,
+      "loss": 0.705,
+      "step": 9135
+    },
+    {
+      "epoch": 1.5293331104796923,
+      "grad_norm": 0.02728847600519657,
+      "learning_rate": 0.0001392923025322824,
+      "loss": 0.7129,
+      "step": 9150
+    },
+    {
+      "epoch": 1.5318402139394953,
+      "grad_norm": 0.029400475323200226,
+      "learning_rate": 0.000139191682039242,
+      "loss": 0.7076,
+      "step": 9165
+    },
+    {
+      "epoch": 1.5343473173992979,
+      "grad_norm": 0.02829390950500965,
+      "learning_rate": 0.00013909106154620158,
+      "loss": 0.7032,
+      "step": 9180
+    },
+    {
+      "epoch": 1.5368544208591008,
+      "grad_norm": 0.028629042208194733,
+      "learning_rate": 0.00013899044105316117,
+      "loss": 0.6992,
+      "step": 9195
+    },
+    {
+      "epoch": 1.5393615243189036,
+      "grad_norm": 0.028124259784817696,
+      "learning_rate": 0.00013888982056012075,
+      "loss": 0.6928,
+      "step": 9210
+    },
+    {
+      "epoch": 1.5418686277787064,
+      "grad_norm": 0.027618682011961937,
+      "learning_rate": 0.00013878920006708034,
+      "loss": 0.6988,
+      "step": 9225
+    },
+    {
+      "epoch": 1.5443757312385091,
+      "grad_norm": 0.028371086344122887,
+      "learning_rate": 0.00013868857957403992,
+      "loss": 0.7068,
+      "step": 9240
+    },
+    {
+      "epoch": 1.546882834698312,
+      "grad_norm": 0.02925163321197033,
+      "learning_rate": 0.0001385879590809995,
+      "loss": 0.7044,
+      "step": 9255
+    },
+    {
+      "epoch": 1.5493899381581147,
+      "grad_norm": 0.027992991730570793,
+      "learning_rate": 0.0001384873385879591,
+      "loss": 0.7147,
+      "step": 9270
+    },
+    {
+      "epoch": 1.5518970416179174,
+      "grad_norm": 0.02831142581999302,
+      "learning_rate": 0.00013838671809491868,
+      "loss": 0.711,
+      "step": 9285
+    },
+    {
+      "epoch": 1.5544041450777202,
+      "grad_norm": 0.027344243600964546,
+      "learning_rate": 0.00013828609760187826,
+      "loss": 0.7043,
+      "step": 9300
+    },
+    {
+      "epoch": 1.556911248537523,
+      "grad_norm": 0.027959240600466728,
+      "learning_rate": 0.00013818547710883785,
+      "loss": 0.715,
+      "step": 9315
+    },
+    {
+      "epoch": 1.5594183519973257,
+      "grad_norm": 0.0285944901406765,
+      "learning_rate": 0.0001380848566157974,
+      "loss": 0.7104,
+      "step": 9330
+    },
+    {
+      "epoch": 1.5619254554571285,
+      "grad_norm": 0.02860502153635025,
+      "learning_rate": 0.00013798423612275702,
+      "loss": 0.7053,
+      "step": 9345
+    },
+    {
+      "epoch": 1.5644325589169314,
+      "grad_norm": 0.028087912127375603,
+      "learning_rate": 0.00013788361562971658,
+      "loss": 0.715,
+      "step": 9360
+    },
+    {
+      "epoch": 1.566939662376734,
+      "grad_norm": 0.028339073061943054,
+      "learning_rate": 0.0001377829951366762,
+      "loss": 0.7035,
+      "step": 9375
+    },
+    {
+      "epoch": 1.569446765836537,
+      "grad_norm": 0.027878131717443466,
+      "learning_rate": 0.00013768237464363575,
+      "loss": 0.7072,
+      "step": 9390
+    },
+    {
+      "epoch": 1.5719538692963395,
+      "grad_norm": 0.028305955231189728,
+      "learning_rate": 0.00013758175415059536,
+      "loss": 0.6994,
+      "step": 9405
+    },
+    {
+      "epoch": 1.5744609727561425,
+      "grad_norm": 0.028195269405841827,
+      "learning_rate": 0.00013748113365755492,
+      "loss": 0.7172,
+      "step": 9420
+    },
+    {
+      "epoch": 1.576968076215945,
+      "grad_norm": 0.028301289305090904,
+      "learning_rate": 0.00013738051316451453,
+      "loss": 0.6958,
+      "step": 9435
+    },
+    {
+      "epoch": 1.579475179675748,
+      "grad_norm": 0.029125042259693146,
+      "learning_rate": 0.0001372798926714741,
+      "loss": 0.7004,
+      "step": 9450
+    },
+    {
+      "epoch": 1.5819822831355508,
+      "grad_norm": 0.02798408642411232,
+      "learning_rate": 0.00013717927217843368,
+      "loss": 0.6995,
+      "step": 9465
+    },
+    {
+      "epoch": 1.5844893865953535,
+      "grad_norm": 0.029614899307489395,
+      "learning_rate": 0.00013707865168539326,
+      "loss": 0.7057,
+      "step": 9480
+    },
+    {
+      "epoch": 1.5869964900551563,
+      "grad_norm": 0.0279951523989439,
+      "learning_rate": 0.00013697803119235285,
+      "loss": 0.6949,
+      "step": 9495
+    },
+    {
+      "epoch": 1.589503593514959,
+      "grad_norm": 0.028490344062447548,
+      "learning_rate": 0.00013687741069931243,
+      "loss": 0.7003,
+      "step": 9510
+    },
+    {
+      "epoch": 1.5920106969747618,
+      "grad_norm": 0.028360631316900253,
+      "learning_rate": 0.00013677679020627202,
+      "loss": 0.7008,
+      "step": 9525
+    },
+    {
+      "epoch": 1.5945178004345646,
+      "grad_norm": 0.029337970539927483,
+      "learning_rate": 0.0001366761697132316,
+      "loss": 0.7143,
+      "step": 9540
+    },
+    {
+      "epoch": 1.5970249038943674,
+      "grad_norm": 0.02845313400030136,
+      "learning_rate": 0.0001365755492201912,
+      "loss": 0.7006,
+      "step": 9555
+    },
+    {
+      "epoch": 1.5995320073541701,
+      "grad_norm": 0.027560876682400703,
+      "learning_rate": 0.00013647492872715078,
+      "loss": 0.7102,
+      "step": 9570
+    },
+    {
+      "epoch": 1.6020391108139729,
+      "grad_norm": 0.028155362233519554,
+      "learning_rate": 0.00013637430823411036,
+      "loss": 0.6979,
+      "step": 9585
+    },
+    {
+      "epoch": 1.6045462142737756,
+      "grad_norm": 0.029344851151108742,
+      "learning_rate": 0.00013627368774106995,
+      "loss": 0.704,
+      "step": 9600
+    },
+    {
+      "epoch": 1.6070533177335786,
+      "grad_norm": 0.02839244157075882,
+      "learning_rate": 0.0001361730672480295,
+      "loss": 0.6977,
+      "step": 9615
+    },
+    {
+      "epoch": 1.6095604211933812,
+      "grad_norm": 0.027915630489587784,
+      "learning_rate": 0.00013607244675498912,
+      "loss": 0.7086,
+      "step": 9630
+    },
+    {
+      "epoch": 1.6120675246531841,
+      "grad_norm": 0.02826772816479206,
+      "learning_rate": 0.00013597182626194868,
+      "loss": 0.6951,
+      "step": 9645
+    },
+    {
+      "epoch": 1.6145746281129867,
+      "grad_norm": 0.02916094847023487,
+      "learning_rate": 0.0001358712057689083,
+      "loss": 0.7103,
+      "step": 9660
+    },
+    {
+      "epoch": 1.6170817315727897,
+      "grad_norm": 0.02921309880912304,
+      "learning_rate": 0.00013577058527586785,
+      "loss": 0.6987,
+      "step": 9675
+    },
+    {
+      "epoch": 1.6195888350325922,
+      "grad_norm": 0.028561830520629883,
+      "learning_rate": 0.00013566996478282746,
+      "loss": 0.7119,
+      "step": 9690
+    },
+    {
+      "epoch": 1.6220959384923952,
+      "grad_norm": 0.028445105999708176,
+      "learning_rate": 0.00013556934428978702,
+      "loss": 0.7022,
+      "step": 9705
+    },
+    {
+      "epoch": 1.6246030419521977,
+      "grad_norm": 0.029156696051359177,
+      "learning_rate": 0.00013546872379674663,
+      "loss": 0.6946,
+      "step": 9720
+    },
+    {
+      "epoch": 1.6271101454120007,
+      "grad_norm": 0.029195377603173256,
+      "learning_rate": 0.0001353681033037062,
+      "loss": 0.6919,
+      "step": 9735
+    },
+    {
+      "epoch": 1.6296172488718035,
+      "grad_norm": 0.028340883553028107,
+      "learning_rate": 0.00013526748281066577,
+      "loss": 0.6949,
+      "step": 9750
+    },
+    {
+      "epoch": 1.6321243523316062,
+      "grad_norm": 0.028798367828130722,
+      "learning_rate": 0.00013516686231762536,
+      "loss": 0.6939,
+      "step": 9765
+    },
+    {
+      "epoch": 1.634631455791409,
+      "grad_norm": 0.028108691796660423,
+      "learning_rate": 0.00013506624182458494,
+      "loss": 0.6877,
+      "step": 9780
+    },
+    {
+      "epoch": 1.6371385592512118,
+      "grad_norm": 0.029803916811943054,
+      "learning_rate": 0.00013496562133154453,
+      "loss": 0.7063,
+      "step": 9795
+    },
+    {
+      "epoch": 1.6396456627110145,
+      "grad_norm": 0.02933133766055107,
+      "learning_rate": 0.00013486500083850412,
+      "loss": 0.7105,
+      "step": 9810
+    },
+    {
+      "epoch": 1.6421527661708173,
+      "grad_norm": 0.02795150876045227,
+      "learning_rate": 0.0001347643803454637,
+      "loss": 0.7127,
+      "step": 9825
+    },
+    {
+      "epoch": 1.64465986963062,
+      "grad_norm": 0.028160467743873596,
+      "learning_rate": 0.00013466375985242329,
+      "loss": 0.6962,
+      "step": 9840
+    },
+    {
+      "epoch": 1.6471669730904228,
+      "grad_norm": 0.028696995228528976,
+      "learning_rate": 0.00013456313935938287,
+      "loss": 0.7104,
+      "step": 9855
+    },
+    {
+      "epoch": 1.6496740765502258,
+      "grad_norm": 0.028448186814785004,
+      "learning_rate": 0.00013446251886634246,
+      "loss": 0.7164,
+      "step": 9870
+    },
+    {
+      "epoch": 1.6521811800100283,
+      "grad_norm": 0.028285130858421326,
+      "learning_rate": 0.00013436189837330204,
+      "loss": 0.6969,
+      "step": 9885
+    },
+    {
+      "epoch": 1.6546882834698313,
+      "grad_norm": 0.02930794097483158,
+      "learning_rate": 0.00013426127788026163,
+      "loss": 0.6933,
+      "step": 9900
+    },
+    {
+      "epoch": 1.6571953869296339,
+      "grad_norm": 0.028923654928803444,
+      "learning_rate": 0.00013416065738722121,
+      "loss": 0.7023,
+      "step": 9915
+    },
+    {
+      "epoch": 1.6597024903894368,
+      "grad_norm": 0.029697788879275322,
+      "learning_rate": 0.00013406003689418077,
+      "loss": 0.7149,
+      "step": 9930
+    },
+    {
+      "epoch": 1.6622095938492394,
+      "grad_norm": 0.02780589461326599,
+      "learning_rate": 0.00013395941640114038,
+      "loss": 0.7018,
+      "step": 9945
+    },
+    {
+      "epoch": 1.6647166973090424,
+      "grad_norm": 0.028592998161911964,
+      "learning_rate": 0.00013385879590809994,
+      "loss": 0.6999,
+      "step": 9960
+    },
+    {
+      "epoch": 1.667223800768845,
+      "grad_norm": 0.028748946264386177,
+      "learning_rate": 0.00013375817541505956,
+      "loss": 0.7108,
+      "step": 9975
+    },
+    {
+      "epoch": 1.669730904228648,
+      "grad_norm": 0.02883664146065712,
+      "learning_rate": 0.00013365755492201911,
+      "loss": 0.7014,
+      "step": 9990
+    },
+    {
+      "epoch": 1.6722380076884507,
+      "grad_norm": 0.027728645130991936,
+      "learning_rate": 0.00013355693442897873,
+      "loss": 0.7098,
+      "step": 10005
+    },
+    {
+      "epoch": 1.6747451111482534,
+      "grad_norm": 0.028445927426218987,
+      "learning_rate": 0.00013345631393593828,
+      "loss": 0.7016,
+      "step": 10020
+    },
+    {
+      "epoch": 1.6772522146080562,
+      "grad_norm": 0.029764369130134583,
+      "learning_rate": 0.00013335569344289787,
+      "loss": 0.6983,
+      "step": 10035
+    },
+    {
+      "epoch": 1.679759318067859,
+      "grad_norm": 0.029188336804509163,
+      "learning_rate": 0.00013325507294985746,
+      "loss": 0.7158,
+      "step": 10050
+    },
+    {
+      "epoch": 1.6822664215276617,
+      "grad_norm": 0.028241556137800217,
+      "learning_rate": 0.00013315445245681704,
+      "loss": 0.6923,
+      "step": 10065
+    },
+    {
+      "epoch": 1.6847735249874645,
+      "grad_norm": 0.02920147404074669,
+      "learning_rate": 0.00013305383196377663,
+      "loss": 0.7157,
+      "step": 10080
+    },
+    {
+      "epoch": 1.6872806284472672,
+      "grad_norm": 0.027919236570596695,
+      "learning_rate": 0.0001329532114707362,
+      "loss": 0.6984,
+      "step": 10095
+    },
+    {
+      "epoch": 1.68978773190707,
+      "grad_norm": 0.0279484074562788,
+      "learning_rate": 0.0001328525909776958,
+      "loss": 0.688,
+      "step": 10110
+    },
+    {
+      "epoch": 1.6922948353668728,
+      "grad_norm": 0.02801922895014286,
+      "learning_rate": 0.00013275197048465538,
+      "loss": 0.7077,
+      "step": 10125
+    },
+    {
+      "epoch": 1.6948019388266755,
+      "grad_norm": 0.02875382825732231,
+      "learning_rate": 0.00013265134999161497,
+      "loss": 0.7097,
+      "step": 10140
+    },
+    {
+      "epoch": 1.6973090422864785,
+      "grad_norm": 0.028978591784834862,
+      "learning_rate": 0.00013255072949857455,
+      "loss": 0.6921,
+      "step": 10155
+    },
+    {
+      "epoch": 1.699816145746281,
+      "grad_norm": 0.028557538986206055,
+      "learning_rate": 0.00013245010900553414,
+      "loss": 0.7124,
+      "step": 10170
+    },
+    {
+      "epoch": 1.702323249206084,
+      "grad_norm": 0.02763993851840496,
+      "learning_rate": 0.00013234948851249372,
+      "loss": 0.6998,
+      "step": 10185
+    },
+    {
+      "epoch": 1.7048303526658866,
+      "grad_norm": 0.029599042609333992,
+      "learning_rate": 0.0001322488680194533,
+      "loss": 0.7116,
+      "step": 10200
+    },
+    {
+      "epoch": 1.7073374561256895,
+      "grad_norm": 0.028568753972649574,
+      "learning_rate": 0.0001321482475264129,
+      "loss": 0.6927,
+      "step": 10215
+    },
+    {
+      "epoch": 1.709844559585492,
+      "grad_norm": 0.028803616762161255,
+      "learning_rate": 0.00013204762703337248,
+      "loss": 0.7,
+      "step": 10230
+    },
+    {
+      "epoch": 1.712351663045295,
+      "grad_norm": 0.028020154684782028,
+      "learning_rate": 0.00013194700654033204,
+      "loss": 0.7024,
+      "step": 10245
+    },
+    {
+      "epoch": 1.7148587665050978,
+      "grad_norm": 0.029931314289569855,
+      "learning_rate": 0.00013184638604729165,
+      "loss": 0.6996,
+      "step": 10260
+    },
+    {
+      "epoch": 1.7173658699649006,
+      "grad_norm": 0.028297219425439835,
+      "learning_rate": 0.0001317457655542512,
+      "loss": 0.7055,
+      "step": 10275
+    },
+    {
+      "epoch": 1.7198729734247034,
+      "grad_norm": 0.02956199459731579,
+      "learning_rate": 0.00013164514506121082,
+      "loss": 0.6975,
+      "step": 10290
+    },
+    {
+      "epoch": 1.7223800768845061,
+      "grad_norm": 0.027763094753026962,
+      "learning_rate": 0.00013154452456817038,
+      "loss": 0.7072,
+      "step": 10305
+    },
+    {
+      "epoch": 1.7248871803443089,
+      "grad_norm": 0.027571503072977066,
+      "learning_rate": 0.00013144390407512997,
+      "loss": 0.7001,
+      "step": 10320
+    },
+    {
+      "epoch": 1.7273942838041116,
+      "grad_norm": 0.028334425762295723,
+      "learning_rate": 0.00013134328358208955,
+      "loss": 0.7059,
+      "step": 10335
+    },
+    {
+      "epoch": 1.7299013872639144,
+      "grad_norm": 0.027847876772284508,
+      "learning_rate": 0.00013124266308904914,
+      "loss": 0.6956,
+      "step": 10350
+    },
+    {
+      "epoch": 1.7324084907237172,
+      "grad_norm": 0.027983665466308594,
+      "learning_rate": 0.00013114204259600872,
+      "loss": 0.716,
+      "step": 10365
+    },
+    {
+      "epoch": 1.73491559418352,
+      "grad_norm": 0.028772972524166107,
+      "learning_rate": 0.0001310414221029683,
+      "loss": 0.7052,
+      "step": 10380
+    },
+    {
+      "epoch": 1.7374226976433227,
+      "grad_norm": 0.028679322451353073,
+      "learning_rate": 0.0001309408016099279,
+      "loss": 0.6948,
+      "step": 10395
+    },
+    {
+      "epoch": 1.7399298011031257,
+      "grad_norm": 0.02946317568421364,
+      "learning_rate": 0.00013084018111688748,
+      "loss": 0.7048,
+      "step": 10410
+    },
+    {
+      "epoch": 1.7424369045629282,
+      "grad_norm": 0.0287346001714468,
+      "learning_rate": 0.00013073956062384707,
+      "loss": 0.7047,
+      "step": 10425
+    },
+    {
+      "epoch": 1.7449440080227312,
+      "grad_norm": 0.02862308919429779,
+      "learning_rate": 0.00013063894013080665,
+      "loss": 0.6886,
+      "step": 10440
+    },
+    {
+      "epoch": 1.7474511114825337,
+      "grad_norm": 0.0288804080337286,
+      "learning_rate": 0.00013053831963776624,
+      "loss": 0.706,
+      "step": 10455
+    },
+    {
+      "epoch": 1.7499582149423367,
+      "grad_norm": 0.0278554018586874,
+      "learning_rate": 0.00013043769914472582,
+      "loss": 0.7169,
+      "step": 10470
+    },
+    {
+      "epoch": 1.7524653184021393,
+      "grad_norm": 0.02842450514435768,
+      "learning_rate": 0.0001303370786516854,
+      "loss": 0.7028,
+      "step": 10485
+    },
+    {
+      "epoch": 1.7549724218619422,
+      "grad_norm": 0.02780633233487606,
+      "learning_rate": 0.000130236458158645,
+      "loss": 0.6974,
+      "step": 10500
+    },
+    {
+      "epoch": 1.7574795253217448,
+      "grad_norm": 0.028826531022787094,
+      "learning_rate": 0.00013013583766560458,
+      "loss": 0.7024,
+      "step": 10515
+    },
+    {
+      "epoch": 1.7599866287815478,
+      "grad_norm": 0.028399532660841942,
+      "learning_rate": 0.00013003521717256414,
+      "loss": 0.7029,
+      "step": 10530
+    },
+    {
+      "epoch": 1.7624937322413505,
+      "grad_norm": 0.029726563021540642,
+      "learning_rate": 0.00012993459667952375,
+      "loss": 0.7033,
+      "step": 10545
+    },
+    {
+      "epoch": 1.7650008357011533,
+      "grad_norm": 0.028318284079432487,
+      "learning_rate": 0.0001298339761864833,
+      "loss": 0.7102,
+      "step": 10560
+    },
+    {
+      "epoch": 1.767507939160956,
+      "grad_norm": 0.02865464985370636,
+      "learning_rate": 0.00012973335569344292,
+      "loss": 0.7079,
+      "step": 10575
+    },
+    {
+      "epoch": 1.7700150426207588,
+      "grad_norm": 0.029711904004216194,
+      "learning_rate": 0.00012963273520040248,
+      "loss": 0.7003,
+      "step": 10590
+    },
+    {
+      "epoch": 1.7725221460805616,
+      "grad_norm": 0.02868981659412384,
+      "learning_rate": 0.00012953211470736206,
+      "loss": 0.7071,
+      "step": 10605
+    },
+    {
+      "epoch": 1.7750292495403643,
+      "grad_norm": 0.03023667074739933,
+      "learning_rate": 0.00012943149421432165,
+      "loss": 0.6988,
+      "step": 10620
+    },
+    {
+      "epoch": 1.777536353000167,
+      "grad_norm": 0.02855963073670864,
+      "learning_rate": 0.00012933087372128123,
+      "loss": 0.7001,
+      "step": 10635
+    },
+    {
+      "epoch": 1.7800434564599699,
+      "grad_norm": 0.02811777964234352,
+      "learning_rate": 0.00012923025322824082,
+      "loss": 0.6982,
+      "step": 10650
+    },
+    {
+      "epoch": 1.7825505599197728,
+      "grad_norm": 0.029220616444945335,
+      "learning_rate": 0.0001291296327352004,
+      "loss": 0.7123,
+      "step": 10665
+    },
+    {
+      "epoch": 1.7850576633795754,
+      "grad_norm": 0.02945820614695549,
+      "learning_rate": 0.00012902901224216,
+      "loss": 0.702,
+      "step": 10680
+    },
+    {
+      "epoch": 1.7875647668393784,
+      "grad_norm": 0.02915896289050579,
+      "learning_rate": 0.00012892839174911958,
+      "loss": 0.6996,
+      "step": 10695
+    },
+    {
+      "epoch": 1.790071870299181,
+      "grad_norm": 0.028102731332182884,
+      "learning_rate": 0.00012882777125607916,
+      "loss": 0.6931,
+      "step": 10710
+    },
+    {
+      "epoch": 1.792578973758984,
+      "grad_norm": 0.028598302975296974,
+      "learning_rate": 0.00012872715076303875,
+      "loss": 0.7049,
+      "step": 10725
+    },
+    {
+      "epoch": 1.7950860772187864,
+      "grad_norm": 0.02882864698767662,
+      "learning_rate": 0.00012862653026999833,
+      "loss": 0.6894,
+      "step": 10740
+    },
+    {
+      "epoch": 1.7975931806785894,
+      "grad_norm": 0.02864612452685833,
+      "learning_rate": 0.00012852590977695792,
+      "loss": 0.6959,
+      "step": 10755
+    },
+    {
+      "epoch": 1.800100284138392,
+      "grad_norm": 0.02791963331401348,
+      "learning_rate": 0.0001284252892839175,
+      "loss": 0.699,
+      "step": 10770
+    },
+    {
+      "epoch": 1.802607387598195,
+      "grad_norm": 0.029228495433926582,
+      "learning_rate": 0.0001283246687908771,
+      "loss": 0.7014,
+      "step": 10785
+    },
+    {
+      "epoch": 1.8051144910579977,
+      "grad_norm": 0.028694583103060722,
+      "learning_rate": 0.00012822404829783667,
+      "loss": 0.6981,
+      "step": 10800
+    },
+    {
+      "epoch": 1.8076215945178005,
+      "grad_norm": 0.028723234310746193,
+      "learning_rate": 0.00012812342780479626,
+      "loss": 0.6935,
+      "step": 10815
+    },
+    {
+      "epoch": 1.8101286979776032,
+      "grad_norm": 0.02791297808289528,
+      "learning_rate": 0.00012802280731175585,
+      "loss": 0.7023,
+      "step": 10830
+    },
+    {
+      "epoch": 1.812635801437406,
+      "grad_norm": 0.028474239632487297,
+      "learning_rate": 0.0001279221868187154,
+      "loss": 0.6966,
+      "step": 10845
+    },
+    {
+      "epoch": 1.8151429048972088,
+      "grad_norm": 0.028216082602739334,
+      "learning_rate": 0.00012782156632567502,
+      "loss": 0.7146,
+      "step": 10860
+    },
+    {
+      "epoch": 1.8176500083570115,
+      "grad_norm": 0.02868053875863552,
+      "learning_rate": 0.00012772094583263458,
+      "loss": 0.7018,
+      "step": 10875
+    },
+    {
+      "epoch": 1.8201571118168143,
+      "grad_norm": 0.029623722657561302,
+      "learning_rate": 0.00012762032533959416,
+      "loss": 0.7024,
+      "step": 10890
+    },
+    {
+      "epoch": 1.822664215276617,
+      "grad_norm": 0.029195398092269897,
+      "learning_rate": 0.00012751970484655375,
+      "loss": 0.7056,
+      "step": 10905
+    },
+    {
+      "epoch": 1.8251713187364198,
+      "grad_norm": 0.02803465723991394,
+      "learning_rate": 0.00012741908435351333,
+      "loss": 0.7013,
+      "step": 10920
+    },
+    {
+      "epoch": 1.8276784221962226,
+      "grad_norm": 0.02818216383457184,
+      "learning_rate": 0.00012731846386047292,
+      "loss": 0.7052,
+      "step": 10935
+    },
+    {
+      "epoch": 1.8301855256560255,
+      "grad_norm": 0.029034661129117012,
+      "learning_rate": 0.0001272178433674325,
+      "loss": 0.6918,
+      "step": 10950
+    },
+    {
+      "epoch": 1.832692629115828,
+      "grad_norm": 0.028653794899582863,
+      "learning_rate": 0.0001271172228743921,
+      "loss": 0.708,
+      "step": 10965
+    },
+    {
+      "epoch": 1.835199732575631,
+      "grad_norm": 0.02844145894050598,
+      "learning_rate": 0.00012701660238135167,
+      "loss": 0.7065,
+      "step": 10980
+    },
+    {
+      "epoch": 1.8377068360354336,
+      "grad_norm": 0.02880460023880005,
+      "learning_rate": 0.00012691598188831126,
+      "loss": 0.6931,
+      "step": 10995
+    },
+    {
+      "epoch": 1.8402139394952366,
+      "grad_norm": 0.02845979668200016,
+      "learning_rate": 0.00012681536139527084,
+      "loss": 0.6924,
+      "step": 11010
+    },
+    {
+      "epoch": 1.8427210429550391,
+      "grad_norm": 0.02805483527481556,
+      "learning_rate": 0.00012671474090223043,
+      "loss": 0.7064,
+      "step": 11025
+    },
+    {
+      "epoch": 1.8452281464148421,
+      "grad_norm": 0.029036138206720352,
+      "learning_rate": 0.00012661412040919002,
+      "loss": 0.7061,
+      "step": 11040
+    },
+    {
+      "epoch": 1.8477352498746449,
+      "grad_norm": 0.028865808621048927,
+      "learning_rate": 0.0001265134999161496,
+      "loss": 0.7088,
+      "step": 11055
+    },
+    {
+      "epoch": 1.8502423533344476,
+      "grad_norm": 0.028568295761942863,
+      "learning_rate": 0.00012641287942310919,
+      "loss": 0.7032,
+      "step": 11070
+    },
+    {
+      "epoch": 1.8527494567942504,
+      "grad_norm": 0.02971578575670719,
+      "learning_rate": 0.00012631225893006877,
+      "loss": 0.703,
+      "step": 11085
+    },
+    {
+      "epoch": 1.8552565602540532,
+      "grad_norm": 0.029128948226571083,
+      "learning_rate": 0.00012621163843702836,
+      "loss": 0.6964,
+      "step": 11100
+    },
+    {
+      "epoch": 1.857763663713856,
+      "grad_norm": 0.028951995074748993,
+      "learning_rate": 0.00012611101794398794,
+      "loss": 0.6998,
+      "step": 11115
+    },
+    {
+      "epoch": 1.8602707671736587,
+      "grad_norm": 0.029678482562303543,
+      "learning_rate": 0.00012601039745094753,
+      "loss": 0.6778,
+      "step": 11130
+    },
+    {
+      "epoch": 1.8627778706334615,
+      "grad_norm": 0.029598036780953407,
+      "learning_rate": 0.0001259097769579071,
+      "loss": 0.694,
+      "step": 11145
+    },
+    {
+      "epoch": 1.8652849740932642,
+      "grad_norm": 0.02879234589636326,
+      "learning_rate": 0.00012580915646486667,
+      "loss": 0.7085,
+      "step": 11160
+    },
+    {
+      "epoch": 1.867792077553067,
+      "grad_norm": 0.029246920719742775,
+      "learning_rate": 0.00012570853597182626,
+      "loss": 0.6932,
+      "step": 11175
+    },
+    {
+      "epoch": 1.8702991810128697,
+      "grad_norm": 0.030359363183379173,
+      "learning_rate": 0.00012560791547878584,
+      "loss": 0.7027,
+      "step": 11190
+    },
+    {
+      "epoch": 1.8728062844726727,
+      "grad_norm": 0.02991410344839096,
+      "learning_rate": 0.00012550729498574543,
+      "loss": 0.6842,
+      "step": 11205
+    },
+    {
+      "epoch": 1.8753133879324753,
+      "grad_norm": 0.028199173510074615,
+      "learning_rate": 0.00012540667449270501,
+      "loss": 0.6998,
+      "step": 11220
+    },
+    {
+      "epoch": 1.8778204913922782,
+      "grad_norm": 0.028087392449378967,
+      "learning_rate": 0.0001253060539996646,
+      "loss": 0.697,
+      "step": 11235
+    },
+    {
+      "epoch": 1.8803275948520808,
+      "grad_norm": 0.02853637933731079,
+      "learning_rate": 0.00012520543350662418,
+      "loss": 0.6874,
+      "step": 11250
+    },
+    {
+      "epoch": 1.8828346983118838,
+      "grad_norm": 0.028400765731930733,
+      "learning_rate": 0.00012510481301358377,
+      "loss": 0.6881,
+      "step": 11265
+    },
+    {
+      "epoch": 1.8853418017716863,
+      "grad_norm": 0.02928781695663929,
+      "learning_rate": 0.00012500419252054336,
+      "loss": 0.6951,
+      "step": 11280
+    },
+    {
+      "epoch": 1.8878489052314893,
+      "grad_norm": 0.028838330879807472,
+      "learning_rate": 0.00012490357202750294,
+      "loss": 0.6857,
+      "step": 11295
+    },
+    {
+      "epoch": 1.8903560086912918,
+      "grad_norm": 0.0293565783649683,
+      "learning_rate": 0.00012480295153446253,
+      "loss": 0.693,
+      "step": 11310
+    },
+    {
+      "epoch": 1.8928631121510948,
+      "grad_norm": 0.02845110557973385,
+      "learning_rate": 0.0001247023310414221,
+      "loss": 0.6999,
+      "step": 11325
+    },
+    {
+      "epoch": 1.8953702156108976,
+      "grad_norm": 0.029096076264977455,
+      "learning_rate": 0.0001246017105483817,
+      "loss": 0.6841,
+      "step": 11340
+    },
+    {
+      "epoch": 1.8978773190707003,
+      "grad_norm": 0.029120532795786858,
+      "learning_rate": 0.00012450109005534128,
+      "loss": 0.7009,
+      "step": 11355
+    },
+    {
+      "epoch": 1.900384422530503,
+      "grad_norm": 0.027919389307498932,
+      "learning_rate": 0.00012440046956230087,
+      "loss": 0.708,
+      "step": 11370
+    },
+    {
+      "epoch": 1.9028915259903059,
+      "grad_norm": 0.02887488156557083,
+      "learning_rate": 0.00012429984906926045,
+      "loss": 0.7048,
+      "step": 11385
+    },
+    {
+      "epoch": 1.9053986294501086,
+      "grad_norm": 0.028664030134677887,
+      "learning_rate": 0.00012419922857622004,
+      "loss": 0.7003,
+      "step": 11400
+    },
+    {
+      "epoch": 1.9079057329099114,
+      "grad_norm": 0.028661739081144333,
+      "learning_rate": 0.00012409860808317962,
+      "loss": 0.7009,
+      "step": 11415
+    },
+    {
+      "epoch": 1.9104128363697142,
+      "grad_norm": 0.02937045879662037,
+      "learning_rate": 0.0001239979875901392,
+      "loss": 0.6935,
+      "step": 11430
+    },
+    {
+      "epoch": 1.912919939829517,
+      "grad_norm": 0.030395416542887688,
+      "learning_rate": 0.0001238973670970988,
+      "loss": 0.6808,
+      "step": 11445
+    },
+    {
+      "epoch": 1.91542704328932,
+      "grad_norm": 0.030018294230103493,
+      "learning_rate": 0.00012379674660405835,
+      "loss": 0.6931,
+      "step": 11460
+    },
+    {
+      "epoch": 1.9179341467491224,
+      "grad_norm": 0.029583923518657684,
+      "learning_rate": 0.00012369612611101794,
+      "loss": 0.6844,
+      "step": 11475
+    },
+    {
+      "epoch": 1.9204412502089254,
+      "grad_norm": 0.028469126671552658,
+      "learning_rate": 0.00012359550561797752,
+      "loss": 0.7,
+      "step": 11490
+    },
+    {
+      "epoch": 1.922948353668728,
+      "grad_norm": 0.029069840908050537,
+      "learning_rate": 0.0001234948851249371,
+      "loss": 0.6769,
+      "step": 11505
+    },
+    {
+      "epoch": 1.925455457128531,
+      "grad_norm": 0.03039330244064331,
+      "learning_rate": 0.0001233942646318967,
+      "loss": 0.701,
+      "step": 11520
+    },
+    {
+      "epoch": 1.9279625605883335,
+      "grad_norm": 0.029704933986067772,
+      "learning_rate": 0.00012329364413885628,
+      "loss": 0.6955,
+      "step": 11535
+    },
+    {
+      "epoch": 1.9304696640481365,
+      "grad_norm": 0.02861003205180168,
+      "learning_rate": 0.00012319302364581587,
+      "loss": 0.7029,
+      "step": 11550
+    },
+    {
+      "epoch": 1.932976767507939,
+      "grad_norm": 0.028516478836536407,
+      "learning_rate": 0.00012309240315277545,
+      "loss": 0.685,
+      "step": 11565
+    },
+    {
+      "epoch": 1.935483870967742,
+      "grad_norm": 0.02939150668680668,
+      "learning_rate": 0.00012299178265973504,
+      "loss": 0.6862,
+      "step": 11580
+    },
+    {
+      "epoch": 1.9379909744275448,
+      "grad_norm": 0.029078399762511253,
+      "learning_rate": 0.00012289116216669462,
+      "loss": 0.6915,
+      "step": 11595
+    },
+    {
+      "epoch": 1.9404980778873475,
+      "grad_norm": 0.02967904321849346,
+      "learning_rate": 0.0001227905416736542,
+      "loss": 0.7078,
+      "step": 11610
+    },
+    {
+      "epoch": 1.9430051813471503,
+      "grad_norm": 0.02986898459494114,
+      "learning_rate": 0.0001226899211806138,
+      "loss": 0.6962,
+      "step": 11625
+    },
+    {
+      "epoch": 1.945512284806953,
+      "grad_norm": 0.029141373932361603,
+      "learning_rate": 0.00012258930068757338,
+      "loss": 0.6898,
+      "step": 11640
+    },
+    {
+      "epoch": 1.9480193882667558,
+      "grad_norm": 0.02856113389134407,
+      "learning_rate": 0.00012248868019453296,
+      "loss": 0.6932,
+      "step": 11655
+    },
+    {
+      "epoch": 1.9505264917265586,
+      "grad_norm": 0.02906043641269207,
+      "learning_rate": 0.00012238805970149255,
+      "loss": 0.6947,
+      "step": 11670
+    },
+    {
+      "epoch": 1.9530335951863613,
+      "grad_norm": 0.028559362515807152,
+      "learning_rate": 0.00012228743920845214,
+      "loss": 0.6959,
+      "step": 11685
+    },
+    {
+      "epoch": 1.955540698646164,
+      "grad_norm": 0.029632238671183586,
+      "learning_rate": 0.00012218681871541172,
+      "loss": 0.7044,
+      "step": 11700
+    },
+    {
+      "epoch": 1.958047802105967,
+      "grad_norm": 0.028845706954598427,
+      "learning_rate": 0.0001220861982223713,
+      "loss": 0.6845,
+      "step": 11715
+    },
+    {
+      "epoch": 1.9605549055657696,
+      "grad_norm": 0.029171636328101158,
+      "learning_rate": 0.00012198557772933088,
+      "loss": 0.7044,
+      "step": 11730
+    },
+    {
+      "epoch": 1.9630620090255726,
+      "grad_norm": 0.030526766553521156,
+      "learning_rate": 0.00012188495723629045,
+      "loss": 0.6881,
+      "step": 11745
+    },
+    {
+      "epoch": 1.9655691124853751,
+      "grad_norm": 0.029202323406934738,
+      "learning_rate": 0.00012178433674325005,
+      "loss": 0.6853,
+      "step": 11760
+    },
+    {
+      "epoch": 1.9680762159451781,
+      "grad_norm": 0.028741231188178062,
+      "learning_rate": 0.00012168371625020962,
+      "loss": 0.7085,
+      "step": 11775
+    },
+    {
+      "epoch": 1.9705833194049807,
+      "grad_norm": 0.029565809294581413,
+      "learning_rate": 0.00012158309575716922,
+      "loss": 0.6951,
+      "step": 11790
+    },
+    {
+      "epoch": 1.9730904228647836,
+      "grad_norm": 0.029546387493610382,
+      "learning_rate": 0.00012148247526412879,
+      "loss": 0.6961,
+      "step": 11805
+    },
+    {
+      "epoch": 1.9755975263245862,
+      "grad_norm": 0.029062774032354355,
+      "learning_rate": 0.00012138185477108839,
+      "loss": 0.6933,
+      "step": 11820
+    },
+    {
+      "epoch": 1.9781046297843892,
+      "grad_norm": 0.028955336660146713,
+      "learning_rate": 0.00012128123427804796,
+      "loss": 0.6898,
+      "step": 11835
+    },
+    {
+      "epoch": 1.980611733244192,
+      "grad_norm": 0.031218407675623894,
+      "learning_rate": 0.00012118061378500756,
+      "loss": 0.689,
+      "step": 11850
+    },
+    {
+      "epoch": 1.9831188367039947,
+      "grad_norm": 0.030403736978769302,
+      "learning_rate": 0.00012107999329196713,
+      "loss": 0.6981,
+      "step": 11865
+    },
+    {
+      "epoch": 1.9856259401637975,
+      "grad_norm": 0.030305424705147743,
+      "learning_rate": 0.00012097937279892673,
+      "loss": 0.6987,
+      "step": 11880
+    },
+    {
+      "epoch": 1.9881330436236002,
+      "grad_norm": 0.029590345919132233,
+      "learning_rate": 0.0001208787523058863,
+      "loss": 0.6984,
+      "step": 11895
+    },
+    {
+      "epoch": 1.990640147083403,
+      "grad_norm": 0.04296644404530525,
+      "learning_rate": 0.00012077813181284589,
+      "loss": 0.7018,
+      "step": 11910
+    },
+    {
+      "epoch": 1.9931472505432057,
+      "grad_norm": 0.029970306903123856,
+      "learning_rate": 0.00012067751131980548,
+      "loss": 0.6887,
+      "step": 11925
+    },
+    {
+      "epoch": 1.9956543540030085,
+      "grad_norm": 0.02884749509394169,
+      "learning_rate": 0.00012057689082676506,
+      "loss": 0.7004,
+      "step": 11940
+    },
+    {
+      "epoch": 1.9981614574628113,
+      "grad_norm": 0.030533695593476295,
+      "learning_rate": 0.00012047627033372463,
+      "loss": 0.6883,
+      "step": 11955
+    },
+    {
+      "epoch": 2.0006685609226142,
+      "grad_norm": 0.029126284644007683,
+      "learning_rate": 0.00012037564984068423,
+      "loss": 0.6984,
+      "step": 11970
+    },
+    {
+      "epoch": 2.003175664382417,
+      "grad_norm": 0.029292147606611252,
+      "learning_rate": 0.0001202750293476438,
+      "loss": 0.6894,
+      "step": 11985
+    },
+    {
+      "epoch": 2.0056827678422198,
+      "grad_norm": 0.029509389773011208,
+      "learning_rate": 0.0001201744088546034,
+      "loss": 0.6823,
+      "step": 12000
+    },
+    {
+      "epoch": 2.0081898713020223,
+      "grad_norm": 0.02902618609368801,
+      "learning_rate": 0.00012007378836156298,
+      "loss": 0.6763,
+      "step": 12015
+    },
+    {
+      "epoch": 2.0106969747618253,
+      "grad_norm": 0.028685985133051872,
+      "learning_rate": 0.00011997316786852255,
+      "loss": 0.6903,
+      "step": 12030
+    },
+    {
+      "epoch": 2.013204078221628,
+      "grad_norm": 0.029849760234355927,
+      "learning_rate": 0.00011987254737548215,
+      "loss": 0.6886,
+      "step": 12045
+    },
+    {
+      "epoch": 2.015711181681431,
+      "grad_norm": 0.030097436159849167,
+      "learning_rate": 0.00011977192688244172,
+      "loss": 0.6868,
+      "step": 12060
+    },
+    {
+      "epoch": 2.0182182851412334,
+      "grad_norm": 0.02963315322995186,
+      "learning_rate": 0.00011967130638940132,
+      "loss": 0.6856,
+      "step": 12075
+    },
+    {
+      "epoch": 2.0207253886010363,
+      "grad_norm": 0.030087383463978767,
+      "learning_rate": 0.00011957068589636089,
+      "loss": 0.6886,
+      "step": 12090
+    },
+    {
+      "epoch": 2.023232492060839,
+      "grad_norm": 0.029318705201148987,
+      "learning_rate": 0.00011947006540332049,
+      "loss": 0.6734,
+      "step": 12105
+    },
+    {
+      "epoch": 2.025739595520642,
+      "grad_norm": 0.029196394607424736,
+      "learning_rate": 0.00011936944491028006,
+      "loss": 0.674,
+      "step": 12120
+    },
+    {
+      "epoch": 2.0282466989804444,
+      "grad_norm": 0.029127739369869232,
+      "learning_rate": 0.00011926882441723966,
+      "loss": 0.6875,
+      "step": 12135
+    },
+    {
+      "epoch": 2.0307538024402474,
+      "grad_norm": 0.029445838183164597,
+      "learning_rate": 0.00011916820392419923,
+      "loss": 0.6869,
+      "step": 12150
+    },
+    {
+      "epoch": 2.03326090590005,
+      "grad_norm": 0.029497170820832253,
+      "learning_rate": 0.00011906758343115883,
+      "loss": 0.6717,
+      "step": 12165
+    },
+    {
+      "epoch": 2.035768009359853,
+      "grad_norm": 0.028793711215257645,
+      "learning_rate": 0.0001189669629381184,
+      "loss": 0.682,
+      "step": 12180
+    },
+    {
+      "epoch": 2.038275112819656,
+      "grad_norm": 0.029894977807998657,
+      "learning_rate": 0.00011886634244507799,
+      "loss": 0.6821,
+      "step": 12195
+    },
+    {
+      "epoch": 2.0407822162794584,
+      "grad_norm": 0.028813883662223816,
+      "learning_rate": 0.00011876572195203757,
+      "loss": 0.6678,
+      "step": 12210
+    },
+    {
+      "epoch": 2.0432893197392614,
+      "grad_norm": 0.029816757887601852,
+      "learning_rate": 0.00011866510145899716,
+      "loss": 0.693,
+      "step": 12225
+    },
+    {
+      "epoch": 2.045796423199064,
+      "grad_norm": 0.03083239123225212,
+      "learning_rate": 0.00011856448096595673,
+      "loss": 0.681,
+      "step": 12240
+    },
+    {
+      "epoch": 2.048303526658867,
+      "grad_norm": 0.029679182916879654,
+      "learning_rate": 0.00011846386047291633,
+      "loss": 0.6742,
+      "step": 12255
+    },
+    {
+      "epoch": 2.0508106301186695,
+      "grad_norm": 0.03096550703048706,
+      "learning_rate": 0.0001183632399798759,
+      "loss": 0.6836,
+      "step": 12270
+    },
+    {
+      "epoch": 2.0533177335784725,
+      "grad_norm": 0.030012456700205803,
+      "learning_rate": 0.0001182626194868355,
+      "loss": 0.6819,
+      "step": 12285
+    },
+    {
+      "epoch": 2.055824837038275,
+      "grad_norm": 0.029759397730231285,
+      "learning_rate": 0.00011816199899379507,
+      "loss": 0.6781,
+      "step": 12300
+    },
+    {
+      "epoch": 2.058331940498078,
+      "grad_norm": 0.030046438798308372,
+      "learning_rate": 0.00011806137850075464,
+      "loss": 0.6787,
+      "step": 12315
+    },
+    {
+      "epoch": 2.0608390439578805,
+      "grad_norm": 0.02959163673222065,
+      "learning_rate": 0.00011796075800771424,
+      "loss": 0.6828,
+      "step": 12330
+    },
+    {
+      "epoch": 2.0633461474176835,
+      "grad_norm": 0.02911483868956566,
+      "learning_rate": 0.00011786013751467382,
+      "loss": 0.682,
+      "step": 12345
+    },
+    {
+      "epoch": 2.065853250877486,
+      "grad_norm": 0.04046880826354027,
+      "learning_rate": 0.00011775951702163341,
+      "loss": 0.6852,
+      "step": 12360
+    },
+    {
+      "epoch": 2.068360354337289,
+      "grad_norm": 0.030412757769227028,
+      "learning_rate": 0.00011765889652859299,
+      "loss": 0.6783,
+      "step": 12375
+    },
+    {
+      "epoch": 2.0708674577970916,
+      "grad_norm": 0.029883218929171562,
+      "learning_rate": 0.00011755827603555259,
+      "loss": 0.6774,
+      "step": 12390
+    },
+    {
+      "epoch": 2.0733745612568946,
+      "grad_norm": 0.029417937621474266,
+      "learning_rate": 0.00011745765554251216,
+      "loss": 0.691,
+      "step": 12405
+    },
+    {
+      "epoch": 2.075881664716697,
+      "grad_norm": 0.03051302768290043,
+      "learning_rate": 0.00011735703504947176,
+      "loss": 0.6871,
+      "step": 12420
+    },
+    {
+      "epoch": 2.0783887681765,
+      "grad_norm": 0.030459176748991013,
+      "learning_rate": 0.00011725641455643133,
+      "loss": 0.6892,
+      "step": 12435
+    },
+    {
+      "epoch": 2.0808958716363026,
+      "grad_norm": 0.030476195737719536,
+      "learning_rate": 0.00011715579406339093,
+      "loss": 0.6875,
+      "step": 12450
+    },
+    {
+      "epoch": 2.0834029750961056,
+      "grad_norm": 0.02982410229742527,
+      "learning_rate": 0.0001170551735703505,
+      "loss": 0.6623,
+      "step": 12465
+    },
+    {
+      "epoch": 2.0859100785559086,
+      "grad_norm": 0.030465099960565567,
+      "learning_rate": 0.0001169545530773101,
+      "loss": 0.6841,
+      "step": 12480
+    },
+    {
+      "epoch": 2.088417182015711,
+      "grad_norm": 0.029227489605545998,
+      "learning_rate": 0.00011685393258426967,
+      "loss": 0.6777,
+      "step": 12495
+    },
+    {
+      "epoch": 2.090924285475514,
+      "grad_norm": 0.029344556853175163,
+      "learning_rate": 0.00011675331209122926,
+      "loss": 0.6823,
+      "step": 12510
+    },
+    {
+      "epoch": 2.0934313889353167,
+      "grad_norm": 0.030551349744200706,
+      "learning_rate": 0.00011665269159818884,
+      "loss": 0.6872,
+      "step": 12525
+    },
+    {
+      "epoch": 2.0959384923951196,
+      "grad_norm": 0.03063136897981167,
+      "learning_rate": 0.00011655207110514843,
+      "loss": 0.6767,
+      "step": 12540
+    },
+    {
+      "epoch": 2.098445595854922,
+      "grad_norm": 0.02986333705484867,
+      "learning_rate": 0.000116451450612108,
+      "loss": 0.6941,
+      "step": 12555
+    },
+    {
+      "epoch": 2.100952699314725,
+      "grad_norm": 0.030152348801493645,
+      "learning_rate": 0.0001163508301190676,
+      "loss": 0.6832,
+      "step": 12570
+    },
+    {
+      "epoch": 2.1034598027745277,
+      "grad_norm": 0.029383687302470207,
+      "learning_rate": 0.00011625020962602717,
+      "loss": 0.6676,
+      "step": 12585
+    },
+    {
+      "epoch": 2.1059669062343307,
+      "grad_norm": 0.03019135817885399,
+      "learning_rate": 0.00011614958913298674,
+      "loss": 0.6735,
+      "step": 12600
+    },
+    {
+      "epoch": 2.1084740096941332,
+      "grad_norm": 0.030429605394601822,
+      "learning_rate": 0.00011604896863994634,
+      "loss": 0.6837,
+      "step": 12615
+    },
+    {
+      "epoch": 2.110981113153936,
+      "grad_norm": 0.031370870769023895,
+      "learning_rate": 0.00011594834814690591,
+      "loss": 0.6753,
+      "step": 12630
+    },
+    {
+      "epoch": 2.1134882166137388,
+      "grad_norm": 0.030195990577340126,
+      "learning_rate": 0.00011584772765386551,
+      "loss": 0.669,
+      "step": 12645
+    },
+    {
+      "epoch": 2.1159953200735417,
+      "grad_norm": 0.03015013597905636,
+      "learning_rate": 0.00011574710716082508,
+      "loss": 0.6868,
+      "step": 12660
+    },
+    {
+      "epoch": 2.1185024235333443,
+      "grad_norm": 0.030749835073947906,
+      "learning_rate": 0.00011564648666778468,
+      "loss": 0.6664,
+      "step": 12675
+    },
+    {
+      "epoch": 2.1210095269931473,
+      "grad_norm": 0.03003542125225067,
+      "learning_rate": 0.00011554586617474425,
+      "loss": 0.6884,
+      "step": 12690
+    },
+    {
+      "epoch": 2.12351663045295,
+      "grad_norm": 0.02948312647640705,
+      "learning_rate": 0.00011544524568170385,
+      "loss": 0.686,
+      "step": 12705
+    },
+    {
+      "epoch": 2.126023733912753,
+      "grad_norm": 0.03116905875504017,
+      "learning_rate": 0.00011534462518866342,
+      "loss": 0.6917,
+      "step": 12720
+    },
+    {
+      "epoch": 2.1285308373725558,
+      "grad_norm": 0.03057217039167881,
+      "learning_rate": 0.00011524400469562302,
+      "loss": 0.6893,
+      "step": 12735
+    },
+    {
+      "epoch": 2.1310379408323583,
+      "grad_norm": 0.03055824153125286,
+      "learning_rate": 0.0001151433842025826,
+      "loss": 0.6749,
+      "step": 12750
+    },
+    {
+      "epoch": 2.1335450442921613,
+      "grad_norm": 0.030194489285349846,
+      "learning_rate": 0.0001150427637095422,
+      "loss": 0.6841,
+      "step": 12765
+    },
+    {
+      "epoch": 2.136052147751964,
+      "grad_norm": 0.030030904337763786,
+      "learning_rate": 0.00011494214321650177,
+      "loss": 0.6755,
+      "step": 12780
+    },
+    {
+      "epoch": 2.138559251211767,
+      "grad_norm": 0.030531438067555428,
+      "learning_rate": 0.00011484152272346137,
+      "loss": 0.6885,
+      "step": 12795
+    },
+    {
+      "epoch": 2.1410663546715694,
+      "grad_norm": 0.031014693900942802,
+      "learning_rate": 0.00011474090223042094,
+      "loss": 0.6872,
+      "step": 12810
+    },
+    {
+      "epoch": 2.1435734581313723,
+      "grad_norm": 0.03255138173699379,
+      "learning_rate": 0.00011464028173738052,
+      "loss": 0.6811,
+      "step": 12825
+    },
+    {
+      "epoch": 2.146080561591175,
+      "grad_norm": 0.02984030731022358,
+      "learning_rate": 0.00011453966124434011,
+      "loss": 0.684,
+      "step": 12840
+    },
+    {
+      "epoch": 2.148587665050978,
+      "grad_norm": 0.03038971871137619,
+      "learning_rate": 0.0001144390407512997,
+      "loss": 0.6901,
+      "step": 12855
+    },
+    {
+      "epoch": 2.1510947685107804,
+      "grad_norm": 0.03030613623559475,
+      "learning_rate": 0.00011433842025825927,
+      "loss": 0.6815,
+      "step": 12870
+    },
+    {
+      "epoch": 2.1536018719705834,
+      "grad_norm": 0.03107587993144989,
+      "learning_rate": 0.00011423779976521885,
+      "loss": 0.6773,
+      "step": 12885
+    },
+    {
+      "epoch": 2.156108975430386,
+      "grad_norm": 0.030311092734336853,
+      "learning_rate": 0.00011413717927217844,
+      "loss": 0.6859,
+      "step": 12900
+    },
+    {
+      "epoch": 2.158616078890189,
+      "grad_norm": 0.03004043735563755,
+      "learning_rate": 0.00011403655877913801,
+      "loss": 0.677,
+      "step": 12915
+    },
+    {
+      "epoch": 2.1611231823499915,
+      "grad_norm": 0.02978183701634407,
+      "learning_rate": 0.00011393593828609761,
+      "loss": 0.686,
+      "step": 12930
+    },
+    {
+      "epoch": 2.1636302858097944,
+      "grad_norm": 0.030549898743629456,
+      "learning_rate": 0.00011383531779305718,
+      "loss": 0.6875,
+      "step": 12945
+    },
+    {
+      "epoch": 2.166137389269597,
+      "grad_norm": 0.030601589009165764,
+      "learning_rate": 0.00011373469730001678,
+      "loss": 0.6817,
+      "step": 12960
+    },
+    {
+      "epoch": 2.1686444927294,
+      "grad_norm": 0.030580811202526093,
+      "learning_rate": 0.00011363407680697635,
+      "loss": 0.6887,
+      "step": 12975
+    },
+    {
+      "epoch": 2.171151596189203,
+      "grad_norm": 0.030157998204231262,
+      "learning_rate": 0.00011353345631393595,
+      "loss": 0.693,
+      "step": 12990
+    },
+    {
+      "epoch": 2.1736586996490055,
+      "grad_norm": 0.03086373209953308,
+      "learning_rate": 0.00011343283582089552,
+      "loss": 0.6962,
+      "step": 13005
+    },
+    {
+      "epoch": 2.1761658031088085,
+      "grad_norm": 0.02979792095720768,
+      "learning_rate": 0.00011333221532785512,
+      "loss": 0.6756,
+      "step": 13020
+    },
+    {
+      "epoch": 2.178672906568611,
+      "grad_norm": 0.03019995242357254,
+      "learning_rate": 0.00011323159483481469,
+      "loss": 0.6951,
+      "step": 13035
+    },
+    {
+      "epoch": 2.181180010028414,
+      "grad_norm": 0.030209194868803024,
+      "learning_rate": 0.00011313097434177429,
+      "loss": 0.6904,
+      "step": 13050
+    },
+    {
+      "epoch": 2.1836871134882165,
+      "grad_norm": 0.030985839664936066,
+      "learning_rate": 0.00011303035384873386,
+      "loss": 0.6808,
+      "step": 13065
+    },
+    {
+      "epoch": 2.1861942169480195,
+      "grad_norm": 0.03027096390724182,
+      "learning_rate": 0.00011292973335569346,
+      "loss": 0.6892,
+      "step": 13080
+    },
+    {
+      "epoch": 2.188701320407822,
+      "grad_norm": 0.03128921985626221,
+      "learning_rate": 0.00011282911286265303,
+      "loss": 0.6841,
+      "step": 13095
+    },
+    {
+      "epoch": 2.191208423867625,
+      "grad_norm": 0.030639823526144028,
+      "learning_rate": 0.00011272849236961262,
+      "loss": 0.6708,
+      "step": 13110
+    },
+    {
+      "epoch": 2.1937155273274276,
+      "grad_norm": 0.030816158279776573,
+      "learning_rate": 0.0001126278718765722,
+      "loss": 0.6792,
+      "step": 13125
+    },
+    {
+      "epoch": 2.1962226307872306,
+      "grad_norm": 0.03019116260111332,
+      "learning_rate": 0.00011252725138353179,
+      "loss": 0.6777,
+      "step": 13140
+    },
+    {
+      "epoch": 2.198729734247033,
+      "grad_norm": 0.030292050912976265,
+      "learning_rate": 0.00011242663089049136,
+      "loss": 0.6999,
+      "step": 13155
+    },
+    {
+      "epoch": 2.201236837706836,
+      "grad_norm": 0.029916753992438316,
+      "learning_rate": 0.00011232601039745095,
+      "loss": 0.6784,
+      "step": 13170
+    },
+    {
+      "epoch": 2.2037439411666386,
+      "grad_norm": 0.029692910611629486,
+      "learning_rate": 0.00011222538990441053,
+      "loss": 0.6921,
+      "step": 13185
+    },
+    {
+      "epoch": 2.2062510446264416,
+      "grad_norm": 0.030788224190473557,
+      "learning_rate": 0.0001121247694113701,
+      "loss": 0.6714,
+      "step": 13200
+    },
+    {
+      "epoch": 2.208758148086244,
+      "grad_norm": 0.031961727887392044,
+      "learning_rate": 0.0001120241489183297,
+      "loss": 0.6818,
+      "step": 13215
+    },
+    {
+      "epoch": 2.211265251546047,
+      "grad_norm": 0.030589012429118156,
+      "learning_rate": 0.00011192352842528928,
+      "loss": 0.6863,
+      "step": 13230
+    },
+    {
+      "epoch": 2.21377235500585,
+      "grad_norm": 0.03072304092347622,
+      "learning_rate": 0.00011182290793224888,
+      "loss": 0.6854,
+      "step": 13245
+    },
+    {
+      "epoch": 2.2162794584656527,
+      "grad_norm": 0.030577028170228004,
+      "learning_rate": 0.00011172228743920845,
+      "loss": 0.6781,
+      "step": 13260
+    },
+    {
+      "epoch": 2.2187865619254556,
+      "grad_norm": 0.030161473900079727,
+      "learning_rate": 0.00011162166694616805,
+      "loss": 0.6824,
+      "step": 13275
+    },
+    {
+      "epoch": 2.221293665385258,
+      "grad_norm": 0.030237851664423943,
+      "learning_rate": 0.00011152104645312762,
+      "loss": 0.6808,
+      "step": 13290
+    },
+    {
+      "epoch": 2.223800768845061,
+      "grad_norm": 0.030910607427358627,
+      "learning_rate": 0.00011142042596008722,
+      "loss": 0.6819,
+      "step": 13305
+    },
+    {
+      "epoch": 2.2263078723048637,
+      "grad_norm": 0.03041113168001175,
+      "learning_rate": 0.00011131980546704679,
+      "loss": 0.6784,
+      "step": 13320
+    },
+    {
+      "epoch": 2.2288149757646667,
+      "grad_norm": 0.0322742834687233,
+      "learning_rate": 0.00011121918497400639,
+      "loss": 0.6695,
+      "step": 13335
+    },
+    {
+      "epoch": 2.2313220792244692,
+      "grad_norm": 0.03125980496406555,
+      "learning_rate": 0.00011111856448096596,
+      "loss": 0.681,
+      "step": 13350
+    },
+    {
+      "epoch": 2.233829182684272,
+      "grad_norm": 0.030773991718888283,
+      "learning_rate": 0.00011101794398792556,
+      "loss": 0.6867,
+      "step": 13365
+    },
+    {
+      "epoch": 2.2363362861440748,
+      "grad_norm": 0.03200787305831909,
+      "learning_rate": 0.00011091732349488513,
+      "loss": 0.691,
+      "step": 13380
+    },
+    {
+      "epoch": 2.2388433896038777,
+      "grad_norm": 0.03116571344435215,
+      "learning_rate": 0.00011081670300184473,
+      "loss": 0.671,
+      "step": 13395
+    },
+    {
+      "epoch": 2.2413504930636803,
+      "grad_norm": 0.031088994815945625,
+      "learning_rate": 0.0001107160825088043,
+      "loss": 0.6726,
+      "step": 13410
+    },
+    {
+      "epoch": 2.2438575965234833,
+      "grad_norm": 0.03130762279033661,
+      "learning_rate": 0.00011061546201576389,
+      "loss": 0.6948,
+      "step": 13425
+    },
+    {
+      "epoch": 2.246364699983286,
+      "grad_norm": 0.03147103264927864,
+      "learning_rate": 0.00011051484152272347,
+      "loss": 0.6778,
+      "step": 13440
+    },
+    {
+      "epoch": 2.248871803443089,
+      "grad_norm": 0.02998683787882328,
+      "learning_rate": 0.00011041422102968304,
+      "loss": 0.6996,
+      "step": 13455
+    },
+    {
+      "epoch": 2.2513789069028913,
+      "grad_norm": 0.03249230980873108,
+      "learning_rate": 0.00011031360053664263,
+      "loss": 0.6949,
+      "step": 13470
+    },
+    {
+      "epoch": 2.2538860103626943,
+      "grad_norm": 0.030694512650370598,
+      "learning_rate": 0.00011021298004360222,
+      "loss": 0.6806,
+      "step": 13485
+    },
+    {
+      "epoch": 2.2563931138224973,
+      "grad_norm": 0.0317358560860157,
+      "learning_rate": 0.0001101123595505618,
+      "loss": 0.6844,
+      "step": 13500
+    },
+    {
+      "epoch": 2.2589002172823,
+      "grad_norm": 0.029508093371987343,
+      "learning_rate": 0.00011001173905752137,
+      "loss": 0.6723,
+      "step": 13515
+    },
+    {
+      "epoch": 2.2614073207421024,
+      "grad_norm": 0.03101976215839386,
+      "learning_rate": 0.00010991111856448097,
+      "loss": 0.6689,
+      "step": 13530
+    },
+    {
+      "epoch": 2.2639144242019054,
+      "grad_norm": 0.030808012932538986,
+      "learning_rate": 0.00010981049807144054,
+      "loss": 0.6701,
+      "step": 13545
+    },
+    {
+      "epoch": 2.2664215276617083,
+      "grad_norm": 0.03057938627898693,
+      "learning_rate": 0.00010970987757840014,
+      "loss": 0.684,
+      "step": 13560
+    },
+    {
+      "epoch": 2.268928631121511,
+      "grad_norm": 0.03127751499414444,
+      "learning_rate": 0.00010960925708535971,
+      "loss": 0.6776,
+      "step": 13575
+    },
+    {
+      "epoch": 2.271435734581314,
+      "grad_norm": 0.02989344857633114,
+      "learning_rate": 0.00010950863659231931,
+      "loss": 0.6889,
+      "step": 13590
+    },
+    {
+      "epoch": 2.2739428380411164,
+      "grad_norm": 0.03043249435722828,
+      "learning_rate": 0.00010940801609927889,
+      "loss": 0.6794,
+      "step": 13605
+    },
+    {
+      "epoch": 2.2764499415009194,
+      "grad_norm": 0.030408738180994987,
+      "learning_rate": 0.00010930739560623848,
+      "loss": 0.6815,
+      "step": 13620
+    },
+    {
+      "epoch": 2.278957044960722,
+      "grad_norm": 0.030735976994037628,
+      "learning_rate": 0.00010920677511319806,
+      "loss": 0.6809,
+      "step": 13635
+    },
+    {
+      "epoch": 2.281464148420525,
+      "grad_norm": 0.0312831737101078,
+      "learning_rate": 0.00010910615462015766,
+      "loss": 0.6788,
+      "step": 13650
+    },
+    {
+      "epoch": 2.2839712518803275,
+      "grad_norm": 0.030336899682879448,
+      "learning_rate": 0.00010900553412711723,
+      "loss": 0.6737,
+      "step": 13665
+    },
+    {
+      "epoch": 2.2864783553401304,
+      "grad_norm": 0.030938081443309784,
+      "learning_rate": 0.00010890491363407683,
+      "loss": 0.6803,
+      "step": 13680
+    },
+    {
+      "epoch": 2.288985458799933,
+      "grad_norm": 0.02994300052523613,
+      "learning_rate": 0.0001088042931410364,
+      "loss": 0.6714,
+      "step": 13695
+    },
+    {
+      "epoch": 2.291492562259736,
+      "grad_norm": 0.03124346025288105,
+      "learning_rate": 0.000108703672647996,
+      "loss": 0.678,
+      "step": 13710
+    },
+    {
+      "epoch": 2.2939996657195385,
+      "grad_norm": 0.030526146292686462,
+      "learning_rate": 0.00010860305215495557,
+      "loss": 0.6815,
+      "step": 13725
+    },
+    {
+      "epoch": 2.2965067691793415,
+      "grad_norm": 0.03184838965535164,
+      "learning_rate": 0.00010850243166191514,
+      "loss": 0.6768,
+      "step": 13740
+    },
+    {
+      "epoch": 2.2990138726391445,
+      "grad_norm": 0.03009560890495777,
+      "learning_rate": 0.00010840181116887474,
+      "loss": 0.6768,
+      "step": 13755
+    },
+    {
+      "epoch": 2.301520976098947,
+      "grad_norm": 0.029740184545516968,
+      "learning_rate": 0.00010830119067583431,
+      "loss": 0.681,
+      "step": 13770
+    },
+    {
+      "epoch": 2.3040280795587496,
+      "grad_norm": 0.030534571036696434,
+      "learning_rate": 0.0001082005701827939,
+      "loss": 0.6739,
+      "step": 13785
+    },
+    {
+      "epoch": 2.3065351830185525,
+      "grad_norm": 0.030200140550732613,
+      "learning_rate": 0.00010809994968975348,
+      "loss": 0.6695,
+      "step": 13800
+    },
+    {
+      "epoch": 2.3090422864783555,
+      "grad_norm": 0.031782638281583786,
+      "learning_rate": 0.00010799932919671307,
+      "loss": 0.6866,
+      "step": 13815
+    },
+    {
+      "epoch": 2.311549389938158,
+      "grad_norm": 0.03087507374584675,
+      "learning_rate": 0.00010789870870367264,
+      "loss": 0.6717,
+      "step": 13830
+    },
+    {
+      "epoch": 2.314056493397961,
+      "grad_norm": 0.030710799619555473,
+      "learning_rate": 0.00010779808821063224,
+      "loss": 0.6882,
+      "step": 13845
+    },
+    {
+      "epoch": 2.3165635968577636,
+      "grad_norm": 0.030561743304133415,
+      "learning_rate": 0.00010769746771759181,
+      "loss": 0.6814,
+      "step": 13860
+    },
+    {
+      "epoch": 2.3190707003175666,
+      "grad_norm": 0.030251817777752876,
+      "learning_rate": 0.00010759684722455141,
+      "loss": 0.6747,
+      "step": 13875
+    },
+    {
+      "epoch": 2.321577803777369,
+      "grad_norm": 0.030898461118340492,
+      "learning_rate": 0.00010749622673151098,
+      "loss": 0.6678,
+      "step": 13890
+    },
+    {
+      "epoch": 2.324084907237172,
+      "grad_norm": 0.031910572201013565,
+      "learning_rate": 0.00010739560623847058,
+      "loss": 0.6873,
+      "step": 13905
+    },
+    {
+      "epoch": 2.3265920106969746,
+      "grad_norm": 0.031096691265702248,
+      "learning_rate": 0.00010729498574543015,
+      "loss": 0.6761,
+      "step": 13920
+    },
+    {
+      "epoch": 2.3290991141567776,
+      "grad_norm": 0.030930999666452408,
+      "learning_rate": 0.00010719436525238975,
+      "loss": 0.6842,
+      "step": 13935
+    },
+    {
+      "epoch": 2.33160621761658,
+      "grad_norm": 0.030477695167064667,
+      "learning_rate": 0.00010709374475934932,
+      "loss": 0.6784,
+      "step": 13950
+    },
+    {
+      "epoch": 2.334113321076383,
+      "grad_norm": 0.03102184645831585,
+      "learning_rate": 0.00010699312426630892,
+      "loss": 0.679,
+      "step": 13965
+    },
+    {
+      "epoch": 2.3366204245361857,
+      "grad_norm": 0.02999734878540039,
+      "learning_rate": 0.0001068925037732685,
+      "loss": 0.6786,
+      "step": 13980
+    },
+    {
+      "epoch": 2.3391275279959887,
+      "grad_norm": 0.030323563143610954,
+      "learning_rate": 0.0001067918832802281,
+      "loss": 0.6825,
+      "step": 13995
+    },
+    {
+      "epoch": 2.3416346314557916,
+      "grad_norm": 0.030984263867139816,
+      "learning_rate": 0.00010669126278718767,
+      "loss": 0.6798,
+      "step": 14010
+    },
+    {
+      "epoch": 2.344141734915594,
+      "grad_norm": 0.03151758387684822,
+      "learning_rate": 0.00010659064229414724,
+      "loss": 0.6821,
+      "step": 14025
+    },
+    {
+      "epoch": 2.3466488383753967,
+      "grad_norm": 0.03008199669420719,
+      "learning_rate": 0.00010649002180110684,
+      "loss": 0.6778,
+      "step": 14040
+    },
+    {
+      "epoch": 2.3491559418351997,
+      "grad_norm": 0.030592739582061768,
+      "learning_rate": 0.00010638940130806641,
+      "loss": 0.6629,
+      "step": 14055
+    },
+    {
+      "epoch": 2.3516630452950027,
+      "grad_norm": 0.030223028734326363,
+      "learning_rate": 0.000106288780815026,
+      "loss": 0.6807,
+      "step": 14070
+    },
+    {
+      "epoch": 2.3541701487548052,
+      "grad_norm": 0.03019655868411064,
+      "learning_rate": 0.00010618816032198558,
+      "loss": 0.6875,
+      "step": 14085
+    },
+    {
+      "epoch": 2.356677252214608,
+      "grad_norm": 0.03179163858294487,
+      "learning_rate": 0.00010608753982894517,
+      "loss": 0.6744,
+      "step": 14100
+    },
+    {
+      "epoch": 2.3591843556744108,
+      "grad_norm": 0.030132126063108444,
+      "learning_rate": 0.00010598691933590474,
+      "loss": 0.6768,
+      "step": 14115
+    },
+    {
+      "epoch": 2.3616914591342137,
+      "grad_norm": 0.03125820681452751,
+      "learning_rate": 0.00010588629884286434,
+      "loss": 0.6734,
+      "step": 14130
+    },
+    {
+      "epoch": 2.3641985625940163,
+      "grad_norm": 0.03128393739461899,
+      "learning_rate": 0.00010578567834982391,
+      "loss": 0.6643,
+      "step": 14145
+    },
+    {
+      "epoch": 2.3667056660538193,
+      "grad_norm": 0.031101234257221222,
+      "learning_rate": 0.00010568505785678351,
+      "loss": 0.6937,
+      "step": 14160
+    },
+    {
+      "epoch": 2.369212769513622,
+      "grad_norm": 0.03127965331077576,
+      "learning_rate": 0.00010558443736374308,
+      "loss": 0.6837,
+      "step": 14175
+    },
+    {
+      "epoch": 2.371719872973425,
+      "grad_norm": 0.03142804279923439,
+      "learning_rate": 0.00010548381687070268,
+      "loss": 0.6643,
+      "step": 14190
+    },
+    {
+      "epoch": 2.3742269764332273,
+      "grad_norm": 0.03196566551923752,
+      "learning_rate": 0.00010538319637766225,
+      "loss": 0.6737,
+      "step": 14205
+    },
+    {
+      "epoch": 2.3767340798930303,
+      "grad_norm": 0.03105044923722744,
+      "learning_rate": 0.00010528257588462185,
+      "loss": 0.6733,
+      "step": 14220
+    },
+    {
+      "epoch": 2.379241183352833,
+      "grad_norm": 0.030758565291762352,
+      "learning_rate": 0.00010518195539158142,
+      "loss": 0.6959,
+      "step": 14235
+    },
+    {
+      "epoch": 2.381748286812636,
+      "grad_norm": 0.03046661615371704,
+      "learning_rate": 0.00010508133489854102,
+      "loss": 0.6958,
+      "step": 14250
+    },
+    {
+      "epoch": 2.384255390272439,
+      "grad_norm": 0.03125166893005371,
+      "learning_rate": 0.00010498071440550059,
+      "loss": 0.6763,
+      "step": 14265
+    },
+    {
+      "epoch": 2.3867624937322414,
+      "grad_norm": 0.031636305153369904,
+      "learning_rate": 0.00010488009391246019,
+      "loss": 0.6794,
+      "step": 14280
+    },
+    {
+      "epoch": 2.389269597192044,
+      "grad_norm": 0.030563022941350937,
+      "learning_rate": 0.00010477947341941976,
+      "loss": 0.6874,
+      "step": 14295
+    },
+    {
+      "epoch": 2.391776700651847,
+      "grad_norm": 0.03061690181493759,
+      "learning_rate": 0.00010467885292637933,
+      "loss": 0.6782,
+      "step": 14310
+    },
+    {
+      "epoch": 2.39428380411165,
+      "grad_norm": 0.0308393444865942,
+      "learning_rate": 0.00010457823243333893,
+      "loss": 0.6777,
+      "step": 14325
+    },
+    {
+      "epoch": 2.3967909075714524,
+      "grad_norm": 0.030834507197141647,
+      "learning_rate": 0.0001044776119402985,
+      "loss": 0.6854,
+      "step": 14340
+    },
+    {
+      "epoch": 2.3992980110312554,
+      "grad_norm": 0.031078575178980827,
+      "learning_rate": 0.0001043769914472581,
+      "loss": 0.6844,
+      "step": 14355
+    },
+    {
+      "epoch": 2.401805114491058,
+      "grad_norm": 0.030426884070038795,
+      "learning_rate": 0.00010427637095421768,
+      "loss": 0.6701,
+      "step": 14370
+    },
+    {
+      "epoch": 2.404312217950861,
+      "grad_norm": 0.03103550709784031,
+      "learning_rate": 0.00010417575046117726,
+      "loss": 0.685,
+      "step": 14385
+    },
+    {
+      "epoch": 2.4068193214106635,
+      "grad_norm": 0.030895834788680077,
+      "learning_rate": 0.00010407512996813685,
+      "loss": 0.6798,
+      "step": 14400
+    },
+    {
+      "epoch": 2.4093264248704664,
+      "grad_norm": 0.029942205175757408,
+      "learning_rate": 0.00010397450947509643,
+      "loss": 0.6848,
+      "step": 14415
+    },
+    {
+      "epoch": 2.411833528330269,
+      "grad_norm": 0.03145187348127365,
+      "learning_rate": 0.000103873888982056,
+      "loss": 0.6745,
+      "step": 14430
+    },
+    {
+      "epoch": 2.414340631790072,
+      "grad_norm": 0.03102920390665531,
+      "learning_rate": 0.0001037732684890156,
+      "loss": 0.6963,
+      "step": 14445
+    },
+    {
+      "epoch": 2.4168477352498745,
+      "grad_norm": 0.030479585751891136,
+      "learning_rate": 0.00010367264799597518,
+      "loss": 0.6779,
+      "step": 14460
+    },
+    {
+      "epoch": 2.4193548387096775,
+      "grad_norm": 0.0313333161175251,
+      "learning_rate": 0.00010357202750293477,
+      "loss": 0.6675,
+      "step": 14475
+    },
+    {
+      "epoch": 2.42186194216948,
+      "grad_norm": 0.031193213537335396,
+      "learning_rate": 0.00010347140700989435,
+      "loss": 0.6709,
+      "step": 14490
+    },
+    {
+      "epoch": 2.424369045629283,
+      "grad_norm": 0.031854551285505295,
+      "learning_rate": 0.00010337078651685395,
+      "loss": 0.6832,
+      "step": 14505
+    },
+    {
+      "epoch": 2.4268761490890856,
+      "grad_norm": 0.03131631389260292,
+      "learning_rate": 0.00010327016602381352,
+      "loss": 0.6831,
+      "step": 14520
+    },
+    {
+      "epoch": 2.4293832525488885,
+      "grad_norm": 0.030897963792085648,
+      "learning_rate": 0.00010316954553077312,
+      "loss": 0.6779,
+      "step": 14535
+    },
+    {
+      "epoch": 2.431890356008691,
+      "grad_norm": 0.030229298397898674,
+      "learning_rate": 0.00010306892503773269,
+      "loss": 0.6928,
+      "step": 14550
+    },
+    {
+      "epoch": 2.434397459468494,
+      "grad_norm": 0.03158511593937874,
+      "learning_rate": 0.00010296830454469229,
+      "loss": 0.6812,
+      "step": 14565
+    },
+    {
+      "epoch": 2.436904562928297,
+      "grad_norm": 0.03185586631298065,
+      "learning_rate": 0.00010286768405165186,
+      "loss": 0.6707,
+      "step": 14580
+    },
+    {
+      "epoch": 2.4394116663880996,
+      "grad_norm": 0.03139151632785797,
+      "learning_rate": 0.00010276706355861143,
+      "loss": 0.6814,
+      "step": 14595
+    },
+    {
+      "epoch": 2.4419187698479026,
+      "grad_norm": 0.03182042017579079,
+      "learning_rate": 0.00010266644306557103,
+      "loss": 0.6663,
+      "step": 14610
+    },
+    {
+      "epoch": 2.444425873307705,
+      "grad_norm": 0.030850499868392944,
+      "learning_rate": 0.0001025658225725306,
+      "loss": 0.6937,
+      "step": 14625
+    },
+    {
+      "epoch": 2.446932976767508,
+      "grad_norm": 0.032495591789484024,
+      "learning_rate": 0.0001024652020794902,
+      "loss": 0.6588,
+      "step": 14640
+    },
+    {
+      "epoch": 2.4494400802273106,
+      "grad_norm": 0.03162992000579834,
+      "learning_rate": 0.00010236458158644977,
+      "loss": 0.6848,
+      "step": 14655
+    },
+    {
+      "epoch": 2.4519471836871136,
+      "grad_norm": 0.031871598213911057,
+      "learning_rate": 0.00010226396109340937,
+      "loss": 0.6743,
+      "step": 14670
+    },
+    {
+      "epoch": 2.454454287146916,
+      "grad_norm": 0.031383831053972244,
+      "learning_rate": 0.00010216334060036894,
+      "loss": 0.6861,
+      "step": 14685
+    },
+    {
+      "epoch": 2.456961390606719,
+      "grad_norm": 0.03176445513963699,
+      "learning_rate": 0.00010206272010732853,
+      "loss": 0.6702,
+      "step": 14700
+    },
+    {
+      "epoch": 2.4594684940665217,
+      "grad_norm": 0.03109871782362461,
+      "learning_rate": 0.00010196209961428812,
+      "loss": 0.6776,
+      "step": 14715
+    },
+    {
+      "epoch": 2.4619755975263247,
+      "grad_norm": 0.031003376469016075,
+      "learning_rate": 0.0001018614791212477,
+      "loss": 0.688,
+      "step": 14730
+    },
+    {
+      "epoch": 2.464482700986127,
+      "grad_norm": 0.031020162627100945,
+      "learning_rate": 0.00010176085862820727,
+      "loss": 0.6713,
+      "step": 14745
+    },
+    {
+      "epoch": 2.46698980444593,
+      "grad_norm": 0.031086094677448273,
+      "learning_rate": 0.00010166023813516687,
+      "loss": 0.6769,
+      "step": 14760
+    },
+    {
+      "epoch": 2.4694969079057327,
+      "grad_norm": 0.03022875264286995,
+      "learning_rate": 0.00010155961764212644,
+      "loss": 0.6807,
+      "step": 14775
+    },
+    {
+      "epoch": 2.4720040113655357,
+      "grad_norm": 0.030896877869963646,
+      "learning_rate": 0.00010145899714908604,
+      "loss": 0.6927,
+      "step": 14790
+    },
+    {
+      "epoch": 2.4745111148253383,
+      "grad_norm": 0.031297486275434494,
+      "learning_rate": 0.00010135837665604561,
+      "loss": 0.6827,
+      "step": 14805
+    },
+    {
+      "epoch": 2.4770182182851412,
+      "grad_norm": 0.03127811476588249,
+      "learning_rate": 0.00010125775616300521,
+      "loss": 0.6962,
+      "step": 14820
+    },
+    {
+      "epoch": 2.479525321744944,
+      "grad_norm": 0.030049098655581474,
+      "learning_rate": 0.00010115713566996479,
+      "loss": 0.6666,
+      "step": 14835
+    },
+    {
+      "epoch": 2.4820324252047468,
+      "grad_norm": 0.031142529100179672,
+      "learning_rate": 0.00010105651517692438,
+      "loss": 0.6787,
+      "step": 14850
+    },
+    {
+      "epoch": 2.4845395286645497,
+      "grad_norm": 0.031707145273685455,
+      "learning_rate": 0.00010095589468388396,
+      "loss": 0.6741,
+      "step": 14865
+    },
+    {
+      "epoch": 2.4870466321243523,
+      "grad_norm": 0.03133350983262062,
+      "learning_rate": 0.00010085527419084353,
+      "loss": 0.6695,
+      "step": 14880
+    },
+    {
+      "epoch": 2.4895537355841553,
+      "grad_norm": 0.031642328947782516,
+      "learning_rate": 0.00010075465369780313,
+      "loss": 0.6853,
+      "step": 14895
+    },
+    {
+      "epoch": 2.492060839043958,
+      "grad_norm": 0.03161296248435974,
+      "learning_rate": 0.0001006540332047627,
+      "loss": 0.673,
+      "step": 14910
+    },
+    {
+      "epoch": 2.494567942503761,
+      "grad_norm": 0.03102605603635311,
+      "learning_rate": 0.0001005534127117223,
+      "loss": 0.6819,
+      "step": 14925
+    },
+    {
+      "epoch": 2.4970750459635633,
+      "grad_norm": 0.031027935445308685,
+      "learning_rate": 0.00010045279221868187,
+      "loss": 0.6835,
+      "step": 14940
+    },
+    {
+      "epoch": 2.4995821494233663,
+      "grad_norm": 0.031037239357829094,
+      "learning_rate": 0.00010035217172564147,
+      "loss": 0.6695,
+      "step": 14955
+    },
+    {
+      "epoch": 2.502089252883169,
+      "grad_norm": 0.030962081626057625,
+      "learning_rate": 0.00010025155123260104,
+      "loss": 0.6808,
+      "step": 14970
+    },
+    {
+      "epoch": 2.504596356342972,
+      "grad_norm": 0.030871711671352386,
+      "learning_rate": 0.00010015093073956063,
+      "loss": 0.6799,
+      "step": 14985
+    },
+    {
+      "epoch": 2.5071034598027744,
+      "grad_norm": 0.03209908306598663,
+      "learning_rate": 0.00010005031024652021,
+      "loss": 0.6785,
+      "step": 15000
+    },
+    {
+      "epoch": 2.5096105632625774,
+      "grad_norm": 0.031665463000535965,
+      "learning_rate": 9.99496897534798e-05,
+      "loss": 0.6871,
+      "step": 15015
+    },
+    {
+      "epoch": 2.5121176667223803,
+      "grad_norm": 0.031626634299755096,
+      "learning_rate": 9.984906926043938e-05,
+      "loss": 0.6706,
+      "step": 15030
+    },
+    {
+      "epoch": 2.514624770182183,
+      "grad_norm": 0.03143932297825813,
+      "learning_rate": 9.974844876739895e-05,
+      "loss": 0.6776,
+      "step": 15045
+    },
+    {
+      "epoch": 2.5171318736419854,
+      "grad_norm": 0.03138510882854462,
+      "learning_rate": 9.964782827435854e-05,
+      "loss": 0.6832,
+      "step": 15060
+    },
+    {
+      "epoch": 2.5196389771017884,
+      "grad_norm": 0.030731745064258575,
+      "learning_rate": 9.954720778131813e-05,
+      "loss": 0.6728,
+      "step": 15075
+    },
+    {
+      "epoch": 2.5221460805615914,
+      "grad_norm": 0.03058742918074131,
+      "learning_rate": 9.944658728827771e-05,
+      "loss": 0.6916,
+      "step": 15090
+    },
+    {
+      "epoch": 2.524653184021394,
+      "grad_norm": 0.030874596908688545,
+      "learning_rate": 9.93459667952373e-05,
+      "loss": 0.6692,
+      "step": 15105
+    },
+    {
+      "epoch": 2.5271602874811965,
+      "grad_norm": 0.03069966472685337,
+      "learning_rate": 9.924534630219688e-05,
+      "loss": 0.6836,
+      "step": 15120
+    },
+    {
+      "epoch": 2.5296673909409995,
+      "grad_norm": 0.031031129881739616,
+      "learning_rate": 9.914472580915647e-05,
+      "loss": 0.6806,
+      "step": 15135
+    },
+    {
+      "epoch": 2.5321744944008024,
+      "grad_norm": 0.03190414234995842,
+      "learning_rate": 9.904410531611605e-05,
+      "loss": 0.677,
+      "step": 15150
+    },
+    {
+      "epoch": 2.534681597860605,
+      "grad_norm": 0.03230069950222969,
+      "learning_rate": 9.894348482307564e-05,
+      "loss": 0.6761,
+      "step": 15165
+    },
+    {
+      "epoch": 2.537188701320408,
+      "grad_norm": 0.03053051233291626,
+      "learning_rate": 9.884286433003522e-05,
+      "loss": 0.6805,
+      "step": 15180
+    },
+    {
+      "epoch": 2.5396958047802105,
+      "grad_norm": 0.03064662776887417,
+      "learning_rate": 9.874224383699481e-05,
+      "loss": 0.6854,
+      "step": 15195
+    },
+    {
+      "epoch": 2.5422029082400135,
+      "grad_norm": 0.03142537549138069,
+      "learning_rate": 9.86416233439544e-05,
+      "loss": 0.6693,
+      "step": 15210
+    },
+    {
+      "epoch": 2.544710011699816,
+      "grad_norm": 0.031185530126094818,
+      "learning_rate": 9.854100285091398e-05,
+      "loss": 0.6626,
+      "step": 15225
+    },
+    {
+      "epoch": 2.547217115159619,
+      "grad_norm": 0.03198733925819397,
+      "learning_rate": 9.844038235787357e-05,
+      "loss": 0.6847,
+      "step": 15240
+    },
+    {
+      "epoch": 2.5497242186194216,
+      "grad_norm": 0.03293673321604729,
+      "learning_rate": 9.833976186483315e-05,
+      "loss": 0.6792,
+      "step": 15255
+    },
+    {
+      "epoch": 2.5522313220792245,
+      "grad_norm": 0.03125865384936333,
+      "learning_rate": 9.823914137179274e-05,
+      "loss": 0.6728,
+      "step": 15270
+    },
+    {
+      "epoch": 2.554738425539027,
+      "grad_norm": 0.0312894769012928,
+      "learning_rate": 9.813852087875232e-05,
+      "loss": 0.6748,
+      "step": 15285
+    },
+    {
+      "epoch": 2.55724552899883,
+      "grad_norm": 0.03170843422412872,
+      "learning_rate": 9.80379003857119e-05,
+      "loss": 0.6674,
+      "step": 15300
+    },
+    {
+      "epoch": 2.5597526324586326,
+      "grad_norm": 0.031321533024311066,
+      "learning_rate": 9.793727989267148e-05,
+      "loss": 0.6797,
+      "step": 15315
+    },
+    {
+      "epoch": 2.5622597359184356,
+      "grad_norm": 0.031243357807397842,
+      "learning_rate": 9.783665939963107e-05,
+      "loss": 0.6752,
+      "step": 15330
+    },
+    {
+      "epoch": 2.5647668393782386,
+      "grad_norm": 0.03241657465696335,
+      "learning_rate": 9.773603890659064e-05,
+      "loss": 0.6851,
+      "step": 15345
+    },
+    {
+      "epoch": 2.567273942838041,
+      "grad_norm": 0.032917000353336334,
+      "learning_rate": 9.763541841355022e-05,
+      "loss": 0.6858,
+      "step": 15360
+    },
+    {
+      "epoch": 2.5697810462978437,
+      "grad_norm": 0.03208984062075615,
+      "learning_rate": 9.753479792050981e-05,
+      "loss": 0.6684,
+      "step": 15375
+    },
+    {
+      "epoch": 2.5722881497576466,
+      "grad_norm": 0.03123905509710312,
+      "learning_rate": 9.74341774274694e-05,
+      "loss": 0.6789,
+      "step": 15390
+    },
+    {
+      "epoch": 2.5747952532174496,
+      "grad_norm": 0.030513722449541092,
+      "learning_rate": 9.733355693442898e-05,
+      "loss": 0.6875,
+      "step": 15405
+    },
+    {
+      "epoch": 2.577302356677252,
+      "grad_norm": 0.03204507753252983,
+      "learning_rate": 9.723293644138856e-05,
+      "loss": 0.6742,
+      "step": 15420
+    },
+    {
+      "epoch": 2.579809460137055,
+      "grad_norm": 0.031124508008360863,
+      "learning_rate": 9.713231594834815e-05,
+      "loss": 0.6839,
+      "step": 15435
+    },
+    {
+      "epoch": 2.5823165635968577,
+      "grad_norm": 0.03063870221376419,
+      "learning_rate": 9.703169545530774e-05,
+      "loss": 0.6736,
+      "step": 15450
+    },
+    {
+      "epoch": 2.5848236670566607,
+      "grad_norm": 0.030677396804094315,
+      "learning_rate": 9.693107496226732e-05,
+      "loss": 0.6844,
+      "step": 15465
+    },
+    {
+      "epoch": 2.587330770516463,
+      "grad_norm": 0.03137551248073578,
+      "learning_rate": 9.68304544692269e-05,
+      "loss": 0.6763,
+      "step": 15480
+    },
+    {
+      "epoch": 2.589837873976266,
+      "grad_norm": 0.030652204528450966,
+      "learning_rate": 9.672983397618649e-05,
+      "loss": 0.6676,
+      "step": 15495
+    },
+    {
+      "epoch": 2.5923449774360687,
+      "grad_norm": 0.03098338656127453,
+      "learning_rate": 9.662921348314608e-05,
+      "loss": 0.681,
+      "step": 15510
+    },
+    {
+      "epoch": 2.5948520808958717,
+      "grad_norm": 0.030911816284060478,
+      "learning_rate": 9.652859299010566e-05,
+      "loss": 0.675,
+      "step": 15525
+    },
+    {
+      "epoch": 2.5973591843556743,
+      "grad_norm": 0.03055042400956154,
+      "learning_rate": 9.642797249706525e-05,
+      "loss": 0.6789,
+      "step": 15540
+    },
+    {
+      "epoch": 2.5998662878154772,
+      "grad_norm": 0.03084755130112171,
+      "learning_rate": 9.632735200402483e-05,
+      "loss": 0.6728,
+      "step": 15555
+    },
+    {
+      "epoch": 2.60237339127528,
+      "grad_norm": 0.03066328726708889,
+      "learning_rate": 9.622673151098442e-05,
+      "loss": 0.693,
+      "step": 15570
+    },
+    {
+      "epoch": 2.6048804947350828,
+      "grad_norm": 0.03215918317437172,
+      "learning_rate": 9.612611101794399e-05,
+      "loss": 0.6832,
+      "step": 15585
+    },
+    {
+      "epoch": 2.6073875981948857,
+      "grad_norm": 0.03187975287437439,
+      "learning_rate": 9.602549052490358e-05,
+      "loss": 0.6799,
+      "step": 15600
+    },
+    {
+      "epoch": 2.6098947016546883,
+      "grad_norm": 0.03179864585399628,
+      "learning_rate": 9.592487003186316e-05,
+      "loss": 0.6865,
+      "step": 15615
+    },
+    {
+      "epoch": 2.612401805114491,
+      "grad_norm": 0.032180171459913254,
+      "learning_rate": 9.582424953882275e-05,
+      "loss": 0.664,
+      "step": 15630
+    },
+    {
+      "epoch": 2.614908908574294,
+      "grad_norm": 0.03252346068620682,
+      "learning_rate": 9.572362904578232e-05,
+      "loss": 0.6686,
+      "step": 15645
+    },
+    {
+      "epoch": 2.617416012034097,
+      "grad_norm": 0.03194168955087662,
+      "learning_rate": 9.56230085527419e-05,
+      "loss": 0.6711,
+      "step": 15660
+    },
+    {
+      "epoch": 2.6199231154938993,
+      "grad_norm": 0.03153575584292412,
+      "learning_rate": 9.552238805970149e-05,
+      "loss": 0.6787,
+      "step": 15675
+    },
+    {
+      "epoch": 2.6224302189537023,
+      "grad_norm": 0.03099830634891987,
+      "learning_rate": 9.542176756666108e-05,
+      "loss": 0.6638,
+      "step": 15690
+    },
+    {
+      "epoch": 2.624937322413505,
+      "grad_norm": 0.032073475420475006,
+      "learning_rate": 9.532114707362066e-05,
+      "loss": 0.6867,
+      "step": 15705
+    },
+    {
+      "epoch": 2.627444425873308,
+      "grad_norm": 0.03117840364575386,
+      "learning_rate": 9.522052658058025e-05,
+      "loss": 0.6751,
+      "step": 15720
+    },
+    {
+      "epoch": 2.6299515293331104,
+      "grad_norm": 0.031706538051366806,
+      "learning_rate": 9.511990608753983e-05,
+      "loss": 0.6751,
+      "step": 15735
+    },
+    {
+      "epoch": 2.6324586327929134,
+      "grad_norm": 0.0310919638723135,
+      "learning_rate": 9.501928559449942e-05,
+      "loss": 0.6818,
+      "step": 15750
+    },
+    {
+      "epoch": 2.634965736252716,
+      "grad_norm": 0.032505493611097336,
+      "learning_rate": 9.4918665101459e-05,
+      "loss": 0.6762,
+      "step": 15765
+    },
+    {
+      "epoch": 2.637472839712519,
+      "grad_norm": 0.03129402920603752,
+      "learning_rate": 9.481804460841859e-05,
+      "loss": 0.6774,
+      "step": 15780
+    },
+    {
+      "epoch": 2.6399799431723214,
+      "grad_norm": 0.030791781842708588,
+      "learning_rate": 9.471742411537817e-05,
+      "loss": 0.6791,
+      "step": 15795
+    },
+    {
+      "epoch": 2.6424870466321244,
+      "grad_norm": 0.030598165467381477,
+      "learning_rate": 9.461680362233776e-05,
+      "loss": 0.6751,
+      "step": 15810
+    },
+    {
+      "epoch": 2.644994150091927,
+      "grad_norm": 0.03157910704612732,
+      "learning_rate": 9.451618312929734e-05,
+      "loss": 0.6661,
+      "step": 15825
+    },
+    {
+      "epoch": 2.64750125355173,
+      "grad_norm": 0.031462252140045166,
+      "learning_rate": 9.441556263625693e-05,
+      "loss": 0.6783,
+      "step": 15840
+    },
+    {
+      "epoch": 2.650008357011533,
+      "grad_norm": 0.031676456332206726,
+      "learning_rate": 9.431494214321652e-05,
+      "loss": 0.6838,
+      "step": 15855
+    },
+    {
+      "epoch": 2.6525154604713355,
+      "grad_norm": 0.031083036214113235,
+      "learning_rate": 9.421432165017609e-05,
+      "loss": 0.678,
+      "step": 15870
+    },
+    {
+      "epoch": 2.655022563931138,
+      "grad_norm": 0.03105340152978897,
+      "learning_rate": 9.411370115713567e-05,
+      "loss": 0.6786,
+      "step": 15885
+    },
+    {
+      "epoch": 2.657529667390941,
+      "grad_norm": 0.03212074562907219,
+      "learning_rate": 9.401308066409526e-05,
+      "loss": 0.6724,
+      "step": 15900
+    },
+    {
+      "epoch": 2.660036770850744,
+      "grad_norm": 0.03203478455543518,
+      "learning_rate": 9.391246017105484e-05,
+      "loss": 0.6664,
+      "step": 15915
+    },
+    {
+      "epoch": 2.6625438743105465,
+      "grad_norm": 0.03217902034521103,
+      "learning_rate": 9.381183967801443e-05,
+      "loss": 0.6668,
+      "step": 15930
+    },
+    {
+      "epoch": 2.665050977770349,
+      "grad_norm": 0.032049164175987244,
+      "learning_rate": 9.371121918497402e-05,
+      "loss": 0.6633,
+      "step": 15945
+    },
+    {
+      "epoch": 2.667558081230152,
+      "grad_norm": 0.03231196105480194,
+      "learning_rate": 9.361059869193359e-05,
+      "loss": 0.6759,
+      "step": 15960
+    },
+    {
+      "epoch": 2.670065184689955,
+      "grad_norm": 0.03290446102619171,
+      "learning_rate": 9.350997819889317e-05,
+      "loss": 0.6686,
+      "step": 15975
+    },
+    {
+      "epoch": 2.6725722881497576,
+      "grad_norm": 0.03090088628232479,
+      "learning_rate": 9.340935770585276e-05,
+      "loss": 0.6841,
+      "step": 15990
+    },
+    {
+      "epoch": 2.6750793916095605,
+      "grad_norm": 0.031320635229349136,
+      "learning_rate": 9.330873721281234e-05,
+      "loss": 0.6697,
+      "step": 16005
+    },
+    {
+      "epoch": 2.677586495069363,
+      "grad_norm": 0.03119390271604061,
+      "learning_rate": 9.320811671977193e-05,
+      "loss": 0.679,
+      "step": 16020
+    },
+    {
+      "epoch": 2.680093598529166,
+      "grad_norm": 0.031817544251680374,
+      "learning_rate": 9.310749622673151e-05,
+      "loss": 0.68,
+      "step": 16035
+    },
+    {
+      "epoch": 2.6826007019889686,
+      "grad_norm": 0.030589740723371506,
+      "learning_rate": 9.30068757336911e-05,
+      "loss": 0.6881,
+      "step": 16050
+    },
+    {
+      "epoch": 2.6851078054487716,
+      "grad_norm": 0.031363166868686676,
+      "learning_rate": 9.290625524065069e-05,
+      "loss": 0.6755,
+      "step": 16065
+    },
+    {
+      "epoch": 2.687614908908574,
+      "grad_norm": 0.03159747272729874,
+      "learning_rate": 9.280563474761027e-05,
+      "loss": 0.682,
+      "step": 16080
+    },
+    {
+      "epoch": 2.690122012368377,
+      "grad_norm": 0.03237079828977585,
+      "learning_rate": 9.270501425456986e-05,
+      "loss": 0.6808,
+      "step": 16095
+    },
+    {
+      "epoch": 2.69262911582818,
+      "grad_norm": 0.031845077872276306,
+      "learning_rate": 9.260439376152944e-05,
+      "loss": 0.6764,
+      "step": 16110
+    },
+    {
+      "epoch": 2.6951362192879826,
+      "grad_norm": 0.031239351257681847,
+      "learning_rate": 9.250377326848903e-05,
+      "loss": 0.6649,
+      "step": 16125
+    },
+    {
+      "epoch": 2.697643322747785,
+      "grad_norm": 0.031146762892603874,
+      "learning_rate": 9.240315277544861e-05,
+      "loss": 0.6953,
+      "step": 16140
+    },
+    {
+      "epoch": 2.700150426207588,
+      "grad_norm": 0.0323052816092968,
+      "learning_rate": 9.230253228240818e-05,
+      "loss": 0.672,
+      "step": 16155
+    },
+    {
+      "epoch": 2.702657529667391,
+      "grad_norm": 0.031430117785930634,
+      "learning_rate": 9.220191178936777e-05,
+      "loss": 0.6796,
+      "step": 16170
+    },
+    {
+      "epoch": 2.7051646331271937,
+      "grad_norm": 0.03176365792751312,
+      "learning_rate": 9.210129129632736e-05,
+      "loss": 0.6716,
+      "step": 16185
+    },
+    {
+      "epoch": 2.7076717365869962,
+      "grad_norm": 0.031570978462696075,
+      "learning_rate": 9.200067080328694e-05,
+      "loss": 0.6779,
+      "step": 16200
+    },
+    {
+      "epoch": 2.710178840046799,
+      "grad_norm": 0.031726591289043427,
+      "learning_rate": 9.190005031024653e-05,
+      "loss": 0.6739,
+      "step": 16215
+    },
+    {
+      "epoch": 2.712685943506602,
+      "grad_norm": 0.03140697255730629,
+      "learning_rate": 9.179942981720611e-05,
+      "loss": 0.6795,
+      "step": 16230
+    },
+    {
+      "epoch": 2.7151930469664047,
+      "grad_norm": 0.03162944316864014,
+      "learning_rate": 9.16988093241657e-05,
+      "loss": 0.6772,
+      "step": 16245
+    },
+    {
+      "epoch": 2.7177001504262077,
+      "grad_norm": 0.03275005519390106,
+      "learning_rate": 9.159818883112527e-05,
+      "loss": 0.6726,
+      "step": 16260
+    },
+    {
+      "epoch": 2.7202072538860103,
+      "grad_norm": 0.0312725305557251,
+      "learning_rate": 9.149756833808485e-05,
+      "loss": 0.6795,
+      "step": 16275
+    },
+    {
+      "epoch": 2.7227143573458132,
+      "grad_norm": 0.03128618001937866,
+      "learning_rate": 9.139694784504444e-05,
+      "loss": 0.6867,
+      "step": 16290
+    },
+    {
+      "epoch": 2.725221460805616,
+      "grad_norm": 0.03184065595269203,
+      "learning_rate": 9.129632735200403e-05,
+      "loss": 0.6737,
+      "step": 16305
+    },
+    {
+      "epoch": 2.7277285642654188,
+      "grad_norm": 0.03144819289445877,
+      "learning_rate": 9.119570685896361e-05,
+      "loss": 0.6684,
+      "step": 16320
+    },
+    {
+      "epoch": 2.7302356677252213,
+      "grad_norm": 0.03168636932969093,
+      "learning_rate": 9.10950863659232e-05,
+      "loss": 0.6759,
+      "step": 16335
+    },
+    {
+      "epoch": 2.7327427711850243,
+      "grad_norm": 0.03160136938095093,
+      "learning_rate": 9.099446587288278e-05,
+      "loss": 0.67,
+      "step": 16350
+    },
+    {
+      "epoch": 2.7352498746448273,
+      "grad_norm": 0.032716233283281326,
+      "learning_rate": 9.089384537984237e-05,
+      "loss": 0.6775,
+      "step": 16365
+    },
+    {
+      "epoch": 2.73775697810463,
+      "grad_norm": 0.033191412687301636,
+      "learning_rate": 9.079322488680195e-05,
+      "loss": 0.6882,
+      "step": 16380
+    },
+    {
+      "epoch": 2.7402640815644324,
+      "grad_norm": 0.03207962587475777,
+      "learning_rate": 9.069260439376154e-05,
+      "loss": 0.6784,
+      "step": 16395
+    },
+    {
+      "epoch": 2.7427711850242353,
+      "grad_norm": 0.031515009701251984,
+      "learning_rate": 9.059198390072112e-05,
+      "loss": 0.6741,
+      "step": 16410
+    },
+    {
+      "epoch": 2.7452782884840383,
+      "grad_norm": 0.03187147155404091,
+      "learning_rate": 9.049136340768071e-05,
+      "loss": 0.6662,
+      "step": 16425
+    },
+    {
+      "epoch": 2.747785391943841,
+      "grad_norm": 0.03254789486527443,
+      "learning_rate": 9.039074291464028e-05,
+      "loss": 0.6879,
+      "step": 16440
+    },
+    {
+      "epoch": 2.7502924954036434,
+      "grad_norm": 0.03185366839170456,
+      "learning_rate": 9.029012242159987e-05,
+      "loss": 0.6781,
+      "step": 16455
+    },
+    {
+      "epoch": 2.7527995988634464,
+      "grad_norm": 0.03274752199649811,
+      "learning_rate": 9.018950192855945e-05,
+      "loss": 0.6779,
+      "step": 16470
+    },
+    {
+      "epoch": 2.7553067023232494,
+      "grad_norm": 0.030197665095329285,
+      "learning_rate": 9.008888143551904e-05,
+      "loss": 0.6732,
+      "step": 16485
+    },
+    {
+      "epoch": 2.757813805783052,
+      "grad_norm": 0.03070506826043129,
+      "learning_rate": 8.998826094247862e-05,
+      "loss": 0.6713,
+      "step": 16500
+    },
+    {
+      "epoch": 2.760320909242855,
+      "grad_norm": 0.03231901675462723,
+      "learning_rate": 8.988764044943821e-05,
+      "loss": 0.6743,
+      "step": 16515
+    },
+    {
+      "epoch": 2.7628280127026574,
+      "grad_norm": 0.031823549419641495,
+      "learning_rate": 8.97870199563978e-05,
+      "loss": 0.6738,
+      "step": 16530
+    },
+    {
+      "epoch": 2.7653351161624604,
+      "grad_norm": 0.03237045556306839,
+      "learning_rate": 8.968639946335738e-05,
+      "loss": 0.6844,
+      "step": 16545
+    },
+    {
+      "epoch": 2.767842219622263,
+      "grad_norm": 0.033365171402692795,
+      "learning_rate": 8.958577897031695e-05,
+      "loss": 0.6778,
+      "step": 16560
+    },
+    {
+      "epoch": 2.770349323082066,
+      "grad_norm": 0.03203690052032471,
+      "learning_rate": 8.948515847727654e-05,
+      "loss": 0.6781,
+      "step": 16575
+    },
+    {
+      "epoch": 2.7728564265418685,
+      "grad_norm": 0.0312359556555748,
+      "learning_rate": 8.938453798423612e-05,
+      "loss": 0.6863,
+      "step": 16590
+    },
+    {
+      "epoch": 2.7753635300016715,
+      "grad_norm": 0.03242425248026848,
+      "learning_rate": 8.928391749119571e-05,
+      "loss": 0.6754,
+      "step": 16605
+    },
+    {
+      "epoch": 2.7778706334614744,
+      "grad_norm": 0.03151217848062515,
+      "learning_rate": 8.91832969981553e-05,
+      "loss": 0.6799,
+      "step": 16620
+    },
+    {
+      "epoch": 2.780377736921277,
+      "grad_norm": 0.032228607684373856,
+      "learning_rate": 8.908267650511488e-05,
+      "loss": 0.6709,
+      "step": 16635
+    },
+    {
+      "epoch": 2.7828848403810795,
+      "grad_norm": 0.03263266757130623,
+      "learning_rate": 8.898205601207446e-05,
+      "loss": 0.6833,
+      "step": 16650
+    },
+    {
+      "epoch": 2.7853919438408825,
+      "grad_norm": 0.030848173424601555,
+      "learning_rate": 8.888143551903405e-05,
+      "loss": 0.6802,
+      "step": 16665
+    },
+    {
+      "epoch": 2.7878990473006855,
+      "grad_norm": 0.03234275057911873,
+      "learning_rate": 8.878081502599364e-05,
+      "loss": 0.6716,
+      "step": 16680
+    },
+    {
+      "epoch": 2.790406150760488,
+      "grad_norm": 0.03131961077451706,
+      "learning_rate": 8.868019453295322e-05,
+      "loss": 0.6813,
+      "step": 16695
+    },
+    {
+      "epoch": 2.7929132542202906,
+      "grad_norm": 0.03362729772925377,
+      "learning_rate": 8.85795740399128e-05,
+      "loss": 0.677,
+      "step": 16710
+    },
+    {
+      "epoch": 2.7954203576800936,
+      "grad_norm": 0.03228291869163513,
+      "learning_rate": 8.847895354687238e-05,
+      "loss": 0.6737,
+      "step": 16725
+    },
+    {
+      "epoch": 2.7979274611398965,
+      "grad_norm": 0.031786005944013596,
+      "learning_rate": 8.837833305383196e-05,
+      "loss": 0.6859,
+      "step": 16740
+    },
+    {
+      "epoch": 2.800434564599699,
+      "grad_norm": 0.03170496225357056,
+      "learning_rate": 8.827771256079155e-05,
+      "loss": 0.6756,
+      "step": 16755
+    },
+    {
+      "epoch": 2.802941668059502,
+      "grad_norm": 0.033506058156490326,
+      "learning_rate": 8.817709206775113e-05,
+      "loss": 0.6758,
+      "step": 16770
+    },
+    {
+      "epoch": 2.8054487715193046,
+      "grad_norm": 0.032467689365148544,
+      "learning_rate": 8.807647157471072e-05,
+      "loss": 0.6745,
+      "step": 16785
+    },
+    {
+      "epoch": 2.8079558749791076,
+      "grad_norm": 0.033489979803562164,
+      "learning_rate": 8.79758510816703e-05,
+      "loss": 0.6676,
+      "step": 16800
+    },
+    {
+      "epoch": 2.81046297843891,
+      "grad_norm": 0.032214514911174774,
+      "learning_rate": 8.787523058862989e-05,
+      "loss": 0.6675,
+      "step": 16815
+    },
+    {
+      "epoch": 2.812970081898713,
+      "grad_norm": 0.03187641128897667,
+      "learning_rate": 8.777461009558948e-05,
+      "loss": 0.6863,
+      "step": 16830
+    },
+    {
+      "epoch": 2.8154771853585157,
+      "grad_norm": 0.031782276928424835,
+      "learning_rate": 8.767398960254906e-05,
+      "loss": 0.6782,
+      "step": 16845
+    },
+    {
+      "epoch": 2.8179842888183186,
+      "grad_norm": 0.031185677275061607,
+      "learning_rate": 8.757336910950865e-05,
+      "loss": 0.6769,
+      "step": 16860
+    },
+    {
+      "epoch": 2.820491392278121,
+      "grad_norm": 0.03163639456033707,
+      "learning_rate": 8.747274861646822e-05,
+      "loss": 0.6744,
+      "step": 16875
+    },
+    {
+      "epoch": 2.822998495737924,
+      "grad_norm": 0.031712062656879425,
+      "learning_rate": 8.73721281234278e-05,
+      "loss": 0.6741,
+      "step": 16890
+    },
+    {
+      "epoch": 2.8255055991977267,
+      "grad_norm": 0.03253958374261856,
+      "learning_rate": 8.727150763038739e-05,
+      "loss": 0.6844,
+      "step": 16905
+    },
+    {
+      "epoch": 2.8280127026575297,
+      "grad_norm": 0.03280916064977646,
+      "learning_rate": 8.717088713734698e-05,
+      "loss": 0.6782,
+      "step": 16920
+    },
+    {
+      "epoch": 2.8305198061173327,
+      "grad_norm": 0.03310822695493698,
+      "learning_rate": 8.707026664430656e-05,
+      "loss": 0.6704,
+      "step": 16935
+    },
+    {
+      "epoch": 2.833026909577135,
+      "grad_norm": 0.031092172488570213,
+      "learning_rate": 8.696964615126615e-05,
+      "loss": 0.676,
+      "step": 16950
+    },
+    {
+      "epoch": 2.8355340130369378,
+      "grad_norm": 0.0315091609954834,
+      "learning_rate": 8.686902565822573e-05,
+      "loss": 0.6658,
+      "step": 16965
+    },
+    {
+      "epoch": 2.8380411164967407,
+      "grad_norm": 0.030993249267339706,
+      "learning_rate": 8.676840516518532e-05,
+      "loss": 0.6771,
+      "step": 16980
+    },
+    {
+      "epoch": 2.8405482199565437,
+      "grad_norm": 0.03143613040447235,
+      "learning_rate": 8.66677846721449e-05,
+      "loss": 0.6887,
+      "step": 16995
+    },
+    {
+      "epoch": 2.8430553234163463,
+      "grad_norm": 0.03253776207566261,
+      "learning_rate": 8.656716417910447e-05,
+      "loss": 0.6684,
+      "step": 17010
+    },
+    {
+      "epoch": 2.8455624268761492,
+      "grad_norm": 0.03285781666636467,
+      "learning_rate": 8.646654368606406e-05,
+      "loss": 0.6827,
+      "step": 17025
+    },
+    {
+      "epoch": 2.848069530335952,
+      "grad_norm": 0.03159667178988457,
+      "learning_rate": 8.636592319302365e-05,
+      "loss": 0.6703,
+      "step": 17040
+    },
+    {
+      "epoch": 2.8505766337957548,
+      "grad_norm": 0.0313105471432209,
+      "learning_rate": 8.626530269998323e-05,
+      "loss": 0.6754,
+      "step": 17055
+    },
+    {
+      "epoch": 2.8530837372555573,
+      "grad_norm": 0.0318642258644104,
+      "learning_rate": 8.616468220694282e-05,
+      "loss": 0.6697,
+      "step": 17070
+    },
+    {
+      "epoch": 2.8555908407153603,
+      "grad_norm": 0.03128768131136894,
+      "learning_rate": 8.60640617139024e-05,
+      "loss": 0.6681,
+      "step": 17085
+    },
+    {
+      "epoch": 2.858097944175163,
+      "grad_norm": 0.03178677707910538,
+      "learning_rate": 8.596344122086199e-05,
+      "loss": 0.6903,
+      "step": 17100
+    },
+    {
+      "epoch": 2.860605047634966,
+      "grad_norm": 0.03252077102661133,
+      "learning_rate": 8.586282072782157e-05,
+      "loss": 0.6779,
+      "step": 17115
+    },
+    {
+      "epoch": 2.8631121510947684,
+      "grad_norm": 0.032303210347890854,
+      "learning_rate": 8.576220023478116e-05,
+      "loss": 0.6683,
+      "step": 17130
+    },
+    {
+      "epoch": 2.8656192545545713,
+      "grad_norm": 0.031926706433296204,
+      "learning_rate": 8.566157974174074e-05,
+      "loss": 0.6769,
+      "step": 17145
+    },
+    {
+      "epoch": 2.868126358014374,
+      "grad_norm": 0.032100748270750046,
+      "learning_rate": 8.556095924870033e-05,
+      "loss": 0.6829,
+      "step": 17160
+    },
+    {
+      "epoch": 2.870633461474177,
+      "grad_norm": 0.032134201377630234,
+      "learning_rate": 8.54603387556599e-05,
+      "loss": 0.6838,
+      "step": 17175
+    },
+    {
+      "epoch": 2.87314056493398,
+      "grad_norm": 0.03254568204283714,
+      "learning_rate": 8.535971826261949e-05,
+      "loss": 0.6905,
+      "step": 17190
+    },
+    {
+      "epoch": 2.8756476683937824,
+      "grad_norm": 0.03155257925391197,
+      "learning_rate": 8.525909776957907e-05,
+      "loss": 0.6799,
+      "step": 17205
+    },
+    {
+      "epoch": 2.878154771853585,
+      "grad_norm": 0.03186199814081192,
+      "learning_rate": 8.515847727653866e-05,
+      "loss": 0.6889,
+      "step": 17220
+    },
+    {
+      "epoch": 2.880661875313388,
+      "grad_norm": 0.03130493685603142,
+      "learning_rate": 8.505785678349824e-05,
+      "loss": 0.6713,
+      "step": 17235
+    },
+    {
+      "epoch": 2.883168978773191,
+      "grad_norm": 0.032139863818883896,
+      "learning_rate": 8.495723629045783e-05,
+      "loss": 0.6695,
+      "step": 17250
+    },
+    {
+      "epoch": 2.8856760822329934,
+      "grad_norm": 0.03158256411552429,
+      "learning_rate": 8.485661579741741e-05,
+      "loss": 0.6696,
+      "step": 17265
+    },
+    {
+      "epoch": 2.8881831856927964,
+      "grad_norm": 0.03188103437423706,
+      "learning_rate": 8.4755995304377e-05,
+      "loss": 0.6631,
+      "step": 17280
+    },
+    {
+      "epoch": 2.890690289152599,
+      "grad_norm": 0.032000407576560974,
+      "learning_rate": 8.465537481133657e-05,
+      "loss": 0.6736,
+      "step": 17295
+    },
+    {
+      "epoch": 2.893197392612402,
+      "grad_norm": 0.0315023809671402,
+      "learning_rate": 8.455475431829616e-05,
+      "loss": 0.6765,
+      "step": 17310
+    },
+    {
+      "epoch": 2.8957044960722045,
+      "grad_norm": 0.031286027282476425,
+      "learning_rate": 8.445413382525574e-05,
+      "loss": 0.6718,
+      "step": 17325
+    },
+    {
+      "epoch": 2.8982115995320075,
+      "grad_norm": 0.031370267271995544,
+      "learning_rate": 8.435351333221533e-05,
+      "loss": 0.6742,
+      "step": 17340
+    },
+    {
+      "epoch": 2.90071870299181,
+      "grad_norm": 0.032615795731544495,
+      "learning_rate": 8.425289283917491e-05,
+      "loss": 0.6756,
+      "step": 17355
+    },
+    {
+      "epoch": 2.903225806451613,
+      "grad_norm": 0.03155631199479103,
+      "learning_rate": 8.41522723461345e-05,
+      "loss": 0.6872,
+      "step": 17370
+    },
+    {
+      "epoch": 2.9057329099114155,
+      "grad_norm": 0.03252144530415535,
+      "learning_rate": 8.405165185309408e-05,
+      "loss": 0.6744,
+      "step": 17385
+    },
+    {
+      "epoch": 2.9082400133712185,
+      "grad_norm": 0.034432001411914825,
+      "learning_rate": 8.395103136005367e-05,
+      "loss": 0.6824,
+      "step": 17400
+    },
+    {
+      "epoch": 2.910747116831021,
+      "grad_norm": 0.033232349902391434,
+      "learning_rate": 8.385041086701326e-05,
+      "loss": 0.6837,
+      "step": 17415
+    },
+    {
+      "epoch": 2.913254220290824,
+      "grad_norm": 0.03268026188015938,
+      "learning_rate": 8.374979037397284e-05,
+      "loss": 0.6858,
+      "step": 17430
+    },
+    {
+      "epoch": 2.915761323750627,
+      "grad_norm": 0.03303677961230278,
+      "learning_rate": 8.364916988093243e-05,
+      "loss": 0.6793,
+      "step": 17445
+    },
+    {
+      "epoch": 2.9182684272104296,
+      "grad_norm": 0.03122582472860813,
+      "learning_rate": 8.354854938789201e-05,
+      "loss": 0.6728,
+      "step": 17460
+    },
+    {
+      "epoch": 2.920775530670232,
+      "grad_norm": 0.03166414424777031,
+      "learning_rate": 8.34479288948516e-05,
+      "loss": 0.6747,
+      "step": 17475
+    },
+    {
+      "epoch": 2.923282634130035,
+      "grad_norm": 0.031359609216451645,
+      "learning_rate": 8.334730840181117e-05,
+      "loss": 0.6644,
+      "step": 17490
+    },
+    {
+      "epoch": 2.925789737589838,
+      "grad_norm": 0.032412488013505936,
+      "learning_rate": 8.324668790877075e-05,
+      "loss": 0.6654,
+      "step": 17505
+    },
+    {
+      "epoch": 2.9282968410496406,
+      "grad_norm": 0.030932830646634102,
+      "learning_rate": 8.314606741573034e-05,
+      "loss": 0.6783,
+      "step": 17520
+    },
+    {
+      "epoch": 2.930803944509443,
+      "grad_norm": 0.03237373009324074,
+      "learning_rate": 8.304544692268993e-05,
+      "loss": 0.6835,
+      "step": 17535
+    },
+    {
+      "epoch": 2.933311047969246,
+      "grad_norm": 0.030989525839686394,
+      "learning_rate": 8.294482642964951e-05,
+      "loss": 0.6774,
+      "step": 17550
+    },
+    {
+      "epoch": 2.935818151429049,
+      "grad_norm": 0.031636081635951996,
+      "learning_rate": 8.28442059366091e-05,
+      "loss": 0.6845,
+      "step": 17565
+    },
+    {
+      "epoch": 2.9383252548888517,
+      "grad_norm": 0.032494526356458664,
+      "learning_rate": 8.274358544356867e-05,
+      "loss": 0.6761,
+      "step": 17580
+    },
+    {
+      "epoch": 2.9408323583486546,
+      "grad_norm": 0.03214458376169205,
+      "learning_rate": 8.264296495052825e-05,
+      "loss": 0.6837,
+      "step": 17595
+    },
+    {
+      "epoch": 2.943339461808457,
+      "grad_norm": 0.03136483207345009,
+      "learning_rate": 8.254234445748784e-05,
+      "loss": 0.6642,
+      "step": 17610
+    },
+    {
+      "epoch": 2.94584656526826,
+      "grad_norm": 0.032171837985515594,
+      "learning_rate": 8.244172396444742e-05,
+      "loss": 0.6667,
+      "step": 17625
+    },
+    {
+      "epoch": 2.9483536687280627,
+      "grad_norm": 0.032955460250377655,
+      "learning_rate": 8.234110347140701e-05,
+      "loss": 0.6789,
+      "step": 17640
+    },
+    {
+      "epoch": 2.9508607721878657,
+      "grad_norm": 0.03368501737713814,
+      "learning_rate": 8.22404829783666e-05,
+      "loss": 0.6711,
+      "step": 17655
+    },
+    {
+      "epoch": 2.9533678756476682,
+      "grad_norm": 0.03181430697441101,
+      "learning_rate": 8.213986248532618e-05,
+      "loss": 0.6751,
+      "step": 17670
+    },
+    {
+      "epoch": 2.955874979107471,
+      "grad_norm": 0.03176905959844589,
+      "learning_rate": 8.203924199228577e-05,
+      "loss": 0.6713,
+      "step": 17685
+    },
+    {
+      "epoch": 2.958382082567274,
+      "grad_norm": 0.03155883401632309,
+      "learning_rate": 8.193862149924535e-05,
+      "loss": 0.6721,
+      "step": 17700
+    },
+    {
+      "epoch": 2.9608891860270767,
+      "grad_norm": 0.03286443278193474,
+      "learning_rate": 8.183800100620494e-05,
+      "loss": 0.6669,
+      "step": 17715
+    },
+    {
+      "epoch": 2.9633962894868793,
+      "grad_norm": 0.03203749656677246,
+      "learning_rate": 8.173738051316452e-05,
+      "loss": 0.6821,
+      "step": 17730
+    },
+    {
+      "epoch": 2.9659033929466823,
+      "grad_norm": 0.03260009363293648,
+      "learning_rate": 8.163676002012411e-05,
+      "loss": 0.6785,
+      "step": 17745
+    },
+    {
+      "epoch": 2.9684104964064852,
+      "grad_norm": 0.034267883747816086,
+      "learning_rate": 8.15361395270837e-05,
+      "loss": 0.6699,
+      "step": 17760
+    },
+    {
+      "epoch": 2.970917599866288,
+      "grad_norm": 0.03159933537244797,
+      "learning_rate": 8.143551903404328e-05,
+      "loss": 0.6736,
+      "step": 17775
+    },
+    {
+      "epoch": 2.9734247033260903,
+      "grad_norm": 0.03262174129486084,
+      "learning_rate": 8.133489854100285e-05,
+      "loss": 0.6775,
+      "step": 17790
+    },
+    {
+      "epoch": 2.9759318067858933,
+      "grad_norm": 0.03155225142836571,
+      "learning_rate": 8.123427804796244e-05,
+      "loss": 0.6767,
+      "step": 17805
+    },
+    {
+      "epoch": 2.9784389102456963,
+      "grad_norm": 0.03178201615810394,
+      "learning_rate": 8.113365755492202e-05,
+      "loss": 0.6741,
+      "step": 17820
+    },
+    {
+      "epoch": 2.980946013705499,
+      "grad_norm": 0.03577803075313568,
+      "learning_rate": 8.103303706188161e-05,
+      "loss": 0.6763,
+      "step": 17835
+    },
+    {
+      "epoch": 2.983453117165302,
+      "grad_norm": 0.03188909962773323,
+      "learning_rate": 8.093241656884119e-05,
+      "loss": 0.6685,
+      "step": 17850
+    },
+    {
+      "epoch": 2.9859602206251044,
+      "grad_norm": 0.03267040103673935,
+      "learning_rate": 8.083179607580078e-05,
+      "loss": 0.6716,
+      "step": 17865
+    },
+    {
+      "epoch": 2.9884673240849073,
+      "grad_norm": 0.03385018929839134,
+      "learning_rate": 8.073117558276035e-05,
+      "loss": 0.679,
+      "step": 17880
+    },
+    {
+      "epoch": 2.99097442754471,
+      "grad_norm": 0.030928703024983406,
+      "learning_rate": 8.063055508971994e-05,
+      "loss": 0.681,
+      "step": 17895
+    },
+    {
+      "epoch": 2.993481531004513,
+      "grad_norm": 0.032874446362257004,
+      "learning_rate": 8.052993459667952e-05,
+      "loss": 0.6637,
+      "step": 17910
+    },
+    {
+      "epoch": 2.9959886344643154,
+      "grad_norm": 0.033117108047008514,
+      "learning_rate": 8.04293141036391e-05,
+      "loss": 0.663,
+      "step": 17925
+    },
+    {
+      "epoch": 2.9984957379241184,
+      "grad_norm": 0.03306007385253906,
+      "learning_rate": 8.032869361059869e-05,
+      "loss": 0.6693,
+      "step": 17940
+    },
+    {
+      "epoch": 3.001002841383921,
+      "grad_norm": 0.0331704318523407,
+      "learning_rate": 8.022807311755828e-05,
+      "loss": 0.6718,
+      "step": 17955
+    },
+    {
+      "epoch": 3.003509944843724,
+      "grad_norm": 0.03156450018286705,
+      "learning_rate": 8.012745262451786e-05,
+      "loss": 0.6553,
+      "step": 17970
+    },
+    {
+      "epoch": 3.0060170483035265,
+      "grad_norm": 0.033207476139068604,
+      "learning_rate": 8.002683213147745e-05,
+      "loss": 0.6632,
+      "step": 17985
+    },
+    {
+      "epoch": 3.0085241517633294,
+      "grad_norm": 0.03195161744952202,
+      "learning_rate": 7.992621163843703e-05,
+      "loss": 0.6705,
+      "step": 18000
+    },
+    {
+      "epoch": 3.0110312552231324,
+      "grad_norm": 0.03295569121837616,
+      "learning_rate": 7.982559114539662e-05,
+      "loss": 0.6648,
+      "step": 18015
+    },
+    {
+      "epoch": 3.013538358682935,
+      "grad_norm": 0.031562697142362595,
+      "learning_rate": 7.97249706523562e-05,
+      "loss": 0.6576,
+      "step": 18030
+    },
+    {
+      "epoch": 3.016045462142738,
+      "grad_norm": 0.031858429312705994,
+      "learning_rate": 7.962435015931579e-05,
+      "loss": 0.6528,
+      "step": 18045
+    },
+    {
+      "epoch": 3.0185525656025405,
+      "grad_norm": 0.03421582654118538,
+      "learning_rate": 7.952372966627538e-05,
+      "loss": 0.6714,
+      "step": 18060
+    },
+    {
+      "epoch": 3.0210596690623435,
+      "grad_norm": 0.03287555277347565,
+      "learning_rate": 7.942310917323496e-05,
+      "loss": 0.6788,
+      "step": 18075
+    },
+    {
+      "epoch": 3.023566772522146,
+      "grad_norm": 0.03256657347083092,
+      "learning_rate": 7.932248868019453e-05,
+      "loss": 0.6485,
+      "step": 18090
+    },
+    {
+      "epoch": 3.026073875981949,
+      "grad_norm": 0.03333086147904396,
+      "learning_rate": 7.922186818715412e-05,
+      "loss": 0.6678,
+      "step": 18105
+    },
+    {
+      "epoch": 3.0285809794417515,
+      "grad_norm": 0.031958311796188354,
+      "learning_rate": 7.91212476941137e-05,
+      "loss": 0.666,
+      "step": 18120
+    },
+    {
+      "epoch": 3.0310880829015545,
+      "grad_norm": 0.033307287842035294,
+      "learning_rate": 7.902062720107329e-05,
+      "loss": 0.6738,
+      "step": 18135
+    },
+    {
+      "epoch": 3.033595186361357,
+      "grad_norm": 0.031850751489400864,
+      "learning_rate": 7.892000670803288e-05,
+      "loss": 0.6615,
+      "step": 18150
+    },
+    {
+      "epoch": 3.03610228982116,
+      "grad_norm": 0.031476084142923355,
+      "learning_rate": 7.881938621499245e-05,
+      "loss": 0.6597,
+      "step": 18165
+    },
+    {
+      "epoch": 3.0386093932809626,
+      "grad_norm": 0.032822057604789734,
+      "learning_rate": 7.871876572195203e-05,
+      "loss": 0.649,
+      "step": 18180
+    },
+    {
+      "epoch": 3.0411164967407656,
+      "grad_norm": 0.032040949910879135,
+      "learning_rate": 7.861814522891162e-05,
+      "loss": 0.6708,
+      "step": 18195
+    },
+    {
+      "epoch": 3.043623600200568,
+      "grad_norm": 0.03377379849553108,
+      "learning_rate": 7.85175247358712e-05,
+      "loss": 0.6705,
+      "step": 18210
+    },
+    {
+      "epoch": 3.046130703660371,
+      "grad_norm": 0.03224708139896393,
+      "learning_rate": 7.841690424283079e-05,
+      "loss": 0.6685,
+      "step": 18225
+    },
+    {
+      "epoch": 3.0486378071201736,
+      "grad_norm": 0.03286907821893692,
+      "learning_rate": 7.831628374979037e-05,
+      "loss": 0.6785,
+      "step": 18240
+    },
+    {
+      "epoch": 3.0511449105799766,
+      "grad_norm": 0.033475641161203384,
+      "learning_rate": 7.821566325674996e-05,
+      "loss": 0.6581,
+      "step": 18255
+    },
+    {
+      "epoch": 3.0536520140397796,
+      "grad_norm": 0.03226190432906151,
+      "learning_rate": 7.811504276370955e-05,
+      "loss": 0.6679,
+      "step": 18270
+    },
+    {
+      "epoch": 3.056159117499582,
+      "grad_norm": 0.03244561329483986,
+      "learning_rate": 7.801442227066913e-05,
+      "loss": 0.6616,
+      "step": 18285
+    },
+    {
+      "epoch": 3.058666220959385,
+      "grad_norm": 0.03348153084516525,
+      "learning_rate": 7.791380177762872e-05,
+      "loss": 0.667,
+      "step": 18300
+    },
+    {
+      "epoch": 3.0611733244191877,
+      "grad_norm": 0.0332234688103199,
+      "learning_rate": 7.78131812845883e-05,
+      "loss": 0.6632,
+      "step": 18315
+    },
+    {
+      "epoch": 3.0636804278789906,
+      "grad_norm": 0.0322151854634285,
+      "learning_rate": 7.771256079154789e-05,
+      "loss": 0.6657,
+      "step": 18330
+    },
+    {
+      "epoch": 3.066187531338793,
+      "grad_norm": 0.033220209181308746,
+      "learning_rate": 7.761194029850747e-05,
+      "loss": 0.6694,
+      "step": 18345
+    },
+    {
+      "epoch": 3.068694634798596,
+      "grad_norm": 0.03340331092476845,
+      "learning_rate": 7.751131980546706e-05,
+      "loss": 0.6557,
+      "step": 18360
+    },
+    {
+      "epoch": 3.0712017382583987,
+      "grad_norm": 0.03259367495775223,
+      "learning_rate": 7.741069931242664e-05,
+      "loss": 0.6595,
+      "step": 18375
+    },
+    {
+      "epoch": 3.0737088417182017,
+      "grad_norm": 0.03241262957453728,
+      "learning_rate": 7.731007881938623e-05,
+      "loss": 0.6579,
+      "step": 18390
+    },
+    {
+      "epoch": 3.0762159451780042,
+      "grad_norm": 0.0317726731300354,
+      "learning_rate": 7.72094583263458e-05,
+      "loss": 0.6595,
+      "step": 18405
+    },
+    {
+      "epoch": 3.078723048637807,
+      "grad_norm": 0.0336771234869957,
+      "learning_rate": 7.710883783330539e-05,
+      "loss": 0.6563,
+      "step": 18420
+    },
+    {
+      "epoch": 3.0812301520976098,
+      "grad_norm": 0.033014651387929916,
+      "learning_rate": 7.700821734026497e-05,
+      "loss": 0.6689,
+      "step": 18435
+    },
+    {
+      "epoch": 3.0837372555574127,
+      "grad_norm": 0.03290229290723801,
+      "learning_rate": 7.690759684722454e-05,
+      "loss": 0.6482,
+      "step": 18450
+    },
+    {
+      "epoch": 3.0862443590172153,
+      "grad_norm": 0.033278971910476685,
+      "learning_rate": 7.680697635418413e-05,
+      "loss": 0.6532,
+      "step": 18465
+    },
+    {
+      "epoch": 3.0887514624770183,
+      "grad_norm": 0.03247194364666939,
+      "learning_rate": 7.670635586114371e-05,
+      "loss": 0.6455,
+      "step": 18480
+    },
+    {
+      "epoch": 3.091258565936821,
+      "grad_norm": 0.03357178345322609,
+      "learning_rate": 7.66057353681033e-05,
+      "loss": 0.6735,
+      "step": 18495
+    },
+    {
+      "epoch": 3.093765669396624,
+      "grad_norm": 0.033263131976127625,
+      "learning_rate": 7.650511487506289e-05,
+      "loss": 0.661,
+      "step": 18510
+    },
+    {
+      "epoch": 3.0962727728564268,
+      "grad_norm": 0.03202125430107117,
+      "learning_rate": 7.640449438202247e-05,
+      "loss": 0.6573,
+      "step": 18525
+    },
+    {
+      "epoch": 3.0987798763162293,
+      "grad_norm": 0.033367644995450974,
+      "learning_rate": 7.630387388898206e-05,
+      "loss": 0.6536,
+      "step": 18540
+    },
+    {
+      "epoch": 3.1012869797760323,
+      "grad_norm": 0.03215374797582626,
+      "learning_rate": 7.620325339594164e-05,
+      "loss": 0.6677,
+      "step": 18555
+    },
+    {
+      "epoch": 3.103794083235835,
+      "grad_norm": 0.03222740814089775,
+      "learning_rate": 7.610263290290123e-05,
+      "loss": 0.6451,
+      "step": 18570
+    },
+    {
+      "epoch": 3.106301186695638,
+      "grad_norm": 0.032317865639925,
+      "learning_rate": 7.600201240986081e-05,
+      "loss": 0.6477,
+      "step": 18585
+    },
+    {
+      "epoch": 3.1088082901554404,
+      "grad_norm": 0.03358441963791847,
+      "learning_rate": 7.59013919168204e-05,
+      "loss": 0.6586,
+      "step": 18600
+    },
+    {
+      "epoch": 3.1113153936152433,
+      "grad_norm": 0.03255462273955345,
+      "learning_rate": 7.580077142377998e-05,
+      "loss": 0.6626,
+      "step": 18615
+    },
+    {
+      "epoch": 3.113822497075046,
+      "grad_norm": 0.032852429896593094,
+      "learning_rate": 7.570015093073957e-05,
+      "loss": 0.6663,
+      "step": 18630
+    },
+    {
+      "epoch": 3.116329600534849,
+      "grad_norm": 0.033199895173311234,
+      "learning_rate": 7.559953043769915e-05,
+      "loss": 0.6647,
+      "step": 18645
+    },
+    {
+      "epoch": 3.1188367039946514,
+      "grad_norm": 0.03370612487196922,
+      "learning_rate": 7.549890994465874e-05,
+      "loss": 0.664,
+      "step": 18660
+    },
+    {
+      "epoch": 3.1213438074544544,
+      "grad_norm": 0.03296181559562683,
+      "learning_rate": 7.539828945161833e-05,
+      "loss": 0.6509,
+      "step": 18675
+    },
+    {
+      "epoch": 3.123850910914257,
+      "grad_norm": 0.03318094462156296,
+      "learning_rate": 7.529766895857791e-05,
+      "loss": 0.6782,
+      "step": 18690
+    },
+    {
+      "epoch": 3.12635801437406,
+      "grad_norm": 0.03345433250069618,
+      "learning_rate": 7.519704846553748e-05,
+      "loss": 0.6615,
+      "step": 18705
+    },
+    {
+      "epoch": 3.1288651178338625,
+      "grad_norm": 0.03299123793840408,
+      "learning_rate": 7.509642797249707e-05,
+      "loss": 0.6567,
+      "step": 18720
+    },
+    {
+      "epoch": 3.1313722212936654,
+      "grad_norm": 0.033207185566425323,
+      "learning_rate": 7.499580747945665e-05,
+      "loss": 0.6657,
+      "step": 18735
+    },
+    {
+      "epoch": 3.133879324753468,
+      "grad_norm": 0.03186638280749321,
+      "learning_rate": 7.489518698641623e-05,
+      "loss": 0.6651,
+      "step": 18750
+    },
+    {
+      "epoch": 3.136386428213271,
+      "grad_norm": 0.03253698721528053,
+      "learning_rate": 7.479456649337581e-05,
+      "loss": 0.6419,
+      "step": 18765
+    },
+    {
+      "epoch": 3.138893531673074,
+      "grad_norm": 0.03357692062854767,
+      "learning_rate": 7.46939460003354e-05,
+      "loss": 0.6591,
+      "step": 18780
+    },
+    {
+      "epoch": 3.1414006351328765,
+      "grad_norm": 0.03315422683954239,
+      "learning_rate": 7.459332550729498e-05,
+      "loss": 0.6542,
+      "step": 18795
+    },
+    {
+      "epoch": 3.143907738592679,
+      "grad_norm": 0.03280475363135338,
+      "learning_rate": 7.449270501425457e-05,
+      "loss": 0.6488,
+      "step": 18810
+    },
+    {
+      "epoch": 3.146414842052482,
+      "grad_norm": 0.032748933881521225,
+      "learning_rate": 7.439208452121415e-05,
+      "loss": 0.6448,
+      "step": 18825
+    },
+    {
+      "epoch": 3.148921945512285,
+      "grad_norm": 0.03311785310506821,
+      "learning_rate": 7.429146402817374e-05,
+      "loss": 0.6798,
+      "step": 18840
+    },
+    {
+      "epoch": 3.1514290489720875,
+      "grad_norm": 0.034732621163129807,
+      "learning_rate": 7.419084353513332e-05,
+      "loss": 0.6582,
+      "step": 18855
+    },
+    {
+      "epoch": 3.1539361524318905,
+      "grad_norm": 0.03301689773797989,
+      "learning_rate": 7.409022304209291e-05,
+      "loss": 0.6545,
+      "step": 18870
+    },
+    {
+      "epoch": 3.156443255891693,
+      "grad_norm": 0.03281566500663757,
+      "learning_rate": 7.39896025490525e-05,
+      "loss": 0.6566,
+      "step": 18885
+    },
+    {
+      "epoch": 3.158950359351496,
+      "grad_norm": 0.034080591052770615,
+      "learning_rate": 7.388898205601208e-05,
+      "loss": 0.668,
+      "step": 18900
+    },
+    {
+      "epoch": 3.1614574628112986,
+      "grad_norm": 0.032824501395225525,
+      "learning_rate": 7.378836156297167e-05,
+      "loss": 0.6654,
+      "step": 18915
+    },
+    {
+      "epoch": 3.1639645662711016,
+      "grad_norm": 0.03173629939556122,
+      "learning_rate": 7.368774106993125e-05,
+      "loss": 0.6692,
+      "step": 18930
+    },
+    {
+      "epoch": 3.166471669730904,
+      "grad_norm": 0.03352899104356766,
+      "learning_rate": 7.358712057689084e-05,
+      "loss": 0.6511,
+      "step": 18945
+    },
+    {
+      "epoch": 3.168978773190707,
+      "grad_norm": 0.033694177865982056,
+      "learning_rate": 7.348650008385042e-05,
+      "loss": 0.6541,
+      "step": 18960
+    },
+    {
+      "epoch": 3.1714858766505096,
+      "grad_norm": 0.033731088042259216,
+      "learning_rate": 7.338587959081001e-05,
+      "loss": 0.6731,
+      "step": 18975
+    },
+    {
+      "epoch": 3.1739929801103126,
+      "grad_norm": 0.03405210003256798,
+      "learning_rate": 7.32852590977696e-05,
+      "loss": 0.6664,
+      "step": 18990
+    },
+    {
+      "epoch": 3.176500083570115,
+      "grad_norm": 0.03264220058917999,
+      "learning_rate": 7.318463860472917e-05,
+      "loss": 0.6638,
+      "step": 19005
+    },
+    {
+      "epoch": 3.179007187029918,
+      "grad_norm": 0.03331288322806358,
+      "learning_rate": 7.308401811168875e-05,
+      "loss": 0.6601,
+      "step": 19020
+    },
+    {
+      "epoch": 3.1815142904897207,
+      "grad_norm": 0.032871656119823456,
+      "learning_rate": 7.298339761864834e-05,
+      "loss": 0.6582,
+      "step": 19035
+    },
+    {
+      "epoch": 3.1840213939495237,
+      "grad_norm": 0.033308811485767365,
+      "learning_rate": 7.288277712560792e-05,
+      "loss": 0.6526,
+      "step": 19050
+    },
+    {
+      "epoch": 3.186528497409326,
+      "grad_norm": 0.034691642969846725,
+      "learning_rate": 7.27821566325675e-05,
+      "loss": 0.6562,
+      "step": 19065
+    },
+    {
+      "epoch": 3.189035600869129,
+      "grad_norm": 0.03305482491850853,
+      "learning_rate": 7.268153613952708e-05,
+      "loss": 0.6647,
+      "step": 19080
+    },
+    {
+      "epoch": 3.191542704328932,
+      "grad_norm": 0.03342653810977936,
+      "learning_rate": 7.258091564648666e-05,
+      "loss": 0.6547,
+      "step": 19095
+    },
+    {
+      "epoch": 3.1940498077887347,
+      "grad_norm": 0.032857514917850494,
+      "learning_rate": 7.248029515344625e-05,
+      "loss": 0.6675,
+      "step": 19110
+    },
+    {
+      "epoch": 3.1965569112485377,
+      "grad_norm": 0.033686548471450806,
+      "learning_rate": 7.237967466040584e-05,
+      "loss": 0.665,
+      "step": 19125
+    },
+    {
+      "epoch": 3.1990640147083402,
+      "grad_norm": 0.0347943976521492,
+      "learning_rate": 7.227905416736542e-05,
+      "loss": 0.6663,
+      "step": 19140
+    },
+    {
+      "epoch": 3.201571118168143,
+      "grad_norm": 0.0345291905105114,
+      "learning_rate": 7.2178433674325e-05,
+      "loss": 0.6712,
+      "step": 19155
+    },
+    {
+      "epoch": 3.2040782216279458,
+      "grad_norm": 0.03408714756369591,
+      "learning_rate": 7.207781318128459e-05,
+      "loss": 0.675,
+      "step": 19170
+    },
+    {
+      "epoch": 3.2065853250877487,
+      "grad_norm": 0.03278841823339462,
+      "learning_rate": 7.197719268824418e-05,
+      "loss": 0.6694,
+      "step": 19185
+    },
+    {
+      "epoch": 3.2090924285475513,
+      "grad_norm": 0.032465532422065735,
+      "learning_rate": 7.187657219520376e-05,
+      "loss": 0.6606,
+      "step": 19200
+    },
+    {
+      "epoch": 3.2115995320073543,
+      "grad_norm": 0.03292189538478851,
+      "learning_rate": 7.177595170216335e-05,
+      "loss": 0.6602,
+      "step": 19215
+    },
+    {
+      "epoch": 3.214106635467157,
+      "grad_norm": 0.03233910724520683,
+      "learning_rate": 7.167533120912293e-05,
+      "loss": 0.6736,
+      "step": 19230
+    },
+    {
+      "epoch": 3.21661373892696,
+      "grad_norm": 0.03283598646521568,
+      "learning_rate": 7.157471071608252e-05,
+      "loss": 0.6561,
+      "step": 19245
+    },
+    {
+      "epoch": 3.2191208423867623,
+      "grad_norm": 0.03332465514540672,
+      "learning_rate": 7.14740902230421e-05,
+      "loss": 0.6666,
+      "step": 19260
+    },
+    {
+      "epoch": 3.2216279458465653,
+      "grad_norm": 0.033038314431905746,
+      "learning_rate": 7.137346973000169e-05,
+      "loss": 0.6615,
+      "step": 19275
+    },
+    {
+      "epoch": 3.224135049306368,
+      "grad_norm": 0.033269450068473816,
+      "learning_rate": 7.127284923696128e-05,
+      "loss": 0.6429,
+      "step": 19290
+    },
+    {
+      "epoch": 3.226642152766171,
+      "grad_norm": 0.03383258357644081,
+      "learning_rate": 7.117222874392085e-05,
+      "loss": 0.6564,
+      "step": 19305
+    },
+    {
+      "epoch": 3.2291492562259734,
+      "grad_norm": 0.03422423452138901,
+      "learning_rate": 7.107160825088043e-05,
+      "loss": 0.6626,
+      "step": 19320
+    },
+    {
+      "epoch": 3.2316563596857764,
+      "grad_norm": 0.03328223526477814,
+      "learning_rate": 7.097098775784002e-05,
+      "loss": 0.6662,
+      "step": 19335
+    },
+    {
+      "epoch": 3.2341634631455793,
+      "grad_norm": 0.06695165485143661,
+      "learning_rate": 7.08703672647996e-05,
+      "loss": 0.6638,
+      "step": 19350
+    },
+    {
+      "epoch": 3.236670566605382,
+      "grad_norm": 0.034399405121803284,
+      "learning_rate": 7.076974677175918e-05,
+      "loss": 0.6696,
+      "step": 19365
+    },
+    {
+      "epoch": 3.239177670065185,
+      "grad_norm": 0.03373480588197708,
+      "learning_rate": 7.066912627871876e-05,
+      "loss": 0.6567,
+      "step": 19380
+    },
+    {
+      "epoch": 3.2416847735249874,
+      "grad_norm": 0.03468296676874161,
+      "learning_rate": 7.056850578567835e-05,
+      "loss": 0.6693,
+      "step": 19395
+    },
+    {
+      "epoch": 3.2441918769847904,
+      "grad_norm": 0.03372135013341904,
+      "learning_rate": 7.046788529263793e-05,
+      "loss": 0.6598,
+      "step": 19410
+    },
+    {
+      "epoch": 3.246698980444593,
+      "grad_norm": 0.033345624804496765,
+      "learning_rate": 7.036726479959752e-05,
+      "loss": 0.6618,
+      "step": 19425
+    },
+    {
+      "epoch": 3.249206083904396,
+      "grad_norm": 0.033639825880527496,
+      "learning_rate": 7.02666443065571e-05,
+      "loss": 0.675,
+      "step": 19440
+    },
+    {
+      "epoch": 3.2517131873641985,
+      "grad_norm": 0.032892145216464996,
+      "learning_rate": 7.016602381351669e-05,
+      "loss": 0.6725,
+      "step": 19455
+    },
+    {
+      "epoch": 3.2542202908240014,
+      "grad_norm": 0.03500252589583397,
+      "learning_rate": 7.006540332047627e-05,
+      "loss": 0.6648,
+      "step": 19470
+    },
+    {
+      "epoch": 3.256727394283804,
+      "grad_norm": 0.033187173306941986,
+      "learning_rate": 6.996478282743586e-05,
+      "loss": 0.6574,
+      "step": 19485
+    },
+    {
+      "epoch": 3.259234497743607,
+      "grad_norm": 0.033664412796497345,
+      "learning_rate": 6.986416233439545e-05,
+      "loss": 0.6555,
+      "step": 19500
+    },
+    {
+      "epoch": 3.2617416012034095,
+      "grad_norm": 0.031998805701732635,
+      "learning_rate": 6.976354184135503e-05,
+      "loss": 0.6621,
+      "step": 19515
+    },
+    {
+      "epoch": 3.2642487046632125,
+      "grad_norm": 0.03370664268732071,
+      "learning_rate": 6.966292134831462e-05,
+      "loss": 0.6575,
+      "step": 19530
+    },
+    {
+      "epoch": 3.266755808123015,
+      "grad_norm": 0.03247015178203583,
+      "learning_rate": 6.95623008552742e-05,
+      "loss": 0.6704,
+      "step": 19545
+    },
+    {
+      "epoch": 3.269262911582818,
+      "grad_norm": 0.03311055153608322,
+      "learning_rate": 6.946168036223379e-05,
+      "loss": 0.6616,
+      "step": 19560
+    },
+    {
+      "epoch": 3.2717700150426205,
+      "grad_norm": 0.0337141677737236,
+      "learning_rate": 6.936105986919337e-05,
+      "loss": 0.6755,
+      "step": 19575
+    },
+    {
+      "epoch": 3.2742771185024235,
+      "grad_norm": 0.03368309885263443,
+      "learning_rate": 6.926043937615294e-05,
+      "loss": 0.6641,
+      "step": 19590
+    },
+    {
+      "epoch": 3.2767842219622265,
+      "grad_norm": 0.035854946821928024,
+      "learning_rate": 6.915981888311253e-05,
+      "loss": 0.6417,
+      "step": 19605
+    },
+    {
+      "epoch": 3.279291325422029,
+      "grad_norm": 0.03423641249537468,
+      "learning_rate": 6.905919839007212e-05,
+      "loss": 0.6617,
+      "step": 19620
+    },
+    {
+      "epoch": 3.281798428881832,
+      "grad_norm": 0.033775344491004944,
+      "learning_rate": 6.89585778970317e-05,
+      "loss": 0.6636,
+      "step": 19635
+    },
+    {
+      "epoch": 3.2843055323416346,
+      "grad_norm": 0.03380570188164711,
+      "learning_rate": 6.885795740399129e-05,
+      "loss": 0.6803,
+      "step": 19650
+    },
+    {
+      "epoch": 3.2868126358014376,
+      "grad_norm": 0.03310622647404671,
+      "learning_rate": 6.875733691095086e-05,
+      "loss": 0.6565,
+      "step": 19665
+    },
+    {
+      "epoch": 3.28931973926124,
+      "grad_norm": 0.033229805529117584,
+      "learning_rate": 6.865671641791044e-05,
+      "loss": 0.6745,
+      "step": 19680
+    },
+    {
+      "epoch": 3.291826842721043,
+      "grad_norm": 0.032646749168634415,
+      "learning_rate": 6.855609592487003e-05,
+      "loss": 0.6621,
+      "step": 19695
+    },
+    {
+      "epoch": 3.2943339461808456,
+      "grad_norm": 0.03353268280625343,
+      "learning_rate": 6.845547543182961e-05,
+      "loss": 0.6591,
+      "step": 19710
+    },
+    {
+      "epoch": 3.2968410496406486,
+      "grad_norm": 0.03348655626177788,
+      "learning_rate": 6.83548549387892e-05,
+      "loss": 0.6595,
+      "step": 19725
+    },
+    {
+      "epoch": 3.299348153100451,
+      "grad_norm": 0.034753601998090744,
+      "learning_rate": 6.825423444574879e-05,
+      "loss": 0.6577,
+      "step": 19740
+    },
+    {
+      "epoch": 3.301855256560254,
+      "grad_norm": 0.03206339105963707,
+      "learning_rate": 6.815361395270837e-05,
+      "loss": 0.6532,
+      "step": 19755
+    },
+    {
+      "epoch": 3.3043623600200567,
+      "grad_norm": 0.03383897244930267,
+      "learning_rate": 6.805299345966796e-05,
+      "loss": 0.6582,
+      "step": 19770
+    },
+    {
+      "epoch": 3.3068694634798597,
+      "grad_norm": 0.03341525420546532,
+      "learning_rate": 6.795237296662754e-05,
+      "loss": 0.655,
+      "step": 19785
+    },
+    {
+      "epoch": 3.309376566939662,
+      "grad_norm": 0.033895961940288544,
+      "learning_rate": 6.785175247358713e-05,
+      "loss": 0.6635,
+      "step": 19800
+    },
+    {
+      "epoch": 3.311883670399465,
+      "grad_norm": 1.8668147325515747,
+      "learning_rate": 6.775113198054671e-05,
+      "loss": 0.6677,
+      "step": 19815
+    },
+    {
+      "epoch": 3.3143907738592677,
+      "grad_norm": 0.03255158290266991,
+      "learning_rate": 6.76505114875063e-05,
+      "loss": 0.6563,
+      "step": 19830
+    },
+    {
+      "epoch": 3.3168978773190707,
+      "grad_norm": 0.03239602968096733,
+      "learning_rate": 6.754989099446588e-05,
+      "loss": 0.6564,
+      "step": 19845
+    },
+    {
+      "epoch": 3.3194049807788737,
+      "grad_norm": 0.032781895250082016,
+      "learning_rate": 6.744927050142547e-05,
+      "loss": 0.6686,
+      "step": 19860
+    },
+    {
+      "epoch": 3.3219120842386762,
+      "grad_norm": 0.03344454988837242,
+      "learning_rate": 6.734865000838504e-05,
+      "loss": 0.6584,
+      "step": 19875
+    },
+    {
+      "epoch": 3.324419187698479,
+      "grad_norm": 0.0336369127035141,
+      "learning_rate": 6.724802951534463e-05,
+      "loss": 0.6561,
+      "step": 19890
+    },
+    {
+      "epoch": 3.3269262911582818,
+      "grad_norm": 0.0323907732963562,
+      "learning_rate": 6.714740902230421e-05,
+      "loss": 0.6587,
+      "step": 19905
+    },
+    {
+      "epoch": 3.3294333946180847,
+      "grad_norm": 0.03296393156051636,
+      "learning_rate": 6.70467885292638e-05,
+      "loss": 0.6666,
+      "step": 19920
+    },
+    {
+      "epoch": 3.3319404980778873,
+      "grad_norm": 0.032257601618766785,
+      "learning_rate": 6.694616803622338e-05,
+      "loss": 0.6488,
+      "step": 19935
+    },
+    {
+      "epoch": 3.3344476015376903,
+      "grad_norm": 0.03307221084833145,
+      "learning_rate": 6.684554754318297e-05,
+      "loss": 0.6546,
+      "step": 19950
+    },
+    {
+      "epoch": 3.336954704997493,
+      "grad_norm": 0.03298574313521385,
+      "learning_rate": 6.674492705014255e-05,
+      "loss": 0.6795,
+      "step": 19965
+    },
+    {
+      "epoch": 3.339461808457296,
+      "grad_norm": 0.0329146534204483,
+      "learning_rate": 6.664430655710213e-05,
+      "loss": 0.6571,
+      "step": 19980
+    },
+    {
+      "epoch": 3.3419689119170983,
+      "grad_norm": 0.03403447940945625,
+      "learning_rate": 6.654368606406171e-05,
+      "loss": 0.6778,
+      "step": 19995
+    },
+    {
+      "epoch": 3.3444760153769013,
+      "grad_norm": 0.03246279060840607,
+      "learning_rate": 6.64430655710213e-05,
+      "loss": 0.6612,
+      "step": 20010
+    },
+    {
+      "epoch": 3.346983118836704,
+      "grad_norm": 0.03411612659692764,
+      "learning_rate": 6.634244507798088e-05,
+      "loss": 0.6478,
+      "step": 20025
+    },
+    {
+      "epoch": 3.349490222296507,
+      "grad_norm": 0.033288851380348206,
+      "learning_rate": 6.624182458494047e-05,
+      "loss": 0.6551,
+      "step": 20040
+    },
+    {
+      "epoch": 3.3519973257563094,
+      "grad_norm": 0.03258313983678818,
+      "learning_rate": 6.614120409190005e-05,
+      "loss": 0.6761,
+      "step": 20055
+    },
+    {
+      "epoch": 3.3545044292161124,
+      "grad_norm": 0.033909596502780914,
+      "learning_rate": 6.604058359885964e-05,
+      "loss": 0.6707,
+      "step": 20070
+    },
+    {
+      "epoch": 3.357011532675915,
+      "grad_norm": 0.033043161034584045,
+      "learning_rate": 6.593996310581922e-05,
+      "loss": 0.6549,
+      "step": 20085
+    },
+    {
+      "epoch": 3.359518636135718,
+      "grad_norm": 0.03325843811035156,
+      "learning_rate": 6.583934261277881e-05,
+      "loss": 0.6655,
+      "step": 20100
+    },
+    {
+      "epoch": 3.362025739595521,
+      "grad_norm": 0.033593397587537766,
+      "learning_rate": 6.57387221197384e-05,
+      "loss": 0.659,
+      "step": 20115
+    },
+    {
+      "epoch": 3.3645328430553234,
+      "grad_norm": 0.032497063279151917,
+      "learning_rate": 6.563810162669798e-05,
+      "loss": 0.6728,
+      "step": 20130
+    },
+    {
+      "epoch": 3.367039946515126,
+      "grad_norm": 0.03400912135839462,
+      "learning_rate": 6.553748113365757e-05,
+      "loss": 0.6701,
+      "step": 20145
+    },
+    {
+      "epoch": 3.369547049974929,
+      "grad_norm": 0.033511847257614136,
+      "learning_rate": 6.543686064061714e-05,
+      "loss": 0.6522,
+      "step": 20160
+    },
+    {
+      "epoch": 3.372054153434732,
+      "grad_norm": 0.034270040690898895,
+      "learning_rate": 6.533624014757672e-05,
+      "loss": 0.6509,
+      "step": 20175
+    },
+    {
+      "epoch": 3.3745612568945345,
+      "grad_norm": 0.03471988067030907,
+      "learning_rate": 6.523561965453631e-05,
+      "loss": 0.657,
+      "step": 20190
+    },
+    {
+      "epoch": 3.3770683603543374,
+      "grad_norm": 0.033193349838256836,
+      "learning_rate": 6.51349991614959e-05,
+      "loss": 0.6602,
+      "step": 20205
+    },
+    {
+      "epoch": 3.37957546381414,
+      "grad_norm": 0.03341618552803993,
+      "learning_rate": 6.503437866845548e-05,
+      "loss": 0.6594,
+      "step": 20220
+    },
+    {
+      "epoch": 3.382082567273943,
+      "grad_norm": 0.03393018990755081,
+      "learning_rate": 6.493375817541507e-05,
+      "loss": 0.6692,
+      "step": 20235
+    },
+    {
+      "epoch": 3.3845896707337455,
+      "grad_norm": 0.032490186393260956,
+      "learning_rate": 6.483313768237465e-05,
+      "loss": 0.645,
+      "step": 20250
+    },
+    {
+      "epoch": 3.3870967741935485,
+      "grad_norm": 0.03293507918715477,
+      "learning_rate": 6.473251718933424e-05,
+      "loss": 0.6414,
+      "step": 20265
+    },
+    {
+      "epoch": 3.389603877653351,
+      "grad_norm": 0.033456623554229736,
+      "learning_rate": 6.463189669629381e-05,
+      "loss": 0.6547,
+      "step": 20280
+    },
+    {
+      "epoch": 3.392110981113154,
+      "grad_norm": 0.03404277190566063,
+      "learning_rate": 6.45312762032534e-05,
+      "loss": 0.6605,
+      "step": 20295
+    },
+    {
+      "epoch": 3.3946180845729566,
+      "grad_norm": 0.03427689149975777,
+      "learning_rate": 6.443065571021298e-05,
+      "loss": 0.6594,
+      "step": 20310
+    },
+    {
+      "epoch": 3.3971251880327595,
+      "grad_norm": 0.033457666635513306,
+      "learning_rate": 6.433003521717256e-05,
+      "loss": 0.6778,
+      "step": 20325
+    },
+    {
+      "epoch": 3.399632291492562,
+      "grad_norm": 0.034244317561388016,
+      "learning_rate": 6.422941472413215e-05,
+      "loss": 0.6742,
+      "step": 20340
+    },
+    {
+      "epoch": 3.402139394952365,
+      "grad_norm": 0.03270183503627777,
+      "learning_rate": 6.412879423109174e-05,
+      "loss": 0.6435,
+      "step": 20355
+    },
+    {
+      "epoch": 3.404646498412168,
+      "grad_norm": 0.034196462482213974,
+      "learning_rate": 6.402817373805132e-05,
+      "loss": 0.648,
+      "step": 20370
+    },
+    {
+      "epoch": 3.4071536018719706,
+      "grad_norm": 0.032824840396642685,
+      "learning_rate": 6.39275532450109e-05,
+      "loss": 0.6767,
+      "step": 20385
+    },
+    {
+      "epoch": 3.409660705331773,
+      "grad_norm": 0.03537704795598984,
+      "learning_rate": 6.382693275197049e-05,
+      "loss": 0.6721,
+      "step": 20400
+    },
+    {
+      "epoch": 3.412167808791576,
+      "grad_norm": 0.03542947396636009,
+      "learning_rate": 6.372631225893008e-05,
+      "loss": 0.6612,
+      "step": 20415
+    },
+    {
+      "epoch": 3.414674912251379,
+      "grad_norm": 0.033524878323078156,
+      "learning_rate": 6.362569176588966e-05,
+      "loss": 0.6709,
+      "step": 20430
+    },
+    {
+      "epoch": 3.4171820157111816,
+      "grad_norm": 0.03377537056803703,
+      "learning_rate": 6.352507127284923e-05,
+      "loss": 0.6711,
+      "step": 20445
+    },
+    {
+      "epoch": 3.4196891191709846,
+      "grad_norm": 0.033485304564237595,
+      "learning_rate": 6.342445077980882e-05,
+      "loss": 0.6622,
+      "step": 20460
+    },
+    {
+      "epoch": 3.422196222630787,
+      "grad_norm": 0.033119190484285355,
+      "learning_rate": 6.33238302867684e-05,
+      "loss": 0.66,
+      "step": 20475
+    },
+    {
+      "epoch": 3.42470332609059,
+      "grad_norm": 0.0347219854593277,
+      "learning_rate": 6.322320979372799e-05,
+      "loss": 0.6538,
+      "step": 20490
+    },
+    {
+      "epoch": 3.4272104295503927,
+      "grad_norm": 0.033375516533851624,
+      "learning_rate": 6.312258930068758e-05,
+      "loss": 0.6721,
+      "step": 20505
+    },
+    {
+      "epoch": 3.4297175330101957,
+      "grad_norm": 0.03268874064087868,
+      "learning_rate": 6.302196880764716e-05,
+      "loss": 0.6608,
+      "step": 20520
+    },
+    {
+      "epoch": 3.432224636469998,
+      "grad_norm": 0.03364133462309837,
+      "learning_rate": 6.292134831460675e-05,
+      "loss": 0.6673,
+      "step": 20535
+    },
+    {
+      "epoch": 3.434731739929801,
+      "grad_norm": 0.03426344320178032,
+      "learning_rate": 6.282072782156633e-05,
+      "loss": 0.6637,
+      "step": 20550
+    },
+    {
+      "epoch": 3.4372388433896037,
+      "grad_norm": 0.03355192393064499,
+      "learning_rate": 6.272010732852592e-05,
+      "loss": 0.6781,
+      "step": 20565
+    },
+    {
+      "epoch": 3.4397459468494067,
+      "grad_norm": 0.03330032154917717,
+      "learning_rate": 6.261948683548549e-05,
+      "loss": 0.6693,
+      "step": 20580
+    },
+    {
+      "epoch": 3.4422530503092093,
+      "grad_norm": 0.03315750136971474,
+      "learning_rate": 6.251886634244508e-05,
+      "loss": 0.66,
+      "step": 20595
+    },
+    {
+      "epoch": 3.4447601537690122,
+      "grad_norm": 0.033580485731363297,
+      "learning_rate": 6.241824584940466e-05,
+      "loss": 0.6749,
+      "step": 20610
+    },
+    {
+      "epoch": 3.447267257228815,
+      "grad_norm": 0.03357269614934921,
+      "learning_rate": 6.231762535636425e-05,
+      "loss": 0.6563,
+      "step": 20625
+    },
+    {
+      "epoch": 3.4497743606886178,
+      "grad_norm": 0.03285966068506241,
+      "learning_rate": 6.221700486332383e-05,
+      "loss": 0.6615,
+      "step": 20640
+    },
+    {
+      "epoch": 3.4522814641484203,
+      "grad_norm": 0.034152496606111526,
+      "learning_rate": 6.211638437028342e-05,
+      "loss": 0.666,
+      "step": 20655
+    },
+    {
+      "epoch": 3.4547885676082233,
+      "grad_norm": 0.03442816436290741,
+      "learning_rate": 6.2015763877243e-05,
+      "loss": 0.6637,
+      "step": 20670
+    },
+    {
+      "epoch": 3.4572956710680263,
+      "grad_norm": 0.03422121703624725,
+      "learning_rate": 6.191514338420259e-05,
+      "loss": 0.661,
+      "step": 20685
+    },
+    {
+      "epoch": 3.459802774527829,
+      "grad_norm": 0.035803407430648804,
+      "learning_rate": 6.181452289116217e-05,
+      "loss": 0.6659,
+      "step": 20700
+    },
+    {
+      "epoch": 3.462309877987632,
+      "grad_norm": 0.03371883183717728,
+      "learning_rate": 6.171390239812176e-05,
+      "loss": 0.6659,
+      "step": 20715
+    },
+    {
+      "epoch": 3.4648169814474343,
+      "grad_norm": 0.034014519304037094,
+      "learning_rate": 6.161328190508133e-05,
+      "loss": 0.662,
+      "step": 20730
+    },
+    {
+      "epoch": 3.4673240849072373,
+      "grad_norm": 0.032825078815221786,
+      "learning_rate": 6.151266141204092e-05,
+      "loss": 0.6645,
+      "step": 20745
+    },
+    {
+      "epoch": 3.46983118836704,
+      "grad_norm": 0.033502623438835144,
+      "learning_rate": 6.14120409190005e-05,
+      "loss": 0.6611,
+      "step": 20760
+    },
+    {
+      "epoch": 3.472338291826843,
+      "grad_norm": 0.03605775162577629,
+      "learning_rate": 6.131142042596009e-05,
+      "loss": 0.6781,
+      "step": 20775
+    },
+    {
+      "epoch": 3.4748453952866454,
+      "grad_norm": 0.03367803990840912,
+      "learning_rate": 6.121079993291967e-05,
+      "loss": 0.6547,
+      "step": 20790
+    },
+    {
+      "epoch": 3.4773524987464484,
+      "grad_norm": 0.034281060099601746,
+      "learning_rate": 6.111017943987926e-05,
+      "loss": 0.6607,
+      "step": 20805
+    },
+    {
+      "epoch": 3.479859602206251,
+      "grad_norm": 0.032581839710474014,
+      "learning_rate": 6.100955894683884e-05,
+      "loss": 0.6753,
+      "step": 20820
+    },
+    {
+      "epoch": 3.482366705666054,
+      "grad_norm": 0.03385984152555466,
+      "learning_rate": 6.090893845379842e-05,
+      "loss": 0.6582,
+      "step": 20835
+    },
+    {
+      "epoch": 3.4848738091258564,
+      "grad_norm": 0.03409432992339134,
+      "learning_rate": 6.080831796075801e-05,
+      "loss": 0.6541,
+      "step": 20850
+    },
+    {
+      "epoch": 3.4873809125856594,
+      "grad_norm": 0.032626084983348846,
+      "learning_rate": 6.0707697467717594e-05,
+      "loss": 0.6625,
+      "step": 20865
+    },
+    {
+      "epoch": 3.489888016045462,
+      "grad_norm": 0.034788578748703,
+      "learning_rate": 6.060707697467718e-05,
+      "loss": 0.6539,
+      "step": 20880
+    },
+    {
+      "epoch": 3.492395119505265,
+      "grad_norm": 0.034189485013484955,
+      "learning_rate": 6.0506456481636765e-05,
+      "loss": 0.6595,
+      "step": 20895
+    },
+    {
+      "epoch": 3.4949022229650675,
+      "grad_norm": 0.03473382443189621,
+      "learning_rate": 6.040583598859635e-05,
+      "loss": 0.6565,
+      "step": 20910
+    },
+    {
+      "epoch": 3.4974093264248705,
+      "grad_norm": 0.0336168147623539,
+      "learning_rate": 6.0305215495555936e-05,
+      "loss": 0.6584,
+      "step": 20925
+    },
+    {
+      "epoch": 3.4999164298846734,
+      "grad_norm": 0.0335552953183651,
+      "learning_rate": 6.020459500251552e-05,
+      "loss": 0.6557,
+      "step": 20940
+    },
+    {
+      "epoch": 3.502423533344476,
+      "grad_norm": 0.03281300142407417,
+      "learning_rate": 6.01039745094751e-05,
+      "loss": 0.668,
+      "step": 20955
+    },
+    {
+      "epoch": 3.5049306368042785,
+      "grad_norm": 0.03366611897945404,
+      "learning_rate": 6.0003354016434685e-05,
+      "loss": 0.6621,
+      "step": 20970
+    },
+    {
+      "epoch": 3.5074377402640815,
+      "grad_norm": 0.034198787063360214,
+      "learning_rate": 5.990273352339427e-05,
+      "loss": 0.6619,
+      "step": 20985
+    },
+    {
+      "epoch": 3.5099448437238845,
+      "grad_norm": 0.03419259935617447,
+      "learning_rate": 5.9802113030353856e-05,
+      "loss": 0.6737,
+      "step": 21000
+    },
+    {
+      "epoch": 3.5266588667892362,
+      "grad_norm": 0.03425095975399017,
+      "learning_rate": 5.913130974341775e-05,
+      "loss": 0.665,
+      "step": 21100
+    },
+    {
+      "epoch": 3.543372889854588,
+      "grad_norm": 0.033783555030822754,
+      "learning_rate": 5.846050645648163e-05,
+      "loss": 0.6577,
+      "step": 21200
+    },
+    {
+      "epoch": 3.5600869129199397,
+      "grad_norm": 0.03537527099251747,
+      "learning_rate": 5.778970316954553e-05,
+      "loss": 0.6635,
+      "step": 21300
+    },
+    {
+      "epoch": 3.5768009359852915,
+      "grad_norm": 0.03478403761982918,
+      "learning_rate": 5.7118899882609426e-05,
+      "loss": 0.6584,
+      "step": 21400
+    },
+    {
+      "epoch": 3.593514959050643,
+      "grad_norm": 0.034150175750255585,
+      "learning_rate": 5.644809659567332e-05,
+      "loss": 0.6596,
+      "step": 21500
+    },
+    {
+      "epoch": 3.6102289821159954,
+      "grad_norm": 0.034265898168087006,
+      "learning_rate": 5.5777293308737214e-05,
+      "loss": 0.6574,
+      "step": 21600
+    },
+    {
+      "epoch": 3.626943005181347,
+      "grad_norm": 0.03564199060201645,
+      "learning_rate": 5.510649002180111e-05,
+      "loss": 0.6549,
+      "step": 21700
+    },
+    {
+      "epoch": 3.643657028246699,
+      "grad_norm": 0.034864045679569244,
+      "learning_rate": 5.4435686734865e-05,
+      "loss": 0.6656,
+      "step": 21800
+    },
+    {
+      "epoch": 3.6603710513120506,
+      "grad_norm": 0.03352364897727966,
+      "learning_rate": 5.3764883447928896e-05,
+      "loss": 0.6622,
+      "step": 21900
+    },
+    {
+      "epoch": 3.677085074377403,
+      "grad_norm": 0.034029681235551834,
+      "learning_rate": 5.309408016099279e-05,
+      "loss": 0.6542,
+      "step": 22000
+    },
+    {
+      "epoch": 3.6937990974427546,
+      "grad_norm": 0.03352760896086693,
+      "learning_rate": 5.2423276874056684e-05,
+      "loss": 0.6664,
+      "step": 22100
+    },
+    {
+      "epoch": 3.7105131205081063,
+      "grad_norm": 0.03489440679550171,
+      "learning_rate": 5.175247358712058e-05,
+      "loss": 0.6629,
+      "step": 22200
+    },
+    {
+      "epoch": 3.727227143573458,
+      "grad_norm": 0.03430229052901268,
+      "learning_rate": 5.108167030018447e-05,
+      "loss": 0.6574,
+      "step": 22300
+    },
+    {
+      "epoch": 3.74394116663881,
+      "grad_norm": 0.033648181706666946,
+      "learning_rate": 5.0410867013248366e-05,
+      "loss": 0.6568,
+      "step": 22400
+    },
+    {
+      "epoch": 3.7606551897041616,
+      "grad_norm": 0.033668212592601776,
+      "learning_rate": 4.974006372631227e-05,
+      "loss": 0.6564,
+      "step": 22500
+    },
+    {
+      "epoch": 3.7773692127695138,
+      "grad_norm": 0.03772876039147377,
+      "learning_rate": 4.906926043937616e-05,
+      "loss": 0.6619,
+      "step": 22600
+    },
+    {
+      "epoch": 3.7940832358348655,
+      "grad_norm": 0.03430061787366867,
+      "learning_rate": 4.839845715244005e-05,
+      "loss": 0.6603,
+      "step": 22700
+    },
+    {
+      "epoch": 3.8107972589002173,
+      "grad_norm": 0.035929903388023376,
+      "learning_rate": 4.772765386550394e-05,
+      "loss": 0.6559,
+      "step": 22800
+    },
+    {
+      "epoch": 3.827511281965569,
+      "grad_norm": 0.03500952944159508,
+      "learning_rate": 4.7056850578567837e-05,
+      "loss": 0.6579,
+      "step": 22900
+    },
+    {
+      "epoch": 3.844225305030921,
+      "grad_norm": 0.03469489514827728,
+      "learning_rate": 4.638604729163173e-05,
+      "loss": 0.6558,
+      "step": 23000
+    },
+    {
+      "epoch": 3.860939328096273,
+      "grad_norm": 0.03354435786604881,
+      "learning_rate": 4.5715244004695625e-05,
+      "loss": 0.6623,
+      "step": 23100
+    },
+    {
+      "epoch": 3.8776533511616247,
+      "grad_norm": 0.03471764177083969,
+      "learning_rate": 4.504444071775952e-05,
+      "loss": 0.6648,
+      "step": 23200
+    },
+    {
+      "epoch": 3.8943673742269764,
+      "grad_norm": 0.03438182920217514,
+      "learning_rate": 4.437363743082341e-05,
+      "loss": 0.6606,
+      "step": 23300
+    },
+    {
+      "epoch": 3.911081397292328,
+      "grad_norm": 0.03417756408452988,
+      "learning_rate": 4.370283414388731e-05,
+      "loss": 0.6626,
+      "step": 23400
+    },
+    {
+      "epoch": 3.92779542035768,
+      "grad_norm": 0.03406790643930435,
+      "learning_rate": 4.30320308569512e-05,
+      "loss": 0.6609,
+      "step": 23500
+    },
+    {
+      "epoch": 3.9445094434230317,
+      "grad_norm": 0.035032719373703,
+      "learning_rate": 4.2361227570015095e-05,
+      "loss": 0.6579,
+      "step": 23600
+    },
+    {
+      "epoch": 3.961223466488384,
+      "grad_norm": 0.03397015482187271,
+      "learning_rate": 4.169042428307899e-05,
+      "loss": 0.6557,
+      "step": 23700
+    },
+    {
+      "epoch": 3.9779374895537356,
+      "grad_norm": 0.03334665298461914,
+      "learning_rate": 4.101962099614288e-05,
+      "loss": 0.6606,
+      "step": 23800
+    },
+    {
+      "epoch": 3.9946515126190874,
+      "grad_norm": 0.033896464854478836,
+      "learning_rate": 4.034881770920678e-05,
+      "loss": 0.6584,
+      "step": 23900
+    },
+    {
+      "epoch": 4.0113655356844395,
+      "grad_norm": 0.03485192731022835,
+      "learning_rate": 3.967801442227067e-05,
+      "loss": 0.6492,
+      "step": 24000
+    },
+    {
+      "epoch": 4.028079558749791,
+      "grad_norm": 0.0341389924287796,
+      "learning_rate": 3.9007211135334565e-05,
+      "loss": 0.6461,
+      "step": 24100
+    },
+    {
+      "epoch": 4.044793581815143,
+      "grad_norm": 0.03344714641571045,
+      "learning_rate": 3.833640784839846e-05,
+      "loss": 0.6475,
+      "step": 24200
+    },
+    {
+      "epoch": 4.061507604880495,
+      "grad_norm": 0.03415651619434357,
+      "learning_rate": 3.7665604561462354e-05,
+      "loss": 0.6533,
+      "step": 24300
+    },
+    {
+      "epoch": 4.0782216279458465,
+      "grad_norm": 0.03357802331447601,
+      "learning_rate": 3.699480127452625e-05,
+      "loss": 0.6513,
+      "step": 24400
+    },
+    {
+      "epoch": 4.094935651011198,
+      "grad_norm": 0.03374486416578293,
+      "learning_rate": 3.632399798759014e-05,
+      "loss": 0.6456,
+      "step": 24500
+    },
+    {
+      "epoch": 4.11164967407655,
+      "grad_norm": 0.03407549113035202,
+      "learning_rate": 3.5653194700654036e-05,
+      "loss": 0.651,
+      "step": 24600
+    },
+    {
+      "epoch": 4.128363697141902,
+      "grad_norm": 0.03402148187160492,
+      "learning_rate": 3.498239141371793e-05,
+      "loss": 0.6461,
+      "step": 24700
+    },
+    {
+      "epoch": 4.1450777202072535,
+      "grad_norm": 0.03708890080451965,
+      "learning_rate": 3.4311588126781824e-05,
+      "loss": 0.652,
+      "step": 24800
+    },
+    {
+      "epoch": 4.161791743272605,
+      "grad_norm": 0.034347113221883774,
+      "learning_rate": 3.364078483984572e-05,
+      "loss": 0.6509,
+      "step": 24900
+    },
+    {
+      "epoch": 4.178505766337958,
+      "grad_norm": 0.0340665765106678,
+      "learning_rate": 3.296998155290961e-05,
+      "loss": 0.6451,
+      "step": 25000
+    },
+    {
+      "epoch": 4.19521978940331,
+      "grad_norm": 0.035704102367162704,
+      "learning_rate": 3.2299178265973506e-05,
+      "loss": 0.6507,
+      "step": 25100
+    },
+    {
+      "epoch": 4.211933812468661,
+      "grad_norm": 0.03486304730176926,
+      "learning_rate": 3.16283749790374e-05,
+      "loss": 0.6547,
+      "step": 25200
+    },
+    {
+      "epoch": 4.228647835534013,
+      "grad_norm": 0.03282959759235382,
+      "learning_rate": 3.0957571692101294e-05,
+      "loss": 0.6523,
+      "step": 25300
+    },
+    {
+      "epoch": 4.245361858599365,
+      "grad_norm": 0.035729847848415375,
+      "learning_rate": 3.0286768405165188e-05,
+      "loss": 0.6503,
+      "step": 25400
+    },
+    {
+      "epoch": 4.262075881664717,
+      "grad_norm": 0.03634531795978546,
+      "learning_rate": 2.9615965118229082e-05,
+      "loss": 0.6493,
+      "step": 25500
+    },
+    {
+      "epoch": 4.278789904730068,
+      "grad_norm": 0.033321358263492584,
+      "learning_rate": 2.8945161831292976e-05,
+      "loss": 0.6492,
+      "step": 25600
+    },
+    {
+      "epoch": 4.29550392779542,
+      "grad_norm": 0.03541552275419235,
+      "learning_rate": 2.8274358544356867e-05,
+      "loss": 0.6466,
+      "step": 25700
+    },
+    {
+      "epoch": 4.312217950860772,
+      "grad_norm": 0.03528020158410072,
+      "learning_rate": 2.760355525742076e-05,
+      "loss": 0.6549,
+      "step": 25800
+    },
+    {
+      "epoch": 4.328931973926124,
+      "grad_norm": 0.03419233486056328,
+      "learning_rate": 2.6932751970484655e-05,
+      "loss": 0.6502,
+      "step": 25900
+    },
+    {
+      "epoch": 4.345645996991476,
+      "grad_norm": 0.03410422429442406,
+      "learning_rate": 2.626194868354855e-05,
+      "loss": 0.656,
+      "step": 26000
+    },
+    {
+      "epoch": 4.362360020056828,
+      "grad_norm": 0.033916935324668884,
+      "learning_rate": 2.5591145396612443e-05,
+      "loss": 0.643,
+      "step": 26100
+    },
+    {
+      "epoch": 4.37907404312218,
+      "grad_norm": 0.034409794956445694,
+      "learning_rate": 2.492034210967634e-05,
+      "loss": 0.6445,
+      "step": 26200
+    },
+    {
+      "epoch": 4.3957880661875315,
+      "grad_norm": 0.03435683995485306,
+      "learning_rate": 2.4249538822740235e-05,
+      "loss": 0.6493,
+      "step": 26300
+    },
+    {
+      "epoch": 4.412502089252883,
+      "grad_norm": 0.03483356907963753,
+      "learning_rate": 2.357873553580413e-05,
+      "loss": 0.649,
+      "step": 26400
+    },
+    {
+      "epoch": 4.429216112318235,
+      "grad_norm": 0.0342116504907608,
+      "learning_rate": 2.290793224886802e-05,
+      "loss": 0.6516,
+      "step": 26500
+    },
+    {
+      "epoch": 4.445930135383587,
+      "grad_norm": 0.035094503313302994,
+      "learning_rate": 2.2237128961931914e-05,
+      "loss": 0.6509,
+      "step": 26600
+    },
+    {
+      "epoch": 4.4626441584489385,
+      "grad_norm": 0.035515137016773224,
+      "learning_rate": 2.1566325674995808e-05,
+      "loss": 0.6491,
+      "step": 26700
+    },
+    {
+      "epoch": 4.47935818151429,
+      "grad_norm": 0.033778801560401917,
+      "learning_rate": 2.0895522388059702e-05,
+      "loss": 0.6516,
+      "step": 26800
+    },
+    {
+      "epoch": 4.496072204579642,
+      "grad_norm": 0.03408665210008621,
+      "learning_rate": 2.0224719101123596e-05,
+      "loss": 0.6482,
+      "step": 26900
+    },
+    {
+      "epoch": 4.512786227644995,
+      "grad_norm": 0.03422163799405098,
+      "learning_rate": 1.955391581418749e-05,
+      "loss": 0.6488,
+      "step": 27000
+    },
+    {
+      "epoch": 4.529500250710346,
+      "grad_norm": 0.03455764427781105,
+      "learning_rate": 1.8883112527251384e-05,
+      "loss": 0.6473,
+      "step": 27100
+    },
+    {
+      "epoch": 4.546214273775698,
+      "grad_norm": 0.033868152648210526,
+      "learning_rate": 1.8212309240315278e-05,
+      "loss": 0.6508,
+      "step": 27200
+    },
+    {
+      "epoch": 4.56292829684105,
+      "grad_norm": 0.03507550060749054,
+      "learning_rate": 1.7541505953379172e-05,
+      "loss": 0.6468,
+      "step": 27300
+    },
+    {
+      "epoch": 4.579642319906402,
+      "grad_norm": 0.03554074466228485,
+      "learning_rate": 1.6870702666443066e-05,
+      "loss": 0.6456,
+      "step": 27400
+    },
+    {
+      "epoch": 4.596356342971753,
+      "grad_norm": 0.03559200465679169,
+      "learning_rate": 1.619989937950696e-05,
+      "loss": 0.6544,
+      "step": 27500
+    },
+    {
+      "epoch": 4.613070366037105,
+      "grad_norm": 0.03546106070280075,
+      "learning_rate": 1.5529096092570854e-05,
+      "loss": 0.646,
+      "step": 27600
+    },
+    {
+      "epoch": 4.629784389102457,
+      "grad_norm": 0.03442246466875076,
+      "learning_rate": 1.4858292805634748e-05,
+      "loss": 0.6523,
+      "step": 27700
+    },
+    {
+      "epoch": 4.646498412167809,
+      "grad_norm": 0.03322317451238632,
+      "learning_rate": 1.4187489518698644e-05,
+      "loss": 0.6483,
+      "step": 27800
+    },
+    {
+      "epoch": 4.66321243523316,
+      "grad_norm": 0.0362270288169384,
+      "learning_rate": 1.3516686231762538e-05,
+      "loss": 0.649,
+      "step": 27900
+    },
+    {
+      "epoch": 4.679926458298512,
+      "grad_norm": 0.03510970249772072,
+      "learning_rate": 1.2845882944826429e-05,
+      "loss": 0.6461,
+      "step": 28000
+    },
+    {
+      "epoch": 4.696640481363865,
+      "grad_norm": 0.03399231657385826,
+      "learning_rate": 1.2175079657890325e-05,
+      "loss": 0.6491,
+      "step": 28100
+    },
+    {
+      "epoch": 4.713354504429216,
+      "grad_norm": 0.03436035290360451,
+      "learning_rate": 1.1504276370954219e-05,
+      "loss": 0.6499,
+      "step": 28200
+    },
+    {
+      "epoch": 4.730068527494568,
+      "grad_norm": 0.034751422703266144,
+      "learning_rate": 1.0833473084018113e-05,
+      "loss": 0.6476,
+      "step": 28300
+    },
+    {
+      "epoch": 4.74678255055992,
+      "grad_norm": 0.034067828208208084,
+      "learning_rate": 1.0162669797082005e-05,
+      "loss": 0.6463,
+      "step": 28400
+    },
+    {
+      "epoch": 4.763496573625272,
+      "grad_norm": 0.03397444635629654,
+      "learning_rate": 9.4918665101459e-06,
+      "loss": 0.6487,
+      "step": 28500
+    },
+    {
+      "epoch": 4.780210596690623,
+      "grad_norm": 0.03437269851565361,
+      "learning_rate": 8.821063223209793e-06,
+      "loss": 0.6477,
+      "step": 28600
+    },
+    {
+      "epoch": 4.796924619755975,
+      "grad_norm": 0.034697502851486206,
+      "learning_rate": 8.150259936273687e-06,
+      "loss": 0.6488,
+      "step": 28700
+    },
+    {
+      "epoch": 4.813638642821327,
+      "grad_norm": 0.03559542074799538,
+      "learning_rate": 7.479456649337582e-06,
+      "loss": 0.6492,
+      "step": 28800
+    },
+    {
+      "epoch": 4.830352665886679,
+      "grad_norm": 0.03439110890030861,
+      "learning_rate": 6.808653362401476e-06,
+      "loss": 0.6462,
+      "step": 28900
+    },
+    {
+      "epoch": 4.84706668895203,
+      "grad_norm": 0.03447462245821953,
+      "learning_rate": 6.13785007546537e-06,
+      "loss": 0.6499,
+      "step": 29000
+    },
+    {
+      "epoch": 4.863780712017382,
+      "grad_norm": 0.03418246656656265,
+      "learning_rate": 5.467046788529264e-06,
+      "loss": 0.6492,
+      "step": 29100
+    },
+    {
+      "epoch": 4.880494735082735,
+      "grad_norm": 0.03533853963017464,
+      "learning_rate": 4.796243501593159e-06,
+      "loss": 0.6513,
+      "step": 29200
+    },
+    {
+      "epoch": 4.8972087581480865,
+      "grad_norm": 0.03379116207361221,
+      "learning_rate": 4.125440214657052e-06,
+      "loss": 0.6491,
+      "step": 29300
+    },
+    {
+      "epoch": 4.913922781213438,
+      "grad_norm": 0.03501541167497635,
+      "learning_rate": 3.454636927720946e-06,
+      "loss": 0.6444,
+      "step": 29400
+    },
+    {
+      "epoch": 4.93063680427879,
+      "grad_norm": 0.03520382195711136,
+      "learning_rate": 2.78383364078484e-06,
+      "loss": 0.6499,
+      "step": 29500
+    },
+    {
+      "epoch": 4.809879753006175,
+      "grad_norm": 0.03432910144329071,
+      "learning_rate": 7.629605477665471e-06,
+      "loss": 0.6493,
+      "step": 29600
+    },
+    {
+      "epoch": 4.826129346766331,
+      "grad_norm": 0.03554558381438255,
+      "learning_rate": 6.9775024453863715e-06,
+      "loss": 0.6423,
+      "step": 29700
+    },
+    {
+      "epoch": 4.842378940526487,
+      "grad_norm": 0.03414788842201233,
+      "learning_rate": 6.3253994131072716e-06,
+      "loss": 0.6475,
+      "step": 29800
+    },
+    {
+      "epoch": 4.858628534286643,
+      "grad_norm": 0.03513456508517265,
+      "learning_rate": 5.673296380828172e-06,
+      "loss": 0.6474,
+      "step": 29900
+    },
+    {
+      "epoch": 4.874878128046799,
+      "grad_norm": 0.03595611825585365,
+      "learning_rate": 5.021193348549072e-06,
+      "loss": 0.6485,
+      "step": 30000
+    },
+    {
+      "epoch": 4.891127721806955,
+      "grad_norm": 0.035549987107515335,
+      "learning_rate": 4.369090316269971e-06,
+      "loss": 0.649,
+      "step": 30100
+    },
+    {
+      "epoch": 4.9073773155671105,
+      "grad_norm": 0.03475033864378929,
+      "learning_rate": 3.7169872839908704e-06,
+      "loss": 0.6468,
+      "step": 30200
+    },
+    {
+      "epoch": 4.923626909327266,
+      "grad_norm": 0.03467612341046333,
+      "learning_rate": 3.0648842517117705e-06,
+      "loss": 0.6404,
+      "step": 30300
+    },
+    {
+      "epoch": 4.939876503087422,
+      "grad_norm": 0.03498971089720726,
+      "learning_rate": 2.4127812194326705e-06,
+      "loss": 0.6544,
+      "step": 30400
+    },
+    {
+      "epoch": 4.956126096847579,
+      "grad_norm": 0.03452787175774574,
+      "learning_rate": 1.7606781871535704e-06,
+      "loss": 0.6491,
+      "step": 30500
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 30770,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.120160458945331e+19,
+  "train_batch_size": 5,
+  "trial_name": null,
+  "trial_params": null
+}