| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9997069597069597, | |
| "eval_steps": 100, | |
| "global_step": 1706, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005860805860805861, | |
| "grad_norm": 2.521568536758423, | |
| "learning_rate": 0.00019941383352872216, | |
| "loss": 2.3132, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.011721611721611722, | |
| "grad_norm": 2.104935884475708, | |
| "learning_rate": 0.00019882766705744433, | |
| "loss": 0.9444, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.017582417582417582, | |
| "grad_norm": 1.2950881719589233, | |
| "learning_rate": 0.00019824150058616647, | |
| "loss": 0.357, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.023443223443223443, | |
| "grad_norm": 1.1862170696258545, | |
| "learning_rate": 0.00019765533411488865, | |
| "loss": 0.2105, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.029304029304029304, | |
| "grad_norm": 0.5247148871421814, | |
| "learning_rate": 0.0001970691676436108, | |
| "loss": 0.1087, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.035164835164835165, | |
| "grad_norm": 1.2535285949707031, | |
| "learning_rate": 0.00019648300117233296, | |
| "loss": 0.1185, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.041025641025641026, | |
| "grad_norm": 0.5407606959342957, | |
| "learning_rate": 0.0001958968347010551, | |
| "loss": 0.0775, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.046886446886446886, | |
| "grad_norm": 1.137266993522644, | |
| "learning_rate": 0.00019531066822977726, | |
| "loss": 0.0728, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05274725274725275, | |
| "grad_norm": 0.5244052410125732, | |
| "learning_rate": 0.00019472450175849943, | |
| "loss": 0.056, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05860805860805861, | |
| "grad_norm": 0.4511496126651764, | |
| "learning_rate": 0.00019413833528722157, | |
| "loss": 0.0648, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06446886446886448, | |
| "grad_norm": 0.33913975954055786, | |
| "learning_rate": 0.00019355216881594375, | |
| "loss": 0.0479, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.07032967032967033, | |
| "grad_norm": 0.354777991771698, | |
| "learning_rate": 0.0001929660023446659, | |
| "loss": 0.0758, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0761904761904762, | |
| "grad_norm": 0.3968910276889801, | |
| "learning_rate": 0.00019237983587338807, | |
| "loss": 0.0776, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.08205128205128205, | |
| "grad_norm": 0.5061652660369873, | |
| "learning_rate": 0.0001917936694021102, | |
| "loss": 0.0349, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08791208791208792, | |
| "grad_norm": 0.24455250799655914, | |
| "learning_rate": 0.00019120750293083236, | |
| "loss": 0.0369, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.09377289377289377, | |
| "grad_norm": 0.3959537446498871, | |
| "learning_rate": 0.00019062133645955453, | |
| "loss": 0.0513, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09963369963369964, | |
| "grad_norm": 0.4534469544887543, | |
| "learning_rate": 0.00019003516998827668, | |
| "loss": 0.0459, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1054945054945055, | |
| "grad_norm": 0.30694451928138733, | |
| "learning_rate": 0.00018944900351699885, | |
| "loss": 0.0377, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11135531135531136, | |
| "grad_norm": 0.15078052878379822, | |
| "learning_rate": 0.000188862837045721, | |
| "loss": 0.0297, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.11721611721611722, | |
| "grad_norm": 0.3153330981731415, | |
| "learning_rate": 0.00018827667057444317, | |
| "loss": 0.0301, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11721611721611722, | |
| "eval_loss": 0.028654273599386215, | |
| "eval_runtime": 4.8016, | |
| "eval_samples_per_second": 5.831, | |
| "eval_steps_per_second": 2.916, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12307692307692308, | |
| "grad_norm": 0.5777396559715271, | |
| "learning_rate": 0.0001876905041031653, | |
| "loss": 0.0515, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.12893772893772895, | |
| "grad_norm": 0.3677718937397003, | |
| "learning_rate": 0.00018710433763188746, | |
| "loss": 0.0315, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1347985347985348, | |
| "grad_norm": 0.30658507347106934, | |
| "learning_rate": 0.0001865181711606096, | |
| "loss": 0.0248, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.14065934065934066, | |
| "grad_norm": 0.21649648249149323, | |
| "learning_rate": 0.00018593200468933178, | |
| "loss": 0.0352, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14652014652014653, | |
| "grad_norm": 0.2877885103225708, | |
| "learning_rate": 0.00018534583821805395, | |
| "loss": 0.0456, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 0.1782904863357544, | |
| "learning_rate": 0.0001847596717467761, | |
| "loss": 0.0257, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15824175824175823, | |
| "grad_norm": 0.1211300641298294, | |
| "learning_rate": 0.00018417350527549827, | |
| "loss": 0.0437, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1641025641025641, | |
| "grad_norm": 0.298053503036499, | |
| "learning_rate": 0.0001835873388042204, | |
| "loss": 0.036, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16996336996336997, | |
| "grad_norm": 0.2280658483505249, | |
| "learning_rate": 0.00018300117233294256, | |
| "loss": 0.0331, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.17582417582417584, | |
| "grad_norm": 0.14896267652511597, | |
| "learning_rate": 0.0001824150058616647, | |
| "loss": 0.0239, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18168498168498168, | |
| "grad_norm": 0.1769961267709732, | |
| "learning_rate": 0.00018182883939038688, | |
| "loss": 0.0375, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.18754578754578755, | |
| "grad_norm": 0.19849297404289246, | |
| "learning_rate": 0.00018124267291910902, | |
| "loss": 0.0357, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1934065934065934, | |
| "grad_norm": 0.2294420450925827, | |
| "learning_rate": 0.0001806565064478312, | |
| "loss": 0.0504, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.19926739926739928, | |
| "grad_norm": 0.10744224488735199, | |
| "learning_rate": 0.00018007033997655337, | |
| "loss": 0.0209, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 0.06066066771745682, | |
| "learning_rate": 0.00017948417350527551, | |
| "loss": 0.0175, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.210989010989011, | |
| "grad_norm": 0.5421485304832458, | |
| "learning_rate": 0.00017889800703399766, | |
| "loss": 0.0398, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21684981684981686, | |
| "grad_norm": 0.1306767612695694, | |
| "learning_rate": 0.0001783118405627198, | |
| "loss": 0.0258, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.22271062271062272, | |
| "grad_norm": 0.11510124802589417, | |
| "learning_rate": 0.00017772567409144198, | |
| "loss": 0.0253, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.20577751100063324, | |
| "learning_rate": 0.00017713950762016412, | |
| "loss": 0.0277, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.23443223443223443, | |
| "grad_norm": 0.20588932931423187, | |
| "learning_rate": 0.0001765533411488863, | |
| "loss": 0.0296, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23443223443223443, | |
| "eval_loss": 0.019306689500808716, | |
| "eval_runtime": 4.7344, | |
| "eval_samples_per_second": 5.914, | |
| "eval_steps_per_second": 2.957, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2402930402930403, | |
| "grad_norm": 0.13713975250720978, | |
| "learning_rate": 0.00017596717467760847, | |
| "loss": 0.0372, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.24615384615384617, | |
| "grad_norm": 0.14788508415222168, | |
| "learning_rate": 0.00017538100820633061, | |
| "loss": 0.033, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.252014652014652, | |
| "grad_norm": 0.2372630536556244, | |
| "learning_rate": 0.00017479484173505276, | |
| "loss": 0.0281, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2578754578754579, | |
| "grad_norm": 0.2357954978942871, | |
| "learning_rate": 0.0001742086752637749, | |
| "loss": 0.0295, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.26373626373626374, | |
| "grad_norm": 0.2537606358528137, | |
| "learning_rate": 0.00017362250879249708, | |
| "loss": 0.036, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2695970695970696, | |
| "grad_norm": 0.2222289741039276, | |
| "learning_rate": 0.00017303634232121922, | |
| "loss": 0.0402, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2754578754578755, | |
| "grad_norm": 0.19696177542209625, | |
| "learning_rate": 0.0001724501758499414, | |
| "loss": 0.025, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2813186813186813, | |
| "grad_norm": 0.08915109932422638, | |
| "learning_rate": 0.00017186400937866357, | |
| "loss": 0.0292, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.28717948717948716, | |
| "grad_norm": 0.39625948667526245, | |
| "learning_rate": 0.00017127784290738572, | |
| "loss": 0.0324, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.29304029304029305, | |
| "grad_norm": 0.05654177442193031, | |
| "learning_rate": 0.00017069167643610786, | |
| "loss": 0.0384, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2989010989010989, | |
| "grad_norm": 0.23707075417041779, | |
| "learning_rate": 0.00017010550996483, | |
| "loss": 0.0311, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 0.2619571387767792, | |
| "learning_rate": 0.00016951934349355218, | |
| "loss": 0.0275, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.31062271062271063, | |
| "grad_norm": 0.11028550565242767, | |
| "learning_rate": 0.00016893317702227432, | |
| "loss": 0.0194, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.31648351648351647, | |
| "grad_norm": 0.26710912585258484, | |
| "learning_rate": 0.0001683470105509965, | |
| "loss": 0.0318, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.32234432234432236, | |
| "grad_norm": 0.20064710080623627, | |
| "learning_rate": 0.00016776084407971864, | |
| "loss": 0.0517, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3282051282051282, | |
| "grad_norm": 0.06760745495557785, | |
| "learning_rate": 0.00016717467760844082, | |
| "loss": 0.0223, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.33406593406593404, | |
| "grad_norm": 0.14518442749977112, | |
| "learning_rate": 0.00016658851113716296, | |
| "loss": 0.0216, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.33992673992673994, | |
| "grad_norm": 0.35427016019821167, | |
| "learning_rate": 0.0001660023446658851, | |
| "loss": 0.0268, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3457875457875458, | |
| "grad_norm": 0.14693213999271393, | |
| "learning_rate": 0.00016541617819460728, | |
| "loss": 0.0246, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3516483516483517, | |
| "grad_norm": 0.2351713478565216, | |
| "learning_rate": 0.00016483001172332943, | |
| "loss": 0.0399, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3516483516483517, | |
| "eval_loss": 0.02264154888689518, | |
| "eval_runtime": 4.8013, | |
| "eval_samples_per_second": 5.832, | |
| "eval_steps_per_second": 2.916, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3575091575091575, | |
| "grad_norm": 0.16985514760017395, | |
| "learning_rate": 0.0001642438452520516, | |
| "loss": 0.0243, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.36336996336996336, | |
| "grad_norm": 0.199642613530159, | |
| "learning_rate": 0.00016365767878077374, | |
| "loss": 0.0329, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.36923076923076925, | |
| "grad_norm": 0.05538804084062576, | |
| "learning_rate": 0.00016307151230949592, | |
| "loss": 0.0474, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.3750915750915751, | |
| "grad_norm": 0.1542443037033081, | |
| "learning_rate": 0.00016248534583821806, | |
| "loss": 0.0232, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.16808335483074188, | |
| "learning_rate": 0.0001618991793669402, | |
| "loss": 0.0272, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.3868131868131868, | |
| "grad_norm": 0.20816853642463684, | |
| "learning_rate": 0.00016131301289566238, | |
| "loss": 0.0375, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.39267399267399267, | |
| "grad_norm": 0.10338038206100464, | |
| "learning_rate": 0.00016072684642438453, | |
| "loss": 0.0233, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.39853479853479856, | |
| "grad_norm": 0.06522126495838165, | |
| "learning_rate": 0.0001601406799531067, | |
| "loss": 0.0288, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4043956043956044, | |
| "grad_norm": 0.11830403655767441, | |
| "learning_rate": 0.00015955451348182884, | |
| "loss": 0.0164, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 0.2777006924152374, | |
| "learning_rate": 0.00015896834701055102, | |
| "loss": 0.0345, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.41611721611721614, | |
| "grad_norm": 0.1277918815612793, | |
| "learning_rate": 0.00015838218053927316, | |
| "loss": 0.0229, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.421978021978022, | |
| "grad_norm": 0.09861145913600922, | |
| "learning_rate": 0.0001577960140679953, | |
| "loss": 0.0181, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4278388278388278, | |
| "grad_norm": 0.08698171377182007, | |
| "learning_rate": 0.00015720984759671748, | |
| "loss": 0.0365, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.4336996336996337, | |
| "grad_norm": 0.23488883674144745, | |
| "learning_rate": 0.00015662368112543963, | |
| "loss": 0.0352, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.43956043956043955, | |
| "grad_norm": 0.05140375718474388, | |
| "learning_rate": 0.0001560375146541618, | |
| "loss": 0.0235, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.44542124542124545, | |
| "grad_norm": 0.15759135782718658, | |
| "learning_rate": 0.00015545134818288395, | |
| "loss": 0.0236, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4512820512820513, | |
| "grad_norm": 0.07523085922002792, | |
| "learning_rate": 0.00015486518171160612, | |
| "loss": 0.0218, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.1892630159854889, | |
| "learning_rate": 0.00015427901524032826, | |
| "loss": 0.0264, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.463003663003663, | |
| "grad_norm": 0.12749487161636353, | |
| "learning_rate": 0.0001536928487690504, | |
| "loss": 0.0414, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.46886446886446886, | |
| "grad_norm": 0.11519593745470047, | |
| "learning_rate": 0.00015310668229777258, | |
| "loss": 0.0173, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.46886446886446886, | |
| "eval_loss": 0.01765686459839344, | |
| "eval_runtime": 4.8182, | |
| "eval_samples_per_second": 5.811, | |
| "eval_steps_per_second": 2.906, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4747252747252747, | |
| "grad_norm": 0.14240577816963196, | |
| "learning_rate": 0.00015252051582649473, | |
| "loss": 0.0214, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.4805860805860806, | |
| "grad_norm": 0.11283282935619354, | |
| "learning_rate": 0.0001519343493552169, | |
| "loss": 0.0266, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.48644688644688644, | |
| "grad_norm": 0.16279707849025726, | |
| "learning_rate": 0.00015134818288393905, | |
| "loss": 0.0274, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.49230769230769234, | |
| "grad_norm": 0.07319923490285873, | |
| "learning_rate": 0.00015076201641266122, | |
| "loss": 0.0216, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4981684981684982, | |
| "grad_norm": 0.13532328605651855, | |
| "learning_rate": 0.00015017584994138336, | |
| "loss": 0.0308, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.504029304029304, | |
| "grad_norm": 0.12896127998828888, | |
| "learning_rate": 0.0001495896834701055, | |
| "loss": 0.0129, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5098901098901099, | |
| "grad_norm": 0.03718201071023941, | |
| "learning_rate": 0.00014900351699882766, | |
| "loss": 0.0181, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.5157509157509158, | |
| "grad_norm": 0.05926808714866638, | |
| "learning_rate": 0.00014841735052754983, | |
| "loss": 0.0146, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5216117216117216, | |
| "grad_norm": 0.09976931661367416, | |
| "learning_rate": 0.00014783118405627197, | |
| "loss": 0.0231, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5274725274725275, | |
| "grad_norm": 0.18460391461849213, | |
| "learning_rate": 0.00014724501758499415, | |
| "loss": 0.02, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.07457377016544342, | |
| "learning_rate": 0.00014665885111371632, | |
| "loss": 0.0209, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5391941391941392, | |
| "grad_norm": 0.03992030769586563, | |
| "learning_rate": 0.00014607268464243847, | |
| "loss": 0.0251, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.545054945054945, | |
| "grad_norm": 0.196414053440094, | |
| "learning_rate": 0.0001454865181711606, | |
| "loss": 0.0344, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.550915750915751, | |
| "grad_norm": 0.19978556036949158, | |
| "learning_rate": 0.00014490035169988276, | |
| "loss": 0.0159, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5567765567765568, | |
| "grad_norm": 0.11389517784118652, | |
| "learning_rate": 0.00014431418522860493, | |
| "loss": 0.0197, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5626373626373626, | |
| "grad_norm": 0.07047716528177261, | |
| "learning_rate": 0.00014372801875732708, | |
| "loss": 0.0138, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5684981684981685, | |
| "grad_norm": 0.10614708811044693, | |
| "learning_rate": 0.00014314185228604925, | |
| "loss": 0.0208, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5743589743589743, | |
| "grad_norm": 0.20569799840450287, | |
| "learning_rate": 0.00014255568581477142, | |
| "loss": 0.0203, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5802197802197803, | |
| "grad_norm": 0.19309553503990173, | |
| "learning_rate": 0.00014196951934349357, | |
| "loss": 0.0282, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5860805860805861, | |
| "grad_norm": 0.07542768865823746, | |
| "learning_rate": 0.0001413833528722157, | |
| "loss": 0.0173, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5860805860805861, | |
| "eval_loss": 0.022126102820038795, | |
| "eval_runtime": 4.8755, | |
| "eval_samples_per_second": 5.743, | |
| "eval_steps_per_second": 2.872, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.591941391941392, | |
| "grad_norm": 0.1205630674958229, | |
| "learning_rate": 0.00014079718640093786, | |
| "loss": 0.0322, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5978021978021978, | |
| "grad_norm": 0.13761042058467865, | |
| "learning_rate": 0.00014021101992966003, | |
| "loss": 0.0203, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6036630036630036, | |
| "grad_norm": 0.08595598489046097, | |
| "learning_rate": 0.00013962485345838218, | |
| "loss": 0.0145, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 0.11087319999933243, | |
| "learning_rate": 0.00013903868698710435, | |
| "loss": 0.0218, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.14962054789066315, | |
| "learning_rate": 0.00013845252051582652, | |
| "loss": 0.0322, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.6212454212454213, | |
| "grad_norm": 0.07573894411325455, | |
| "learning_rate": 0.00013786635404454867, | |
| "loss": 0.0275, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6271062271062271, | |
| "grad_norm": 0.069780133664608, | |
| "learning_rate": 0.0001372801875732708, | |
| "loss": 0.0235, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.6329670329670329, | |
| "grad_norm": 0.07833613455295563, | |
| "learning_rate": 0.00013669402110199296, | |
| "loss": 0.0344, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6388278388278388, | |
| "grad_norm": 0.07331829518079758, | |
| "learning_rate": 0.00013610785463071513, | |
| "loss": 0.0135, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.6446886446886447, | |
| "grad_norm": 0.22369089722633362, | |
| "learning_rate": 0.00013552168815943728, | |
| "loss": 0.0222, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6505494505494506, | |
| "grad_norm": 0.1465146392583847, | |
| "learning_rate": 0.00013493552168815945, | |
| "loss": 0.0307, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6564102564102564, | |
| "grad_norm": 0.06348715722560883, | |
| "learning_rate": 0.00013434935521688162, | |
| "loss": 0.0358, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6622710622710622, | |
| "grad_norm": 0.09298256784677505, | |
| "learning_rate": 0.00013376318874560377, | |
| "loss": 0.0224, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6681318681318681, | |
| "grad_norm": 0.18280836939811707, | |
| "learning_rate": 0.00013317702227432591, | |
| "loss": 0.0263, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.673992673992674, | |
| "grad_norm": 0.07080171257257462, | |
| "learning_rate": 0.00013259085580304806, | |
| "loss": 0.0192, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6798534798534799, | |
| "grad_norm": 0.11019092798233032, | |
| "learning_rate": 0.00013200468933177023, | |
| "loss": 0.0211, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.13162659108638763, | |
| "learning_rate": 0.00013141852286049238, | |
| "loss": 0.0284, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6915750915750916, | |
| "grad_norm": 0.19205588102340698, | |
| "learning_rate": 0.00013083235638921455, | |
| "loss": 0.0245, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6974358974358974, | |
| "grad_norm": 0.0971173569560051, | |
| "learning_rate": 0.0001302461899179367, | |
| "loss": 0.0216, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.7032967032967034, | |
| "grad_norm": 0.2179749310016632, | |
| "learning_rate": 0.00012966002344665887, | |
| "loss": 0.0268, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7032967032967034, | |
| "eval_loss": 0.01850169710814953, | |
| "eval_runtime": 4.6688, | |
| "eval_samples_per_second": 5.997, | |
| "eval_steps_per_second": 2.999, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7091575091575092, | |
| "grad_norm": 0.23079490661621094, | |
| "learning_rate": 0.00012907385697538101, | |
| "loss": 0.0269, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.715018315018315, | |
| "grad_norm": 0.08325810730457306, | |
| "learning_rate": 0.00012848769050410316, | |
| "loss": 0.0211, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7208791208791209, | |
| "grad_norm": 0.05983910337090492, | |
| "learning_rate": 0.00012790152403282533, | |
| "loss": 0.0217, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.7267399267399267, | |
| "grad_norm": 0.13831888139247894, | |
| "learning_rate": 0.00012731535756154748, | |
| "loss": 0.0142, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7326007326007326, | |
| "grad_norm": 0.12296965718269348, | |
| "learning_rate": 0.00012672919109026965, | |
| "loss": 0.0253, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.7384615384615385, | |
| "grad_norm": 0.13777951896190643, | |
| "learning_rate": 0.0001261430246189918, | |
| "loss": 0.0214, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7443223443223443, | |
| "grad_norm": 0.12136834859848022, | |
| "learning_rate": 0.00012555685814771397, | |
| "loss": 0.0244, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.7501831501831502, | |
| "grad_norm": 0.050576552748680115, | |
| "learning_rate": 0.00012497069167643612, | |
| "loss": 0.0137, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.756043956043956, | |
| "grad_norm": 0.22222141921520233, | |
| "learning_rate": 0.00012438452520515826, | |
| "loss": 0.0254, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.06815624237060547, | |
| "learning_rate": 0.00012379835873388043, | |
| "loss": 0.0231, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7677655677655678, | |
| "grad_norm": 0.19518345594406128, | |
| "learning_rate": 0.00012321219226260258, | |
| "loss": 0.0218, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7736263736263737, | |
| "grad_norm": 0.06349798291921616, | |
| "learning_rate": 0.00012262602579132475, | |
| "loss": 0.0265, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7794871794871795, | |
| "grad_norm": 0.09031341969966888, | |
| "learning_rate": 0.00012203985932004688, | |
| "loss": 0.0299, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7853479853479853, | |
| "grad_norm": 0.060232892632484436, | |
| "learning_rate": 0.00012145369284876906, | |
| "loss": 0.0227, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7912087912087912, | |
| "grad_norm": 0.23972396552562714, | |
| "learning_rate": 0.00012086752637749122, | |
| "loss": 0.0233, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7970695970695971, | |
| "grad_norm": 0.06141636520624161, | |
| "learning_rate": 0.00012028135990621336, | |
| "loss": 0.017, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.802930402930403, | |
| "grad_norm": 0.05603253096342087, | |
| "learning_rate": 0.00011969519343493553, | |
| "loss": 0.025, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.8087912087912088, | |
| "grad_norm": 0.06840907782316208, | |
| "learning_rate": 0.0001191090269636577, | |
| "loss": 0.0164, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8146520146520146, | |
| "grad_norm": 0.1270790845155716, | |
| "learning_rate": 0.00011852286049237984, | |
| "loss": 0.0237, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 0.03222518041729927, | |
| "learning_rate": 0.00011793669402110198, | |
| "loss": 0.0169, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "eval_loss": 0.026108432561159134, | |
| "eval_runtime": 4.6865, | |
| "eval_samples_per_second": 5.975, | |
| "eval_steps_per_second": 2.987, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8263736263736263, | |
| "grad_norm": 0.06273896247148514, | |
| "learning_rate": 0.00011735052754982416, | |
| "loss": 0.0245, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.8322344322344323, | |
| "grad_norm": 0.1422451138496399, | |
| "learning_rate": 0.00011676436107854632, | |
| "loss": 0.0218, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8380952380952381, | |
| "grad_norm": 0.07166247069835663, | |
| "learning_rate": 0.00011617819460726846, | |
| "loss": 0.0259, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.843956043956044, | |
| "grad_norm": 0.13188450038433075, | |
| "learning_rate": 0.00011559202813599064, | |
| "loss": 0.0185, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8498168498168498, | |
| "grad_norm": 0.11839079111814499, | |
| "learning_rate": 0.0001150058616647128, | |
| "loss": 0.0196, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.8556776556776556, | |
| "grad_norm": 0.09421879053115845, | |
| "learning_rate": 0.00011441969519343494, | |
| "loss": 0.0207, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8615384615384616, | |
| "grad_norm": 0.11722107976675034, | |
| "learning_rate": 0.00011383352872215709, | |
| "loss": 0.0286, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.8673992673992674, | |
| "grad_norm": 0.07790110260248184, | |
| "learning_rate": 0.00011324736225087926, | |
| "loss": 0.0157, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8732600732600733, | |
| "grad_norm": 0.11153840273618698, | |
| "learning_rate": 0.00011266119577960142, | |
| "loss": 0.0184, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8791208791208791, | |
| "grad_norm": 0.07105362415313721, | |
| "learning_rate": 0.00011207502930832356, | |
| "loss": 0.0193, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.884981684981685, | |
| "grad_norm": 0.11616308242082596, | |
| "learning_rate": 0.00011148886283704571, | |
| "loss": 0.0219, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8908424908424909, | |
| "grad_norm": 0.10045047104358673, | |
| "learning_rate": 0.0001109026963657679, | |
| "loss": 0.0177, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8967032967032967, | |
| "grad_norm": 0.07033990323543549, | |
| "learning_rate": 0.00011031652989449004, | |
| "loss": 0.0227, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.9025641025641026, | |
| "grad_norm": 0.07648850232362747, | |
| "learning_rate": 0.00010973036342321219, | |
| "loss": 0.023, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9084249084249084, | |
| "grad_norm": 0.05392804369330406, | |
| "learning_rate": 0.00010914419695193436, | |
| "loss": 0.0136, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.17311276495456696, | |
| "learning_rate": 0.00010855803048065652, | |
| "loss": 0.0257, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9201465201465201, | |
| "grad_norm": 0.07022574543952942, | |
| "learning_rate": 0.00010797186400937866, | |
| "loss": 0.0282, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.926007326007326, | |
| "grad_norm": 0.15858297049999237, | |
| "learning_rate": 0.00010738569753810081, | |
| "loss": 0.0219, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9318681318681319, | |
| "grad_norm": 0.06796769052743912, | |
| "learning_rate": 0.00010679953106682298, | |
| "loss": 0.0288, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.9377289377289377, | |
| "grad_norm": 0.11868051439523697, | |
| "learning_rate": 0.00010621336459554514, | |
| "loss": 0.0248, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9377289377289377, | |
| "eval_loss": 0.019950957968831062, | |
| "eval_runtime": 4.8291, | |
| "eval_samples_per_second": 5.798, | |
| "eval_steps_per_second": 2.899, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9435897435897436, | |
| "grad_norm": 0.19787561893463135, | |
| "learning_rate": 0.00010562719812426729, | |
| "loss": 0.0249, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.9494505494505494, | |
| "grad_norm": 0.06437662243843079, | |
| "learning_rate": 0.00010504103165298946, | |
| "loss": 0.0141, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9553113553113554, | |
| "grad_norm": 0.09178975969552994, | |
| "learning_rate": 0.00010445486518171162, | |
| "loss": 0.0218, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.9611721611721612, | |
| "grad_norm": 0.09567834436893463, | |
| "learning_rate": 0.00010386869871043376, | |
| "loss": 0.0229, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.967032967032967, | |
| "grad_norm": 0.039594829082489014, | |
| "learning_rate": 0.00010328253223915591, | |
| "loss": 0.0186, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.9728937728937729, | |
| "grad_norm": 0.18495650589466095, | |
| "learning_rate": 0.00010269636576787808, | |
| "loss": 0.0237, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9787545787545787, | |
| "grad_norm": 0.1861388385295868, | |
| "learning_rate": 0.00010211019929660024, | |
| "loss": 0.0367, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.9846153846153847, | |
| "grad_norm": 0.05491223558783531, | |
| "learning_rate": 0.00010152403282532239, | |
| "loss": 0.015, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9904761904761905, | |
| "grad_norm": 0.04110349714756012, | |
| "learning_rate": 0.00010093786635404456, | |
| "loss": 0.0258, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.9963369963369964, | |
| "grad_norm": 0.07649147510528564, | |
| "learning_rate": 0.00010035169988276672, | |
| "loss": 0.0235, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0021978021978022, | |
| "grad_norm": 0.11078579723834991, | |
| "learning_rate": 9.976553341148887e-05, | |
| "loss": 0.0204, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.008058608058608, | |
| "grad_norm": 0.08302613347768784, | |
| "learning_rate": 9.917936694021102e-05, | |
| "loss": 0.0188, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0139194139194139, | |
| "grad_norm": 0.19045108556747437, | |
| "learning_rate": 9.859320046893318e-05, | |
| "loss": 0.0226, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.0197802197802197, | |
| "grad_norm": 0.04657626897096634, | |
| "learning_rate": 9.800703399765534e-05, | |
| "loss": 0.0205, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.0256410256410255, | |
| "grad_norm": 0.10237232595682144, | |
| "learning_rate": 9.742086752637749e-05, | |
| "loss": 0.0259, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.0315018315018314, | |
| "grad_norm": 0.1746947020292282, | |
| "learning_rate": 9.683470105509965e-05, | |
| "loss": 0.0229, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0373626373626375, | |
| "grad_norm": 0.1032433807849884, | |
| "learning_rate": 9.624853458382182e-05, | |
| "loss": 0.0277, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.0432234432234433, | |
| "grad_norm": 0.08428288996219635, | |
| "learning_rate": 9.566236811254397e-05, | |
| "loss": 0.0161, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0490842490842491, | |
| "grad_norm": 0.16661523282527924, | |
| "learning_rate": 9.507620164126613e-05, | |
| "loss": 0.0212, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.054945054945055, | |
| "grad_norm": 0.08473166078329086, | |
| "learning_rate": 9.449003516998827e-05, | |
| "loss": 0.0285, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.054945054945055, | |
| "eval_loss": 0.018958253785967827, | |
| "eval_runtime": 4.8179, | |
| "eval_samples_per_second": 5.812, | |
| "eval_steps_per_second": 2.906, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0608058608058608, | |
| "grad_norm": 0.04149739816784859, | |
| "learning_rate": 9.390386869871044e-05, | |
| "loss": 0.0138, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 0.05616866052150726, | |
| "learning_rate": 9.331770222743259e-05, | |
| "loss": 0.0194, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.0725274725274725, | |
| "grad_norm": 0.16730394959449768, | |
| "learning_rate": 9.273153575615475e-05, | |
| "loss": 0.0359, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.0783882783882783, | |
| "grad_norm": 0.06336849927902222, | |
| "learning_rate": 9.214536928487691e-05, | |
| "loss": 0.0147, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.0842490842490842, | |
| "grad_norm": 0.0882687047123909, | |
| "learning_rate": 9.155920281359907e-05, | |
| "loss": 0.015, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.09010989010989, | |
| "grad_norm": 0.05476200208067894, | |
| "learning_rate": 9.097303634232123e-05, | |
| "loss": 0.019, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.095970695970696, | |
| "grad_norm": 0.05358652025461197, | |
| "learning_rate": 9.038686987104337e-05, | |
| "loss": 0.0174, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.101831501831502, | |
| "grad_norm": 0.24095569550991058, | |
| "learning_rate": 8.980070339976554e-05, | |
| "loss": 0.0293, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.1076923076923078, | |
| "grad_norm": 0.06653840094804764, | |
| "learning_rate": 8.921453692848769e-05, | |
| "loss": 0.0133, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.1135531135531136, | |
| "grad_norm": 0.08839567750692368, | |
| "learning_rate": 8.862837045720985e-05, | |
| "loss": 0.0208, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1194139194139194, | |
| "grad_norm": 0.032916922122240067, | |
| "learning_rate": 8.804220398593201e-05, | |
| "loss": 0.0267, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.1252747252747253, | |
| "grad_norm": 0.11819420754909515, | |
| "learning_rate": 8.745603751465417e-05, | |
| "loss": 0.0384, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1311355311355311, | |
| "grad_norm": 0.06757565587759018, | |
| "learning_rate": 8.686987104337633e-05, | |
| "loss": 0.0135, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.136996336996337, | |
| "grad_norm": 0.0970228835940361, | |
| "learning_rate": 8.628370457209847e-05, | |
| "loss": 0.0166, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.13436350226402283, | |
| "learning_rate": 8.569753810082065e-05, | |
| "loss": 0.016, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.1487179487179486, | |
| "grad_norm": 0.10584839433431625, | |
| "learning_rate": 8.511137162954279e-05, | |
| "loss": 0.0177, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1545787545787545, | |
| "grad_norm": 0.21206024289131165, | |
| "learning_rate": 8.452520515826495e-05, | |
| "loss": 0.0325, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.1604395604395605, | |
| "grad_norm": 0.04815613850951195, | |
| "learning_rate": 8.393903868698711e-05, | |
| "loss": 0.0137, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1663003663003664, | |
| "grad_norm": 0.07466138154268265, | |
| "learning_rate": 8.335287221570927e-05, | |
| "loss": 0.0173, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.1721611721611722, | |
| "grad_norm": 0.09366811066865921, | |
| "learning_rate": 8.276670574443143e-05, | |
| "loss": 0.025, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1721611721611722, | |
| "eval_loss": 0.019907595589756966, | |
| "eval_runtime": 4.819, | |
| "eval_samples_per_second": 5.81, | |
| "eval_steps_per_second": 2.905, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.178021978021978, | |
| "grad_norm": 0.08527784794569016, | |
| "learning_rate": 8.218053927315357e-05, | |
| "loss": 0.0208, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.183882783882784, | |
| "grad_norm": 0.08328138291835785, | |
| "learning_rate": 8.159437280187575e-05, | |
| "loss": 0.0216, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.1897435897435897, | |
| "grad_norm": 0.05000188946723938, | |
| "learning_rate": 8.100820633059789e-05, | |
| "loss": 0.0211, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.1956043956043956, | |
| "grad_norm": 0.028807902708649635, | |
| "learning_rate": 8.042203985932005e-05, | |
| "loss": 0.0096, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2014652014652014, | |
| "grad_norm": 0.20507606863975525, | |
| "learning_rate": 7.983587338804221e-05, | |
| "loss": 0.0222, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.2073260073260073, | |
| "grad_norm": 0.04885656014084816, | |
| "learning_rate": 7.924970691676437e-05, | |
| "loss": 0.0215, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.213186813186813, | |
| "grad_norm": 0.047489382326602936, | |
| "learning_rate": 7.866354044548652e-05, | |
| "loss": 0.0178, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.2190476190476192, | |
| "grad_norm": 0.05971779301762581, | |
| "learning_rate": 7.807737397420867e-05, | |
| "loss": 0.0176, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.224908424908425, | |
| "grad_norm": 0.04695736616849899, | |
| "learning_rate": 7.749120750293083e-05, | |
| "loss": 0.0148, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 0.08131909370422363, | |
| "learning_rate": 7.690504103165299e-05, | |
| "loss": 0.0201, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2366300366300367, | |
| "grad_norm": 0.06954577565193176, | |
| "learning_rate": 7.631887456037515e-05, | |
| "loss": 0.0149, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.2424908424908425, | |
| "grad_norm": 0.054430391639471054, | |
| "learning_rate": 7.57327080890973e-05, | |
| "loss": 0.0078, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.2483516483516484, | |
| "grad_norm": 0.12231959402561188, | |
| "learning_rate": 7.514654161781947e-05, | |
| "loss": 0.0258, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.2542124542124542, | |
| "grad_norm": 0.04983118548989296, | |
| "learning_rate": 7.456037514654162e-05, | |
| "loss": 0.0189, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.26007326007326, | |
| "grad_norm": 0.11981873214244843, | |
| "learning_rate": 7.397420867526378e-05, | |
| "loss": 0.0156, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.2659340659340659, | |
| "grad_norm": 0.03829724341630936, | |
| "learning_rate": 7.338804220398593e-05, | |
| "loss": 0.0162, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.2717948717948717, | |
| "grad_norm": 0.1572490632534027, | |
| "learning_rate": 7.28018757327081e-05, | |
| "loss": 0.0188, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.2776556776556776, | |
| "grad_norm": 0.122514508664608, | |
| "learning_rate": 7.221570926143025e-05, | |
| "loss": 0.0229, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.2835164835164834, | |
| "grad_norm": 0.06537042558193207, | |
| "learning_rate": 7.16295427901524e-05, | |
| "loss": 0.0222, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.2893772893772895, | |
| "grad_norm": 0.1269371509552002, | |
| "learning_rate": 7.104337631887457e-05, | |
| "loss": 0.0272, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.2893772893772895, | |
| "eval_loss": 0.015397748909890652, | |
| "eval_runtime": 4.7526, | |
| "eval_samples_per_second": 5.892, | |
| "eval_steps_per_second": 2.946, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.2952380952380953, | |
| "grad_norm": 0.07507819682359695, | |
| "learning_rate": 7.045720984759672e-05, | |
| "loss": 0.0256, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.3010989010989011, | |
| "grad_norm": 0.04198193550109863, | |
| "learning_rate": 6.987104337631888e-05, | |
| "loss": 0.0102, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.306959706959707, | |
| "grad_norm": 0.053751297295093536, | |
| "learning_rate": 6.928487690504104e-05, | |
| "loss": 0.0141, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.3128205128205128, | |
| "grad_norm": 0.12076237797737122, | |
| "learning_rate": 6.86987104337632e-05, | |
| "loss": 0.0165, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.3186813186813187, | |
| "grad_norm": 0.0769004300236702, | |
| "learning_rate": 6.811254396248535e-05, | |
| "loss": 0.0191, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.3245421245421245, | |
| "grad_norm": 0.08021704852581024, | |
| "learning_rate": 6.75263774912075e-05, | |
| "loss": 0.0253, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.3304029304029303, | |
| "grad_norm": 0.09786754846572876, | |
| "learning_rate": 6.694021101992967e-05, | |
| "loss": 0.0191, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.3362637362637364, | |
| "grad_norm": 0.06878714263439178, | |
| "learning_rate": 6.635404454865182e-05, | |
| "loss": 0.0326, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.3421245421245422, | |
| "grad_norm": 0.11297193914651871, | |
| "learning_rate": 6.576787807737398e-05, | |
| "loss": 0.0185, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.347985347985348, | |
| "grad_norm": 0.10731342434883118, | |
| "learning_rate": 6.518171160609614e-05, | |
| "loss": 0.0168, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.353846153846154, | |
| "grad_norm": 0.08888328820466995, | |
| "learning_rate": 6.45955451348183e-05, | |
| "loss": 0.0182, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.3597069597069598, | |
| "grad_norm": 0.1666301190853119, | |
| "learning_rate": 6.400937866354045e-05, | |
| "loss": 0.0254, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.3655677655677656, | |
| "grad_norm": 0.1334419697523117, | |
| "learning_rate": 6.34232121922626e-05, | |
| "loss": 0.0215, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.3714285714285714, | |
| "grad_norm": 0.05616243556141853, | |
| "learning_rate": 6.283704572098477e-05, | |
| "loss": 0.0156, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.3772893772893773, | |
| "grad_norm": 0.1660885214805603, | |
| "learning_rate": 6.225087924970692e-05, | |
| "loss": 0.0241, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.3831501831501831, | |
| "grad_norm": 0.09245380759239197, | |
| "learning_rate": 6.166471277842908e-05, | |
| "loss": 0.0222, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.389010989010989, | |
| "grad_norm": 0.08635041117668152, | |
| "learning_rate": 6.107854630715122e-05, | |
| "loss": 0.0203, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.3948717948717948, | |
| "grad_norm": 0.07752135396003723, | |
| "learning_rate": 6.049237983587339e-05, | |
| "loss": 0.0216, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.4007326007326006, | |
| "grad_norm": 0.10603225976228714, | |
| "learning_rate": 5.990621336459554e-05, | |
| "loss": 0.0205, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.4065934065934065, | |
| "grad_norm": 0.04343140870332718, | |
| "learning_rate": 5.932004689331771e-05, | |
| "loss": 0.0105, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4065934065934065, | |
| "eval_loss": 0.015952473506331444, | |
| "eval_runtime": 4.7686, | |
| "eval_samples_per_second": 5.872, | |
| "eval_steps_per_second": 2.936, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4124542124542123, | |
| "grad_norm": 0.12541887164115906, | |
| "learning_rate": 5.873388042203987e-05, | |
| "loss": 0.0217, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.4183150183150184, | |
| "grad_norm": 0.05417335778474808, | |
| "learning_rate": 5.814771395076202e-05, | |
| "loss": 0.0205, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.4241758241758242, | |
| "grad_norm": 0.04461506009101868, | |
| "learning_rate": 5.756154747948418e-05, | |
| "loss": 0.0211, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.43003663003663, | |
| "grad_norm": 0.09423286467790604, | |
| "learning_rate": 5.697538100820633e-05, | |
| "loss": 0.0238, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.435897435897436, | |
| "grad_norm": 0.050094570964574814, | |
| "learning_rate": 5.638921453692849e-05, | |
| "loss": 0.0163, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.4417582417582417, | |
| "grad_norm": 0.13104532659053802, | |
| "learning_rate": 5.580304806565064e-05, | |
| "loss": 0.0219, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.4476190476190476, | |
| "grad_norm": 0.08238503336906433, | |
| "learning_rate": 5.52168815943728e-05, | |
| "loss": 0.0126, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.4534798534798534, | |
| "grad_norm": 0.1029452383518219, | |
| "learning_rate": 5.463071512309497e-05, | |
| "loss": 0.0247, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4593406593406593, | |
| "grad_norm": 0.05564792454242706, | |
| "learning_rate": 5.404454865181712e-05, | |
| "loss": 0.0212, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.4652014652014653, | |
| "grad_norm": 0.08589282631874084, | |
| "learning_rate": 5.345838218053928e-05, | |
| "loss": 0.0184, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.4710622710622712, | |
| "grad_norm": 0.15644195675849915, | |
| "learning_rate": 5.287221570926143e-05, | |
| "loss": 0.0165, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.476923076923077, | |
| "grad_norm": 0.11301274597644806, | |
| "learning_rate": 5.228604923798359e-05, | |
| "loss": 0.0322, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.4827838827838828, | |
| "grad_norm": 0.044553741812705994, | |
| "learning_rate": 5.1699882766705743e-05, | |
| "loss": 0.0183, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.4886446886446887, | |
| "grad_norm": 0.06141185760498047, | |
| "learning_rate": 5.11137162954279e-05, | |
| "loss": 0.0102, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.4945054945054945, | |
| "grad_norm": 0.08107537031173706, | |
| "learning_rate": 5.0527549824150055e-05, | |
| "loss": 0.0215, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.5003663003663004, | |
| "grad_norm": 0.06017793342471123, | |
| "learning_rate": 4.9941383352872214e-05, | |
| "loss": 0.0176, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.5062271062271062, | |
| "grad_norm": 0.051033902913331985, | |
| "learning_rate": 4.9355216881594373e-05, | |
| "loss": 0.0149, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.512087912087912, | |
| "grad_norm": 0.124452143907547, | |
| "learning_rate": 4.876905041031653e-05, | |
| "loss": 0.0209, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.5179487179487179, | |
| "grad_norm": 0.1616523712873459, | |
| "learning_rate": 4.8182883939038685e-05, | |
| "loss": 0.0192, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.5238095238095237, | |
| "grad_norm": 0.07067764550447464, | |
| "learning_rate": 4.759671746776085e-05, | |
| "loss": 0.0279, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5238095238095237, | |
| "eval_loss": 0.017229218035936356, | |
| "eval_runtime": 4.6702, | |
| "eval_samples_per_second": 5.996, | |
| "eval_steps_per_second": 2.998, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5296703296703296, | |
| "grad_norm": 0.06332267820835114, | |
| "learning_rate": 4.7010550996483003e-05, | |
| "loss": 0.0253, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.5355311355311354, | |
| "grad_norm": 0.08032066375017166, | |
| "learning_rate": 4.642438452520516e-05, | |
| "loss": 0.0128, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.5413919413919412, | |
| "grad_norm": 0.11456907540559769, | |
| "learning_rate": 4.5838218053927315e-05, | |
| "loss": 0.0127, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.5472527472527473, | |
| "grad_norm": 0.19653138518333435, | |
| "learning_rate": 4.5252051582649474e-05, | |
| "loss": 0.0236, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.5531135531135531, | |
| "grad_norm": 0.08195839822292328, | |
| "learning_rate": 4.4665885111371633e-05, | |
| "loss": 0.0173, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.558974358974359, | |
| "grad_norm": 0.11376089602708817, | |
| "learning_rate": 4.4079718640093786e-05, | |
| "loss": 0.0115, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.5648351648351648, | |
| "grad_norm": 0.055264201015233994, | |
| "learning_rate": 4.3493552168815945e-05, | |
| "loss": 0.0198, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.5706959706959707, | |
| "grad_norm": 0.13038881123065948, | |
| "learning_rate": 4.2907385697538104e-05, | |
| "loss": 0.0231, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.5765567765567765, | |
| "grad_norm": 0.0317939892411232, | |
| "learning_rate": 4.2321219226260263e-05, | |
| "loss": 0.0083, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.5824175824175826, | |
| "grad_norm": 0.151336207985878, | |
| "learning_rate": 4.1735052754982416e-05, | |
| "loss": 0.0225, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.5882783882783884, | |
| "grad_norm": 0.07817093282938004, | |
| "learning_rate": 4.1148886283704575e-05, | |
| "loss": 0.0226, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.5941391941391942, | |
| "grad_norm": 0.1341279149055481, | |
| "learning_rate": 4.056271981242673e-05, | |
| "loss": 0.0263, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.06353727728128433, | |
| "learning_rate": 3.997655334114889e-05, | |
| "loss": 0.0198, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.605860805860806, | |
| "grad_norm": 0.11177901178598404, | |
| "learning_rate": 3.9390386869871046e-05, | |
| "loss": 0.0172, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.6117216117216118, | |
| "grad_norm": 0.047024596482515335, | |
| "learning_rate": 3.88042203985932e-05, | |
| "loss": 0.0207, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.6175824175824176, | |
| "grad_norm": 0.04343528300523758, | |
| "learning_rate": 3.8218053927315364e-05, | |
| "loss": 0.0214, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.6234432234432234, | |
| "grad_norm": 0.08330193161964417, | |
| "learning_rate": 3.763188745603752e-05, | |
| "loss": 0.0286, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.6293040293040293, | |
| "grad_norm": 0.0811009556055069, | |
| "learning_rate": 3.7045720984759676e-05, | |
| "loss": 0.0148, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.6351648351648351, | |
| "grad_norm": 0.1049441322684288, | |
| "learning_rate": 3.645955451348183e-05, | |
| "loss": 0.0184, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.641025641025641, | |
| "grad_norm": 0.11944428086280823, | |
| "learning_rate": 3.587338804220399e-05, | |
| "loss": 0.0122, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.641025641025641, | |
| "eval_loss": 0.017561230808496475, | |
| "eval_runtime": 4.685, | |
| "eval_samples_per_second": 5.977, | |
| "eval_steps_per_second": 2.988, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.6468864468864468, | |
| "grad_norm": 0.14023366570472717, | |
| "learning_rate": 3.528722157092615e-05, | |
| "loss": 0.0178, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.6527472527472526, | |
| "grad_norm": 0.14057691395282745, | |
| "learning_rate": 3.47010550996483e-05, | |
| "loss": 0.0268, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6586080586080585, | |
| "grad_norm": 0.1253061443567276, | |
| "learning_rate": 3.411488862837046e-05, | |
| "loss": 0.0266, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.6644688644688643, | |
| "grad_norm": 0.03431854769587517, | |
| "learning_rate": 3.352872215709262e-05, | |
| "loss": 0.02, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.6703296703296702, | |
| "grad_norm": 0.13929079473018646, | |
| "learning_rate": 3.294255568581478e-05, | |
| "loss": 0.0226, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.6761904761904762, | |
| "grad_norm": 0.06429693102836609, | |
| "learning_rate": 3.235638921453693e-05, | |
| "loss": 0.0225, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.682051282051282, | |
| "grad_norm": 0.029311953112483025, | |
| "learning_rate": 3.177022274325909e-05, | |
| "loss": 0.0161, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.687912087912088, | |
| "grad_norm": 0.04346455633640289, | |
| "learning_rate": 3.118405627198124e-05, | |
| "loss": 0.0155, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.6937728937728938, | |
| "grad_norm": 0.09009824693202972, | |
| "learning_rate": 3.05978898007034e-05, | |
| "loss": 0.0153, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.6996336996336996, | |
| "grad_norm": 0.071926549077034, | |
| "learning_rate": 3.0011723329425556e-05, | |
| "loss": 0.0136, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.7054945054945057, | |
| "grad_norm": 0.06461833417415619, | |
| "learning_rate": 2.9425556858147718e-05, | |
| "loss": 0.0237, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.7113553113553115, | |
| "grad_norm": 0.039929524064064026, | |
| "learning_rate": 2.8839390386869874e-05, | |
| "loss": 0.0187, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.7172161172161173, | |
| "grad_norm": 0.0534372515976429, | |
| "learning_rate": 2.825322391559203e-05, | |
| "loss": 0.0192, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.7230769230769232, | |
| "grad_norm": 0.1467376947402954, | |
| "learning_rate": 2.7667057444314186e-05, | |
| "loss": 0.0203, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.728937728937729, | |
| "grad_norm": 0.0830003172159195, | |
| "learning_rate": 2.7080890973036345e-05, | |
| "loss": 0.0188, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.7347985347985349, | |
| "grad_norm": 0.07220768928527832, | |
| "learning_rate": 2.64947245017585e-05, | |
| "loss": 0.0118, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.7406593406593407, | |
| "grad_norm": 0.0751115009188652, | |
| "learning_rate": 2.5908558030480656e-05, | |
| "loss": 0.0156, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.7465201465201465, | |
| "grad_norm": 0.07690921425819397, | |
| "learning_rate": 2.5322391559202812e-05, | |
| "loss": 0.0347, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.7523809523809524, | |
| "grad_norm": 0.05416159704327583, | |
| "learning_rate": 2.473622508792497e-05, | |
| "loss": 0.0167, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.7582417582417582, | |
| "grad_norm": 0.0676250010728836, | |
| "learning_rate": 2.4150058616647127e-05, | |
| "loss": 0.0205, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.7582417582417582, | |
| "eval_loss": 0.019522378221154213, | |
| "eval_runtime": 4.6942, | |
| "eval_samples_per_second": 5.965, | |
| "eval_steps_per_second": 2.982, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.764102564102564, | |
| "grad_norm": 0.08909754455089569, | |
| "learning_rate": 2.3563892145369286e-05, | |
| "loss": 0.0249, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.76996336996337, | |
| "grad_norm": 0.042161233723163605, | |
| "learning_rate": 2.2977725674091442e-05, | |
| "loss": 0.0117, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.7758241758241757, | |
| "grad_norm": 0.07136218249797821, | |
| "learning_rate": 2.23915592028136e-05, | |
| "loss": 0.0212, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.7816849816849816, | |
| "grad_norm": 0.14128735661506653, | |
| "learning_rate": 2.1805392731535757e-05, | |
| "loss": 0.0189, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.7875457875457874, | |
| "grad_norm": 0.05959760770201683, | |
| "learning_rate": 2.1219226260257916e-05, | |
| "loss": 0.0119, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.7934065934065933, | |
| "grad_norm": 0.038479190319776535, | |
| "learning_rate": 2.0633059788980072e-05, | |
| "loss": 0.013, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.7992673992673993, | |
| "grad_norm": 0.09512809664011002, | |
| "learning_rate": 2.0046893317702228e-05, | |
| "loss": 0.0148, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.8051282051282052, | |
| "grad_norm": 0.14848454296588898, | |
| "learning_rate": 1.9460726846424384e-05, | |
| "loss": 0.019, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.810989010989011, | |
| "grad_norm": 0.10240516811609268, | |
| "learning_rate": 1.8874560375146543e-05, | |
| "loss": 0.017, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.8168498168498168, | |
| "grad_norm": 0.09345954656600952, | |
| "learning_rate": 1.82883939038687e-05, | |
| "loss": 0.0237, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.8227106227106227, | |
| "grad_norm": 0.03815275430679321, | |
| "learning_rate": 1.7702227432590858e-05, | |
| "loss": 0.0188, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.8285714285714287, | |
| "grad_norm": 0.027827398851513863, | |
| "learning_rate": 1.7116060961313014e-05, | |
| "loss": 0.0183, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.8344322344322346, | |
| "grad_norm": 0.08811303228139877, | |
| "learning_rate": 1.6529894490035173e-05, | |
| "loss": 0.0191, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.8402930402930404, | |
| "grad_norm": 0.03119056299328804, | |
| "learning_rate": 1.594372801875733e-05, | |
| "loss": 0.0183, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 0.09752997010946274, | |
| "learning_rate": 1.5357561547479485e-05, | |
| "loss": 0.0161, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.852014652014652, | |
| "grad_norm": 0.0855243131518364, | |
| "learning_rate": 1.477139507620164e-05, | |
| "loss": 0.015, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.857875457875458, | |
| "grad_norm": 0.08388842642307281, | |
| "learning_rate": 1.41852286049238e-05, | |
| "loss": 0.0148, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.8637362637362638, | |
| "grad_norm": 0.10147551447153091, | |
| "learning_rate": 1.3599062133645957e-05, | |
| "loss": 0.0154, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.8695970695970696, | |
| "grad_norm": 0.0457012839615345, | |
| "learning_rate": 1.3012895662368113e-05, | |
| "loss": 0.0186, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.8754578754578755, | |
| "grad_norm": 0.03654688224196434, | |
| "learning_rate": 1.242672919109027e-05, | |
| "loss": 0.0321, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.8754578754578755, | |
| "eval_loss": 0.017443044111132622, | |
| "eval_runtime": 4.7108, | |
| "eval_samples_per_second": 5.944, | |
| "eval_steps_per_second": 2.972, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.8813186813186813, | |
| "grad_norm": 0.07887323200702667, | |
| "learning_rate": 1.1840562719812428e-05, | |
| "loss": 0.0142, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.8871794871794871, | |
| "grad_norm": 0.11328335106372833, | |
| "learning_rate": 1.1254396248534585e-05, | |
| "loss": 0.025, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.893040293040293, | |
| "grad_norm": 0.09318089485168457, | |
| "learning_rate": 1.0668229777256741e-05, | |
| "loss": 0.0204, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.8989010989010988, | |
| "grad_norm": 0.06992164254188538, | |
| "learning_rate": 1.0082063305978899e-05, | |
| "loss": 0.0135, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.14927181601524353, | |
| "learning_rate": 9.495896834701056e-06, | |
| "loss": 0.0245, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.9106227106227105, | |
| "grad_norm": 0.11408836394548416, | |
| "learning_rate": 8.909730363423214e-06, | |
| "loss": 0.0161, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.9164835164835163, | |
| "grad_norm": 0.06911155581474304, | |
| "learning_rate": 8.32356389214537e-06, | |
| "loss": 0.0154, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.9223443223443224, | |
| "grad_norm": 0.11621779948472977, | |
| "learning_rate": 7.737397420867527e-06, | |
| "loss": 0.0144, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.9282051282051282, | |
| "grad_norm": 0.046058397740125656, | |
| "learning_rate": 7.151230949589684e-06, | |
| "loss": 0.0093, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.934065934065934, | |
| "grad_norm": 0.11228576302528381, | |
| "learning_rate": 6.565064478311841e-06, | |
| "loss": 0.022, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.93992673992674, | |
| "grad_norm": 0.1315338909626007, | |
| "learning_rate": 5.978898007033998e-06, | |
| "loss": 0.0193, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.9457875457875458, | |
| "grad_norm": 0.040056392550468445, | |
| "learning_rate": 5.3927315357561546e-06, | |
| "loss": 0.0132, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.9516483516483516, | |
| "grad_norm": 0.10741738229990005, | |
| "learning_rate": 4.806565064478312e-06, | |
| "loss": 0.0352, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.9575091575091577, | |
| "grad_norm": 0.059029560536146164, | |
| "learning_rate": 4.220398593200469e-06, | |
| "loss": 0.019, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.9633699633699635, | |
| "grad_norm": 0.06789711117744446, | |
| "learning_rate": 3.6342321219226262e-06, | |
| "loss": 0.0189, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.9692307692307693, | |
| "grad_norm": 0.02918117679655552, | |
| "learning_rate": 3.0480656506447833e-06, | |
| "loss": 0.0259, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.9750915750915752, | |
| "grad_norm": 0.08073403686285019, | |
| "learning_rate": 2.4618991793669404e-06, | |
| "loss": 0.0286, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.980952380952381, | |
| "grad_norm": 0.1617717295885086, | |
| "learning_rate": 1.8757327080890972e-06, | |
| "loss": 0.0191, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.9868131868131869, | |
| "grad_norm": 0.06613462418317795, | |
| "learning_rate": 1.2895662368112545e-06, | |
| "loss": 0.0128, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.9926739926739927, | |
| "grad_norm": 0.08398256450891495, | |
| "learning_rate": 7.033997655334116e-07, | |
| "loss": 0.0118, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.9926739926739927, | |
| "eval_loss": 0.018166696652770042, | |
| "eval_runtime": 4.6959, | |
| "eval_samples_per_second": 5.963, | |
| "eval_steps_per_second": 2.981, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.9985347985347985, | |
| "grad_norm": 0.0773661658167839, | |
| "learning_rate": 1.1723329425556858e-07, | |
| "loss": 0.0146, | |
| "step": 1705 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1706, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.764499308335456e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |