| { | |
| "best_global_step": 1800, | |
| "best_metric": 0.07772836834192276, | |
| "best_model_checkpoint": "./outputs/powershell-production/checkpoint-1800", | |
| "epoch": 1.5481573242489937, | |
| "eval_steps": 100, | |
| "global_step": 2500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012387736141220192, | |
| "grad_norm": 296.0, | |
| "learning_rate": 7.054455445544555e-07, | |
| "loss": 4.6719, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.024775472282440383, | |
| "grad_norm": 149.0, | |
| "learning_rate": 1.448019801980198e-06, | |
| "loss": 4.032, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.037163208423660575, | |
| "grad_norm": 76.0, | |
| "learning_rate": 2.1905940594059405e-06, | |
| "loss": 3.7462, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04955094456488077, | |
| "grad_norm": 82.0, | |
| "learning_rate": 2.9331683168316834e-06, | |
| "loss": 2.6629, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06193868070610096, | |
| "grad_norm": 67.5, | |
| "learning_rate": 3.675742574257426e-06, | |
| "loss": 1.4688, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06193868070610096, | |
| "eval_accuracy": 0.8779996238543429, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9087364573635998, | |
| "eval_f1_unsafe": 0.8224576644206463, | |
| "eval_loss": 0.8046298027038574, | |
| "eval_macro_f1": 0.5770647072614153, | |
| "eval_runtime": 1995.9982, | |
| "eval_samples_per_second": 5.875, | |
| "eval_steps_per_second": 1.469, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07432641684732115, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 4.418316831683168e-06, | |
| "loss": 0.6992, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08671415298854135, | |
| "grad_norm": 19.75, | |
| "learning_rate": 5.160891089108911e-06, | |
| "loss": 0.5021, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09910188912976153, | |
| "grad_norm": 85.0, | |
| "learning_rate": 5.903465346534654e-06, | |
| "loss": 0.4075, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11148962527098173, | |
| "grad_norm": 78.0, | |
| "learning_rate": 6.646039603960397e-06, | |
| "loss": 0.2943, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12387736141220192, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 7.388613861386139e-06, | |
| "loss": 0.3197, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12387736141220192, | |
| "eval_accuracy": 0.950703116857914, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9602635885015949, | |
| "eval_f1_unsafe": 0.9357709157041263, | |
| "eval_loss": 0.21172067523002625, | |
| "eval_macro_f1": 0.6320115014019071, | |
| "eval_runtime": 1997.1894, | |
| "eval_samples_per_second": 5.872, | |
| "eval_steps_per_second": 1.468, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13626509755342212, | |
| "grad_norm": 32.0, | |
| "learning_rate": 8.131188118811882e-06, | |
| "loss": 0.3174, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1486528336946423, | |
| "grad_norm": 102.0, | |
| "learning_rate": 8.873762376237623e-06, | |
| "loss": 0.2517, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16104056983586248, | |
| "grad_norm": 61.5, | |
| "learning_rate": 9.616336633663367e-06, | |
| "loss": 0.2532, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1734283059770827, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.0358910891089109e-05, | |
| "loss": 0.1548, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.18581604211830288, | |
| "grad_norm": 58.25, | |
| "learning_rate": 1.1101485148514851e-05, | |
| "loss": 0.1953, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18581604211830288, | |
| "eval_accuracy": 0.9621405323299588, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9684133256371634, | |
| "eval_f1_unsafe": 0.9531137279206326, | |
| "eval_loss": 0.1852019727230072, | |
| "eval_macro_f1": 0.6405090178525987, | |
| "eval_runtime": 1997.4497, | |
| "eval_samples_per_second": 5.871, | |
| "eval_steps_per_second": 1.468, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.19820377825952307, | |
| "grad_norm": 13.25, | |
| "learning_rate": 1.1844059405940594e-05, | |
| "loss": 0.183, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21059151440074325, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.2586633663366337e-05, | |
| "loss": 0.2541, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.22297925054196346, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.332920792079208e-05, | |
| "loss": 0.1851, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.23536698668318365, | |
| "grad_norm": 62.25, | |
| "learning_rate": 1.4071782178217821e-05, | |
| "loss": 0.2287, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.24775472282440383, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.4814356435643564e-05, | |
| "loss": 0.1806, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.24775472282440383, | |
| "eval_accuracy": 0.9719879297973246, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9767870924112939, | |
| "eval_f1_unsafe": 0.9646897646101082, | |
| "eval_loss": 0.13077302277088165, | |
| "eval_macro_f1": 0.6471589523404674, | |
| "eval_runtime": 1996.7162, | |
| "eval_samples_per_second": 5.873, | |
| "eval_steps_per_second": 1.468, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.26014245896562405, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 1.499985848313474e-05, | |
| "loss": 0.2029, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.27253019510684423, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.4999229530058107e-05, | |
| "loss": 0.228, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2849179312480644, | |
| "grad_norm": 88.5, | |
| "learning_rate": 1.4998097458826036e-05, | |
| "loss": 0.1709, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2973056673892846, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 1.4996462345388408e-05, | |
| "loss": 0.1309, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3096934035305048, | |
| "grad_norm": 73.5, | |
| "learning_rate": 1.499432429944386e-05, | |
| "loss": 0.1862, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3096934035305048, | |
| "eval_accuracy": 0.9698197016470148, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9749329791096684, | |
| "eval_f1_unsafe": 0.9620869563961173, | |
| "eval_loss": 0.16210031509399414, | |
| "eval_macro_f1": 0.6456733118352619, | |
| "eval_runtime": 1996.9786, | |
| "eval_samples_per_second": 5.872, | |
| "eval_steps_per_second": 1.468, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.32208113967172497, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 1.4991683464432428e-05, | |
| "loss": 0.0715, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3344688758129452, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.4988540017525911e-05, | |
| "loss": 0.2116, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3468566119541654, | |
| "grad_norm": 124.0, | |
| "learning_rate": 1.4984894169616006e-05, | |
| "loss": 0.1838, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3592443480953856, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 1.4980746165300146e-05, | |
| "loss": 0.1586, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.37163208423660576, | |
| "grad_norm": 14.5, | |
| "learning_rate": 1.4976096282865085e-05, | |
| "loss": 0.1157, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.37163208423660576, | |
| "eval_accuracy": 0.9606404538664539, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9680893048977398, | |
| "eval_f1_unsafe": 0.948655118870566, | |
| "eval_loss": 0.24218252301216125, | |
| "eval_macro_f1": 0.6389148079227686, | |
| "eval_runtime": 1997.0786, | |
| "eval_samples_per_second": 5.872, | |
| "eval_steps_per_second": 1.468, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.38401982037782595, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 1.4970944834268245e-05, | |
| "loss": 0.1796, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.39640755651904613, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.4965292165116766e-05, | |
| "loss": 0.1243, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4087952926602663, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 1.495913865464434e-05, | |
| "loss": 0.1263, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4211830288014865, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.4952484715685758e-05, | |
| "loss": 0.1291, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.43357076494270674, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 1.4945330794649209e-05, | |
| "loss": 0.1866, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.43357076494270674, | |
| "eval_accuracy": 0.9692282665075834, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9746824060081962, | |
| "eval_f1_unsafe": 0.9607789140941978, | |
| "eval_loss": 0.15051080286502838, | |
| "eval_macro_f1": 0.6451537733674647, | |
| "eval_runtime": 1996.287, | |
| "eval_samples_per_second": 5.874, | |
| "eval_steps_per_second": 1.469, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.44595850108392693, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.493767737148634e-05, | |
| "loss": 0.1021, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4583462372251471, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.492952495966005e-05, | |
| "loss": 0.1652, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4707339733663673, | |
| "grad_norm": 67.5, | |
| "learning_rate": 1.4920874106110049e-05, | |
| "loss": 0.0815, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4831217095075875, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 1.4911725391216151e-05, | |
| "loss": 0.1006, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.49550944564880767, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.4902079428759355e-05, | |
| "loss": 0.1625, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.49550944564880767, | |
| "eval_accuracy": 0.9683618856636994, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9743468963029092, | |
| "eval_f1_unsafe": 0.958734391511112, | |
| "eval_loss": 0.17920953035354614, | |
| "eval_macro_f1": 0.6443604292713404, | |
| "eval_runtime": 1996.9022, | |
| "eval_samples_per_second": 5.873, | |
| "eval_steps_per_second": 1.468, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5078971817900279, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 1.4891936865880652e-05, | |
| "loss": 0.2079, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5202849179312481, | |
| "grad_norm": 205.0, | |
| "learning_rate": 1.4881298383037618e-05, | |
| "loss": 0.0709, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5326726540724682, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 1.4870164693958752e-05, | |
| "loss": 0.1639, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5450603902136885, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 1.4858536545595602e-05, | |
| "loss": 0.0897, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5574481263549086, | |
| "grad_norm": 64.5, | |
| "learning_rate": 1.4846414718072656e-05, | |
| "loss": 0.2061, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5574481263549086, | |
| "eval_accuracy": 0.9738056476952693, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9784293070754638, | |
| "eval_f1_unsafe": 0.9666591344542607, | |
| "eval_loss": 0.12458275258541107, | |
| "eval_macro_f1": 0.6483628138432415, | |
| "eval_runtime": 1995.9769, | |
| "eval_samples_per_second": 5.875, | |
| "eval_steps_per_second": 1.469, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5698358624961288, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.4833800024634986e-05, | |
| "loss": 0.1909, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5822235986373491, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1.4820693311593708e-05, | |
| "loss": 0.1375, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5946113347785692, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 1.4807095458269194e-05, | |
| "loss": 0.0788, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6069990709197894, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.4793007376932077e-05, | |
| "loss": 0.0933, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6193868070610096, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.4778430012742053e-05, | |
| "loss": 0.2083, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6193868070610096, | |
| "eval_accuracy": 0.9669330113334843, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9732207557904943, | |
| "eval_f1_unsafe": 0.9567865224376395, | |
| "eval_loss": 0.1653885543346405, | |
| "eval_macro_f1": 0.6433357594093779, | |
| "eval_runtime": 1996.6331, | |
| "eval_samples_per_second": 5.873, | |
| "eval_steps_per_second": 1.468, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6317745432022298, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.4763364343684464e-05, | |
| "loss": 0.0689, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6441622793434499, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 1.4747811380504698e-05, | |
| "loss": 0.0984, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6565500154846702, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.4731772166640363e-05, | |
| "loss": 0.0894, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6689377516258904, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 1.4715247778151297e-05, | |
| "loss": 0.0969, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6813254877671105, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.4698239323647365e-05, | |
| "loss": 0.2052, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6813254877671105, | |
| "eval_accuracy": 0.9829721975651158, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9859135534089479, | |
| "eval_f1_unsafe": 0.9784783062783209, | |
| "eval_loss": 0.08367573469877243, | |
| "eval_macro_f1": 0.654797286562423, | |
| "eval_runtime": 1996.2626, | |
| "eval_samples_per_second": 5.874, | |
| "eval_steps_per_second": 1.469, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6937132239083308, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 1.4680747944214093e-05, | |
| "loss": 0.0508, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7061009600495509, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1.4662774813336105e-05, | |
| "loss": 0.0915, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7184886961907712, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 1.4644321136818402e-05, | |
| "loss": 0.2418, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7308764323319913, | |
| "grad_norm": 134.0, | |
| "learning_rate": 1.4625388152705457e-05, | |
| "loss": 0.094, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7432641684732115, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 1.4605977131198166e-05, | |
| "loss": 0.1194, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7432641684732115, | |
| "eval_accuracy": 0.979379790915723, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9830450283559968, | |
| "eval_f1_unsafe": 0.9736928478058482, | |
| "eval_loss": 0.11758451163768768, | |
| "eval_macro_f1": 0.652245958720615, | |
| "eval_runtime": 1996.2276, | |
| "eval_samples_per_second": 5.875, | |
| "eval_steps_per_second": 1.469, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7556519046144317, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 1.4586089374568616e-05, | |
| "loss": 0.2312, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7680396407556519, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 1.4565726217072738e-05, | |
| "loss": 0.1048, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7804273768968721, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.454488902486077e-05, | |
| "loss": 0.0491, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7928151130380923, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.452357919588562e-05, | |
| "loss": 0.2089, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8052028491793125, | |
| "grad_norm": 15.0, | |
| "learning_rate": 1.4501798159809068e-05, | |
| "loss": 0.1261, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8052028491793125, | |
| "eval_accuracy": 0.9768035525400014, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.981035749707623, | |
| "eval_f1_unsafe": 0.9701397197118314, | |
| "eval_loss": 0.13235369324684143, | |
| "eval_macro_f1": 0.6503918231398181, | |
| "eval_runtime": 1996.4333, | |
| "eval_samples_per_second": 5.874, | |
| "eval_steps_per_second": 1.469, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8175905853205326, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 1.4479547377905856e-05, | |
| "loss": 0.1104, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8299783214617529, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.445682834296565e-05, | |
| "loss": 0.1023, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.842366057602973, | |
| "grad_norm": 0.10888671875, | |
| "learning_rate": 1.4433642579192891e-05, | |
| "loss": 0.1085, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8547537937441932, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.4409991642104537e-05, | |
| "loss": 0.1881, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8671415298854135, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.4385877118425702e-05, | |
| "loss": 0.1471, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8671415298854135, | |
| "eval_accuracy": 0.9815110753333501, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9848392442662509, | |
| "eval_f1_unsafe": 0.9763106685487681, | |
| "eval_loss": 0.1091982051730156, | |
| "eval_macro_f1": 0.6537166376050063, | |
| "eval_runtime": 1995.848, | |
| "eval_samples_per_second": 5.876, | |
| "eval_steps_per_second": 1.469, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8795292660266336, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.436130062598321e-05, | |
| "loss": 0.1376, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8919170021678539, | |
| "grad_norm": 30.875, | |
| "learning_rate": 1.4336263813597044e-05, | |
| "loss": 0.2345, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.904304738309074, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 1.4310768360969748e-05, | |
| "loss": 0.0666, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9166924744502942, | |
| "grad_norm": 58.5, | |
| "learning_rate": 1.4284815978573712e-05, | |
| "loss": 0.1151, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9290802105915144, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.4258408407536437e-05, | |
| "loss": 0.1628, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9290802105915144, | |
| "eval_accuracy": 0.9807349694471498, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.984214613072056, | |
| "eval_f1_unsafe": 0.9752874772399701, | |
| "eval_loss": 0.11199858784675598, | |
| "eval_macro_f1": 0.653167363437342, | |
| "eval_runtime": 1994.6293, | |
| "eval_samples_per_second": 5.879, | |
| "eval_steps_per_second": 1.47, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9414679467327346, | |
| "grad_norm": 76.5, | |
| "learning_rate": 1.4231547419523716e-05, | |
| "loss": 0.073, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9538556828739548, | |
| "grad_norm": 57.0, | |
| "learning_rate": 1.4204234816620775e-05, | |
| "loss": 0.1174, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.966243419015175, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.4176472431211372e-05, | |
| "loss": 0.0915, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9786311551563952, | |
| "grad_norm": 9.125, | |
| "learning_rate": 1.4148262125854865e-05, | |
| "loss": 0.1316, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.9910188912976153, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.4119605793161252e-05, | |
| "loss": 0.1827, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9910188912976153, | |
| "eval_accuracy": 0.9785007600777688, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9823662269149198, | |
| "eval_f1_unsafe": 0.972464825268733, | |
| "eval_loss": 0.11720670759677887, | |
| "eval_macro_f1": 0.6516103507278843, | |
| "eval_runtime": 1994.2961, | |
| "eval_samples_per_second": 5.88, | |
| "eval_steps_per_second": 1.47, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.003096934035305, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 1.4090505355664204e-05, | |
| "loss": 0.0551, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.0154846701765252, | |
| "grad_norm": 70.0, | |
| "learning_rate": 1.4060962765692071e-05, | |
| "loss": 0.1282, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.0278724063177453, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.4030980005236909e-05, | |
| "loss": 0.1651, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.0402601424589657, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.4000559085821516e-05, | |
| "loss": 0.0613, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.0526478786001858, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.3969702048364466e-05, | |
| "loss": 0.1932, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0526478786001858, | |
| "eval_accuracy": 0.9818333147656337, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9850234980707332, | |
| "eval_f1_unsafe": 0.9769161682170211, | |
| "eval_loss": 0.09109389036893845, | |
| "eval_macro_f1": 0.6539798887625848, | |
| "eval_runtime": 1994.2161, | |
| "eval_samples_per_second": 5.881, | |
| "eval_steps_per_second": 1.47, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.065035614741406, | |
| "grad_norm": 16.625, | |
| "learning_rate": 1.39384109630432e-05, | |
| "loss": 0.0363, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.077423350882626, | |
| "grad_norm": 6.25, | |
| "learning_rate": 1.3906687929155126e-05, | |
| "loss": 0.1233, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.0898110870238464, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.3874535074976783e-05, | |
| "loss": 0.0671, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.1021988231650666, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.3841954557621064e-05, | |
| "loss": 0.1144, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.1145865593062867, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 1.380894856289249e-05, | |
| "loss": 0.1096, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1145865593062867, | |
| "eval_accuracy": 0.9877523140744361, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9898415868750905, | |
| "eval_f1_unsafe": 0.9845811265566877, | |
| "eval_loss": 0.07772836834192276, | |
| "eval_macro_f1": 0.6581409044772594, | |
| "eval_runtime": 1994.4681, | |
| "eval_samples_per_second": 5.88, | |
| "eval_steps_per_second": 1.47, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.126974295447507, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 1.3775519305140562e-05, | |
| "loss": 0.0635, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.1393620315887272, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.3741669027111208e-05, | |
| "loss": 0.1231, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.1517497677299473, | |
| "grad_norm": 63.0, | |
| "learning_rate": 1.370739999979632e-05, | |
| "loss": 0.03, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.1641375038711677, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.3672714522281388e-05, | |
| "loss": 0.0977, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.1765252400123878, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 1.3637614921591264e-05, | |
| "loss": 0.1558, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.1765252400123878, | |
| "eval_accuracy": 0.9833260140659309, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9862329815239024, | |
| "eval_f1_unsafe": 0.9788627998117704, | |
| "eval_loss": 0.07807961851358414, | |
| "eval_macro_f1": 0.655031927111891, | |
| "eval_runtime": 1993.7901, | |
| "eval_samples_per_second": 5.882, | |
| "eval_steps_per_second": 1.471, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.188912976153608, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 1.3602103552534031e-05, | |
| "loss": 0.0994, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.201300712294828, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 1.3566182797543043e-05, | |
| "loss": 0.0687, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.2136884484360484, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.352985506651706e-05, | |
| "loss": 0.0575, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.2260761845772685, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.3493122796658592e-05, | |
| "loss": 0.0911, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.2384639207184887, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.345598845231038e-05, | |
| "loss": 0.0229, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2384639207184887, | |
| "eval_accuracy": 0.9824398915646342, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9854820774139722, | |
| "eval_f1_unsafe": 0.9777847557227568, | |
| "eval_loss": 0.08629076927900314, | |
| "eval_macro_f1": 0.6544222777122429, | |
| "eval_runtime": 1994.2838, | |
| "eval_samples_per_second": 5.88, | |
| "eval_steps_per_second": 1.47, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2508516568597088, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 1.3418454524790067e-05, | |
| "loss": 0.0455, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.2632393930009291, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 1.3380523532223054e-05, | |
| "loss": 0.0651, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.2756271291421493, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 1.3342198019373568e-05, | |
| "loss": 0.0416, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.2880148652833694, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.3303480557473925e-05, | |
| "loss": 0.0529, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.3004026014245897, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.326437374405204e-05, | |
| "loss": 0.1341, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.3004026014245897, | |
| "eval_accuracy": 0.9761251966079808, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9805451901064516, | |
| "eval_f1_unsafe": 0.9691063895273341, | |
| "eval_loss": 0.14960430562496185, | |
| "eval_macro_f1": 0.6498838598779285, | |
| "eval_runtime": 1993.9644, | |
| "eval_samples_per_second": 5.881, | |
| "eval_steps_per_second": 1.47, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.3127903375658099, | |
| "grad_norm": 3.75, | |
| "learning_rate": 1.3224880202757141e-05, | |
| "loss": 0.0826, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.32517807370703, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 1.318500258318378e-05, | |
| "loss": 0.1449, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.3375658098482504, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.3144743560694046e-05, | |
| "loss": 0.1547, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.3499535459894705, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.3104105836238093e-05, | |
| "loss": 0.1091, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.3623412821306906, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.3063092136172923e-05, | |
| "loss": 0.0411, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.3623412821306906, | |
| "eval_accuracy": 0.9798046198832511, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.983508466217983, | |
| "eval_f1_unsafe": 0.9739551975438099, | |
| "eval_loss": 0.12951034307479858, | |
| "eval_macro_f1": 0.6524878879205976, | |
| "eval_runtime": 1994.3539, | |
| "eval_samples_per_second": 5.88, | |
| "eval_steps_per_second": 1.47, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.3747290182719107, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 1.3021705212079489e-05, | |
| "loss": 0.0346, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.3871167544131309, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 1.2979947840578088e-05, | |
| "loss": 0.05, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.3995044905543512, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.2937822823142075e-05, | |
| "loss": 0.0295, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.4118922266955714, | |
| "grad_norm": 4.125, | |
| "learning_rate": 1.2895332985909917e-05, | |
| "loss": 0.0247, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.4242799628367915, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.2852481179495598e-05, | |
| "loss": 0.0995, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.4242799628367915, | |
| "eval_accuracy": 0.9826931310292615, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9857166974218224, | |
| "eval_f1_unsafe": 0.9780457337441255, | |
| "eval_loss": 0.08260737359523773, | |
| "eval_macro_f1": 0.654587477055316, | |
| "eval_runtime": 1994.3821, | |
| "eval_samples_per_second": 5.88, | |
| "eval_steps_per_second": 1.47, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.4366676989780118, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.2809270278797362e-05, | |
| "loss": 0.0237, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.449055435119232, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 1.2765703182804838e-05, | |
| "loss": 0.1853, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.461443171260452, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 1.2721782814404554e-05, | |
| "loss": 0.0685, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.4738309074016724, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.2677512120183843e-05, | |
| "loss": 0.098, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.4862186435428926, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 1.2632894070233157e-05, | |
| "loss": 0.0969, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.4862186435428926, | |
| "eval_accuracy": 0.9844363385454663, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.98716291618377, | |
| "eval_f1_unsafe": 0.9802392592072087, | |
| "eval_loss": 0.07929345965385437, | |
| "eval_macro_f1": 0.6558007251303262, | |
| "eval_runtime": 1993.9159, | |
| "eval_samples_per_second": 5.881, | |
| "eval_steps_per_second": 1.47, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.4986063796841127, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 1.2587931657946806e-05, | |
| "loss": 0.0766, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.510994115825333, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 1.2542627899822127e-05, | |
| "loss": 0.0969, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.523381851966553, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 1.249698583525712e-05, | |
| "loss": 0.0476, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.5357695881077733, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 1.245100852634653e-05, | |
| "loss": 0.1034, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.5481573242489937, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 1.2404699057676415e-05, | |
| "loss": 0.0748, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.5481573242489937, | |
| "eval_accuracy": 0.9836527097461256, | |
| "eval_f1_controversial": 0.0, | |
| "eval_f1_safe": 0.9865615120011499, | |
| "eval_f1_unsafe": 0.9791368071911146, | |
| "eval_loss": 0.0919911116361618, | |
| "eval_macro_f1": 0.6552327730640882, | |
| "eval_runtime": 1995.1921, | |
| "eval_samples_per_second": 5.878, | |
| "eval_steps_per_second": 1.47, | |
| "step": 2500 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 8075, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8442537274070835e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |