codeguard-ps-stream-0.6b / trainer_state.json
marksverdhei's picture
Upload folder using huggingface_hub
dfd5819 verified
{
"best_global_step": 1800,
"best_metric": 0.07772836834192276,
"best_model_checkpoint": "./outputs/powershell-production/checkpoint-1800",
"epoch": 1.5481573242489937,
"eval_steps": 100,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012387736141220192,
"grad_norm": 296.0,
"learning_rate": 7.054455445544555e-07,
"loss": 4.6719,
"step": 20
},
{
"epoch": 0.024775472282440383,
"grad_norm": 149.0,
"learning_rate": 1.448019801980198e-06,
"loss": 4.032,
"step": 40
},
{
"epoch": 0.037163208423660575,
"grad_norm": 76.0,
"learning_rate": 2.1905940594059405e-06,
"loss": 3.7462,
"step": 60
},
{
"epoch": 0.04955094456488077,
"grad_norm": 82.0,
"learning_rate": 2.9331683168316834e-06,
"loss": 2.6629,
"step": 80
},
{
"epoch": 0.06193868070610096,
"grad_norm": 67.5,
"learning_rate": 3.675742574257426e-06,
"loss": 1.4688,
"step": 100
},
{
"epoch": 0.06193868070610096,
"eval_accuracy": 0.8779996238543429,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9087364573635998,
"eval_f1_unsafe": 0.8224576644206463,
"eval_loss": 0.8046298027038574,
"eval_macro_f1": 0.5770647072614153,
"eval_runtime": 1995.9982,
"eval_samples_per_second": 5.875,
"eval_steps_per_second": 1.469,
"step": 100
},
{
"epoch": 0.07432641684732115,
"grad_norm": 14.6875,
"learning_rate": 4.418316831683168e-06,
"loss": 0.6992,
"step": 120
},
{
"epoch": 0.08671415298854135,
"grad_norm": 19.75,
"learning_rate": 5.160891089108911e-06,
"loss": 0.5021,
"step": 140
},
{
"epoch": 0.09910188912976153,
"grad_norm": 85.0,
"learning_rate": 5.903465346534654e-06,
"loss": 0.4075,
"step": 160
},
{
"epoch": 0.11148962527098173,
"grad_norm": 78.0,
"learning_rate": 6.646039603960397e-06,
"loss": 0.2943,
"step": 180
},
{
"epoch": 0.12387736141220192,
"grad_norm": 8.0625,
"learning_rate": 7.388613861386139e-06,
"loss": 0.3197,
"step": 200
},
{
"epoch": 0.12387736141220192,
"eval_accuracy": 0.950703116857914,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9602635885015949,
"eval_f1_unsafe": 0.9357709157041263,
"eval_loss": 0.21172067523002625,
"eval_macro_f1": 0.6320115014019071,
"eval_runtime": 1997.1894,
"eval_samples_per_second": 5.872,
"eval_steps_per_second": 1.468,
"step": 200
},
{
"epoch": 0.13626509755342212,
"grad_norm": 32.0,
"learning_rate": 8.131188118811882e-06,
"loss": 0.3174,
"step": 220
},
{
"epoch": 0.1486528336946423,
"grad_norm": 102.0,
"learning_rate": 8.873762376237623e-06,
"loss": 0.2517,
"step": 240
},
{
"epoch": 0.16104056983586248,
"grad_norm": 61.5,
"learning_rate": 9.616336633663367e-06,
"loss": 0.2532,
"step": 260
},
{
"epoch": 0.1734283059770827,
"grad_norm": 52.75,
"learning_rate": 1.0358910891089109e-05,
"loss": 0.1548,
"step": 280
},
{
"epoch": 0.18581604211830288,
"grad_norm": 58.25,
"learning_rate": 1.1101485148514851e-05,
"loss": 0.1953,
"step": 300
},
{
"epoch": 0.18581604211830288,
"eval_accuracy": 0.9621405323299588,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9684133256371634,
"eval_f1_unsafe": 0.9531137279206326,
"eval_loss": 0.1852019727230072,
"eval_macro_f1": 0.6405090178525987,
"eval_runtime": 1997.4497,
"eval_samples_per_second": 5.871,
"eval_steps_per_second": 1.468,
"step": 300
},
{
"epoch": 0.19820377825952307,
"grad_norm": 13.25,
"learning_rate": 1.1844059405940594e-05,
"loss": 0.183,
"step": 320
},
{
"epoch": 0.21059151440074325,
"grad_norm": 8.8125,
"learning_rate": 1.2586633663366337e-05,
"loss": 0.2541,
"step": 340
},
{
"epoch": 0.22297925054196346,
"grad_norm": 17.0,
"learning_rate": 1.332920792079208e-05,
"loss": 0.1851,
"step": 360
},
{
"epoch": 0.23536698668318365,
"grad_norm": 62.25,
"learning_rate": 1.4071782178217821e-05,
"loss": 0.2287,
"step": 380
},
{
"epoch": 0.24775472282440383,
"grad_norm": 8.125,
"learning_rate": 1.4814356435643564e-05,
"loss": 0.1806,
"step": 400
},
{
"epoch": 0.24775472282440383,
"eval_accuracy": 0.9719879297973246,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9767870924112939,
"eval_f1_unsafe": 0.9646897646101082,
"eval_loss": 0.13077302277088165,
"eval_macro_f1": 0.6471589523404674,
"eval_runtime": 1996.7162,
"eval_samples_per_second": 5.873,
"eval_steps_per_second": 1.468,
"step": 400
},
{
"epoch": 0.26014245896562405,
"grad_norm": 0.75390625,
"learning_rate": 1.499985848313474e-05,
"loss": 0.2029,
"step": 420
},
{
"epoch": 0.27253019510684423,
"grad_norm": 21.625,
"learning_rate": 1.4999229530058107e-05,
"loss": 0.228,
"step": 440
},
{
"epoch": 0.2849179312480644,
"grad_norm": 88.5,
"learning_rate": 1.4998097458826036e-05,
"loss": 0.1709,
"step": 460
},
{
"epoch": 0.2973056673892846,
"grad_norm": 1.4375,
"learning_rate": 1.4996462345388408e-05,
"loss": 0.1309,
"step": 480
},
{
"epoch": 0.3096934035305048,
"grad_norm": 73.5,
"learning_rate": 1.499432429944386e-05,
"loss": 0.1862,
"step": 500
},
{
"epoch": 0.3096934035305048,
"eval_accuracy": 0.9698197016470148,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9749329791096684,
"eval_f1_unsafe": 0.9620869563961173,
"eval_loss": 0.16210031509399414,
"eval_macro_f1": 0.6456733118352619,
"eval_runtime": 1996.9786,
"eval_samples_per_second": 5.872,
"eval_steps_per_second": 1.468,
"step": 500
},
{
"epoch": 0.32208113967172497,
"grad_norm": 1.21875,
"learning_rate": 1.4991683464432428e-05,
"loss": 0.0715,
"step": 520
},
{
"epoch": 0.3344688758129452,
"grad_norm": 2.625,
"learning_rate": 1.4988540017525911e-05,
"loss": 0.2116,
"step": 540
},
{
"epoch": 0.3468566119541654,
"grad_norm": 124.0,
"learning_rate": 1.4984894169616006e-05,
"loss": 0.1838,
"step": 560
},
{
"epoch": 0.3592443480953856,
"grad_norm": 3.703125,
"learning_rate": 1.4980746165300146e-05,
"loss": 0.1586,
"step": 580
},
{
"epoch": 0.37163208423660576,
"grad_norm": 14.5,
"learning_rate": 1.4976096282865085e-05,
"loss": 0.1157,
"step": 600
},
{
"epoch": 0.37163208423660576,
"eval_accuracy": 0.9606404538664539,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9680893048977398,
"eval_f1_unsafe": 0.948655118870566,
"eval_loss": 0.24218252301216125,
"eval_macro_f1": 0.6389148079227686,
"eval_runtime": 1997.0786,
"eval_samples_per_second": 5.872,
"eval_steps_per_second": 1.468,
"step": 600
},
{
"epoch": 0.38401982037782595,
"grad_norm": 3.640625,
"learning_rate": 1.4970944834268245e-05,
"loss": 0.1796,
"step": 620
},
{
"epoch": 0.39640755651904613,
"grad_norm": 1.0546875,
"learning_rate": 1.4965292165116766e-05,
"loss": 0.1243,
"step": 640
},
{
"epoch": 0.4087952926602663,
"grad_norm": 0.44921875,
"learning_rate": 1.495913865464434e-05,
"loss": 0.1263,
"step": 660
},
{
"epoch": 0.4211830288014865,
"grad_norm": 1.3671875,
"learning_rate": 1.4952484715685758e-05,
"loss": 0.1291,
"step": 680
},
{
"epoch": 0.43357076494270674,
"grad_norm": 6.8125,
"learning_rate": 1.4945330794649209e-05,
"loss": 0.1866,
"step": 700
},
{
"epoch": 0.43357076494270674,
"eval_accuracy": 0.9692282665075834,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9746824060081962,
"eval_f1_unsafe": 0.9607789140941978,
"eval_loss": 0.15051080286502838,
"eval_macro_f1": 0.6451537733674647,
"eval_runtime": 1996.287,
"eval_samples_per_second": 5.874,
"eval_steps_per_second": 1.469,
"step": 700
},
{
"epoch": 0.44595850108392693,
"grad_norm": 19.375,
"learning_rate": 1.493767737148634e-05,
"loss": 0.1021,
"step": 720
},
{
"epoch": 0.4583462372251471,
"grad_norm": 1.2265625,
"learning_rate": 1.492952495966005e-05,
"loss": 0.1652,
"step": 740
},
{
"epoch": 0.4707339733663673,
"grad_norm": 67.5,
"learning_rate": 1.4920874106110049e-05,
"loss": 0.0815,
"step": 760
},
{
"epoch": 0.4831217095075875,
"grad_norm": 7.65625,
"learning_rate": 1.4911725391216151e-05,
"loss": 0.1006,
"step": 780
},
{
"epoch": 0.49550944564880767,
"grad_norm": 2.6875,
"learning_rate": 1.4902079428759355e-05,
"loss": 0.1625,
"step": 800
},
{
"epoch": 0.49550944564880767,
"eval_accuracy": 0.9683618856636994,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9743468963029092,
"eval_f1_unsafe": 0.958734391511112,
"eval_loss": 0.17920953035354614,
"eval_macro_f1": 0.6443604292713404,
"eval_runtime": 1996.9022,
"eval_samples_per_second": 5.873,
"eval_steps_per_second": 1.468,
"step": 800
},
{
"epoch": 0.5078971817900279,
"grad_norm": 1.4453125,
"learning_rate": 1.4891936865880652e-05,
"loss": 0.2079,
"step": 820
},
{
"epoch": 0.5202849179312481,
"grad_norm": 205.0,
"learning_rate": 1.4881298383037618e-05,
"loss": 0.0709,
"step": 840
},
{
"epoch": 0.5326726540724682,
"grad_norm": 0.71875,
"learning_rate": 1.4870164693958752e-05,
"loss": 0.1639,
"step": 860
},
{
"epoch": 0.5450603902136885,
"grad_norm": 4.84375,
"learning_rate": 1.4858536545595602e-05,
"loss": 0.0897,
"step": 880
},
{
"epoch": 0.5574481263549086,
"grad_norm": 64.5,
"learning_rate": 1.4846414718072656e-05,
"loss": 0.2061,
"step": 900
},
{
"epoch": 0.5574481263549086,
"eval_accuracy": 0.9738056476952693,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9784293070754638,
"eval_f1_unsafe": 0.9666591344542607,
"eval_loss": 0.12458275258541107,
"eval_macro_f1": 0.6483628138432415,
"eval_runtime": 1995.9769,
"eval_samples_per_second": 5.875,
"eval_steps_per_second": 1.469,
"step": 900
},
{
"epoch": 0.5698358624961288,
"grad_norm": 0.314453125,
"learning_rate": 1.4833800024634986e-05,
"loss": 0.1909,
"step": 920
},
{
"epoch": 0.5822235986373491,
"grad_norm": 0.25390625,
"learning_rate": 1.4820693311593708e-05,
"loss": 0.1375,
"step": 940
},
{
"epoch": 0.5946113347785692,
"grad_norm": 2.828125,
"learning_rate": 1.4807095458269194e-05,
"loss": 0.0788,
"step": 960
},
{
"epoch": 0.6069990709197894,
"grad_norm": 10.4375,
"learning_rate": 1.4793007376932077e-05,
"loss": 0.0933,
"step": 980
},
{
"epoch": 0.6193868070610096,
"grad_norm": 42.25,
"learning_rate": 1.4778430012742053e-05,
"loss": 0.2083,
"step": 1000
},
{
"epoch": 0.6193868070610096,
"eval_accuracy": 0.9669330113334843,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9732207557904943,
"eval_f1_unsafe": 0.9567865224376395,
"eval_loss": 0.1653885543346405,
"eval_macro_f1": 0.6433357594093779,
"eval_runtime": 1996.6331,
"eval_samples_per_second": 5.873,
"eval_steps_per_second": 1.468,
"step": 1000
},
{
"epoch": 0.6317745432022298,
"grad_norm": 36.75,
"learning_rate": 1.4763364343684464e-05,
"loss": 0.0689,
"step": 1020
},
{
"epoch": 0.6441622793434499,
"grad_norm": 4.09375,
"learning_rate": 1.4747811380504698e-05,
"loss": 0.0984,
"step": 1040
},
{
"epoch": 0.6565500154846702,
"grad_norm": 1.7890625,
"learning_rate": 1.4731772166640363e-05,
"loss": 0.0894,
"step": 1060
},
{
"epoch": 0.6689377516258904,
"grad_norm": 0.50390625,
"learning_rate": 1.4715247778151297e-05,
"loss": 0.0969,
"step": 1080
},
{
"epoch": 0.6813254877671105,
"grad_norm": 6.1875,
"learning_rate": 1.4698239323647365e-05,
"loss": 0.2052,
"step": 1100
},
{
"epoch": 0.6813254877671105,
"eval_accuracy": 0.9829721975651158,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9859135534089479,
"eval_f1_unsafe": 0.9784783062783209,
"eval_loss": 0.08367573469877243,
"eval_macro_f1": 0.654797286562423,
"eval_runtime": 1996.2626,
"eval_samples_per_second": 5.874,
"eval_steps_per_second": 1.469,
"step": 1100
},
{
"epoch": 0.6937132239083308,
"grad_norm": 0.3125,
"learning_rate": 1.4680747944214093e-05,
"loss": 0.0508,
"step": 1120
},
{
"epoch": 0.7061009600495509,
"grad_norm": 0.34375,
"learning_rate": 1.4662774813336105e-05,
"loss": 0.0915,
"step": 1140
},
{
"epoch": 0.7184886961907712,
"grad_norm": 6.34375,
"learning_rate": 1.4644321136818402e-05,
"loss": 0.2418,
"step": 1160
},
{
"epoch": 0.7308764323319913,
"grad_norm": 134.0,
"learning_rate": 1.4625388152705457e-05,
"loss": 0.094,
"step": 1180
},
{
"epoch": 0.7432641684732115,
"grad_norm": 1.34375,
"learning_rate": 1.4605977131198166e-05,
"loss": 0.1194,
"step": 1200
},
{
"epoch": 0.7432641684732115,
"eval_accuracy": 0.979379790915723,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9830450283559968,
"eval_f1_unsafe": 0.9736928478058482,
"eval_loss": 0.11758451163768768,
"eval_macro_f1": 0.652245958720615,
"eval_runtime": 1996.2276,
"eval_samples_per_second": 5.875,
"eval_steps_per_second": 1.469,
"step": 1200
},
{
"epoch": 0.7556519046144317,
"grad_norm": 2.84375,
"learning_rate": 1.4586089374568616e-05,
"loss": 0.2312,
"step": 1220
},
{
"epoch": 0.7680396407556519,
"grad_norm": 0.2021484375,
"learning_rate": 1.4565726217072738e-05,
"loss": 0.1048,
"step": 1240
},
{
"epoch": 0.7804273768968721,
"grad_norm": 1.6484375,
"learning_rate": 1.454488902486077e-05,
"loss": 0.0491,
"step": 1260
},
{
"epoch": 0.7928151130380923,
"grad_norm": 3.109375,
"learning_rate": 1.452357919588562e-05,
"loss": 0.2089,
"step": 1280
},
{
"epoch": 0.8052028491793125,
"grad_norm": 15.0,
"learning_rate": 1.4501798159809068e-05,
"loss": 0.1261,
"step": 1300
},
{
"epoch": 0.8052028491793125,
"eval_accuracy": 0.9768035525400014,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.981035749707623,
"eval_f1_unsafe": 0.9701397197118314,
"eval_loss": 0.13235369324684143,
"eval_macro_f1": 0.6503918231398181,
"eval_runtime": 1996.4333,
"eval_samples_per_second": 5.874,
"eval_steps_per_second": 1.469,
"step": 1300
},
{
"epoch": 0.8175905853205326,
"grad_norm": 0.373046875,
"learning_rate": 1.4479547377905856e-05,
"loss": 0.1104,
"step": 1320
},
{
"epoch": 0.8299783214617529,
"grad_norm": 1.890625,
"learning_rate": 1.445682834296565e-05,
"loss": 0.1023,
"step": 1340
},
{
"epoch": 0.842366057602973,
"grad_norm": 0.10888671875,
"learning_rate": 1.4433642579192891e-05,
"loss": 0.1085,
"step": 1360
},
{
"epoch": 0.8547537937441932,
"grad_norm": 2.984375,
"learning_rate": 1.4409991642104537e-05,
"loss": 0.1881,
"step": 1380
},
{
"epoch": 0.8671415298854135,
"grad_norm": 37.5,
"learning_rate": 1.4385877118425702e-05,
"loss": 0.1471,
"step": 1400
},
{
"epoch": 0.8671415298854135,
"eval_accuracy": 0.9815110753333501,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9848392442662509,
"eval_f1_unsafe": 0.9763106685487681,
"eval_loss": 0.1091982051730156,
"eval_macro_f1": 0.6537166376050063,
"eval_runtime": 1995.848,
"eval_samples_per_second": 5.876,
"eval_steps_per_second": 1.469,
"step": 1400
},
{
"epoch": 0.8795292660266336,
"grad_norm": 1.078125,
"learning_rate": 1.436130062598321e-05,
"loss": 0.1376,
"step": 1420
},
{
"epoch": 0.8919170021678539,
"grad_norm": 30.875,
"learning_rate": 1.4336263813597044e-05,
"loss": 0.2345,
"step": 1440
},
{
"epoch": 0.904304738309074,
"grad_norm": 0.40234375,
"learning_rate": 1.4310768360969748e-05,
"loss": 0.0666,
"step": 1460
},
{
"epoch": 0.9166924744502942,
"grad_norm": 58.5,
"learning_rate": 1.4284815978573712e-05,
"loss": 0.1151,
"step": 1480
},
{
"epoch": 0.9290802105915144,
"grad_norm": 20.75,
"learning_rate": 1.4258408407536437e-05,
"loss": 0.1628,
"step": 1500
},
{
"epoch": 0.9290802105915144,
"eval_accuracy": 0.9807349694471498,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.984214613072056,
"eval_f1_unsafe": 0.9752874772399701,
"eval_loss": 0.11199858784675598,
"eval_macro_f1": 0.653167363437342,
"eval_runtime": 1994.6293,
"eval_samples_per_second": 5.879,
"eval_steps_per_second": 1.47,
"step": 1500
},
{
"epoch": 0.9414679467327346,
"grad_norm": 76.5,
"learning_rate": 1.4231547419523716e-05,
"loss": 0.073,
"step": 1520
},
{
"epoch": 0.9538556828739548,
"grad_norm": 57.0,
"learning_rate": 1.4204234816620775e-05,
"loss": 0.1174,
"step": 1540
},
{
"epoch": 0.966243419015175,
"grad_norm": 1.0234375,
"learning_rate": 1.4176472431211372e-05,
"loss": 0.0915,
"step": 1560
},
{
"epoch": 0.9786311551563952,
"grad_norm": 9.125,
"learning_rate": 1.4148262125854865e-05,
"loss": 0.1316,
"step": 1580
},
{
"epoch": 0.9910188912976153,
"grad_norm": 24.0,
"learning_rate": 1.4119605793161252e-05,
"loss": 0.1827,
"step": 1600
},
{
"epoch": 0.9910188912976153,
"eval_accuracy": 0.9785007600777688,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9823662269149198,
"eval_f1_unsafe": 0.972464825268733,
"eval_loss": 0.11720670759677887,
"eval_macro_f1": 0.6516103507278843,
"eval_runtime": 1994.2961,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 1.47,
"step": 1600
},
{
"epoch": 1.003096934035305,
"grad_norm": 9.4375,
"learning_rate": 1.4090505355664204e-05,
"loss": 0.0551,
"step": 1620
},
{
"epoch": 1.0154846701765252,
"grad_norm": 70.0,
"learning_rate": 1.4060962765692071e-05,
"loss": 0.1282,
"step": 1640
},
{
"epoch": 1.0278724063177453,
"grad_norm": 5.28125,
"learning_rate": 1.4030980005236909e-05,
"loss": 0.1651,
"step": 1660
},
{
"epoch": 1.0402601424589657,
"grad_norm": 0.8515625,
"learning_rate": 1.4000559085821516e-05,
"loss": 0.0613,
"step": 1680
},
{
"epoch": 1.0526478786001858,
"grad_norm": 41.0,
"learning_rate": 1.3969702048364466e-05,
"loss": 0.1932,
"step": 1700
},
{
"epoch": 1.0526478786001858,
"eval_accuracy": 0.9818333147656337,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9850234980707332,
"eval_f1_unsafe": 0.9769161682170211,
"eval_loss": 0.09109389036893845,
"eval_macro_f1": 0.6539798887625848,
"eval_runtime": 1994.2161,
"eval_samples_per_second": 5.881,
"eval_steps_per_second": 1.47,
"step": 1700
},
{
"epoch": 1.065035614741406,
"grad_norm": 16.625,
"learning_rate": 1.39384109630432e-05,
"loss": 0.0363,
"step": 1720
},
{
"epoch": 1.077423350882626,
"grad_norm": 6.25,
"learning_rate": 1.3906687929155126e-05,
"loss": 0.1233,
"step": 1740
},
{
"epoch": 1.0898110870238464,
"grad_norm": 1.65625,
"learning_rate": 1.3874535074976783e-05,
"loss": 0.0671,
"step": 1760
},
{
"epoch": 1.1021988231650666,
"grad_norm": 1.1171875,
"learning_rate": 1.3841954557621064e-05,
"loss": 0.1144,
"step": 1780
},
{
"epoch": 1.1145865593062867,
"grad_norm": 0.70703125,
"learning_rate": 1.380894856289249e-05,
"loss": 0.1096,
"step": 1800
},
{
"epoch": 1.1145865593062867,
"eval_accuracy": 0.9877523140744361,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9898415868750905,
"eval_f1_unsafe": 0.9845811265566877,
"eval_loss": 0.07772836834192276,
"eval_macro_f1": 0.6581409044772594,
"eval_runtime": 1994.4681,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 1.47,
"step": 1800
},
{
"epoch": 1.126974295447507,
"grad_norm": 0.10107421875,
"learning_rate": 1.3775519305140562e-05,
"loss": 0.0635,
"step": 1820
},
{
"epoch": 1.1393620315887272,
"grad_norm": 3.4375,
"learning_rate": 1.3741669027111208e-05,
"loss": 0.1231,
"step": 1840
},
{
"epoch": 1.1517497677299473,
"grad_norm": 63.0,
"learning_rate": 1.370739999979632e-05,
"loss": 0.03,
"step": 1860
},
{
"epoch": 1.1641375038711677,
"grad_norm": 21.25,
"learning_rate": 1.3672714522281388e-05,
"loss": 0.0977,
"step": 1880
},
{
"epoch": 1.1765252400123878,
"grad_norm": 5.96875,
"learning_rate": 1.3637614921591264e-05,
"loss": 0.1558,
"step": 1900
},
{
"epoch": 1.1765252400123878,
"eval_accuracy": 0.9833260140659309,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9862329815239024,
"eval_f1_unsafe": 0.9788627998117704,
"eval_loss": 0.07807961851358414,
"eval_macro_f1": 0.655031927111891,
"eval_runtime": 1993.7901,
"eval_samples_per_second": 5.882,
"eval_steps_per_second": 1.471,
"step": 1900
},
{
"epoch": 1.188912976153608,
"grad_norm": 0.134765625,
"learning_rate": 1.3602103552534031e-05,
"loss": 0.0994,
"step": 1920
},
{
"epoch": 1.201300712294828,
"grad_norm": 1.46875,
"learning_rate": 1.3566182797543043e-05,
"loss": 0.0687,
"step": 1940
},
{
"epoch": 1.2136884484360484,
"grad_norm": 1.59375,
"learning_rate": 1.352985506651706e-05,
"loss": 0.0575,
"step": 1960
},
{
"epoch": 1.2260761845772685,
"grad_norm": 0.515625,
"learning_rate": 1.3493122796658592e-05,
"loss": 0.0911,
"step": 1980
},
{
"epoch": 1.2384639207184887,
"grad_norm": 1.484375,
"learning_rate": 1.345598845231038e-05,
"loss": 0.0229,
"step": 2000
},
{
"epoch": 1.2384639207184887,
"eval_accuracy": 0.9824398915646342,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9854820774139722,
"eval_f1_unsafe": 0.9777847557227568,
"eval_loss": 0.08629076927900314,
"eval_macro_f1": 0.6544222777122429,
"eval_runtime": 1994.2838,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 1.47,
"step": 2000
},
{
"epoch": 1.2508516568597088,
"grad_norm": 12.1875,
"learning_rate": 1.3418454524790067e-05,
"loss": 0.0455,
"step": 2020
},
{
"epoch": 1.2632393930009291,
"grad_norm": 3.546875,
"learning_rate": 1.3380523532223054e-05,
"loss": 0.0651,
"step": 2040
},
{
"epoch": 1.2756271291421493,
"grad_norm": 5.8125,
"learning_rate": 1.3342198019373568e-05,
"loss": 0.0416,
"step": 2060
},
{
"epoch": 1.2880148652833694,
"grad_norm": 19.375,
"learning_rate": 1.3303480557473925e-05,
"loss": 0.0529,
"step": 2080
},
{
"epoch": 1.3004026014245897,
"grad_norm": 0.625,
"learning_rate": 1.326437374405204e-05,
"loss": 0.1341,
"step": 2100
},
{
"epoch": 1.3004026014245897,
"eval_accuracy": 0.9761251966079808,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9805451901064516,
"eval_f1_unsafe": 0.9691063895273341,
"eval_loss": 0.14960430562496185,
"eval_macro_f1": 0.6498838598779285,
"eval_runtime": 1993.9644,
"eval_samples_per_second": 5.881,
"eval_steps_per_second": 1.47,
"step": 2100
},
{
"epoch": 1.3127903375658099,
"grad_norm": 3.75,
"learning_rate": 1.3224880202757141e-05,
"loss": 0.0826,
"step": 2120
},
{
"epoch": 1.32517807370703,
"grad_norm": 0.279296875,
"learning_rate": 1.318500258318378e-05,
"loss": 0.1449,
"step": 2140
},
{
"epoch": 1.3375658098482504,
"grad_norm": 2.140625,
"learning_rate": 1.3144743560694046e-05,
"loss": 0.1547,
"step": 2160
},
{
"epoch": 1.3499535459894705,
"grad_norm": 0.93359375,
"learning_rate": 1.3104105836238093e-05,
"loss": 0.1091,
"step": 2180
},
{
"epoch": 1.3623412821306906,
"grad_norm": 19.25,
"learning_rate": 1.3063092136172923e-05,
"loss": 0.0411,
"step": 2200
},
{
"epoch": 1.3623412821306906,
"eval_accuracy": 0.9798046198832511,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.983508466217983,
"eval_f1_unsafe": 0.9739551975438099,
"eval_loss": 0.12951034307479858,
"eval_macro_f1": 0.6524878879205976,
"eval_runtime": 1994.3539,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 1.47,
"step": 2200
},
{
"epoch": 1.3747290182719107,
"grad_norm": 1.2734375,
"learning_rate": 1.3021705212079489e-05,
"loss": 0.0346,
"step": 2220
},
{
"epoch": 1.3871167544131309,
"grad_norm": 4.84375,
"learning_rate": 1.2979947840578088e-05,
"loss": 0.05,
"step": 2240
},
{
"epoch": 1.3995044905543512,
"grad_norm": 2.53125,
"learning_rate": 1.2937822823142075e-05,
"loss": 0.0295,
"step": 2260
},
{
"epoch": 1.4118922266955714,
"grad_norm": 4.125,
"learning_rate": 1.2895332985909917e-05,
"loss": 0.0247,
"step": 2280
},
{
"epoch": 1.4242799628367915,
"grad_norm": 0.859375,
"learning_rate": 1.2852481179495598e-05,
"loss": 0.0995,
"step": 2300
},
{
"epoch": 1.4242799628367915,
"eval_accuracy": 0.9826931310292615,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9857166974218224,
"eval_f1_unsafe": 0.9780457337441255,
"eval_loss": 0.08260737359523773,
"eval_macro_f1": 0.654587477055316,
"eval_runtime": 1994.3821,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 1.47,
"step": 2300
},
{
"epoch": 1.4366676989780118,
"grad_norm": 0.98046875,
"learning_rate": 1.2809270278797362e-05,
"loss": 0.0237,
"step": 2320
},
{
"epoch": 1.449055435119232,
"grad_norm": 0.1689453125,
"learning_rate": 1.2765703182804838e-05,
"loss": 0.1853,
"step": 2340
},
{
"epoch": 1.461443171260452,
"grad_norm": 0.37890625,
"learning_rate": 1.2721782814404554e-05,
"loss": 0.0685,
"step": 2360
},
{
"epoch": 1.4738309074016724,
"grad_norm": 1.59375,
"learning_rate": 1.2677512120183843e-05,
"loss": 0.098,
"step": 2380
},
{
"epoch": 1.4862186435428926,
"grad_norm": 0.66796875,
"learning_rate": 1.2632894070233157e-05,
"loss": 0.0969,
"step": 2400
},
{
"epoch": 1.4862186435428926,
"eval_accuracy": 0.9844363385454663,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.98716291618377,
"eval_f1_unsafe": 0.9802392592072087,
"eval_loss": 0.07929345965385437,
"eval_macro_f1": 0.6558007251303262,
"eval_runtime": 1993.9159,
"eval_samples_per_second": 5.881,
"eval_steps_per_second": 1.47,
"step": 2400
},
{
"epoch": 1.4986063796841127,
"grad_norm": 0.376953125,
"learning_rate": 1.2587931657946806e-05,
"loss": 0.0766,
"step": 2420
},
{
"epoch": 1.510994115825333,
"grad_norm": 4.9375,
"learning_rate": 1.2542627899822127e-05,
"loss": 0.0969,
"step": 2440
},
{
"epoch": 1.523381851966553,
"grad_norm": 4.59375,
"learning_rate": 1.249698583525712e-05,
"loss": 0.0476,
"step": 2460
},
{
"epoch": 1.5357695881077733,
"grad_norm": 0.3359375,
"learning_rate": 1.245100852634653e-05,
"loss": 0.1034,
"step": 2480
},
{
"epoch": 1.5481573242489937,
"grad_norm": 0.484375,
"learning_rate": 1.2404699057676415e-05,
"loss": 0.0748,
"step": 2500
},
{
"epoch": 1.5481573242489937,
"eval_accuracy": 0.9836527097461256,
"eval_f1_controversial": 0.0,
"eval_f1_safe": 0.9865615120011499,
"eval_f1_unsafe": 0.9791368071911146,
"eval_loss": 0.0919911116361618,
"eval_macro_f1": 0.6552327730640882,
"eval_runtime": 1995.1921,
"eval_samples_per_second": 5.878,
"eval_steps_per_second": 1.47,
"step": 2500
}
],
"logging_steps": 20,
"max_steps": 8075,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8442537274070835e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}