{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996363636363636, "eval_steps": 500, "global_step": 1236, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.6635121252609261, "importance_ratio": 1.0, "learning_rate": 0.0, "loss": 0.0211, "ppo_loss": 1.0, "sft_loss": 0.06775619834661484, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.705460294248276, "importance_ratio": 1.0, "kl_div": 0.0002822639944497496, "kl_div_neg": 0.0002822639944497496, "learning_rate": 6.020599913279622e-07, "loss": 0.0824, "ppo_loss": 1.0002822875976562, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.7112764661319769, "kl_div": 0.0009204513626173139, "kl_div_sft": 0.0009204513626173139, "learning_rate": 9.542425094393247e-07, "loss": -0.1509, "sft_loss": 0.04132953658699989, "step": 3 }, { "epoch": 0.01, "grad_norm": 0.7529159524854754, "kl_div": -0.00046564757940359414, "kl_div_sft": -0.00046564757940359414, "learning_rate": 1.2041199826559244e-06, "loss": -0.0736, "sft_loss": 0.05602573603391647, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.7418723792629985, "importance_ratio": 1.0, "kl_div": -0.0001351796672679484, "kl_div_neg": 0.0006867063930258155, "kl_div_sft": -0.0009570657275617123, "learning_rate": 1.3979400086720373e-06, "loss": 0.0196, "ppo_loss": 1.0006868839263916, "sft_loss": 0.04666399583220482, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.669249194100899, "importance_ratio": 1.0, "kl_div": -0.00030084114405326545, "kl_div_pos": -0.00030084114405326545, "learning_rate": 1.556302500767287e-06, "loss": 0.1155, "ppo_loss": -0.9996995329856873, "step": 6 }, { "epoch": 0.02, "grad_norm": 0.7068742609399631, "importance_ratio": 1.0, "kl_div": -0.0005346988327801228, "kl_div_neg": -0.0011013677576556802, "kl_div_sft": 3.19700593536254e-05, "learning_rate": 1.6901960800285134e-06, "loss": 0.0055, "ppo_loss": 0.9988992810249329, "sft_loss": 0.1200389564037323, "step": 7 }, { "epoch": 0.02, "grad_norm": 0.6876259168276546, "importance_ratio": 1.0, "kl_div": 8.872683974914253e-05, "kl_div_neg": -0.0002661606704350561, "kl_div_sft": 0.00044361434993334115, "learning_rate": 1.8061799739838866e-06, "loss": 0.1156, "ppo_loss": 0.9997339248657227, "sft_loss": 0.08696060627698898, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.7939305603233358, "importance_ratio": 1.0, "kl_div": 0.00036535965045914054, "kl_div_pos": -0.0009317900985479355, "kl_div_sft": 0.0016625093994662166, "learning_rate": 1.9084850188786494e-06, "loss": -0.1666, "ppo_loss": -0.9990686774253845, "sft_loss": 0.06919308006763458, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.623453396281409, "importance_ratio": 1.0, "kl_div": 0.0022256490774452686, "kl_div_neg": 0.0023987418971955776, "kl_div_pos": 0.0020525562576949596, "learning_rate": 1.9999999999999995e-06, "loss": -0.0104, "ppo_loss": 0.0001735091209411621, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.6390810247968097, "importance_ratio": 1.0, "kl_div": 0.0018754223128780723, "kl_div_neg": 0.0018754223128780723, "learning_rate": 2e-06, "loss": 0.0211, "ppo_loss": 1.0018773078918457, "step": 11 }, { "epoch": 0.03, "grad_norm": 0.7721549294865053, "importance_ratio": 1.0, "kl_div": -0.0011717455927282572, "kl_div_neg": -0.002727470127865672, "kl_div_pos": 0.00038397900061681867, "learning_rate": 1.99836867862969e-06, "loss": -0.027, "ppo_loss": -0.0015539228916168213, "step": 12 }, { "epoch": 0.03, "grad_norm": 0.7730312243665663, "importance_ratio": 1.0, "kl_div": 0.0003774297656491399, "kl_div_neg": -0.002771079307422042, "kl_div_pos": 0.0035259388387203217, "learning_rate": 1.99673735725938e-06, "loss": 0.0825, "ppo_loss": -0.003149688243865967, "step": 13 }, { "epoch": 0.03, "grad_norm": 0.8498004917610201, "kl_div": 0.0019221242982894182, "kl_div_sft": 0.0019221242982894182, "learning_rate": 1.99510603588907e-06, "loss": 0.2234, "sft_loss": 0.05455465987324715, "step": 14 }, { "epoch": 0.04, "grad_norm": 0.7375702032523053, "importance_ratio": 1.0, "kl_div": 0.0008038842352107167, "kl_div_neg": 0.001211852766573429, "kl_div_pos": 0.00039591570384800434, "learning_rate": 1.99347471451876e-06, "loss": -0.0567, "ppo_loss": 0.0004082918167114258, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.7252607682305425, "importance_ratio": 1.0, "kl_div": 0.0012020182330161333, "kl_div_neg": 0.0013668726896867156, "kl_div_sft": 0.0010371638927608728, "learning_rate": 1.99184339314845e-06, "loss": 0.0046, "ppo_loss": 1.0013678073883057, "sft_loss": 0.12026583403348923, "step": 16 }, { "epoch": 0.04, "grad_norm": 0.7092052777631312, "importance_ratio": 0.99609375, "kl_div": 0.00025449926033616066, "kl_div_pos": -0.002506878226995468, "kl_div_sft": 0.0030158767476677895, "learning_rate": 1.9902120717781402e-06, "loss": 0.0364, "ppo_loss": -0.9974962472915649, "sft_loss": 0.05157789587974548, "step": 17 }, { "epoch": 0.04, "grad_norm": 0.668932570732893, "kl_div": 0.0012163774808868766, "kl_div_sft": 0.0012163774808868766, "learning_rate": 1.9885807504078304e-06, "loss": 0.0053, "sft_loss": 0.08282782137393951, "step": 18 }, { "epoch": 0.05, "grad_norm": 0.8751659917059365, "importance_ratio": 0.98828125, "kl_div": -0.004169079475104809, "kl_div_pos": -0.010013559833168983, "kl_div_sft": 0.0016754004172980785, "learning_rate": 1.98694942903752e-06, "loss": -0.1204, "ppo_loss": -0.9900364279747009, "sft_loss": 0.0878886952996254, "step": 19 }, { "epoch": 0.05, "grad_norm": 0.7782780385712201, "kl_div": 0.002132371999323368, "kl_div_sft": 0.002132371999323368, "learning_rate": 1.9853181076672104e-06, "loss": -0.0268, "sft_loss": 0.10623150318861008, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.9007209618700114, "importance_ratio": 0.984375, "kl_div": -0.007818278856575489, "kl_div_pos": -0.015718363225460052, "kl_div_sft": 8.180640725186095e-05, "learning_rate": 1.9836867862969006e-06, "loss": -0.1371, "ppo_loss": -0.9844045042991638, "sft_loss": 0.06074066832661629, "step": 21 }, { "epoch": 0.05, "grad_norm": 0.8245334837888496, "importance_ratio": 0.98046875, "kl_div": -0.011028180830180645, "kl_div_pos": -0.01786625385284424, "kl_div_sft": -0.004190108273178339, "learning_rate": 1.9820554649265904e-06, "loss": -0.1048, "ppo_loss": -0.9822924137115479, "sft_loss": 0.05008915439248085, "step": 22 }, { "epoch": 0.06, "grad_norm": 0.8880969524854001, "importance_ratio": 1.0078125, "kl_div": -0.0005527837201952934, "kl_div_neg": 0.006105577107518911, "kl_div_sft": -0.007211144547909498, "learning_rate": 1.9804241435562806e-06, "loss": 0.1279, "ppo_loss": 1.0061242580413818, "sft_loss": 0.043001554906368256, "step": 23 }, { "epoch": 0.06, "grad_norm": 0.8045722360370814, "importance_ratio": 1.0, "kl_div": 0.0022286863531917334, "kl_div_pos": 0.0017837247578427196, "kl_div_sft": 0.0026736478321254253, "learning_rate": 1.9787928221859708e-06, "loss": -0.0913, "ppo_loss": -1.0017852783203125, "sft_loss": 0.034286826848983765, "step": 24 }, { "epoch": 0.06, "grad_norm": 0.7689353843398787, "importance_ratio": 0.99609375, "kl_div": -0.001848046900704503, "kl_div_neg": -0.0025919671170413494, "kl_div_sft": -0.0011041266843676567, "learning_rate": 1.9771615008156605e-06, "loss": 0.0209, "ppo_loss": 0.9974113702774048, "sft_loss": 0.05627015605568886, "step": 25 }, { "epoch": 0.06, "grad_norm": 0.9286371596882123, "importance_ratio": 0.9375, "kl_div": -0.030886108055710793, "kl_div_neg": -0.06649031490087509, "kl_div_sft": 0.004718099255114794, "learning_rate": 1.9755301794453507e-06, "loss": -0.0009, "ppo_loss": 0.9356719255447388, "sft_loss": 0.030333051458001137, "step": 26 }, { "epoch": 0.07, "grad_norm": 0.8046988699396788, "importance_ratio": 0.9296875, "kl_div": -0.03120781108736992, "kl_div_neg": -0.07134287804365158, "kl_div_sft": 0.008927257731556892, "learning_rate": 1.9738988580750405e-06, "loss": -0.1101, "ppo_loss": 0.9311425685882568, "sft_loss": 0.0429232157766819, "step": 27 }, { "epoch": 0.07, "grad_norm": 0.8819900327365835, "kl_div": 0.011365109123289585, "kl_div_sft": 0.011365109123289585, "learning_rate": 1.9722675367047307e-06, "loss": 0.1108, "sft_loss": 0.03654914349317551, "step": 28 }, { "epoch": 0.07, "grad_norm": 0.9544759150664117, "importance_ratio": 1.0, "kl_div": 0.005260104313492775, "kl_div_pos": 0.003754216246306896, "kl_div_sft": 0.006765992846339941, "learning_rate": 1.970636215334421e-06, "loss": 0.0282, "ppo_loss": -1.0037612915039062, "sft_loss": 0.041900016367435455, "step": 29 }, { "epoch": 0.07, "grad_norm": 1.0208257298608072, "importance_ratio": 0.9921875, "kl_div": -0.003449210897088051, "kl_div_neg": -0.006457801442593336, "kl_div_sft": -0.0004406205553095788, "learning_rate": 1.9690048939641107e-06, "loss": -0.0117, "ppo_loss": 0.9935629963874817, "sft_loss": 0.10450047254562378, "step": 30 }, { "epoch": 0.08, "grad_norm": 0.9936690195544808, "importance_ratio": 1.0078125, "kl_div": 0.0018349961610510945, "kl_div_pos": 0.005279692821204662, "kl_div_sft": -0.0016097004991024733, "learning_rate": 1.967373572593801e-06, "loss": 0.1026, "ppo_loss": -1.005293607711792, "sft_loss": 0.07605157792568207, "step": 31 }, { "epoch": 0.08, "grad_norm": 0.7723476939377518, "kl_div": -0.005411309655755758, "kl_div_sft": -0.005411309655755758, "learning_rate": 1.965742251223491e-06, "loss": -0.0887, "sft_loss": 0.118146151304245, "step": 32 }, { "epoch": 0.08, "grad_norm": 1.2419891199013167, "importance_ratio": 0.953125, "kl_div": -0.02059026248753071, "kl_div_neg": -0.04986540973186493, "kl_div_sft": 0.008684884756803513, "learning_rate": 1.9641109298531813e-06, "loss": -0.1473, "ppo_loss": 0.9513574242591858, "sft_loss": 0.026145216077566147, "step": 33 }, { "epoch": 0.08, "grad_norm": 1.1872208919784464, "importance_ratio": 0.984375, "kl_div": -0.014594344422221184, "kl_div_neg": -0.011061317287385464, "kl_div_pos": -0.01812737062573433, "learning_rate": 1.962479608482871e-06, "loss": 0.0378, "ppo_loss": 0.003481835126876831, "step": 34 }, { "epoch": 0.08, "grad_norm": 1.1060544676311477, "importance_ratio": 0.99609375, "kl_div": -0.007654288783669472, "kl_div_pos": -0.0056701661087572575, "kl_div_sft": -0.009638411924242973, "learning_rate": 1.9608482871125612e-06, "loss": -0.1361, "ppo_loss": -0.9943458437919617, "sft_loss": 0.08755666762590408, "step": 35 }, { "epoch": 0.09, "grad_norm": 1.2161479462544658, "kl_div": -0.014355514198541641, "kl_div_sft": -0.014355514198541641, "learning_rate": 1.959216965742251e-06, "loss": 0.0437, "sft_loss": 0.08722537755966187, "step": 36 }, { "epoch": 0.09, "grad_norm": 1.1803055716733668, "kl_div": 0.002251553349196911, "kl_div_sft": 0.002251553349196911, "learning_rate": 1.957585644371941e-06, "loss": 0.0125, "sft_loss": 0.09785296022891998, "step": 37 }, { "epoch": 0.09, "grad_norm": 1.189250208391352, "importance_ratio": 0.984375, "kl_div": -0.012131206691265106, "kl_div_neg": -0.012131206691265106, "learning_rate": 1.9559543230016314e-06, "loss": 0.0831, "ppo_loss": 0.9880673885345459, "step": 38 }, { "epoch": 0.09, "grad_norm": 1.158483873277291, "importance_ratio": 0.84375, "kl_div": -0.10153215378522873, "kl_div_pos": -0.16852955520153046, "kl_div_sft": -0.0345347560942173, "learning_rate": 1.954323001631321e-06, "loss": 0.1808, "ppo_loss": -0.8449063301086426, "sft_loss": 0.23241208493709564, "step": 39 }, { "epoch": 0.1, "grad_norm": 1.4674984083540699, "importance_ratio": 1.0, "kl_div": -0.016898073256015778, "kl_div_pos": 0.002774033695459366, "kl_div_sft": -0.03657018020749092, "learning_rate": 1.9526916802610114e-06, "loss": 0.0697, "ppo_loss": -1.0027778148651123, "sft_loss": 0.11947453022003174, "step": 40 }, { "epoch": 0.1, "grad_norm": 1.4236500318629106, "importance_ratio": 0.984375, "kl_div": -0.01073143258690834, "kl_div_neg": -0.015295865945518017, "kl_div_sft": -0.006167000159621239, "learning_rate": 1.951060358890701e-06, "loss": 0.1136, "ppo_loss": 0.9848204851150513, "sft_loss": 0.07722340524196625, "step": 41 }, { "epoch": 0.1, "grad_norm": 1.0650865262660743, "importance_ratio": 0.9140625, "kl_div": -0.08983860164880753, "kl_div_neg": -0.13955451548099518, "kl_div_pos": -0.040122684091329575, "learning_rate": 1.9494290375203913e-06, "loss": -0.0056, "ppo_loss": -0.045462965965270996, "step": 42 }, { "epoch": 0.1, "grad_norm": 1.490474016712344, "importance_ratio": 0.98046875, "kl_div": -0.009037816897034645, "kl_div_pos": -0.019423315301537514, "kl_div_sft": 0.001347680576145649, "learning_rate": 1.9477977161500815e-06, "loss": -0.028, "ppo_loss": -0.9807640910148621, "sft_loss": 0.0452117882668972, "step": 43 }, { "epoch": 0.11, "grad_norm": 1.0889998692984455, "importance_ratio": 0.88671875, "kl_div": -0.125685915350914, "kl_div_neg": -0.125685915350914, "learning_rate": 1.9461663947797717e-06, "loss": 0.0699, "ppo_loss": 0.8866076469421387, "step": 44 }, { "epoch": 0.11, "grad_norm": 1.3315241231448502, "importance_ratio": 0.984375, "kl_div": -0.0671415627002716, "kl_div_pos": -0.013838584534823895, "kl_div_sft": -0.12044453620910645, "learning_rate": 1.9445350734094615e-06, "loss": 0.0841, "ppo_loss": -0.9862567186355591, "sft_loss": 0.2546923756599426, "step": 45 }, { "epoch": 0.11, "grad_norm": 1.2332457193408217, "importance_ratio": 0.9375, "kl_div": -0.06261230260133743, "kl_div_neg": -0.06261230260133743, "learning_rate": 1.9429037520391517e-06, "loss": -0.1038, "ppo_loss": 0.9403876066207886, "step": 46 }, { "epoch": 0.11, "grad_norm": 1.3700016378128397, "kl_div": -0.005976326763629913, "kl_div_sft": -0.005976326763629913, "learning_rate": 1.941272430668842e-06, "loss": -0.0048, "sft_loss": 0.05619629845023155, "step": 47 }, { "epoch": 0.12, "grad_norm": 1.1374039829517788, "importance_ratio": 0.91796875, "kl_div": -0.03426515311002731, "kl_div_neg": -0.08605366200208664, "kl_div_sft": 0.01752335950732231, "learning_rate": 1.9396411092985316e-06, "loss": 0.0432, "ppo_loss": 0.9175450205802917, "sft_loss": 0.044655341655015945, "step": 48 }, { "epoch": 0.12, "grad_norm": 1.1122507780573518, "kl_div": 0.007389682345092297, "kl_div_sft": 0.007389682345092297, "learning_rate": 1.938009787928222e-06, "loss": 0.01, "sft_loss": 0.03936357796192169, "step": 49 }, { "epoch": 0.12, "grad_norm": 1.7067137538603148, "importance_ratio": 0.9453125, "kl_div": -0.0594576857984066, "kl_div_pos": -0.0594576857984066, "learning_rate": 1.936378466557912e-06, "loss": -0.103, "ppo_loss": -0.9448889493942261, "step": 50 }, { "epoch": 0.12, "grad_norm": 1.1785624615216657, "importance_ratio": 0.90234375, "kl_div": -0.10802098363637924, "kl_div_neg": -0.1956670731306076, "kl_div_pos": -0.02037489227950573, "learning_rate": 1.934747145187602e-06, "loss": -0.0801, "ppo_loss": -0.0787726640701294, "step": 51 }, { "epoch": 0.13, "grad_norm": 1.1189453593763627, "importance_ratio": 0.9375, "kl_div": -0.06822185963392258, "kl_div_neg": -0.02376522310078144, "kl_div_pos": -0.11267849802970886, "learning_rate": 1.933115823817292e-06, "loss": 0.0258, "ppo_loss": 0.041538506746292114, "step": 52 }, { "epoch": 0.13, "grad_norm": 1.2671464323075408, "importance_ratio": 0.921875, "kl_div": -0.0395657904446125, "kl_div_pos": -0.08192670345306396, "kl_div_sft": 0.0027951220981776714, "learning_rate": 1.9314845024469818e-06, "loss": -0.0388, "ppo_loss": -0.9213394522666931, "sft_loss": 0.06512558460235596, "step": 53 }, { "epoch": 0.13, "grad_norm": 1.3067314447107663, "importance_ratio": 0.9453125, "kl_div": -0.05756595730781555, "kl_div_neg": -0.05756595730781555, "learning_rate": 1.929853181076672e-06, "loss": 0.0434, "ppo_loss": 0.9440711736679077, "step": 54 }, { "epoch": 0.13, "grad_norm": 1.4856568272976414, "importance_ratio": 0.8046875, "kl_div": -0.1118774339556694, "kl_div_neg": -0.21787752211093903, "kl_div_sft": -0.005877349525690079, "learning_rate": 1.928221859706362e-06, "loss": 0.1075, "ppo_loss": 0.8042239546775818, "sft_loss": 0.07803851366043091, "step": 55 }, { "epoch": 0.14, "grad_norm": 1.2536513405163698, "importance_ratio": 0.94140625, "kl_div": -0.060897182673215866, "kl_div_neg": -0.060897182673215866, "learning_rate": 1.926590538336052e-06, "loss": 0.2201, "ppo_loss": 0.9409350156784058, "step": 56 }, { "epoch": 0.14, "grad_norm": 1.8340930520582641, "kl_div": 0.006499993149191141, "kl_div_sft": 0.006499993149191141, "learning_rate": 1.924959216965742e-06, "loss": -0.0637, "sft_loss": 0.08112891763448715, "step": 57 }, { "epoch": 0.14, "grad_norm": 2.7852510112462383, "importance_ratio": 0.9609375, "kl_div": -0.034732721745967865, "kl_div_pos": -0.0408448651432991, "kl_div_sft": -0.028620580211281776, "learning_rate": 1.9233278955954323e-06, "loss": 0.1799, "ppo_loss": -0.9599780440330505, "sft_loss": 0.10139136761426926, "step": 58 }, { "epoch": 0.14, "grad_norm": 1.448854302904934, "importance_ratio": 0.9609375, "kl_div": -0.021149268373847008, "kl_div_neg": -0.04163122922182083, "kl_div_sft": -0.0006673082825727761, "learning_rate": 1.9216965742251225e-06, "loss": -0.0926, "ppo_loss": 0.9592234492301941, "sft_loss": 0.03335588797926903, "step": 59 }, { "epoch": 0.15, "grad_norm": 1.4108705692743857, "importance_ratio": 0.8984375, "kl_div": -0.11035354435443878, "kl_div_neg": -0.16702036559581757, "kl_div_pos": -0.0536867156624794, "learning_rate": 1.9200652528548123e-06, "loss": 0.0644, "ppo_loss": -0.0507732629776001, "step": 60 }, { "epoch": 0.15, "grad_norm": 1.3722931053131302, "kl_div": -0.012220574542880058, "kl_div_sft": -0.012220574542880058, "learning_rate": 1.9184339314845025e-06, "loss": 0.0142, "sft_loss": 0.12431100010871887, "step": 61 }, { "epoch": 0.15, "grad_norm": 1.401147358629085, "importance_ratio": 0.921875, "kl_div": -0.07922801375389099, "kl_div_neg": -0.055370330810546875, "kl_div_pos": -0.10308569669723511, "learning_rate": 1.9168026101141923e-06, "loss": -0.1723, "ppo_loss": 0.022042512893676758, "step": 62 }, { "epoch": 0.15, "grad_norm": 2.6114266376273556, "importance_ratio": 0.94140625, "kl_div": -0.02723909728229046, "kl_div_pos": -0.06206502392888069, "kl_div_sft": 0.007586829364299774, "learning_rate": 1.9151712887438825e-06, "loss": -0.0016, "ppo_loss": -0.9398217797279358, "sft_loss": 0.08087282627820969, "step": 63 }, { "epoch": 0.16, "grad_norm": 1.5447770351026526, "kl_div": -0.002401808276772499, "kl_div_sft": -0.002401808276772499, "learning_rate": 1.9135399673735727e-06, "loss": -0.0636, "sft_loss": 0.08615696430206299, "step": 64 }, { "epoch": 0.16, "grad_norm": 1.4003828785361496, "kl_div": 0.0021121015306562185, "kl_div_sft": 0.0021121015306562185, "learning_rate": 1.9119086460032624e-06, "loss": 0.0677, "sft_loss": 0.06753754615783691, "step": 65 }, { "epoch": 0.16, "grad_norm": 1.3365684045710704, "importance_ratio": 0.984375, "kl_div": -0.017900364473462105, "kl_div_pos": -0.017900364473462105, "learning_rate": 1.9102773246329526e-06, "loss": -0.0189, "ppo_loss": -0.9823896884918213, "step": 66 }, { "epoch": 0.16, "grad_norm": 1.4343510843913625, "importance_ratio": 0.79296875, "kl_div": -0.1421518474817276, "kl_div_neg": -0.2302280217409134, "kl_div_sft": -0.054075662046670914, "learning_rate": 1.9086460032626424e-06, "loss": -0.0862, "ppo_loss": 0.800000011920929, "sft_loss": 0.1580251157283783, "step": 67 }, { "epoch": 0.16, "grad_norm": 1.8832227331223539, "importance_ratio": 0.8125, "kl_div": -0.20741838216781616, "kl_div_neg": -0.2551022469997406, "kl_div_pos": -0.15973453223705292, "learning_rate": 1.9070146818923328e-06, "loss": 0.0184, "ppo_loss": -0.02618500590324402, "step": 68 }, { "epoch": 0.17, "grad_norm": 1.6859843193673505, "importance_ratio": 0.96875, "kl_div": -0.00505702942609787, "kl_div_pos": -0.029736729338765144, "kl_div_sft": 0.019622670486569405, "learning_rate": 1.9053833605220226e-06, "loss": 0.002, "ppo_loss": -0.9707010388374329, "sft_loss": 0.1003195270895958, "step": 69 }, { "epoch": 0.17, "grad_norm": 1.2920918328920994, "importance_ratio": 0.94140625, "kl_div": -0.026884760707616806, "kl_div_pos": -0.06142554432153702, "kl_div_sft": 0.007656024768948555, "learning_rate": 1.9037520391517128e-06, "loss": -0.0516, "ppo_loss": -0.9404229521751404, "sft_loss": 0.05460807681083679, "step": 70 }, { "epoch": 0.17, "grad_norm": 1.6652357952873371, "kl_div": -0.01144311111420393, "kl_div_sft": -0.01144311111420393, "learning_rate": 1.902120717781403e-06, "loss": 0.1146, "sft_loss": 0.05612169951200485, "step": 71 }, { "epoch": 0.17, "grad_norm": 1.4722564721272102, "importance_ratio": 0.93359375, "kl_div": -0.03432208672165871, "kl_div_pos": -0.06942012161016464, "kl_div_sft": 0.0007759497966617346, "learning_rate": 1.9004893964110927e-06, "loss": 0.0532, "ppo_loss": -0.9329346418380737, "sft_loss": 0.04070064797997475, "step": 72 }, { "epoch": 0.18, "grad_norm": 1.096017503431569, "importance_ratio": 0.98046875, "kl_div": -0.0019319439306855202, "kl_div_pos": -0.019157661125063896, "kl_div_sft": 0.015293773263692856, "learning_rate": 1.898858075040783e-06, "loss": -0.0507, "ppo_loss": -0.9810246825218201, "sft_loss": 0.07127617299556732, "step": 73 }, { "epoch": 0.18, "grad_norm": 2.8056491064037496, "importance_ratio": 0.96484375, "kl_div": -0.03702807426452637, "kl_div_pos": -0.03702807426452637, "learning_rate": 1.897226753670473e-06, "loss": -0.0432, "ppo_loss": -0.9645748138427734, "step": 74 }, { "epoch": 0.18, "grad_norm": 1.6776358205783002, "kl_div": 0.01326986588537693, "kl_div_sft": 0.01326986588537693, "learning_rate": 1.8955954323001631e-06, "loss": -0.0721, "sft_loss": 0.03585825115442276, "step": 75 }, { "epoch": 0.18, "grad_norm": 1.3208584531366925, "importance_ratio": 0.890625, "kl_div": -0.10156667232513428, "kl_div_neg": -0.11789321154356003, "kl_div_sft": -0.08524013310670853, "learning_rate": 1.8939641109298531e-06, "loss": 0.0193, "ppo_loss": 0.8887909650802612, "sft_loss": 0.1250036656856537, "step": 76 }, { "epoch": 0.19, "grad_norm": 1.234519805140591, "importance_ratio": 0.9765625, "kl_div": -0.03858345001935959, "kl_div_neg": -0.02181975729763508, "kl_div_sft": -0.05534714460372925, "learning_rate": 1.892332789559543e-06, "loss": 0.0532, "ppo_loss": 0.9784166216850281, "sft_loss": 0.10421111434698105, "step": 77 }, { "epoch": 0.19, "grad_norm": 1.4403118331023692, "importance_ratio": 0.95703125, "kl_div": -0.01925833150744438, "kl_div_pos": -0.04326550289988518, "kl_div_sft": 0.0047488403506577015, "learning_rate": 1.8907014681892333e-06, "loss": -0.0328, "ppo_loss": -0.9576570987701416, "sft_loss": 0.025720076635479927, "step": 78 }, { "epoch": 0.19, "grad_norm": 3.560108787717491, "importance_ratio": 0.98046875, "kl_div": -0.019890496507287025, "kl_div_neg": -0.036975789815187454, "kl_div_pos": -0.0028052027337253094, "learning_rate": 1.8890701468189233e-06, "loss": 0.0458, "ppo_loss": -0.016749650239944458, "step": 79 }, { "epoch": 0.19, "grad_norm": 1.9435713609254912, "kl_div": -0.005440596025437117, "kl_div_sft": -0.005440596025437117, "learning_rate": 1.8874388254486133e-06, "loss": -0.1137, "sft_loss": 0.07730162888765335, "step": 80 }, { "epoch": 0.2, "grad_norm": 4.148207835936149, "kl_div": -0.011881545186042786, "kl_div_sft": -0.011881545186042786, "learning_rate": 1.8858075040783032e-06, "loss": 0.0925, "sft_loss": 0.06615802645683289, "step": 81 }, { "epoch": 0.2, "grad_norm": 16.686075328347325, "importance_ratio": 0.8828125, "kl_div": -0.12083092331886292, "kl_div_neg": -0.10571560263633728, "kl_div_pos": -0.13594624400138855, "learning_rate": 1.8841761827079934e-06, "loss": -0.0205, "ppo_loss": 0.013395458459854126, "step": 82 }, { "epoch": 0.2, "grad_norm": 1.3354719365783643, "kl_div": -0.0251028873026371, "kl_div_sft": -0.0251028873026371, "learning_rate": 1.8825448613376836e-06, "loss": 0.009, "sft_loss": 0.09680324792861938, "step": 83 }, { "epoch": 0.2, "grad_norm": 3.219100044253, "importance_ratio": 0.953125, "kl_div": -0.02121526561677456, "kl_div_pos": -0.04778638482093811, "kl_div_sft": 0.00535585219040513, "learning_rate": 1.8809135399673734e-06, "loss": 0.0199, "ppo_loss": -0.9533373713493347, "sft_loss": 0.05247313529253006, "step": 84 }, { "epoch": 0.21, "grad_norm": 9.536574780544736, "importance_ratio": 0.94921875, "kl_div": -0.05309019237756729, "kl_div_pos": -0.05309019237756729, "learning_rate": 1.8792822185970636e-06, "loss": -0.1631, "ppo_loss": -0.9483247399330139, "step": 85 }, { "epoch": 0.21, "grad_norm": 1.109040115745377, "importance_ratio": 0.875, "kl_div": -0.14318667352199554, "kl_div_pos": -0.14318667352199554, "learning_rate": 1.8776508972267536e-06, "loss": -0.0027, "ppo_loss": -0.8744655251502991, "step": 86 }, { "epoch": 0.21, "grad_norm": 3.7104905672476503, "importance_ratio": 0.8515625, "kl_div": -0.17779727280139923, "kl_div_neg": -0.352699339389801, "kl_div_pos": -0.002895209938287735, "learning_rate": 1.8760195758564436e-06, "loss": 0.0602, "ppo_loss": -0.09855446219444275, "step": 87 }, { "epoch": 0.21, "grad_norm": 14.546277092304194, "kl_div": 0.004458991345018148, "kl_div_sft": 0.004458991345018148, "learning_rate": 1.8743882544861336e-06, "loss": 0.0783, "sft_loss": 0.034892488270998, "step": 88 }, { "epoch": 0.22, "grad_norm": 10.241466784925795, "importance_ratio": 0.94140625, "kl_div": -0.059860531240701675, "kl_div_pos": -0.059860531240701675, "learning_rate": 1.8727569331158237e-06, "loss": 0.0203, "ppo_loss": -0.9426825046539307, "step": 89 }, { "epoch": 0.22, "grad_norm": 1.1149219937257968, "importance_ratio": 0.96875, "kl_div": -0.032899901270866394, "kl_div_neg": -0.01882491260766983, "kl_div_pos": -0.04697488993406296, "learning_rate": 1.871125611745514e-06, "loss": 0.0003, "ppo_loss": 0.013619929552078247, "step": 90 }, { "epoch": 0.22, "grad_norm": 2.920325536479396, "importance_ratio": 0.9765625, "kl_div": -0.016603615134954453, "kl_div_pos": -0.022303353995084763, "kl_div_sft": -0.010903875343501568, "learning_rate": 1.8694942903752037e-06, "loss": -0.0031, "ppo_loss": -0.9779435396194458, "sft_loss": 0.06654062867164612, "step": 91 }, { "epoch": 0.22, "grad_norm": 1.1270961307121106, "importance_ratio": 0.84375, "kl_div": -0.08473112434148788, "kl_div_neg": -0.1700011044740677, "kl_div_sft": 0.0005388528225012124, "learning_rate": 1.867862969004894e-06, "loss": -0.0543, "ppo_loss": 0.8436638712882996, "sft_loss": 0.05119462311267853, "step": 92 }, { "epoch": 0.23, "grad_norm": 5.17656096721227, "importance_ratio": 0.8203125, "kl_div": -0.1019870936870575, "kl_div_neg": -0.19734208285808563, "kl_div_sft": -0.006632108241319656, "learning_rate": 1.866231647634584e-06, "loss": -0.086, "ppo_loss": 0.8209097981452942, "sft_loss": 0.06681513786315918, "step": 93 }, { "epoch": 0.23, "grad_norm": 4.32968981717673, "importance_ratio": 0.953125, "kl_div": -0.05050528049468994, "kl_div_neg": -0.04812584072351456, "kl_div_pos": -0.052884723991155624, "learning_rate": 1.864600326264274e-06, "loss": 0.0194, "ppo_loss": 0.0022622644901275635, "step": 94 }, { "epoch": 0.23, "grad_norm": 1.1813542799359134, "kl_div": 0.00850912369787693, "kl_div_sft": 0.00850912369787693, "learning_rate": 1.862969004893964e-06, "loss": -0.1158, "sft_loss": 0.05716530978679657, "step": 95 }, { "epoch": 0.23, "grad_norm": 1.0067802880019683, "kl_div": -0.0733400210738182, "kl_div_sft": -0.0733400210738182, "learning_rate": 1.861337683523654e-06, "loss": -0.1477, "sft_loss": 0.12137281149625778, "step": 96 }, { "epoch": 0.24, "grad_norm": 2.1790317656541816, "importance_ratio": 0.8125, "kl_div": -0.20977582037448883, "kl_div_neg": -0.20977582037448883, "learning_rate": 1.8597063621533443e-06, "loss": 0.0704, "ppo_loss": 0.8167802095413208, "step": 97 }, { "epoch": 0.24, "grad_norm": 1.5017800894920748, "kl_div": 0.0028979559428989887, "kl_div_sft": 0.0028979559428989887, "learning_rate": 1.858075040783034e-06, "loss": 0.0027, "sft_loss": 0.04274223372340202, "step": 98 }, { "epoch": 0.24, "grad_norm": 3.253236186447958, "kl_div": -0.0038328543305397034, "kl_div_sft": -0.0038328543305397034, "learning_rate": 1.8564437194127242e-06, "loss": -0.0125, "sft_loss": 0.06729351729154587, "step": 99 }, { "epoch": 0.24, "grad_norm": 1.1924399472356528, "importance_ratio": 0.9921875, "kl_div": -0.017330992966890335, "kl_div_pos": -0.006222095340490341, "kl_div_sft": -0.028439892455935478, "learning_rate": 1.8548123980424142e-06, "loss": -0.111, "ppo_loss": -0.993797242641449, "sft_loss": 0.14494113624095917, "step": 100 }, { "epoch": 0.24, "grad_norm": 4.0589767436892945, "importance_ratio": 1.0, "kl_div": 0.003675918560475111, "kl_div_pos": 0.002127651358023286, "kl_div_sft": 0.0052241855300962925, "learning_rate": 1.8531810766721044e-06, "loss": -0.0991, "ppo_loss": -1.0021299123764038, "sft_loss": 0.04997418820858002, "step": 101 }, { "epoch": 0.25, "grad_norm": 3.0079827594871733, "importance_ratio": 0.8515625, "kl_div": -0.15740655362606049, "kl_div_neg": -0.1272001564502716, "kl_div_pos": -0.18761295080184937, "learning_rate": 1.8515497553017944e-06, "loss": -0.106, "ppo_loss": 0.025810927152633667, "step": 102 }, { "epoch": 0.25, "grad_norm": 6.991458586308189, "kl_div": -0.018888656049966812, "kl_div_sft": -0.018888656049966812, "learning_rate": 1.8499184339314844e-06, "loss": -0.0726, "sft_loss": 0.12873780727386475, "step": 103 }, { "epoch": 0.25, "grad_norm": 2.0316232191681327, "importance_ratio": 0.984375, "kl_div": -0.018464138731360435, "kl_div_neg": -0.03176502138376236, "kl_div_pos": -0.005163257010281086, "learning_rate": 1.8482871125611746e-06, "loss": 0.0248, "ppo_loss": -0.013057917356491089, "step": 104 }, { "epoch": 0.25, "grad_norm": 19.860864758982913, "importance_ratio": 1.0, "kl_div": -0.0046781618148088455, "kl_div_neg": 0.002820936730131507, "kl_div_sft": -0.012177260592579842, "learning_rate": 1.8466557911908646e-06, "loss": -0.0997, "ppo_loss": 1.0028249025344849, "sft_loss": 0.1462474763393402, "step": 105 }, { "epoch": 0.26, "grad_norm": 1.94309540302439, "importance_ratio": 0.984375, "kl_div": -0.02095239982008934, "kl_div_pos": -0.016821056604385376, "kl_div_sft": -0.025083741173148155, "learning_rate": 1.8450244698205545e-06, "loss": 0.0472, "ppo_loss": -0.9833196401596069, "sft_loss": 0.09222033619880676, "step": 106 }, { "epoch": 0.26, "grad_norm": 8.829977681703108, "kl_div": 0.006691737566143274, "kl_div_sft": 0.006691737566143274, "learning_rate": 1.8433931484502445e-06, "loss": -0.0236, "sft_loss": 0.03618942201137543, "step": 107 }, { "epoch": 0.26, "grad_norm": 7.877428921518379, "importance_ratio": 0.859375, "kl_div": -0.1552104353904724, "kl_div_neg": -0.21654579043388367, "kl_div_pos": -0.09387508779764175, "learning_rate": 1.8417618270799347e-06, "loss": 0.0563, "ppo_loss": -0.05255037546157837, "step": 108 }, { "epoch": 0.26, "grad_norm": 1.4240924186987443, "importance_ratio": 0.984375, "kl_div": -0.012207446619868279, "kl_div_pos": -0.01420664507895708, "kl_div_sft": -0.01020824909210205, "learning_rate": 1.8401305057096247e-06, "loss": 0.0522, "ppo_loss": -0.9858937859535217, "sft_loss": 0.0927015170454979, "step": 109 }, { "epoch": 0.27, "grad_norm": 1.156184478140483, "importance_ratio": 0.8984375, "kl_div": -0.11182701587677002, "kl_div_neg": -0.11182701587677002, "learning_rate": 1.8384991843393147e-06, "loss": -0.1422, "ppo_loss": 0.8957895636558533, "step": 110 }, { "epoch": 0.27, "grad_norm": 6.277832012261074, "importance_ratio": 0.9140625, "kl_div": -0.09246876835823059, "kl_div_neg": -0.16997133195400238, "kl_div_pos": -0.014966201968491077, "learning_rate": 1.8368678629690049e-06, "loss": 0.0615, "ppo_loss": -0.07072815299034119, "step": 111 }, { "epoch": 0.27, "grad_norm": 5.03266081369713, "importance_ratio": 0.984375, "kl_div": -0.015262942761182785, "kl_div_pos": -0.015262942761182785, "learning_rate": 1.8352365415986949e-06, "loss": -0.0423, "ppo_loss": -0.9848529696464539, "step": 112 }, { "epoch": 0.27, "grad_norm": 1.023171193659411, "importance_ratio": 0.78515625, "kl_div": -0.12196735292673111, "kl_div_pos": -0.2421884834766388, "kl_div_sft": -0.0017462180694565177, "learning_rate": 1.8336052202283848e-06, "loss": -0.0038, "ppo_loss": -0.7849081754684448, "sft_loss": 0.05288619548082352, "step": 113 }, { "epoch": 0.28, "grad_norm": 2.7586984913230066, "importance_ratio": 0.98828125, "kl_div": 0.003044874407351017, "kl_div_pos": -0.013100683689117432, "kl_div_sft": 0.019190432503819466, "learning_rate": 1.8319738988580748e-06, "loss": 0.0134, "ppo_loss": -0.9869847893714905, "sft_loss": 0.05788184702396393, "step": 114 }, { "epoch": 0.28, "grad_norm": 1.3708875886950078, "importance_ratio": 0.98828125, "kl_div": -0.012799538671970367, "kl_div_neg": -0.012748450972139835, "kl_div_sft": -0.012850625440478325, "learning_rate": 1.830342577487765e-06, "loss": 0.0322, "ppo_loss": 0.9873324632644653, "sft_loss": 0.08000502735376358, "step": 115 }, { "epoch": 0.28, "grad_norm": 1.1851864915885835, "importance_ratio": 1.0, "kl_div": -0.0011242360342293978, "kl_div_neg": -0.00048666924703866243, "kl_div_pos": -0.0017618027050048113, "learning_rate": 1.8287112561174552e-06, "loss": -0.0125, "ppo_loss": 0.0006368458271026611, "step": 116 }, { "epoch": 0.28, "grad_norm": 4.152021049240353, "importance_ratio": 0.984375, "kl_div": -0.009348939172923565, "kl_div_pos": -0.015161776915192604, "kl_div_sft": -0.003536101896315813, "learning_rate": 1.827079934747145e-06, "loss": 0.1734, "ppo_loss": -0.9849525690078735, "sft_loss": 0.11104224622249603, "step": 117 }, { "epoch": 0.29, "grad_norm": 17.122627038204538, "kl_div": -0.0030991281382739544, "kl_div_sft": -0.0030991281382739544, "learning_rate": 1.8254486133768352e-06, "loss": 0.0684, "sft_loss": 0.10727023333311081, "step": 118 }, { "epoch": 0.29, "grad_norm": 4.130081745659531, "importance_ratio": 0.75390625, "kl_div": -0.14108410477638245, "kl_div_neg": -0.28084099292755127, "kl_div_sft": -0.0013272189535200596, "learning_rate": 1.8238172920065252e-06, "loss": -0.0169, "ppo_loss": 0.800000011920929, "sft_loss": 0.059665389358997345, "step": 119 }, { "epoch": 0.29, "grad_norm": 2.738881784016763, "importance_ratio": 0.890625, "kl_div": -0.12150909006595612, "kl_div_neg": -0.06636663526296616, "kl_div_pos": -0.17665155231952667, "learning_rate": 1.8221859706362152e-06, "loss": -0.0808, "ppo_loss": 0.04885795712471008, "step": 120 }, { "epoch": 0.29, "grad_norm": 1.8803545471430183, "importance_ratio": 0.77734375, "kl_div": -0.14616063237190247, "kl_div_pos": -0.2531307339668274, "kl_div_sft": -0.039190515875816345, "learning_rate": 1.8205546492659054e-06, "loss": 0.024, "ppo_loss": -0.7763664722442627, "sft_loss": 0.10987972468137741, "step": 121 }, { "epoch": 0.3, "grad_norm": 1.1585058424583765, "importance_ratio": 0.98046875, "kl_div": -0.01550198346376419, "kl_div_pos": -0.0198336411267519, "kl_div_sft": -0.011170326732099056, "learning_rate": 1.8189233278955953e-06, "loss": 0.0286, "ppo_loss": -0.9803617596626282, "sft_loss": 0.06033976003527641, "step": 122 }, { "epoch": 0.3, "grad_norm": 16.79258302723445, "importance_ratio": 0.8828125, "kl_div": -0.12821130454540253, "kl_div_neg": -0.12821130454540253, "learning_rate": 1.8172920065252855e-06, "loss": -0.0829, "ppo_loss": 0.8835910558700562, "step": 123 }, { "epoch": 0.3, "grad_norm": 2.7903950390360417, "importance_ratio": 0.70703125, "kl_div": -0.17138634622097015, "kl_div_neg": -0.3454788625240326, "kl_div_sft": 0.002706160070374608, "learning_rate": 1.8156606851549753e-06, "loss": 0.0432, "ppo_loss": 0.800000011920929, "sft_loss": 0.029093803837895393, "step": 124 }, { "epoch": 0.3, "grad_norm": 11.587931280741508, "importance_ratio": 1.0078125, "kl_div": 0.007452365942299366, "kl_div_pos": 0.005459305830299854, "kl_div_sft": 0.009445426054298878, "learning_rate": 1.8140293637846655e-06, "loss": -0.0357, "ppo_loss": -1.0054742097854614, "sft_loss": 0.08399739861488342, "step": 125 }, { "epoch": 0.31, "grad_norm": 4.793488043002528, "importance_ratio": 0.92578125, "kl_div": -0.07985176146030426, "kl_div_neg": -0.07985176146030426, "learning_rate": 1.8123980424143555e-06, "loss": 0.112, "ppo_loss": 0.9262062907218933, "step": 126 }, { "epoch": 0.31, "grad_norm": 4.294557036360072, "kl_div": 0.005065533332526684, "kl_div_sft": 0.005065533332526684, "learning_rate": 1.8107667210440457e-06, "loss": 0.1447, "sft_loss": 0.04570557177066803, "step": 127 }, { "epoch": 0.31, "grad_norm": 7.758244221056378, "kl_div": -0.09475219994783401, "kl_div_sft": -0.09475219994783401, "learning_rate": 1.8091353996737357e-06, "loss": -0.1566, "sft_loss": 0.14630523324012756, "step": 128 }, { "epoch": 0.31, "grad_norm": 1.2952268767730881, "kl_div": -0.024958381429314613, "kl_div_sft": -0.024958381429314613, "learning_rate": 1.8075040783034257e-06, "loss": 0.2292, "sft_loss": 0.13158275187015533, "step": 129 }, { "epoch": 0.32, "grad_norm": 2.1450926110253348, "importance_ratio": 1.0, "kl_div": 0.0020835120230913162, "kl_div_pos": 0.0011660216841846704, "kl_div_sft": 0.0030010021291673183, "learning_rate": 1.8058727569331158e-06, "loss": -0.0424, "ppo_loss": -1.0011667013168335, "sft_loss": 0.06473907083272934, "step": 130 }, { "epoch": 0.32, "grad_norm": 10.508363208733325, "importance_ratio": 0.890625, "kl_div": -0.12170784175395966, "kl_div_pos": -0.12170784175395966, "learning_rate": 1.8042414355628056e-06, "loss": -0.0534, "ppo_loss": -0.8911383152008057, "step": 131 }, { "epoch": 0.32, "grad_norm": 1.205441115749259, "importance_ratio": 0.98046875, "kl_div": -0.008304815739393234, "kl_div_pos": -0.018010282889008522, "kl_div_sft": 0.0014006514102220535, "learning_rate": 1.8026101141924958e-06, "loss": -0.1574, "ppo_loss": -0.9821509718894958, "sft_loss": 0.036297757178545, "step": 132 }, { "epoch": 0.32, "grad_norm": 1.275700716932992, "importance_ratio": 0.86328125, "kl_div": -0.1585293859243393, "kl_div_neg": -0.3000960350036621, "kl_div_pos": -0.016962744295597076, "learning_rate": 1.8009787928221858e-06, "loss": -0.1364, "ppo_loss": -0.09159013628959656, "step": 133 }, { "epoch": 0.32, "grad_norm": 0.9975014706172953, "importance_ratio": 0.89453125, "kl_div": -0.11716045439243317, "kl_div_neg": -0.2166241854429245, "kl_div_pos": -0.01769673079252243, "learning_rate": 1.799347471451876e-06, "loss": 0.0028, "ppo_loss": -0.08861318230628967, "step": 134 }, { "epoch": 0.33, "grad_norm": 1.1355681478727442, "importance_ratio": 0.72265625, "kl_div": -0.15676546096801758, "kl_div_pos": -0.32453951239585876, "kl_div_sft": 0.011008601635694504, "learning_rate": 1.797716150081566e-06, "loss": -0.1041, "ppo_loss": -0.7228600978851318, "sft_loss": 0.046805497258901596, "step": 135 }, { "epoch": 0.33, "grad_norm": 3.1212162762440574, "importance_ratio": 0.9375, "kl_div": -0.06198743358254433, "kl_div_pos": -0.06198743358254433, "learning_rate": 1.796084828711256e-06, "loss": 0.0549, "ppo_loss": -0.9419088959693909, "step": 136 }, { "epoch": 0.33, "grad_norm": 1.5661286752630128, "importance_ratio": 0.84375, "kl_div": -0.17229585349559784, "kl_div_neg": -0.17083480954170227, "kl_div_pos": -0.1737568974494934, "learning_rate": 1.7944535073409462e-06, "loss": -0.0202, "ppo_loss": 0.0012297630310058594, "step": 137 }, { "epoch": 0.33, "grad_norm": 1.9203922165169252, "importance_ratio": 0.80078125, "kl_div": -0.19221439957618713, "kl_div_neg": -0.2237863838672638, "kl_div_sft": -0.16064240038394928, "learning_rate": 1.7928221859706361e-06, "loss": -0.0806, "ppo_loss": 0.800000011920929, "sft_loss": 0.19409435987472534, "step": 138 }, { "epoch": 0.34, "grad_norm": 1.159094739081435, "kl_div": -0.06490548700094223, "kl_div_sft": -0.06490548700094223, "learning_rate": 1.7911908646003261e-06, "loss": 0.0725, "sft_loss": 0.1461164951324463, "step": 139 }, { "epoch": 0.34, "grad_norm": 0.89539811555495, "importance_ratio": 0.6953125, "kl_div": -0.1836492419242859, "kl_div_neg": -0.3647206723690033, "kl_div_sft": -0.002577810548245907, "learning_rate": 1.7895595432300161e-06, "loss": -0.005, "ppo_loss": 0.800000011920929, "sft_loss": 0.040640685707330704, "step": 140 }, { "epoch": 0.34, "grad_norm": 0.9498610658443112, "importance_ratio": 1.015625, "kl_div": 0.008194430731236935, "kl_div_pos": 0.01923866756260395, "kl_div_sft": -0.0028498058672994375, "learning_rate": 1.7879282218597063e-06, "loss": -0.0633, "ppo_loss": -1.0194249153137207, "sft_loss": 0.14637592434883118, "step": 141 }, { "epoch": 0.34, "grad_norm": 0.9800355721849093, "kl_div": -0.014961409382522106, "kl_div_sft": -0.014961409382522106, "learning_rate": 1.7862969004893965e-06, "loss": -0.0662, "sft_loss": 0.08139221370220184, "step": 142 }, { "epoch": 0.35, "grad_norm": 2.483579496262967, "importance_ratio": 0.87109375, "kl_div": -0.0703229084610939, "kl_div_neg": -0.14011670649051666, "kl_div_sft": -0.0005291154375299811, "learning_rate": 1.7846655791190863e-06, "loss": -0.1198, "ppo_loss": 0.8692567944526672, "sft_loss": 0.04155682399868965, "step": 143 }, { "epoch": 0.35, "grad_norm": 1.1562214409677711, "kl_div": 0.002795197768136859, "kl_div_sft": 0.002795197768136859, "learning_rate": 1.7830342577487765e-06, "loss": 0.1252, "sft_loss": 0.0712403953075409, "step": 144 }, { "epoch": 0.35, "grad_norm": 5.743535263812712, "importance_ratio": 0.84765625, "kl_div": -0.1651565581560135, "kl_div_neg": -0.1651565581560135, "learning_rate": 1.7814029363784665e-06, "loss": 0.0082, "ppo_loss": 0.8477612733840942, "step": 145 }, { "epoch": 0.35, "grad_norm": 3.1622814298948647, "kl_div": 0.0015609723050147295, "kl_div_sft": 0.0015609723050147295, "learning_rate": 1.7797716150081564e-06, "loss": 0.0026, "sft_loss": 0.046822816133499146, "step": 146 }, { "epoch": 0.36, "grad_norm": 1.2081844906829902, "kl_div": -0.019221793860197067, "kl_div_sft": -0.019221793860197067, "learning_rate": 1.7781402936378466e-06, "loss": -0.0855, "sft_loss": 0.09771668165922165, "step": 147 }, { "epoch": 0.36, "grad_norm": 1.20771538071317, "importance_ratio": 0.890625, "kl_div": -0.061765387654304504, "kl_div_neg": -0.11600951105356216, "kl_div_sft": -0.007521265652030706, "learning_rate": 1.7765089722675366e-06, "loss": 0.0313, "ppo_loss": 0.8904667496681213, "sft_loss": 0.05933324247598648, "step": 148 }, { "epoch": 0.36, "grad_norm": 1.6411811249365569, "kl_div": 0.013755328953266144, "kl_div_sft": 0.013755328953266144, "learning_rate": 1.7748776508972268e-06, "loss": -0.1119, "sft_loss": 0.07278871536254883, "step": 149 }, { "epoch": 0.36, "grad_norm": 3.13531211771659, "importance_ratio": 0.93359375, "kl_div": -0.03333902359008789, "kl_div_neg": -0.0697193294763565, "kl_div_sft": 0.003041281597688794, "learning_rate": 1.7732463295269166e-06, "loss": 0.1347, "ppo_loss": 0.9326555132865906, "sft_loss": 0.05647405609488487, "step": 150 }, { "epoch": 0.37, "grad_norm": 0.9084137703047128, "kl_div": 0.006213179789483547, "kl_div_sft": 0.006213179789483547, "learning_rate": 1.7716150081566068e-06, "loss": 0.0022, "sft_loss": 0.073430135846138, "step": 151 }, { "epoch": 0.37, "grad_norm": 29.33578873963614, "importance_ratio": 0.8671875, "kl_div": -0.06711047887802124, "kl_div_pos": -0.1426703929901123, "kl_div_sft": 0.008449428714811802, "learning_rate": 1.7699836867862968e-06, "loss": 0.0391, "ppo_loss": -0.8670398592948914, "sft_loss": 0.025338491424918175, "step": 152 }, { "epoch": 0.37, "grad_norm": 3.212689369646534, "importance_ratio": 0.84375, "kl_div": -0.17463237047195435, "kl_div_neg": -0.29152414202690125, "kl_div_pos": -0.05774059146642685, "learning_rate": 1.768352365415987e-06, "loss": -0.0389, "ppo_loss": -0.0719473659992218, "step": 153 }, { "epoch": 0.37, "grad_norm": 1.5889537672803584, "importance_ratio": 0.8125, "kl_div": -0.10110513865947723, "kl_div_neg": -0.2069135308265686, "kl_div_sft": 0.0047032469883561134, "learning_rate": 1.766721044045677e-06, "loss": 0.0353, "ppo_loss": 0.8130899667739868, "sft_loss": 0.08691839873790741, "step": 154 }, { "epoch": 0.38, "grad_norm": 1.4739593487883216, "importance_ratio": 0.96875, "kl_div": -0.03229506313800812, "kl_div_neg": -0.01941351592540741, "kl_div_pos": -0.04517660662531853, "learning_rate": 1.765089722675367e-06, "loss": -0.0083, "ppo_loss": 0.012472540140151978, "step": 155 }, { "epoch": 0.38, "grad_norm": 1.7783165585490306, "importance_ratio": 0.859375, "kl_div": -0.0714777484536171, "kl_div_neg": -0.15110665559768677, "kl_div_sft": 0.008151160553097725, "learning_rate": 1.7634584013050571e-06, "loss": 0.0427, "ppo_loss": 0.8597559928894043, "sft_loss": 0.048410769551992416, "step": 156 }, { "epoch": 0.38, "grad_norm": 1.2124782973244714, "importance_ratio": 0.9375, "kl_div": -0.07026248425245285, "kl_div_neg": -0.07026248425245285, "learning_rate": 1.761827079934747e-06, "loss": 0.1084, "ppo_loss": 0.9341276288032532, "step": 157 }, { "epoch": 0.38, "grad_norm": 3.2926519022016683, "importance_ratio": 0.90234375, "kl_div": -0.10565493255853653, "kl_div_neg": -0.18504543602466583, "kl_div_pos": -0.026264430955052376, "learning_rate": 1.760195758564437e-06, "loss": -0.0399, "ppo_loss": -0.07150548696517944, "step": 158 }, { "epoch": 0.39, "grad_norm": 1.6225790182659727, "importance_ratio": 0.9609375, "kl_div": -0.04186892881989479, "kl_div_neg": -0.04186892881989479, "learning_rate": 1.758564437194127e-06, "loss": -0.2452, "ppo_loss": 0.959962010383606, "step": 159 }, { "epoch": 0.39, "grad_norm": 0.9470105643846278, "kl_div": -0.11340246349573135, "kl_div_sft": -0.11340246349573135, "learning_rate": 1.7569331158238173e-06, "loss": -0.063, "sft_loss": 0.21769794821739197, "step": 160 }, { "epoch": 0.39, "grad_norm": 3.348421060625913, "importance_ratio": 0.76171875, "kl_div": -0.1341472864151001, "kl_div_neg": -0.27457693219184875, "kl_div_sft": 0.006282369140535593, "learning_rate": 1.7553017944535073e-06, "loss": -0.0256, "ppo_loss": 0.800000011920929, "sft_loss": 0.033535126596689224, "step": 161 }, { "epoch": 0.39, "grad_norm": 7.443811639615435, "importance_ratio": 0.9453125, "kl_div": -0.06028685346245766, "kl_div_neg": -0.06028685346245766, "learning_rate": 1.7536704730831972e-06, "loss": 0.1934, "ppo_loss": 0.9425788521766663, "step": 162 }, { "epoch": 0.4, "grad_norm": 3.5188537690765913, "importance_ratio": 0.90234375, "kl_div": -0.04902525618672371, "kl_div_pos": -0.10362156480550766, "kl_div_sft": 0.005571051966398954, "learning_rate": 1.7520391517128874e-06, "loss": -0.0139, "ppo_loss": -0.9015664458274841, "sft_loss": 0.07571996748447418, "step": 163 }, { "epoch": 0.4, "grad_norm": 3.0153035205000087, "kl_div": 0.014165805652737617, "kl_div_sft": 0.014165805652737617, "learning_rate": 1.7504078303425774e-06, "loss": -0.104, "sft_loss": 0.040061578154563904, "step": 164 }, { "epoch": 0.4, "grad_norm": 10.233874406292543, "importance_ratio": 0.90625, "kl_div": -0.10461732745170593, "kl_div_neg": -0.2085922807455063, "kl_div_pos": -0.0006423748563975096, "learning_rate": 1.7487765089722674e-06, "loss": 0.0406, "ppo_loss": -0.0938158929347992, "step": 165 }, { "epoch": 0.4, "grad_norm": 4.848785736038836, "importance_ratio": 0.9609375, "kl_div": -0.01604663021862507, "kl_div_neg": -0.03924839571118355, "kl_div_sft": 0.007155134342610836, "learning_rate": 1.7471451876019576e-06, "loss": 0.0303, "ppo_loss": 0.9615119099617004, "sft_loss": 0.02133319526910782, "step": 166 }, { "epoch": 0.4, "grad_norm": 2.66964325082396, "importance_ratio": 0.85546875, "kl_div": -0.07945609837770462, "kl_div_pos": -0.1554151475429535, "kl_div_sft": -0.003497056197375059, "learning_rate": 1.7455138662316476e-06, "loss": -0.0582, "ppo_loss": -0.856059730052948, "sft_loss": 0.14876271784305573, "step": 167 }, { "epoch": 0.41, "grad_norm": 8.793669799352205, "importance_ratio": 0.8203125, "kl_div": -0.21741493046283722, "kl_div_pos": -0.21741493046283722, "learning_rate": 1.7438825448613378e-06, "loss": 0.1567, "ppo_loss": -0.820621907711029, "step": 168 }, { "epoch": 0.41, "grad_norm": 1.0459297239608907, "importance_ratio": 0.74609375, "kl_div": -0.14068058133125305, "kl_div_neg": -0.2933712899684906, "kl_div_sft": 0.012010131031274796, "learning_rate": 1.7422512234910276e-06, "loss": 0.073, "ppo_loss": 0.800000011920929, "sft_loss": 0.04012210667133331, "step": 169 }, { "epoch": 0.41, "grad_norm": 29.005732528782133, "kl_div": -0.2703365087509155, "kl_div_sft": -0.2703365087509155, "learning_rate": 1.7406199021207178e-06, "loss": -0.1284, "sft_loss": 0.33477458357810974, "step": 170 }, { "epoch": 0.41, "grad_norm": 1.2141234786382378, "importance_ratio": 0.8046875, "kl_div": -0.2189944088459015, "kl_div_neg": -0.20113199949264526, "kl_div_pos": -0.23685681819915771, "learning_rate": 1.7389885807504077e-06, "loss": 0.0753, "ppo_loss": 0.014350086450576782, "step": 171 }, { "epoch": 0.42, "grad_norm": 5.335803095854867, "importance_ratio": 0.875, "kl_div": -0.1390659213066101, "kl_div_neg": -0.2621629536151886, "kl_div_pos": -0.015968896448612213, "learning_rate": 1.7373572593800977e-06, "loss": -0.1297, "ppo_loss": -0.09207895398139954, "step": 172 }, { "epoch": 0.42, "grad_norm": 4.069993370856808, "importance_ratio": 0.84765625, "kl_div": -0.08496677875518799, "kl_div_pos": -0.16756807267665863, "kl_div_sft": -0.002365480177104473, "learning_rate": 1.735725938009788e-06, "loss": 0.0029, "ppo_loss": -0.845719039440155, "sft_loss": 0.07888627797365189, "step": 173 }, { "epoch": 0.42, "grad_norm": 2.3106871019829285, "importance_ratio": 0.87890625, "kl_div": -0.12879464030265808, "kl_div_pos": -0.12879464030265808, "learning_rate": 1.734094616639478e-06, "loss": -0.1655, "ppo_loss": -0.8791550993919373, "step": 174 }, { "epoch": 0.42, "grad_norm": 1.373166075045685, "importance_ratio": 0.78515625, "kl_div": -0.11861623823642731, "kl_div_neg": -0.243907630443573, "kl_div_sft": 0.006675161421298981, "learning_rate": 1.732463295269168e-06, "loss": 0.0095, "ppo_loss": 0.800000011920929, "sft_loss": 0.0354793556034565, "step": 175 }, { "epoch": 0.43, "grad_norm": 1.2951290833639222, "importance_ratio": 0.9921875, "kl_div": -0.008589332923293114, "kl_div_pos": -0.008589332923293114, "learning_rate": 1.7308319738988579e-06, "loss": -0.0586, "ppo_loss": -0.9914485216140747, "step": 176 }, { "epoch": 0.43, "grad_norm": 1.0655123698654576, "importance_ratio": 0.70703125, "kl_div": -0.17774005234241486, "kl_div_neg": -0.3439978063106537, "kl_div_sft": -0.011482291854918003, "learning_rate": 1.729200652528548e-06, "loss": -0.0304, "ppo_loss": 0.800000011920929, "sft_loss": 0.0535435788333416, "step": 177 }, { "epoch": 0.43, "grad_norm": 4.031029983549054, "importance_ratio": 0.859375, "kl_div": -0.15722407400608063, "kl_div_neg": -0.03494102135300636, "kl_div_pos": -0.2795071303844452, "learning_rate": 1.727569331158238e-06, "loss": 0.025, "ppo_loss": 0.1047530472278595, "step": 178 }, { "epoch": 0.43, "grad_norm": 7.814317659645009, "importance_ratio": 0.890625, "kl_div": -0.07359719276428223, "kl_div_neg": -0.11783038079738617, "kl_div_sft": -0.029364006593823433, "learning_rate": 1.7259380097879282e-06, "loss": -0.0495, "ppo_loss": 0.8888468742370605, "sft_loss": 0.14391446113586426, "step": 179 }, { "epoch": 0.44, "grad_norm": 7.462387375484931, "importance_ratio": 0.9453125, "kl_div": -0.06070178002119064, "kl_div_neg": -0.06070178002119064, "learning_rate": 1.7243066884176182e-06, "loss": -0.0258, "ppo_loss": 0.9421994686126709, "step": 180 }, { "epoch": 0.44, "grad_norm": 2.2803669095537016, "importance_ratio": 0.8046875, "kl_div": -0.17116063833236694, "kl_div_neg": -0.21698014438152313, "kl_div_sft": -0.12534114718437195, "learning_rate": 1.7226753670473082e-06, "loss": -0.006, "ppo_loss": 0.8049459457397461, "sft_loss": 0.2503657639026642, "step": 181 }, { "epoch": 0.44, "grad_norm": 0.8677791261282602, "kl_div": -0.007281461730599403, "kl_div_sft": -0.007281461730599403, "learning_rate": 1.7210440456769984e-06, "loss": -0.111, "sft_loss": 0.06887984275817871, "step": 182 }, { "epoch": 0.44, "grad_norm": 3.1503894141349194, "importance_ratio": 1.0234375, "kl_div": 0.006265717558562756, "kl_div_neg": 0.0197683684527874, "kl_div_sft": -0.007236933335661888, "learning_rate": 1.7194127243066882e-06, "loss": 0.0205, "ppo_loss": 1.0199650526046753, "sft_loss": 0.061343319714069366, "step": 183 }, { "epoch": 0.45, "grad_norm": 6.572701218384443, "importance_ratio": 0.984375, "kl_div": -0.019500788301229477, "kl_div_pos": -0.019500788301229477, "learning_rate": 1.7177814029363784e-06, "loss": -0.1272, "ppo_loss": -0.9808427095413208, "step": 184 }, { "epoch": 0.45, "grad_norm": 2.2136950869779586, "importance_ratio": 0.83203125, "kl_div": -0.19927285611629486, "kl_div_neg": -0.19927285611629486, "learning_rate": 1.7161500815660684e-06, "loss": 0.1022, "ppo_loss": 0.8878340721130371, "step": 185 }, { "epoch": 0.45, "grad_norm": 1.0749154345939593, "importance_ratio": 0.9765625, "kl_div": -0.007401918526738882, "kl_div_pos": -0.023486772552132607, "kl_div_sft": 0.008682935498654842, "learning_rate": 1.7145187601957586e-06, "loss": -0.0568, "ppo_loss": -0.9767869114875793, "sft_loss": 0.038777194917201996, "step": 186 }, { "epoch": 0.45, "grad_norm": 3.193871506621026, "importance_ratio": 0.8203125, "kl_div": -0.2192690223455429, "kl_div_neg": -0.44032028317451477, "kl_div_pos": 0.0017822531517595053, "learning_rate": 1.7128874388254485e-06, "loss": 0.0145, "ppo_loss": -0.10089191794395447, "step": 187 }, { "epoch": 0.46, "grad_norm": 9.937608250442247, "importance_ratio": 0.984375, "kl_div": -0.01901225373148918, "kl_div_neg": -0.009077527560293674, "kl_div_pos": -0.028946978971362114, "learning_rate": 1.7112561174551385e-06, "loss": 0.0223, "ppo_loss": 0.009747833013534546, "step": 188 }, { "epoch": 0.46, "grad_norm": 2.626869262718091, "importance_ratio": 0.98046875, "kl_div": -0.015202803537249565, "kl_div_pos": -0.02008308283984661, "kl_div_sft": -0.01032252423465252, "learning_rate": 1.7096247960848287e-06, "loss": 0.0366, "ppo_loss": -0.9801172018051147, "sft_loss": 0.08730190247297287, "step": 189 }, { "epoch": 0.46, "grad_norm": 1.7644146171659656, "importance_ratio": 0.95703125, "kl_div": -0.01875142753124237, "kl_div_pos": -0.045739587396383286, "kl_div_sft": 0.008236734196543694, "learning_rate": 1.7079934747145187e-06, "loss": 0.0074, "ppo_loss": -0.9552907347679138, "sft_loss": 0.03919677063822746, "step": 190 }, { "epoch": 0.46, "grad_norm": 1.3433554757725241, "importance_ratio": 0.80078125, "kl_div": -0.24683430790901184, "kl_div_neg": -0.47321048378944397, "kl_div_pos": -0.020458126440644264, "learning_rate": 1.7063621533442087e-06, "loss": 0.0831, "ppo_loss": -0.08987483382225037, "step": 191 }, { "epoch": 0.47, "grad_norm": 12.54506768194302, "kl_div": -0.018011918291449547, "kl_div_sft": -0.018011918291449547, "learning_rate": 1.7047308319738989e-06, "loss": 0.1565, "sft_loss": 0.16000191867351532, "step": 192 }, { "epoch": 0.47, "grad_norm": 1.832165288190318, "importance_ratio": 0.73828125, "kl_div": -0.1693369597196579, "kl_div_pos": -0.3051265478134155, "kl_div_sft": -0.033547379076480865, "learning_rate": 1.7030995106035889e-06, "loss": -0.0637, "ppo_loss": -0.7370301485061646, "sft_loss": 0.10590871423482895, "step": 193 }, { "epoch": 0.47, "grad_norm": 1.2331892184952706, "importance_ratio": 0.99609375, "kl_div": 0.0009276217315346003, "kl_div_neg": -0.003930038772523403, "kl_div_sft": 0.005785282235592604, "learning_rate": 1.7014681892332789e-06, "loss": -0.0689, "ppo_loss": 0.9960777163505554, "sft_loss": 0.038523055613040924, "step": 194 }, { "epoch": 0.47, "grad_norm": 10.673199639834243, "kl_div": 0.007861132733523846, "kl_div_sft": 0.007861132733523846, "learning_rate": 1.6998368678629688e-06, "loss": 0.1179, "sft_loss": 0.0692097395658493, "step": 195 }, { "epoch": 0.48, "grad_norm": 1.1172814496424563, "importance_ratio": 0.96484375, "kl_div": -0.022414401173591614, "kl_div_pos": -0.034341100603342056, "kl_div_sft": -0.010487699881196022, "learning_rate": 1.698205546492659e-06, "loss": -0.1272, "ppo_loss": -0.9662418961524963, "sft_loss": 0.0791834369301796, "step": 196 }, { "epoch": 0.48, "grad_norm": 1.7431361604909785, "importance_ratio": 0.9140625, "kl_div": -0.09347224235534668, "kl_div_neg": -0.026140188798308372, "kl_div_pos": -0.16080430150032043, "learning_rate": 1.696574225122349e-06, "loss": -0.0508, "ppo_loss": 0.06136992573738098, "step": 197 }, { "epoch": 0.48, "grad_norm": 5.576029921025871, "importance_ratio": 0.953125, "kl_div": -0.023265577852725983, "kl_div_pos": -0.04682733118534088, "kl_div_sft": 0.00029617652762681246, "learning_rate": 1.694942903752039e-06, "loss": 0.0361, "ppo_loss": -0.9542521834373474, "sft_loss": 0.09867709875106812, "step": 198 }, { "epoch": 0.48, "grad_norm": 0.9557656152134608, "kl_div": -0.019151723012328148, "kl_div_sft": -0.019151723012328148, "learning_rate": 1.6933115823817292e-06, "loss": 0.0791, "sft_loss": 0.0674813762307167, "step": 199 }, { "epoch": 0.48, "grad_norm": 4.801397620814088, "importance_ratio": 1.0, "kl_div": -0.057284507900476456, "kl_div_neg": 0.0016344115138053894, "kl_div_sft": -0.1162034273147583, "learning_rate": 1.6916802610114192e-06, "loss": 0.1107, "ppo_loss": 1.0016357898712158, "sft_loss": 0.16367655992507935, "step": 200 }, { "epoch": 0.49, "grad_norm": 5.25441102506617, "importance_ratio": 0.9921875, "kl_div": -0.006039372645318508, "kl_div_pos": -0.009510613977909088, "kl_div_sft": -0.002568131545558572, "learning_rate": 1.6900489396411094e-06, "loss": 0.0823, "ppo_loss": -0.9905344247817993, "sft_loss": 0.0743524581193924, "step": 201 }, { "epoch": 0.49, "grad_norm": 2.6389299177583028, "importance_ratio": 0.98046875, "kl_div": -0.009905396960675716, "kl_div_pos": -0.019572056829929352, "kl_div_sft": -0.00023873659665696323, "learning_rate": 1.6884176182707991e-06, "loss": -0.0591, "ppo_loss": -0.9806182384490967, "sft_loss": 0.09485552459955215, "step": 202 }, { "epoch": 0.49, "grad_norm": 3.628003816116833, "importance_ratio": 0.98046875, "kl_div": -0.01536363735795021, "kl_div_neg": -0.02144819125533104, "kl_div_sft": -0.009279083460569382, "learning_rate": 1.6867862969004893e-06, "loss": -0.0365, "ppo_loss": 0.9787802696228027, "sft_loss": 0.06808764487504959, "step": 203 }, { "epoch": 0.49, "grad_norm": 10.045426758225995, "importance_ratio": 0.90625, "kl_div": -0.09950728714466095, "kl_div_neg": -0.128764808177948, "kl_div_pos": -0.0702497735619545, "learning_rate": 1.6851549755301793e-06, "loss": -0.0343, "ppo_loss": -0.026490122079849243, "step": 204 }, { "epoch": 0.5, "grad_norm": 1.1874430793371316, "importance_ratio": 0.99609375, "kl_div": -0.0002015829086303711, "kl_div_pos": -0.00569371972233057, "kl_div_sft": 0.005290553905069828, "learning_rate": 1.6835236541598693e-06, "loss": -0.0687, "ppo_loss": -0.9943224191665649, "sft_loss": 0.11273713409900665, "step": 205 }, { "epoch": 0.5, "grad_norm": 1.8097189251913681, "kl_div": -0.0048973290249705315, "kl_div_sft": -0.0048973290249705315, "learning_rate": 1.6818923327895595e-06, "loss": 0.0452, "sft_loss": 0.09427434206008911, "step": 206 }, { "epoch": 0.5, "grad_norm": 1.4893219280512007, "importance_ratio": 0.859375, "kl_div": -0.08186168968677521, "kl_div_pos": -0.15327724814414978, "kl_div_sft": -0.010446123778820038, "learning_rate": 1.6802610114192495e-06, "loss": 0.0751, "ppo_loss": -0.8578917980194092, "sft_loss": 0.1798616498708725, "step": 207 }, { "epoch": 0.5, "grad_norm": 4.306058634144844, "importance_ratio": 0.94140625, "kl_div": -0.06163910776376724, "kl_div_pos": -0.06163910776376724, "learning_rate": 1.6786296900489397e-06, "loss": -0.1196, "ppo_loss": -0.9414124488830566, "step": 208 }, { "epoch": 0.51, "grad_norm": 0.9217491710587724, "importance_ratio": 0.94921875, "kl_div": -0.026861974969506264, "kl_div_pos": -0.05260562524199486, "kl_div_sft": -0.001118326443247497, "learning_rate": 1.6769983686786295e-06, "loss": 0.0148, "ppo_loss": -0.9487541317939758, "sft_loss": 0.05310088023543358, "step": 209 }, { "epoch": 0.51, "grad_norm": 1.013299539205397, "importance_ratio": 0.9765625, "kl_div": -0.01243782788515091, "kl_div_pos": -0.023766396567225456, "kl_div_sft": -0.001109258970245719, "learning_rate": 1.6753670473083197e-06, "loss": -0.0859, "ppo_loss": -0.9765138030052185, "sft_loss": 0.0629742220044136, "step": 210 }, { "epoch": 0.51, "grad_norm": 1.3299034777013468, "importance_ratio": 0.9921875, "kl_div": -0.004810581915080547, "kl_div_neg": -0.0062462324276566505, "kl_div_sft": -0.003374930936843157, "learning_rate": 1.6737357259380096e-06, "loss": 0.0774, "ppo_loss": 0.9937732815742493, "sft_loss": 0.1101992055773735, "step": 211 }, { "epoch": 0.51, "grad_norm": 3.5440785143057942, "importance_ratio": 0.79296875, "kl_div": -0.11117896437644958, "kl_div_pos": -0.2340003401041031, "kl_div_sft": 0.011642408557236195, "learning_rate": 1.6721044045676998e-06, "loss": -0.0015, "ppo_loss": -0.7913615703582764, "sft_loss": 0.030798746272921562, "step": 212 }, { "epoch": 0.52, "grad_norm": 1.2810043006336405, "importance_ratio": 0.97265625, "kl_div": -0.018537085503339767, "kl_div_neg": -0.02736750990152359, "kl_div_sft": -0.009706659242510796, "learning_rate": 1.6704730831973898e-06, "loss": -0.075, "ppo_loss": 0.973003625869751, "sft_loss": 0.10553260892629623, "step": 213 }, { "epoch": 0.52, "grad_norm": 1.7161154235399265, "importance_ratio": 0.98828125, "kl_div": 0.005109624471515417, "kl_div_pos": -0.012915403582155704, "kl_div_sft": 0.02313465252518654, "learning_rate": 1.6688417618270798e-06, "loss": -0.0869, "ppo_loss": -0.9871676564216614, "sft_loss": 0.08311924338340759, "step": 214 }, { "epoch": 0.52, "grad_norm": 0.9455897657608686, "importance_ratio": 0.73828125, "kl_div": -0.15399585664272308, "kl_div_neg": -0.30571627616882324, "kl_div_sft": -0.002275428967550397, "learning_rate": 1.66721044045677e-06, "loss": 0.0262, "ppo_loss": 0.800000011920929, "sft_loss": 0.06958441436290741, "step": 215 }, { "epoch": 0.52, "grad_norm": 0.8223109289451344, "importance_ratio": 0.6875, "kl_div": -0.3769010901451111, "kl_div_neg": -0.3769010901451111, "learning_rate": 1.6655791190864598e-06, "loss": -0.0299, "ppo_loss": 0.800000011920929, "step": 216 }, { "epoch": 0.53, "grad_norm": 4.10171339938189, "importance_ratio": 0.9765625, "kl_div": -0.025581523776054382, "kl_div_neg": -0.045762915164232254, "kl_div_pos": -0.005400133319199085, "learning_rate": 1.66394779771615e-06, "loss": 0.1311, "ppo_loss": -0.01967298984527588, "step": 217 }, { "epoch": 0.53, "grad_norm": 1.3111783139513318, "importance_ratio": 0.859375, "kl_div": -0.15934057533740997, "kl_div_neg": -0.2929997742176056, "kl_div_pos": -0.025681370869278908, "learning_rate": 1.6623164763458402e-06, "loss": -0.1031, "ppo_loss": -0.08732280135154724, "step": 218 }, { "epoch": 0.53, "grad_norm": 2.429477891129353, "importance_ratio": 0.8046875, "kl_div": -0.10484195500612259, "kl_div_neg": -0.21933768689632416, "kl_div_sft": 0.009653773158788681, "learning_rate": 1.6606851549755301e-06, "loss": -0.0789, "ppo_loss": 0.8030505180358887, "sft_loss": 0.024198254570364952, "step": 219 }, { "epoch": 0.53, "grad_norm": 15.890667250435474, "importance_ratio": 0.98828125, "kl_div": -0.0036477274261415005, "kl_div_pos": -0.012399217113852501, "kl_div_sft": 0.0051037622615695, "learning_rate": 1.6590538336052201e-06, "loss": -0.0181, "ppo_loss": -0.9876773357391357, "sft_loss": 0.0732378140091896, "step": 220 }, { "epoch": 0.54, "grad_norm": 1.4782304173757412, "importance_ratio": 0.87109375, "kl_div": -0.07580201327800751, "kl_div_neg": -0.1377091109752655, "kl_div_sft": -0.013894918374717236, "learning_rate": 1.6574225122349101e-06, "loss": 0.0889, "ppo_loss": 0.8713521361351013, "sft_loss": 0.09440121054649353, "step": 221 }, { "epoch": 0.54, "grad_norm": 1.1372404179600206, "kl_div": -0.0017525558359920979, "kl_div_sft": -0.0017525558359920979, "learning_rate": 1.6557911908646003e-06, "loss": -0.0755, "sft_loss": 0.04836359620094299, "step": 222 }, { "epoch": 0.54, "grad_norm": 1.1127947898940231, "importance_ratio": 0.890625, "kl_div": -0.11645431816577911, "kl_div_neg": -0.11645431816577911, "learning_rate": 1.6541598694942903e-06, "loss": -0.0162, "ppo_loss": 0.8909889459609985, "step": 223 }, { "epoch": 0.54, "grad_norm": 1.2737183524440732, "kl_div": -0.024832893162965775, "kl_div_sft": -0.024832893162965775, "learning_rate": 1.6525285481239803e-06, "loss": 0.0049, "sft_loss": 0.09953819215297699, "step": 224 }, { "epoch": 0.55, "grad_norm": 1.3164385783133985, "kl_div": 0.0015126760117709637, "kl_div_sft": 0.0015126760117709637, "learning_rate": 1.6508972267536705e-06, "loss": 0.123, "sft_loss": 0.06987418234348297, "step": 225 }, { "epoch": 0.55, "grad_norm": 3.5919973328884223, "importance_ratio": 0.734375, "kl_div": -0.14923806488513947, "kl_div_neg": -0.3067997097969055, "kl_div_sft": 0.00832358468323946, "learning_rate": 1.6492659053833605e-06, "loss": 0.0632, "ppo_loss": 0.800000011920929, "sft_loss": 0.04831942170858383, "step": 226 }, { "epoch": 0.55, "grad_norm": 1.0691139583028615, "kl_div": -0.00811840407550335, "kl_div_sft": -0.00811840407550335, "learning_rate": 1.6476345840130507e-06, "loss": -0.151, "sft_loss": 0.06757162511348724, "step": 227 }, { "epoch": 0.55, "grad_norm": 3.1649456534156646, "importance_ratio": 0.9921875, "kl_div": -0.009984659031033516, "kl_div_neg": -0.007218531798571348, "kl_div_sft": -0.012750785797834396, "learning_rate": 1.6460032626427404e-06, "loss": -0.0265, "ppo_loss": 0.9928075075149536, "sft_loss": 0.10508273541927338, "step": 228 }, { "epoch": 0.56, "grad_norm": 2.9554347133869787, "importance_ratio": 0.80078125, "kl_div": -0.23050877451896667, "kl_div_neg": -0.34867945313453674, "kl_div_pos": -0.1123380959033966, "learning_rate": 1.6443719412724306e-06, "loss": -0.0503, "ppo_loss": -0.04687100648880005, "step": 229 }, { "epoch": 0.56, "grad_norm": 1.91213082134934, "kl_div": -0.00412414874881506, "kl_div_sft": -0.00412414874881506, "learning_rate": 1.6427406199021206e-06, "loss": -0.0354, "sft_loss": 0.0650189146399498, "step": 230 }, { "epoch": 0.56, "grad_norm": 1.5237829354711199, "importance_ratio": 0.7578125, "kl_div": -0.27641308307647705, "kl_div_neg": -0.27641308307647705, "learning_rate": 1.6411092985318106e-06, "loss": 0.1837, "ppo_loss": 0.800000011920929, "step": 231 }, { "epoch": 0.56, "grad_norm": 1.1111890977405692, "importance_ratio": 0.91015625, "kl_div": -0.0970386490225792, "kl_div_neg": -0.1912834793329239, "kl_div_pos": -0.0027938156854361296, "learning_rate": 1.6394779771615008e-06, "loss": -0.1116, "ppo_loss": -0.0856558084487915, "step": 232 }, { "epoch": 0.56, "grad_norm": 1.1335844369475514, "kl_div": -0.020297205075621605, "kl_div_sft": -0.020297205075621605, "learning_rate": 1.6378466557911908e-06, "loss": 0.038, "sft_loss": 0.12747663259506226, "step": 233 }, { "epoch": 0.57, "grad_norm": 4.644416672919157, "kl_div": -0.0027559231966733932, "kl_div_sft": -0.0027559231966733932, "learning_rate": 1.636215334420881e-06, "loss": 0.031, "sft_loss": 0.12215368449687958, "step": 234 }, { "epoch": 0.57, "grad_norm": 1.0598840486338408, "kl_div": -0.002642288338392973, "kl_div_sft": -0.002642288338392973, "learning_rate": 1.6345840130505707e-06, "loss": 0.0129, "sft_loss": 0.0570257268846035, "step": 235 }, { "epoch": 0.57, "grad_norm": 1.0173545782221605, "importance_ratio": 0.99609375, "kl_div": -0.013152987696230412, "kl_div_pos": -0.0037110077682882547, "kl_div_sft": -0.022594967857003212, "learning_rate": 1.632952691680261e-06, "loss": -0.0591, "ppo_loss": -0.9962958693504333, "sft_loss": 0.0943819060921669, "step": 236 }, { "epoch": 0.57, "grad_norm": 1.0482286839391024, "importance_ratio": 0.921875, "kl_div": -0.08936250954866409, "kl_div_neg": -0.08936250954866409, "learning_rate": 1.6313213703099511e-06, "loss": 0.0602, "ppo_loss": 0.9185183644294739, "step": 237 }, { "epoch": 0.58, "grad_norm": 12.773073150384684, "kl_div": 0.004495954606682062, "kl_div_sft": 0.004495954606682062, "learning_rate": 1.6296900489396411e-06, "loss": -0.0781, "sft_loss": 0.060407549142837524, "step": 238 }, { "epoch": 0.58, "grad_norm": 1.1713831059923707, "importance_ratio": 1.0, "kl_div": -0.004435502924025059, "kl_div_pos": -7.828343950677663e-05, "kl_div_sft": -0.008792722597718239, "learning_rate": 1.628058727569331e-06, "loss": 0.0049, "ppo_loss": -0.9999216794967651, "sft_loss": 0.10335825383663177, "step": 239 }, { "epoch": 0.58, "grad_norm": 3.5700552616063033, "importance_ratio": 0.71875, "kl_div": -0.3352140784263611, "kl_div_neg": -0.25275135040283203, "kl_div_pos": -0.41767677664756775, "learning_rate": 1.626427406199021e-06, "loss": -0.0569, "ppo_loss": 0.07071244716644287, "step": 240 }, { "epoch": 0.58, "grad_norm": 0.9741228070160373, "kl_div": 0.005845887586474419, "kl_div_sft": 0.005845887586474419, "learning_rate": 1.6247960848287113e-06, "loss": -0.0022, "sft_loss": 0.07129596918821335, "step": 241 }, { "epoch": 0.59, "grad_norm": 2.8899753149753704, "kl_div": -0.012755146250128746, "kl_div_sft": -0.012755146250128746, "learning_rate": 1.623164763458401e-06, "loss": 0.0202, "sft_loss": 0.12924881279468536, "step": 242 }, { "epoch": 0.59, "grad_norm": 1.1576247419532693, "importance_ratio": 0.9453125, "kl_div": -0.01969856396317482, "kl_div_pos": -0.054807450622320175, "kl_div_sft": 0.015410324558615685, "learning_rate": 1.6215334420880912e-06, "loss": 0.0741, "ppo_loss": -0.9466674327850342, "sft_loss": 0.12107433378696442, "step": 243 }, { "epoch": 0.59, "grad_norm": 1.158357915457366, "kl_div": -0.023490093648433685, "kl_div_sft": -0.023490093648433685, "learning_rate": 1.6199021207177814e-06, "loss": -0.0097, "sft_loss": 0.09046634286642075, "step": 244 }, { "epoch": 0.59, "grad_norm": 1.1750598608149345, "importance_ratio": 0.9765625, "kl_div": -0.021390119567513466, "kl_div_neg": -0.00024339275842066854, "kl_div_pos": -0.04253684729337692, "learning_rate": 1.6182707993474714e-06, "loss": -0.0526, "ppo_loss": 0.02070075273513794, "step": 245 }, { "epoch": 0.6, "grad_norm": 6.0861800687779395, "importance_ratio": 0.75390625, "kl_div": -0.1369992047548294, "kl_div_neg": -0.2839204967021942, "kl_div_sft": 0.009922093711793423, "learning_rate": 1.6166394779771614e-06, "loss": -0.1549, "ppo_loss": 0.800000011920929, "sft_loss": 0.041812244802713394, "step": 246 }, { "epoch": 0.6, "grad_norm": 4.37485394915259, "importance_ratio": 0.88671875, "kl_div": -0.12540145218372345, "kl_div_pos": -0.12540145218372345, "learning_rate": 1.6150081566068514e-06, "loss": -0.1772, "ppo_loss": -0.8872189521789551, "step": 247 }, { "epoch": 0.6, "grad_norm": 2.8037534555727666, "kl_div": 0.006541554816067219, "kl_div_sft": 0.006541554816067219, "learning_rate": 1.6133768352365416e-06, "loss": 0.0927, "sft_loss": 0.06823631376028061, "step": 248 }, { "epoch": 0.6, "grad_norm": 1.2486541655522805, "kl_div": -0.010986318811774254, "kl_div_sft": -0.010986318811774254, "learning_rate": 1.6117455138662316e-06, "loss": 0.0299, "sft_loss": 0.10466299951076508, "step": 249 }, { "epoch": 0.61, "grad_norm": 2.5603761943435734, "importance_ratio": 1.0, "kl_div": -0.004413792863488197, "kl_div_pos": 0.0007278404664248228, "kl_div_sft": -0.009555426426231861, "learning_rate": 1.6101141924959216e-06, "loss": 0.0637, "ppo_loss": -1.0007281303405762, "sft_loss": 0.07314762473106384, "step": 250 }, { "epoch": 0.61, "grad_norm": 5.240721723634095, "importance_ratio": 0.984375, "kl_div": -0.016420193016529083, "kl_div_neg": -0.030372580513358116, "kl_div_pos": -0.002467805054038763, "learning_rate": 1.6084828711256118e-06, "loss": 0.0825, "ppo_loss": -0.013725578784942627, "step": 251 }, { "epoch": 0.61, "grad_norm": 1.057452047176148, "importance_ratio": 0.7734375, "kl_div": -0.13504436612129211, "kl_div_pos": -0.25911781191825867, "kl_div_sft": -0.01097092591226101, "learning_rate": 1.6068515497553017e-06, "loss": 0.0304, "ppo_loss": -0.7717321515083313, "sft_loss": 0.08440785109996796, "step": 252 }, { "epoch": 0.61, "grad_norm": 0.9980315862331917, "importance_ratio": 1.015625, "kl_div": 0.006387812085449696, "kl_div_pos": 0.013380873948335648, "kl_div_sft": -0.00060524936998263, "learning_rate": 1.6052202283849917e-06, "loss": 0.0603, "ppo_loss": -1.0134707689285278, "sft_loss": 0.07214315980672836, "step": 253 }, { "epoch": 0.62, "grad_norm": 1.3425978223228665, "kl_div": 0.004152823239564896, "kl_div_sft": 0.004152823239564896, "learning_rate": 1.6035889070146817e-06, "loss": 0.0237, "sft_loss": 0.04243698716163635, "step": 254 }, { "epoch": 0.62, "grad_norm": 1.0893081388019052, "importance_ratio": 0.98828125, "kl_div": -0.012175730429589748, "kl_div_pos": -0.011445739306509495, "kl_div_sft": -0.012905721552670002, "learning_rate": 1.601957585644372e-06, "loss": 0.1381, "ppo_loss": -0.9886195063591003, "sft_loss": 0.05715664103627205, "step": 255 }, { "epoch": 0.62, "grad_norm": 1.2376164237078648, "importance_ratio": 0.82421875, "kl_div": -0.20184333622455597, "kl_div_neg": -0.3390652537345886, "kl_div_pos": -0.06462141871452332, "learning_rate": 1.6003262642740619e-06, "loss": -0.1227, "ppo_loss": -0.06871113181114197, "step": 256 }, { "epoch": 0.62, "grad_norm": 1.0333806770512999, "importance_ratio": 0.98828125, "kl_div": -0.08744671195745468, "kl_div_pos": -0.012920528650283813, "kl_div_sft": -0.16197289526462555, "learning_rate": 1.5986949429037519e-06, "loss": -0.0855, "ppo_loss": -0.9871625900268555, "sft_loss": 0.21870480477809906, "step": 257 }, { "epoch": 0.63, "grad_norm": 2.033737068533743, "importance_ratio": 0.99609375, "kl_div": -0.009044161066412926, "kl_div_pos": -0.004775775596499443, "kl_div_sft": -0.013312545605003834, "learning_rate": 1.597063621533442e-06, "loss": 0.0067, "ppo_loss": -0.9952355623245239, "sft_loss": 0.08667251467704773, "step": 258 }, { "epoch": 0.63, "grad_norm": 2.17227103375518, "kl_div": -0.02424740232527256, "kl_div_sft": -0.02424740232527256, "learning_rate": 1.595432300163132e-06, "loss": 0.0051, "sft_loss": 0.1608126163482666, "step": 259 }, { "epoch": 0.63, "grad_norm": 3.214808448792363, "kl_div": -0.010200893506407738, "kl_div_sft": -0.010200893506407738, "learning_rate": 1.5938009787928222e-06, "loss": -0.0841, "sft_loss": 0.059631407260894775, "step": 260 }, { "epoch": 0.63, "grad_norm": 3.708067995215448, "kl_div": -0.012492822483181953, "kl_div_sft": -0.012492822483181953, "learning_rate": 1.592169657422512e-06, "loss": -0.1127, "sft_loss": 0.05107155442237854, "step": 261 }, { "epoch": 0.64, "grad_norm": 3.4311475839252066, "importance_ratio": 0.7734375, "kl_div": -0.2557826042175293, "kl_div_neg": -0.28364330530166626, "kl_div_pos": -0.22792188823223114, "learning_rate": 1.5905383360522022e-06, "loss": 0.0694, "ppo_loss": 0.0019067823886871338, "step": 262 }, { "epoch": 0.64, "grad_norm": 0.8276769037534126, "importance_ratio": 0.7890625, "kl_div": -0.11510230600833893, "kl_div_neg": -0.23486950993537903, "kl_div_sft": 0.0046649049036204815, "learning_rate": 1.5889070146818924e-06, "loss": 0.0593, "ppo_loss": 0.800000011920929, "sft_loss": 0.03703388571739197, "step": 263 }, { "epoch": 0.64, "grad_norm": 1.0369255060975295, "kl_div": -0.0018341443501412868, "kl_div_sft": -0.0018341443501412868, "learning_rate": 1.5872756933115822e-06, "loss": 0.0103, "sft_loss": 0.07939282059669495, "step": 264 }, { "epoch": 0.64, "grad_norm": 0.8583886121029157, "importance_ratio": 0.9296875, "kl_div": -0.040981777012348175, "kl_div_pos": -0.07225144654512405, "kl_div_sft": -0.00971211027354002, "learning_rate": 1.5856443719412724e-06, "loss": -0.0687, "ppo_loss": -0.9302969574928284, "sft_loss": 0.11106070131063461, "step": 265 }, { "epoch": 0.64, "grad_norm": 9.116483711857038, "importance_ratio": 0.828125, "kl_div": -0.0933116152882576, "kl_div_neg": -0.19036899507045746, "kl_div_sft": 0.003745768219232559, "learning_rate": 1.5840130505709624e-06, "loss": -0.0504, "ppo_loss": 0.8266540765762329, "sft_loss": 0.09498132020235062, "step": 266 }, { "epoch": 0.65, "grad_norm": 1.8274568820359849, "importance_ratio": 1.0078125, "kl_div": -0.008119492791593075, "kl_div_pos": 0.004711403977125883, "kl_div_sft": -0.02095039002597332, "learning_rate": 1.5823817292006526e-06, "loss": -0.0648, "ppo_loss": -1.0047225952148438, "sft_loss": 0.08279009163379669, "step": 267 }, { "epoch": 0.65, "grad_norm": 9.420829550466632, "importance_ratio": 0.6953125, "kl_div": -0.17865419387817383, "kl_div_neg": -0.36194297671318054, "kl_div_sft": 0.004634591285139322, "learning_rate": 1.5807504078303423e-06, "loss": 0.1689, "ppo_loss": 0.800000011920929, "sft_loss": 0.05000005289912224, "step": 268 }, { "epoch": 0.65, "grad_norm": 0.9602763685138183, "importance_ratio": 0.81640625, "kl_div": -0.10194914042949677, "kl_div_neg": -0.20226718485355377, "kl_div_sft": -0.0016310999635607004, "learning_rate": 1.5791190864600325e-06, "loss": 0.026, "ppo_loss": 0.8168765902519226, "sft_loss": 0.03999049589037895, "step": 269 }, { "epoch": 0.65, "grad_norm": 0.9590098266953325, "kl_div": -0.12230391055345535, "kl_div_sft": -0.12230391055345535, "learning_rate": 1.5774877650897227e-06, "loss": -0.1017, "sft_loss": 0.16727250814437866, "step": 270 }, { "epoch": 0.66, "grad_norm": 0.8379303182523353, "importance_ratio": 0.9921875, "kl_div": -0.00032729655504226685, "kl_div_neg": -0.007788390852510929, "kl_div_sft": 0.007133797742426395, "learning_rate": 1.5758564437194127e-06, "loss": 0.0727, "ppo_loss": 0.9922418594360352, "sft_loss": 0.05602758377790451, "step": 271 }, { "epoch": 0.66, "grad_norm": 9.583772422572244, "importance_ratio": 0.8359375, "kl_div": -0.19486042857170105, "kl_div_neg": -0.3538120687007904, "kl_div_pos": -0.0359087735414505, "learning_rate": 1.5742251223491027e-06, "loss": -0.0354, "ppo_loss": -0.08236417174339294, "step": 272 }, { "epoch": 0.66, "grad_norm": 2.3751083148550722, "importance_ratio": 0.9921875, "kl_div": -0.0045613134279847145, "kl_div_pos": -0.008577365428209305, "kl_div_sft": -0.0005452617770060897, "learning_rate": 1.5725938009787927e-06, "loss": -0.1139, "ppo_loss": -0.991459310054779, "sft_loss": 0.038445647805929184, "step": 273 }, { "epoch": 0.66, "grad_norm": 2.4028228530134856, "importance_ratio": 0.98046875, "kl_div": -0.009789850562810898, "kl_div_pos": -0.020217105746269226, "kl_div_sft": 0.0006374044460244477, "learning_rate": 1.5709624796084829e-06, "loss": 0.0563, "ppo_loss": -0.9799858927726746, "sft_loss": 0.0919925719499588, "step": 274 }, { "epoch": 0.67, "grad_norm": 1.0608938643593155, "kl_div": -0.013894759118556976, "kl_div_sft": -0.013894759118556976, "learning_rate": 1.5693311582381726e-06, "loss": -0.1247, "sft_loss": 0.09050247073173523, "step": 275 }, { "epoch": 0.67, "grad_norm": 3.589018651140568, "importance_ratio": 0.82421875, "kl_div": -0.09773320704698563, "kl_div_neg": -0.19317105412483215, "kl_div_sft": -0.0022953650914132595, "learning_rate": 1.5676998368678628e-06, "loss": -0.0331, "ppo_loss": 0.8243409991264343, "sft_loss": 0.07633786648511887, "step": 276 }, { "epoch": 0.67, "grad_norm": 8.14206347505569, "kl_div": 1.1440544767538086e-05, "kl_div_sft": 1.1440544767538086e-05, "learning_rate": 1.566068515497553e-06, "loss": 0.0461, "sft_loss": 0.10166864097118378, "step": 277 }, { "epoch": 0.67, "grad_norm": 4.270036700624787, "importance_ratio": 0.9140625, "kl_div": -0.09008216857910156, "kl_div_neg": -0.09008216857910156, "learning_rate": 1.564437194127243e-06, "loss": 0.0274, "ppo_loss": 0.9167443513870239, "step": 278 }, { "epoch": 0.68, "grad_norm": 5.246673620107099, "kl_div": -0.060400474816560745, "kl_div_sft": -0.060400474816560745, "learning_rate": 1.562805872756933e-06, "loss": -0.0454, "sft_loss": 0.12175711244344711, "step": 279 }, { "epoch": 0.68, "grad_norm": 3.953524482656069, "importance_ratio": 0.859375, "kl_div": -0.1578972488641739, "kl_div_pos": -0.1578972488641739, "learning_rate": 1.561174551386623e-06, "loss": -0.1069, "ppo_loss": -0.8590763807296753, "step": 280 }, { "epoch": 0.68, "grad_norm": 3.9506407157731407, "importance_ratio": 0.9765625, "kl_div": -0.0060625518672168255, "kl_div_pos": -0.02237924560904503, "kl_div_sft": 0.010254141874611378, "learning_rate": 1.5595432300163132e-06, "loss": -0.0587, "ppo_loss": -0.9778693318367004, "sft_loss": 0.05958448350429535, "step": 281 }, { "epoch": 0.68, "grad_norm": 3.7460476710376396, "importance_ratio": 0.8359375, "kl_div": -0.11085577309131622, "kl_div_neg": -0.17810475826263428, "kl_div_sft": -0.04360678046941757, "learning_rate": 1.5579119086460032e-06, "loss": -0.0386, "ppo_loss": 0.8368546962738037, "sft_loss": 0.12636341154575348, "step": 282 }, { "epoch": 0.69, "grad_norm": 4.745895620221447, "importance_ratio": 0.96484375, "kl_div": -0.029229873791337013, "kl_div_pos": -0.0365082323551178, "kl_div_sft": -0.02195151522755623, "learning_rate": 1.5562805872756932e-06, "loss": -0.0572, "ppo_loss": -0.9641501903533936, "sft_loss": 0.05703162029385567, "step": 283 }, { "epoch": 0.69, "grad_norm": 5.993764498218747, "kl_div": -0.007489209994673729, "kl_div_sft": -0.007489209994673729, "learning_rate": 1.5546492659053833e-06, "loss": -0.0634, "sft_loss": 0.07352593541145325, "step": 284 }, { "epoch": 0.69, "grad_norm": 0.9322771820704878, "kl_div": -0.03013739548623562, "kl_div_sft": -0.03013739548623562, "learning_rate": 1.5530179445350733e-06, "loss": 0.056, "sft_loss": 0.08940079063177109, "step": 285 }, { "epoch": 0.69, "grad_norm": 0.992517671692001, "kl_div": 0.0077654337510466576, "kl_div_sft": 0.0077654337510466576, "learning_rate": 1.5513866231647635e-06, "loss": -0.0487, "sft_loss": 0.030625823885202408, "step": 286 }, { "epoch": 0.7, "grad_norm": 3.708157881683071, "importance_ratio": 0.9921875, "kl_div": -0.11875781416893005, "kl_div_neg": -0.007041546981781721, "kl_div_sft": -0.2304740846157074, "learning_rate": 1.5497553017944533e-06, "loss": 0.1021, "ppo_loss": 0.9929831027984619, "sft_loss": 0.27379873394966125, "step": 287 }, { "epoch": 0.7, "grad_norm": 0.9189327616143602, "kl_div": -0.010802363976836205, "kl_div_sft": -0.010802363976836205, "learning_rate": 1.5481239804241435e-06, "loss": -0.1124, "sft_loss": 0.052867598831653595, "step": 288 }, { "epoch": 0.7, "grad_norm": 3.388275598227071, "kl_div": -0.012758041732013226, "kl_div_sft": -0.012758041732013226, "learning_rate": 1.5464926590538337e-06, "loss": -0.2157, "sft_loss": 0.07837103307247162, "step": 289 }, { "epoch": 0.7, "grad_norm": 3.671059765509262, "kl_div": -0.009600481018424034, "kl_div_sft": -0.009600481018424034, "learning_rate": 1.5448613376835235e-06, "loss": -0.0148, "sft_loss": 0.08664091676473618, "step": 290 }, { "epoch": 0.71, "grad_norm": 1.0037582822769144, "kl_div": -0.021426748484373093, "kl_div_sft": -0.021426748484373093, "learning_rate": 1.5432300163132137e-06, "loss": 0.1021, "sft_loss": 0.10049542784690857, "step": 291 }, { "epoch": 0.71, "grad_norm": 3.9444987751497913, "importance_ratio": 0.90625, "kl_div": -0.1015133410692215, "kl_div_neg": -0.15428906679153442, "kl_div_pos": -0.04873760789632797, "learning_rate": 1.5415986949429036e-06, "loss": 0.0774, "ppo_loss": -0.04770335555076599, "step": 292 }, { "epoch": 0.71, "grad_norm": 1.0185512934012841, "importance_ratio": 0.90625, "kl_div": -0.050773173570632935, "kl_div_pos": -0.09809917956590652, "kl_div_sft": -0.003447168506681919, "learning_rate": 1.5399673735725938e-06, "loss": 0.0851, "ppo_loss": -0.9065589904785156, "sft_loss": 0.07342778891324997, "step": 293 }, { "epoch": 0.71, "grad_norm": 1.24960759679412, "importance_ratio": 0.98046875, "kl_div": -0.01254759170114994, "kl_div_pos": -0.018518824130296707, "kl_div_sft": -0.0065763588063418865, "learning_rate": 1.5383360522022836e-06, "loss": 0.0704, "ppo_loss": -0.9816515445709229, "sft_loss": 0.09209147095680237, "step": 294 }, { "epoch": 0.72, "grad_norm": 3.208030257008623, "kl_div": 0.003485637716948986, "kl_div_sft": 0.003485637716948986, "learning_rate": 1.5367047308319738e-06, "loss": 0.028, "sft_loss": 0.06576119363307953, "step": 295 }, { "epoch": 0.72, "grad_norm": 0.7614542575280052, "kl_div": 0.007014347240328789, "kl_div_sft": 0.007014347240328789, "learning_rate": 1.535073409461664e-06, "loss": 0.1344, "sft_loss": 0.06254887580871582, "step": 296 }, { "epoch": 0.72, "grad_norm": 9.809169325505872, "kl_div": -0.0015400054398924112, "kl_div_sft": -0.0015400054398924112, "learning_rate": 1.533442088091354e-06, "loss": -0.0076, "sft_loss": 0.08064014464616776, "step": 297 }, { "epoch": 0.72, "grad_norm": 3.459325824170929, "kl_div": -0.007169000804424286, "kl_div_sft": -0.007169000804424286, "learning_rate": 1.531810766721044e-06, "loss": -0.0143, "sft_loss": 0.10261943191289902, "step": 298 }, { "epoch": 0.72, "grad_norm": 0.7764140564788423, "kl_div": 0.0003441378939896822, "kl_div_sft": 0.0003441378939896822, "learning_rate": 1.530179445350734e-06, "loss": -0.0727, "sft_loss": 0.08510269224643707, "step": 299 }, { "epoch": 0.73, "grad_norm": 3.553790773740822, "importance_ratio": 0.70703125, "kl_div": -0.17280974984169006, "kl_div_neg": -0.349417120218277, "kl_div_sft": 0.003797624260187149, "learning_rate": 1.5285481239804242e-06, "loss": 0.0813, "ppo_loss": 0.800000011920929, "sft_loss": 0.07960819453001022, "step": 300 }, { "epoch": 0.73, "grad_norm": 0.9986867924766581, "kl_div": -0.00027870782651007175, "kl_div_sft": -0.00027870782651007175, "learning_rate": 1.526916802610114e-06, "loss": -0.0956, "sft_loss": 0.07223731279373169, "step": 301 }, { "epoch": 0.73, "grad_norm": 1.182324928566422, "importance_ratio": 0.83203125, "kl_div": -0.19484283030033112, "kl_div_neg": -0.34265056252479553, "kl_div_pos": -0.047035101801157, "learning_rate": 1.5252854812398041e-06, "loss": 0.0757, "ppo_loss": -0.07702693343162537, "step": 302 }, { "epoch": 0.73, "grad_norm": 11.752123052463089, "importance_ratio": 0.6796875, "kl_div": -0.18974079191684723, "kl_div_neg": -0.3876355290412903, "kl_div_sft": 0.008153931237757206, "learning_rate": 1.5236541598694943e-06, "loss": -0.0932, "ppo_loss": 0.800000011920929, "sft_loss": 0.07056049257516861, "step": 303 }, { "epoch": 0.74, "grad_norm": 4.471725433642123, "importance_ratio": 0.98828125, "kl_div": -0.004336100071668625, "kl_div_neg": -0.013022080063819885, "kl_div_sft": 0.004349880386143923, "learning_rate": 1.5220228384991843e-06, "loss": -0.0391, "ppo_loss": 0.9870623350143433, "sft_loss": 0.09904748946428299, "step": 304 }, { "epoch": 0.74, "grad_norm": 11.537474236919367, "importance_ratio": 0.99609375, "kl_div": -0.005024380516260862, "kl_div_pos": -0.0030147444922477007, "kl_div_sft": -0.007034016773104668, "learning_rate": 1.5203915171288743e-06, "loss": -0.1319, "ppo_loss": -0.9969898462295532, "sft_loss": 0.04072725400328636, "step": 305 }, { "epoch": 0.74, "grad_norm": 8.069143943713918, "importance_ratio": 0.9765625, "kl_div": -0.011580190621316433, "kl_div_pos": -0.024436360225081444, "kl_div_sft": 0.001275978283956647, "learning_rate": 1.5187601957585643e-06, "loss": -0.1803, "ppo_loss": -0.9758597612380981, "sft_loss": 0.06408393383026123, "step": 306 }, { "epoch": 0.74, "grad_norm": 1.0572363113920726, "importance_ratio": 0.96484375, "kl_div": -0.015708668157458305, "kl_div_pos": -0.035194650292396545, "kl_div_sft": 0.00377731304615736, "learning_rate": 1.5171288743882545e-06, "loss": -0.0675, "ppo_loss": -0.9654175043106079, "sft_loss": 0.06628946959972382, "step": 307 }, { "epoch": 0.75, "grad_norm": 13.488066874020417, "importance_ratio": 0.96484375, "kl_div": -0.032234422862529755, "kl_div_pos": -0.03625834360718727, "kl_div_sft": -0.028210503980517387, "learning_rate": 1.5154975530179447e-06, "loss": 0.077, "ppo_loss": -0.9643911719322205, "sft_loss": 0.06895402818918228, "step": 308 }, { "epoch": 0.75, "grad_norm": 1.5976955716766723, "importance_ratio": 1.0078125, "kl_div": 0.014641315676271915, "kl_div_pos": 0.005524564068764448, "kl_div_sft": 0.023758066818118095, "learning_rate": 1.5138662316476344e-06, "loss": -0.0026, "ppo_loss": -1.005539894104004, "sft_loss": 0.07187476009130478, "step": 309 }, { "epoch": 0.75, "grad_norm": 1.0015244784703676, "kl_div": 0.00711329560726881, "kl_div_sft": 0.00711329560726881, "learning_rate": 1.5122349102773246e-06, "loss": -0.1007, "sft_loss": 0.0625566840171814, "step": 310 }, { "epoch": 0.75, "grad_norm": 3.2210326757754393, "importance_ratio": 0.921875, "kl_div": -0.04271788150072098, "kl_div_pos": -0.0804443210363388, "kl_div_sft": -0.004991442896425724, "learning_rate": 1.5106035889070146e-06, "loss": -0.1387, "ppo_loss": -0.9227062463760376, "sft_loss": 0.03155931085348129, "step": 311 }, { "epoch": 0.76, "grad_norm": 5.240578690234974, "importance_ratio": 0.984375, "kl_div": 0.00046649202704429626, "kl_div_pos": -0.016343850642442703, "kl_div_sft": 0.017276834696531296, "learning_rate": 1.5089722675367046e-06, "loss": -0.0052, "ppo_loss": -0.9837889671325684, "sft_loss": 0.043509677052497864, "step": 312 }, { "epoch": 0.76, "grad_norm": 0.8025776879565545, "importance_ratio": 0.9921875, "kl_div": -0.002305206609889865, "kl_div_pos": -0.007442329078912735, "kl_div_sft": 0.002831915859133005, "learning_rate": 1.5073409461663946e-06, "loss": 0.1136, "ppo_loss": -0.992585301399231, "sft_loss": 0.06029798090457916, "step": 313 }, { "epoch": 0.76, "grad_norm": 9.853050851588272, "importance_ratio": 0.60546875, "kl_div": -0.24869604408740997, "kl_div_pos": -0.5010824799537659, "kl_div_sft": 0.0036904041189700365, "learning_rate": 1.5057096247960848e-06, "loss": 0.0182, "ppo_loss": -0.6058744788169861, "sft_loss": 0.06293935328722, "step": 314 }, { "epoch": 0.76, "grad_norm": 1.02099609375, "importance_ratio": 0.98828125, "kl_div": -0.012375455349683762, "kl_div_pos": -0.012375455349683762, "learning_rate": 1.504078303425775e-06, "loss": -0.0341, "ppo_loss": -0.9878675937652588, "step": 315 }, { "epoch": 0.77, "grad_norm": 1.6083232813440314, "kl_div": -0.019710950553417206, "kl_div_sft": -0.019710950553417206, "learning_rate": 1.5024469820554647e-06, "loss": -0.1431, "sft_loss": 0.0729701817035675, "step": 316 }, { "epoch": 0.77, "grad_norm": 1.2108278286400305, "importance_ratio": 0.96875, "kl_div": -0.03259526938199997, "kl_div_pos": -0.03259526938199997, "learning_rate": 1.500815660685155e-06, "loss": -0.0835, "ppo_loss": -0.967930793762207, "step": 317 }, { "epoch": 0.77, "grad_norm": 1.3044009322260497, "importance_ratio": 0.96484375, "kl_div": -0.016362616792321205, "kl_div_pos": -0.03751760348677635, "kl_div_sft": 0.004792371299117804, "learning_rate": 1.499184339314845e-06, "loss": -0.0453, "ppo_loss": -0.963177502155304, "sft_loss": 0.032457318156957626, "step": 318 }, { "epoch": 0.77, "grad_norm": 4.55266629580963, "importance_ratio": 0.7265625, "kl_div": -0.3182659149169922, "kl_div_neg": -0.3182659149169922, "learning_rate": 1.4975530179445351e-06, "loss": 0.0939, "ppo_loss": 0.800000011920929, "step": 319 }, { "epoch": 0.78, "grad_norm": 1.2009390256397048, "importance_ratio": 0.97265625, "kl_div": -0.012672297656536102, "kl_div_neg": -0.027501724660396576, "kl_div_sft": 0.002157128881663084, "learning_rate": 1.4959216965742249e-06, "loss": 0.233, "ppo_loss": 0.9728729724884033, "sft_loss": 0.050059814006090164, "step": 320 }, { "epoch": 0.78, "grad_norm": 1.0228750998655343, "importance_ratio": 0.9453125, "kl_div": -0.022379038855433464, "kl_div_neg": -0.054841797798871994, "kl_div_sft": 0.010083720088005066, "learning_rate": 1.494290375203915e-06, "loss": -0.0796, "ppo_loss": 0.9466348886489868, "sft_loss": 0.018363839015364647, "step": 321 }, { "epoch": 0.78, "grad_norm": 1.4901011796174906, "importance_ratio": 0.890625, "kl_div": -0.12282276898622513, "kl_div_neg": -4.3708219891414046e-05, "kl_div_pos": -0.2456018328666687, "learning_rate": 1.4926590538336053e-06, "loss": 0.0191, "ppo_loss": 0.10886135697364807, "step": 322 }, { "epoch": 0.78, "grad_norm": 5.4440468147725865, "importance_ratio": 0.77734375, "kl_div": -0.13044001162052155, "kl_div_neg": -0.2507326006889343, "kl_div_sft": -0.010147427208721638, "learning_rate": 1.4910277324632953e-06, "loss": 0.0005, "ppo_loss": 0.800000011920929, "sft_loss": 0.10479097068309784, "step": 323 }, { "epoch": 0.79, "grad_norm": 1.1518768916577657, "importance_ratio": 0.90234375, "kl_div": -0.05070945993065834, "kl_div_pos": -0.1028522476553917, "kl_div_sft": 0.0014333281433209777, "learning_rate": 1.4893964110929853e-06, "loss": 0.0619, "ppo_loss": -0.9022603034973145, "sft_loss": 0.0915645956993103, "step": 324 }, { "epoch": 0.79, "grad_norm": 1.8697849227514465, "kl_div": 0.005619871895760298, "kl_div_sft": 0.005619871895760298, "learning_rate": 1.4877650897226752e-06, "loss": -0.026, "sft_loss": 0.05023251101374626, "step": 325 }, { "epoch": 0.79, "grad_norm": 2.931214771181061, "importance_ratio": 0.92578125, "kl_div": -0.03833425045013428, "kl_div_neg": -0.07714598625898361, "kl_div_sft": 0.0004774852131959051, "learning_rate": 1.4861337683523654e-06, "loss": 0.1182, "ppo_loss": 0.9257546663284302, "sft_loss": 0.046782199293375015, "step": 326 }, { "epoch": 0.79, "grad_norm": 4.7159573204691245, "kl_div": -0.0009512719698250294, "kl_div_sft": -0.0009512719698250294, "learning_rate": 1.4845024469820552e-06, "loss": -0.0133, "sft_loss": 0.0551016591489315, "step": 327 }, { "epoch": 0.8, "grad_norm": 4.158387287123689, "importance_ratio": 0.8203125, "kl_div": -0.10169047117233276, "kl_div_neg": -0.19945180416107178, "kl_div_sft": -0.003929144237190485, "learning_rate": 1.4828711256117454e-06, "loss": 0.0485, "ppo_loss": 0.8191797137260437, "sft_loss": 0.08998027443885803, "step": 328 }, { "epoch": 0.8, "grad_norm": 2.8205447959865824, "kl_div": 0.008606771007180214, "kl_div_sft": 0.008606771007180214, "learning_rate": 1.4812398042414356e-06, "loss": -0.0295, "sft_loss": 0.046088241040706635, "step": 329 }, { "epoch": 0.8, "grad_norm": 1.2197406361778134, "importance_ratio": 0.9296875, "kl_div": -0.033290740102529526, "kl_div_pos": -0.07345406711101532, "kl_div_sft": 0.006872584577649832, "learning_rate": 1.4796084828711256e-06, "loss": -0.0098, "ppo_loss": -0.9291788339614868, "sft_loss": 0.03272410109639168, "step": 330 }, { "epoch": 0.8, "grad_norm": 2.049644867444505, "importance_ratio": 0.96875, "kl_div": -0.03301801532506943, "kl_div_pos": -0.03301801532506943, "learning_rate": 1.4779771615008156e-06, "loss": -0.0545, "ppo_loss": -0.9675254225730896, "step": 331 }, { "epoch": 0.8, "grad_norm": 1.7787836634867733, "importance_ratio": 0.9296875, "kl_div": -0.036390505731105804, "kl_div_neg": -0.07250487059354782, "kl_div_sft": -0.0002761399664450437, "learning_rate": 1.4763458401305055e-06, "loss": -0.0752, "ppo_loss": 0.9300612211227417, "sft_loss": 0.05477307736873627, "step": 332 }, { "epoch": 0.81, "grad_norm": 1.7759055554391399, "importance_ratio": 0.94140625, "kl_div": -0.06303277611732483, "kl_div_neg": -0.11272674053907394, "kl_div_pos": -0.013338807038962841, "learning_rate": 1.4747145187601957e-06, "loss": -0.1584, "ppo_loss": -0.04667750000953674, "step": 333 }, { "epoch": 0.81, "grad_norm": 2.3862452060657304, "importance_ratio": 0.9921875, "kl_div": -0.006657070480287075, "kl_div_pos": -0.006657070480287075, "learning_rate": 1.473083197389886e-06, "loss": -0.2039, "ppo_loss": -0.993366003036499, "step": 334 }, { "epoch": 0.81, "grad_norm": 4.780790419088667, "importance_ratio": 0.953125, "kl_div": -0.0286087803542614, "kl_div_pos": -0.04897100850939751, "kl_div_sft": -0.00824655406177044, "learning_rate": 1.4714518760195757e-06, "loss": -0.0568, "ppo_loss": -0.9522087574005127, "sft_loss": 0.097450390458107, "step": 335 }, { "epoch": 0.81, "grad_norm": 2.2356525316854676, "importance_ratio": 0.9765625, "kl_div": -0.011972763575613499, "kl_div_neg": -0.02487398311495781, "kl_div_sft": 0.000928456720430404, "learning_rate": 1.469820554649266e-06, "loss": 0.0395, "ppo_loss": 0.975432813167572, "sft_loss": 0.07037409394979477, "step": 336 }, { "epoch": 0.82, "grad_norm": 3.4639200486698103, "importance_ratio": 0.9921875, "kl_div": -0.00725951325148344, "kl_div_neg": -0.008314115926623344, "kl_div_sft": -0.006204910576343536, "learning_rate": 1.4681892332789559e-06, "loss": 0.0526, "ppo_loss": 0.9917203783988953, "sft_loss": 0.06223254278302193, "step": 337 }, { "epoch": 0.82, "grad_norm": 2.7775114843095055, "importance_ratio": 1.0, "kl_div": -0.0025152729358524084, "kl_div_pos": 0.003554165828973055, "kl_div_sft": -0.008584711700677872, "learning_rate": 1.4665579119086459e-06, "loss": 0.0289, "ppo_loss": -1.0035605430603027, "sft_loss": 0.0701230838894844, "step": 338 }, { "epoch": 0.82, "grad_norm": 1.4601875869221272, "kl_div": 0.003021697048097849, "kl_div_sft": 0.003021697048097849, "learning_rate": 1.4649265905383359e-06, "loss": -0.172, "sft_loss": 0.07517191767692566, "step": 339 }, { "epoch": 0.82, "grad_norm": 1.8553365881180026, "importance_ratio": 0.79296875, "kl_div": -0.23275145888328552, "kl_div_neg": -0.26393523812294006, "kl_div_pos": -0.20156769454479218, "learning_rate": 1.463295269168026e-06, "loss": -0.1994, "ppo_loss": -0.008724093437194824, "step": 340 }, { "epoch": 0.83, "grad_norm": 1.9243318314649143, "importance_ratio": 0.9296875, "kl_div": -0.07310383766889572, "kl_div_neg": -0.07310383766889572, "learning_rate": 1.4616639477977163e-06, "loss": -0.0659, "ppo_loss": 0.9309701323509216, "step": 341 }, { "epoch": 0.83, "grad_norm": 1.956693517358818, "importance_ratio": 0.93359375, "kl_div": -0.036307819187641144, "kl_div_pos": -0.06998840719461441, "kl_div_sft": -0.002627227921038866, "learning_rate": 1.460032626427406e-06, "loss": -0.0614, "ppo_loss": -0.932404637336731, "sft_loss": 0.07751280814409256, "step": 342 }, { "epoch": 0.83, "grad_norm": 3.135826582531772, "importance_ratio": 0.96875, "kl_div": -0.03073815256357193, "kl_div_pos": -0.03073815256357193, "learning_rate": 1.4584013050570962e-06, "loss": -0.0142, "ppo_loss": -0.969732403755188, "step": 343 }, { "epoch": 0.83, "grad_norm": 5.727697824552616, "importance_ratio": 0.83203125, "kl_div": -0.08306290209293365, "kl_div_neg": -0.18256793916225433, "kl_div_sft": 0.016442136839032173, "learning_rate": 1.4567699836867862e-06, "loss": 0.108, "ppo_loss": 0.8331279754638672, "sft_loss": 0.02431655116379261, "step": 344 }, { "epoch": 0.84, "grad_norm": 1.160169402282351, "importance_ratio": 0.8359375, "kl_div": -0.08823750913143158, "kl_div_neg": -0.1810176968574524, "kl_div_sft": 0.004542672540992498, "learning_rate": 1.4551386623164764e-06, "loss": 0.1544, "ppo_loss": 0.8344205617904663, "sft_loss": 0.043373603373765945, "step": 345 }, { "epoch": 0.84, "grad_norm": 1.4447440189278034, "importance_ratio": 0.96484375, "kl_div": -0.015825355425477028, "kl_div_neg": -0.03718387335538864, "kl_div_sft": 0.005533162504434586, "learning_rate": 1.4535073409461662e-06, "loss": -0.0611, "ppo_loss": 0.9634990096092224, "sft_loss": 0.0831359475851059, "step": 346 }, { "epoch": 0.84, "grad_norm": 3.0823128188476296, "importance_ratio": 0.98046875, "kl_div": -0.014321206137537956, "kl_div_neg": -0.02046828903257847, "kl_div_sft": -0.008174123242497444, "learning_rate": 1.4518760195758564e-06, "loss": 0.0207, "ppo_loss": 0.9797397255897522, "sft_loss": 0.12043432146310806, "step": 347 }, { "epoch": 0.84, "grad_norm": 6.68119215052569, "importance_ratio": 0.8671875, "kl_div": -0.15497435629367828, "kl_div_neg": -0.30931636691093445, "kl_div_pos": -0.0006323597626760602, "learning_rate": 1.4502446982055466e-06, "loss": -0.0633, "ppo_loss": -0.09968394041061401, "step": 348 }, { "epoch": 0.85, "grad_norm": 1.6288848802367726, "importance_ratio": 0.9609375, "kl_div": -0.019876418635249138, "kl_div_pos": -0.039128370583057404, "kl_div_sft": -0.0006244677351787686, "learning_rate": 1.4486133768352363e-06, "loss": -0.11, "ppo_loss": -0.9616272449493408, "sft_loss": 0.03771361708641052, "step": 349 }, { "epoch": 0.85, "grad_norm": 1.7350734644570496, "importance_ratio": 1.0078125, "kl_div": 0.006748631596565247, "kl_div_pos": 0.009035097435116768, "kl_div_sft": 0.004462166223675013, "learning_rate": 1.4469820554649265e-06, "loss": -0.1506, "ppo_loss": -1.0090761184692383, "sft_loss": 0.041371047496795654, "step": 350 }, { "epoch": 0.85, "grad_norm": 1.6053564027705443, "kl_div": -0.07098940759897232, "kl_div_sft": -0.07098940759897232, "learning_rate": 1.4453507340946165e-06, "loss": -0.1262, "sft_loss": 0.13393214344978333, "step": 351 }, { "epoch": 0.85, "grad_norm": 0.9634725116278806, "kl_div": 0.0022934931330382824, "kl_div_sft": 0.0022934931330382824, "learning_rate": 1.4437194127243067e-06, "loss": 0.0054, "sft_loss": 0.0617811381816864, "step": 352 }, { "epoch": 0.86, "grad_norm": 0.9781766445188108, "importance_ratio": 0.97265625, "kl_div": -0.05350446328520775, "kl_div_pos": -0.026010213419795036, "kl_div_sft": -0.08099871128797531, "learning_rate": 1.4420880913539967e-06, "loss": 0.0701, "ppo_loss": -0.9743251204490662, "sft_loss": 0.16814443469047546, "step": 353 }, { "epoch": 0.86, "grad_norm": 0.9834217724119791, "importance_ratio": 0.97265625, "kl_div": -0.01595260016620159, "kl_div_pos": -0.02615179680287838, "kl_div_sft": -0.005753403063863516, "learning_rate": 1.4404567699836867e-06, "loss": 0.0254, "ppo_loss": -0.9741871356964111, "sft_loss": 0.06098558008670807, "step": 354 }, { "epoch": 0.86, "grad_norm": 2.190498367981171, "importance_ratio": 0.8359375, "kl_div": -0.089511938393116, "kl_div_neg": -0.176958367228508, "kl_div_sft": -0.0020655107218772173, "learning_rate": 1.4388254486133769e-06, "loss": -0.0161, "ppo_loss": 0.8378146290779114, "sft_loss": 0.12363357841968536, "step": 355 }, { "epoch": 0.86, "grad_norm": 0.853924616191224, "kl_div": 0.008525339886546135, "kl_div_sft": 0.008525339886546135, "learning_rate": 1.4371941272430669e-06, "loss": 0.0255, "sft_loss": 0.08300769329071045, "step": 356 }, { "epoch": 0.87, "grad_norm": 1.416110415738903, "importance_ratio": 0.984375, "kl_div": -0.008234689943492413, "kl_div_neg": -0.01620335318148136, "kl_div_sft": -0.0002660272584762424, "learning_rate": 1.4355628058727568e-06, "loss": 0.0244, "ppo_loss": 0.9839272499084473, "sft_loss": 0.05065668746829033, "step": 357 }, { "epoch": 0.87, "grad_norm": 1.1595674298523522, "importance_ratio": 0.7734375, "kl_div": -0.13369768857955933, "kl_div_neg": -0.25916311144828796, "kl_div_sft": -0.00823226198554039, "learning_rate": 1.4339314845024468e-06, "loss": -0.0392, "ppo_loss": 0.800000011920929, "sft_loss": 0.08650946617126465, "step": 358 }, { "epoch": 0.87, "grad_norm": 1.7837342456317047, "kl_div": 0.008224120363593102, "kl_div_sft": 0.008224120363593102, "learning_rate": 1.432300163132137e-06, "loss": 0.0372, "sft_loss": 0.09266399592161179, "step": 359 }, { "epoch": 0.87, "grad_norm": 0.949493870679729, "importance_ratio": 1.0078125, "kl_div": -0.0019446432124823332, "kl_div_pos": 0.006772224325686693, "kl_div_sft": -0.01066151075065136, "learning_rate": 1.4306688417618272e-06, "loss": 0.0486, "ppo_loss": -1.0067951679229736, "sft_loss": 0.10154891014099121, "step": 360 }, { "epoch": 0.88, "grad_norm": 1.4625758811632306, "importance_ratio": 0.9921875, "kl_div": -0.010378789156675339, "kl_div_pos": -0.008084448054432869, "kl_div_sft": -0.012673130258917809, "learning_rate": 1.429037520391517e-06, "loss": -0.0158, "ppo_loss": -0.991948127746582, "sft_loss": 0.08620632439851761, "step": 361 }, { "epoch": 0.88, "grad_norm": 1.2196710969455766, "importance_ratio": 0.796875, "kl_div": -0.22868932783603668, "kl_div_neg": -0.22868932783603668, "learning_rate": 1.4274061990212072e-06, "loss": -0.0005, "ppo_loss": 0.8147095441818237, "step": 362 }, { "epoch": 0.88, "grad_norm": 4.837035810885735, "importance_ratio": 0.96875, "kl_div": -0.005435936152935028, "kl_div_pos": -0.03251469507813454, "kl_div_sft": 0.02164282277226448, "learning_rate": 1.4257748776508972e-06, "loss": -0.2686, "ppo_loss": -0.9680081605911255, "sft_loss": 0.076421357691288, "step": 363 }, { "epoch": 0.88, "grad_norm": 1.3897824252978406, "importance_ratio": 0.74609375, "kl_div": -0.29730668663978577, "kl_div_neg": -0.29730668663978577, "learning_rate": 1.4241435562805872e-06, "loss": 0.011, "ppo_loss": 0.800000011920929, "step": 364 }, { "epoch": 0.88, "grad_norm": 1.8646535877937016, "importance_ratio": 0.984375, "kl_div": -0.015370756387710571, "kl_div_pos": -0.015370756387710571, "learning_rate": 1.4225122349102771e-06, "loss": 0.0536, "ppo_loss": -0.9847500324249268, "step": 365 }, { "epoch": 0.89, "grad_norm": 2.2570258753505583, "importance_ratio": 0.80078125, "kl_div": -0.23754560947418213, "kl_div_pos": -0.22059480845928192, "kl_div_sft": -0.25449639558792114, "learning_rate": 1.4208809135399673e-06, "loss": 0.2512, "ppo_loss": -0.8020416498184204, "sft_loss": 0.29686230421066284, "step": 366 }, { "epoch": 0.89, "grad_norm": 3.6046846638615846, "importance_ratio": 0.94921875, "kl_div": -0.01577794924378395, "kl_div_pos": -0.05074576660990715, "kl_div_sft": 0.019189869984984398, "learning_rate": 1.4192495921696575e-06, "loss": -0.1423, "ppo_loss": -0.9505202770233154, "sft_loss": 0.0688878744840622, "step": 367 }, { "epoch": 0.89, "grad_norm": 3.187210369041152, "importance_ratio": 0.9921875, "kl_div": -0.008837157860398293, "kl_div_pos": -0.006236175075173378, "kl_div_sft": -0.011438139714300632, "learning_rate": 1.4176182707993473e-06, "loss": -0.1118, "ppo_loss": -0.993783175945282, "sft_loss": 0.06663285195827484, "step": 368 }, { "epoch": 0.89, "grad_norm": 2.231233001558751, "importance_ratio": 0.89453125, "kl_div": -0.0557381734251976, "kl_div_neg": -0.10951899737119675, "kl_div_sft": -0.0019573522731661797, "learning_rate": 1.4159869494290375e-06, "loss": -0.1313, "ppo_loss": 0.8962652087211609, "sft_loss": 0.07832780480384827, "step": 369 }, { "epoch": 0.9, "grad_norm": 1.2428612471342046, "importance_ratio": 0.953125, "kl_div": -0.019023537635803223, "kl_div_pos": -0.04916612431406975, "kl_div_sft": 0.011119049973785877, "learning_rate": 1.4143556280587275e-06, "loss": 0.0106, "ppo_loss": -0.952022910118103, "sft_loss": 0.05877070873975754, "step": 370 }, { "epoch": 0.9, "grad_norm": 12.252440540330564, "kl_div": 0.0018844157457351685, "kl_div_sft": 0.0018844157457351685, "learning_rate": 1.4127243066884177e-06, "loss": 0.0404, "sft_loss": 0.03680435195565224, "step": 371 }, { "epoch": 0.9, "grad_norm": 1.581418896125047, "importance_ratio": 0.9921875, "kl_div": -0.0039640204049646854, "kl_div_pos": -0.008107632398605347, "kl_div_sft": 0.0001795917487470433, "learning_rate": 1.4110929853181075e-06, "loss": -0.0892, "ppo_loss": -0.9919251799583435, "sft_loss": 0.045454829931259155, "step": 372 }, { "epoch": 0.9, "grad_norm": 1.1601368809377202, "importance_ratio": 0.77734375, "kl_div": -0.12817691266536713, "kl_div_neg": -0.25409457087516785, "kl_div_sft": -0.002259261906147003, "learning_rate": 1.4094616639477976e-06, "loss": 0.0758, "ppo_loss": 0.800000011920929, "sft_loss": 0.09486612677574158, "step": 373 }, { "epoch": 0.91, "grad_norm": 1.3684898930581635, "importance_ratio": 0.81640625, "kl_div": -0.20381605625152588, "kl_div_neg": -0.2413814812898636, "kl_div_pos": -0.16625064611434937, "learning_rate": 1.4078303425774878e-06, "loss": 0.0803, "ppo_loss": -0.023416996002197266, "step": 374 }, { "epoch": 0.91, "grad_norm": 1.258158807883002, "importance_ratio": 0.9921875, "kl_div": -0.010114632546901703, "kl_div_pos": -0.010114632546901703, "learning_rate": 1.4061990212071776e-06, "loss": -0.1264, "ppo_loss": -0.9899537563323975, "step": 375 }, { "epoch": 0.91, "grad_norm": 0.9657481661272326, "kl_div": 0.006769304163753986, "kl_div_sft": 0.006769304163753986, "learning_rate": 1.4045676998368678e-06, "loss": -0.1026, "sft_loss": 0.03854276239871979, "step": 376 }, { "epoch": 0.91, "grad_norm": 2.465672566737089, "importance_ratio": 1.0, "kl_div": 0.005747776944190264, "kl_div_pos": -0.0016325340839102864, "kl_div_sft": 0.013128087855875492, "learning_rate": 1.4029363784665578e-06, "loss": -0.0012, "ppo_loss": -0.9983687996864319, "sft_loss": 0.05518368259072304, "step": 377 }, { "epoch": 0.92, "grad_norm": 1.1353959189569596, "importance_ratio": 0.8046875, "kl_div": -0.2200791984796524, "kl_div_neg": -0.2200791984796524, "learning_rate": 1.401305057096248e-06, "loss": -0.0909, "ppo_loss": 0.8086894750595093, "step": 378 }, { "epoch": 0.92, "grad_norm": 9.269111067567923, "importance_ratio": 0.828125, "kl_div": -0.20169486105442047, "kl_div_neg": -0.3609882593154907, "kl_div_pos": -0.04240145534276962, "learning_rate": 1.399673735725938e-06, "loss": 0.0275, "ppo_loss": -0.07924246788024902, "step": 379 }, { "epoch": 0.92, "grad_norm": 1.125246074043346, "kl_div": -0.07262912392616272, "kl_div_sft": -0.07262912392616272, "learning_rate": 1.398042414355628e-06, "loss": -0.0746, "sft_loss": 0.11740092933177948, "step": 380 }, { "epoch": 0.92, "grad_norm": 1.1147266307707406, "kl_div": -0.002457402413710952, "kl_div_sft": -0.002457402413710952, "learning_rate": 1.3964110929853182e-06, "loss": -0.0434, "sft_loss": 0.04604826122522354, "step": 381 }, { "epoch": 0.93, "grad_norm": 3.2083150359453554, "importance_ratio": 0.9609375, "kl_div": -0.015367057174444199, "kl_div_pos": -0.04033532366156578, "kl_div_sft": 0.009601208381354809, "learning_rate": 1.3947797716150081e-06, "loss": -0.001, "ppo_loss": -0.9604672789573669, "sft_loss": 0.020802391692996025, "step": 382 }, { "epoch": 0.93, "grad_norm": 1.150521806837677, "importance_ratio": 0.75390625, "kl_div": -0.28163760900497437, "kl_div_neg": -0.28163760900497437, "learning_rate": 1.3931484502446981e-06, "loss": 0.0511, "ppo_loss": 0.800000011920929, "step": 383 }, { "epoch": 0.93, "grad_norm": 0.8816599791789368, "kl_div": -4.7477660700678825e-06, "kl_div_sft": -4.7477660700678825e-06, "learning_rate": 1.3915171288743881e-06, "loss": 0.0797, "sft_loss": 0.07828231900930405, "step": 384 }, { "epoch": 0.93, "grad_norm": 1.8813316251466288, "kl_div": -0.03393517807126045, "kl_div_sft": -0.03393517807126045, "learning_rate": 1.3898858075040783e-06, "loss": -0.0095, "sft_loss": 0.12192416191101074, "step": 385 }, { "epoch": 0.94, "grad_norm": 1.2005491549887957, "kl_div": -0.0066083818674087524, "kl_div_sft": -0.0066083818674087524, "learning_rate": 1.3882544861337683e-06, "loss": -0.0687, "sft_loss": 0.050567544996738434, "step": 386 }, { "epoch": 0.94, "grad_norm": 5.512368772852106, "importance_ratio": 0.9453125, "kl_div": -0.03097151219844818, "kl_div_neg": -0.05789247900247574, "kl_div_sft": -0.004050543997436762, "learning_rate": 1.3866231647634583e-06, "loss": -0.1663, "ppo_loss": 0.9437513947486877, "sft_loss": 0.08977137506008148, "step": 387 }, { "epoch": 0.94, "grad_norm": 4.165481525992378, "importance_ratio": 0.98828125, "kl_div": -0.009134024381637573, "kl_div_neg": -0.013471991755068302, "kl_div_sft": -0.004796057473868132, "learning_rate": 1.3849918433931485e-06, "loss": 0.0507, "ppo_loss": 0.9866183400154114, "sft_loss": 0.047184206545352936, "step": 388 }, { "epoch": 0.94, "grad_norm": 2.342263220643977, "kl_div": -0.0007208693423308432, "kl_div_sft": -0.0007208693423308432, "learning_rate": 1.3833605220228385e-06, "loss": -0.0463, "sft_loss": 0.02857479453086853, "step": 389 }, { "epoch": 0.95, "grad_norm": 1.3386212435370513, "kl_div": 0.00022949720732867718, "kl_div_sft": 0.00022949720732867718, "learning_rate": 1.3817292006525284e-06, "loss": -0.1283, "sft_loss": 0.06676965206861496, "step": 390 }, { "epoch": 0.95, "grad_norm": 1.335906803603844, "kl_div": 0.006292227655649185, "kl_div_sft": 0.006292227655649185, "learning_rate": 1.3800978792822184e-06, "loss": -0.0428, "sft_loss": 0.07561105489730835, "step": 391 }, { "epoch": 0.95, "grad_norm": 1.0108704416477208, "importance_ratio": 0.7890625, "kl_div": -0.23826153576374054, "kl_div_neg": -0.23826153576374054, "learning_rate": 1.3784665579119086e-06, "loss": -0.0507, "ppo_loss": 0.811415433883667, "step": 392 }, { "epoch": 0.95, "grad_norm": 2.175136614488159, "importance_ratio": 0.98828125, "kl_div": -0.001840903889387846, "kl_div_pos": -0.01280925888568163, "kl_div_sft": 0.009127451106905937, "learning_rate": 1.3768352365415988e-06, "loss": -0.0873, "ppo_loss": -0.9872723817825317, "sft_loss": 0.13365277647972107, "step": 393 }, { "epoch": 0.96, "grad_norm": 2.628608312629397, "importance_ratio": 0.859375, "kl_div": -0.0726788267493248, "kl_div_neg": -0.15091568231582642, "kl_div_sft": 0.005558023229241371, "learning_rate": 1.3752039151712886e-06, "loss": -0.004, "ppo_loss": 0.8599202632904053, "sft_loss": 0.078884057700634, "step": 394 }, { "epoch": 0.96, "grad_norm": 0.8525592325278135, "importance_ratio": 0.8671875, "kl_div": -0.152493417263031, "kl_div_neg": -0.152493417263031, "learning_rate": 1.3735725938009788e-06, "loss": 0.0276, "ppo_loss": 0.8968149423599243, "step": 395 }, { "epoch": 0.96, "grad_norm": 1.293563659811949, "importance_ratio": 0.875, "kl_div": -0.14297887682914734, "kl_div_neg": -0.27100446820259094, "kl_div_pos": -0.014953281730413437, "learning_rate": 1.3719412724306688e-06, "loss": -0.0321, "ppo_loss": -0.09257897734642029, "step": 396 }, { "epoch": 0.96, "grad_norm": 3.674110154354462, "importance_ratio": 1.0, "kl_div": -0.004077698569744825, "kl_div_pos": 0.0026960691902786493, "kl_div_sft": -0.010851466096937656, "learning_rate": 1.3703099510603587e-06, "loss": 0.0206, "ppo_loss": -1.0026997327804565, "sft_loss": 0.07455823570489883, "step": 397 }, { "epoch": 0.96, "grad_norm": 2.217661630960872, "importance_ratio": 0.73828125, "kl_div": -0.14972834289073944, "kl_div_neg": -0.3059646189212799, "kl_div_sft": 0.006507941521704197, "learning_rate": 1.3686786296900487e-06, "loss": 0.0537, "ppo_loss": 0.800000011920929, "sft_loss": 0.017921945080161095, "step": 398 }, { "epoch": 0.97, "grad_norm": 4.9095395474020656, "importance_ratio": 1.0078125, "kl_div": -0.04424430802464485, "kl_div_pos": 0.007046323735266924, "kl_div_sft": -0.09553494304418564, "learning_rate": 1.367047308319739e-06, "loss": -0.1444, "ppo_loss": -1.0070712566375732, "sft_loss": 0.14671209454536438, "step": 399 }, { "epoch": 0.97, "grad_norm": 0.9939955809780883, "importance_ratio": 0.83984375, "kl_div": -0.18479210138320923, "kl_div_neg": -0.18479210138320923, "learning_rate": 1.3654159869494291e-06, "loss": -0.0179, "ppo_loss": 0.8847052454948425, "step": 400 }, { "epoch": 0.97, "grad_norm": 3.032307391233481, "kl_div": -0.008666617795825005, "kl_div_sft": -0.008666617795825005, "learning_rate": 1.363784665579119e-06, "loss": 0.1489, "sft_loss": 0.06816105544567108, "step": 401 }, { "epoch": 0.97, "grad_norm": 0.9823549221271792, "importance_ratio": 0.96484375, "kl_div": -0.02337481454014778, "kl_div_pos": -0.03435087203979492, "kl_div_sft": -0.012398757971823215, "learning_rate": 1.362153344208809e-06, "loss": -0.0555, "ppo_loss": -0.966232419013977, "sft_loss": 0.07946991920471191, "step": 402 }, { "epoch": 0.98, "grad_norm": 1.0897954020833909, "importance_ratio": 0.96875, "kl_div": -0.011266342364251614, "kl_div_pos": -0.03210162743926048, "kl_div_sft": 0.009568942710757256, "learning_rate": 1.360522022838499e-06, "loss": 0.0457, "ppo_loss": -0.9684081673622131, "sft_loss": 0.04152145981788635, "step": 403 }, { "epoch": 0.98, "grad_norm": 0.8808380936380005, "importance_ratio": 0.8203125, "kl_div": -0.10881634056568146, "kl_div_neg": -0.1977686733007431, "kl_div_sft": -0.019864002242684364, "learning_rate": 1.3588907014681893e-06, "loss": -0.1877, "ppo_loss": 0.8205596804618835, "sft_loss": 0.08723907917737961, "step": 404 }, { "epoch": 0.98, "grad_norm": 1.8881065051054673, "importance_ratio": 0.9453125, "kl_div": -0.0581141859292984, "kl_div_pos": -0.0581141859292984, "learning_rate": 1.3572593800978793e-06, "loss": 0.0755, "ppo_loss": -0.9441255927085876, "step": 405 }, { "epoch": 0.98, "grad_norm": 1.1999381327575513, "importance_ratio": 0.69140625, "kl_div": -0.1756732314825058, "kl_div_neg": -0.36646994948387146, "kl_div_sft": 0.015123496763408184, "learning_rate": 1.3556280587275692e-06, "loss": -0.0247, "ppo_loss": 0.800000011920929, "sft_loss": 0.07821010053157806, "step": 406 }, { "epoch": 0.99, "grad_norm": 3.3795522436129866, "importance_ratio": 0.9609375, "kl_div": -0.020731983706355095, "kl_div_pos": -0.03792019933462143, "kl_div_sft": -0.003543769009411335, "learning_rate": 1.3539967373572594e-06, "loss": -0.084, "ppo_loss": -0.96278977394104, "sft_loss": 0.045178286731243134, "step": 407 }, { "epoch": 0.99, "grad_norm": 1.1472833244353395, "kl_div": 0.004671051632612944, "kl_div_sft": 0.004671051632612944, "learning_rate": 1.3523654159869492e-06, "loss": -0.1033, "sft_loss": 0.044013626873493195, "step": 408 }, { "epoch": 0.99, "grad_norm": 0.998406093622832, "kl_div": 0.000133444438688457, "kl_div_sft": 0.000133444438688457, "learning_rate": 1.3507340946166394e-06, "loss": 0.0161, "sft_loss": 0.04665825143456459, "step": 409 }, { "epoch": 0.99, "grad_norm": 0.8757673713339846, "importance_ratio": 0.98046875, "kl_div": -0.010525347664952278, "kl_div_pos": -0.02144579030573368, "kl_div_sft": 0.000395095266867429, "learning_rate": 1.3491027732463294e-06, "loss": -0.1966, "ppo_loss": -0.978782594203949, "sft_loss": 0.04445347562432289, "step": 410 }, { "epoch": 1.0, "grad_norm": 0.8917975487123769, "importance_ratio": 0.79296875, "kl_div": -0.10984192788600922, "kl_div_neg": -0.23029771447181702, "kl_div_sft": 0.010613863356411457, "learning_rate": 1.3474714518760196e-06, "loss": 0.0826, "ppo_loss": 0.800000011920929, "sft_loss": 0.03369507938623428, "step": 411 }, { "epoch": 1.0, "grad_norm": 0.985083975625179, "importance_ratio": 1.015625, "kl_div": 0.0019731326028704643, "kl_div_pos": 0.015019123442471027, "kl_div_sft": -0.011072858236730099, "learning_rate": 1.3458401305057096e-06, "loss": 0.0667, "ppo_loss": -1.0151324272155762, "sft_loss": 0.11207890510559082, "step": 412 }, { "epoch": 1.0, "grad_norm": 2.4819049191887, "importance_ratio": 0.80859375, "kl_div": -0.12128035724163055, "kl_div_neg": -0.21045958995819092, "kl_div_sft": -0.03210112079977989, "learning_rate": 1.3442088091353996e-06, "loss": -0.0648, "ppo_loss": 0.8102118372917175, "sft_loss": 0.06167708337306976, "step": 413 }, { "epoch": 1.0, "grad_norm": 2.7244534355576664, "importance_ratio": 0.7578125, "kl_div": -0.28326472640037537, "kl_div_neg": -0.4056023061275482, "kl_div_pos": -0.1609271615743637, "learning_rate": 1.3425774877650897e-06, "loss": -0.0573, "ppo_loss": -0.02567705512046814, "step": 414 }, { "epoch": 1.01, "grad_norm": 1.059758914036212, "importance_ratio": 0.8125, "kl_div": -0.12452328205108643, "kl_div_neg": -0.2088662087917328, "kl_div_sft": -0.04018034785985947, "learning_rate": 1.3409461663947797e-06, "loss": -0.1405, "ppo_loss": 0.8115037679672241, "sft_loss": 0.11398042738437653, "step": 415 }, { "epoch": 1.01, "grad_norm": 1.0173488365940238, "kl_div": 0.010434374213218689, "kl_div_sft": 0.010434374213218689, "learning_rate": 1.3393148450244697e-06, "loss": 0.0197, "sft_loss": 0.02324233204126358, "step": 416 }, { "epoch": 1.01, "grad_norm": 1.5338649609050454, "importance_ratio": 1.0625, "kl_div": 0.03886265680193901, "kl_div_pos": 0.0616186261177063, "kl_div_sft": 0.016106685623526573, "learning_rate": 1.3376835236541597e-06, "loss": -0.0194, "ppo_loss": -1.0635566711425781, "sft_loss": 0.0324275866150856, "step": 417 }, { "epoch": 1.01, "grad_norm": 0.8521199632775311, "importance_ratio": 0.90625, "kl_div": -0.044288281351327896, "kl_div_neg": -0.10058998316526413, "kl_div_sft": 0.012013423256576061, "learning_rate": 1.33605220228385e-06, "loss": 0.047, "ppo_loss": 0.9043037295341492, "sft_loss": 0.10559868067502975, "step": 418 }, { "epoch": 1.02, "grad_norm": 6.040839127614859, "importance_ratio": 0.90625, "kl_div": -0.10411019623279572, "kl_div_neg": -0.22670073807239532, "kl_div_pos": 0.018480347469449043, "learning_rate": 1.33442088091354e-06, "loss": 0.0881, "ppo_loss": -0.10932603478431702, "step": 419 }, { "epoch": 1.02, "grad_norm": 0.941359079532538, "importance_ratio": 0.86328125, "kl_div": -0.06987014412879944, "kl_div_neg": -0.14838054776191711, "kl_div_sft": 0.00864026416093111, "learning_rate": 1.3327895595432299e-06, "loss": 0.0411, "ppo_loss": 0.8621029853820801, "sft_loss": 0.03195761516690254, "step": 420 }, { "epoch": 1.02, "grad_norm": 1.4519205331415717, "importance_ratio": 0.90625, "kl_div": -0.1107395812869072, "kl_div_neg": -0.2647498846054077, "kl_div_pos": 0.04327072575688362, "learning_rate": 1.33115823817292e-06, "loss": -0.0077, "ppo_loss": -0.1221102774143219, "step": 421 }, { "epoch": 1.02, "grad_norm": 0.6827252784033078, "importance_ratio": 0.76171875, "kl_div": -0.27574872970581055, "kl_div_neg": -0.35648322105407715, "kl_div_pos": -0.19501426815986633, "learning_rate": 1.32952691680261e-06, "loss": -0.1515, "ppo_loss": -0.011411458253860474, "step": 422 }, { "epoch": 1.03, "grad_norm": 3.1655908731502045, "kl_div": -0.0026373867876827717, "kl_div_sft": -0.0026373867876827717, "learning_rate": 1.3278955954323e-06, "loss": -0.0047, "sft_loss": 0.05154386907815933, "step": 423 }, { "epoch": 1.03, "grad_norm": 1.1272914120902908, "kl_div": 0.006313965655863285, "kl_div_sft": 0.006313965655863285, "learning_rate": 1.3262642740619902e-06, "loss": -0.0953, "sft_loss": 0.06390878558158875, "step": 424 }, { "epoch": 1.03, "grad_norm": 0.9623445273828378, "importance_ratio": 0.99609375, "kl_div": 0.0017089867033064365, "kl_div_pos": -0.0037163347005844116, "kl_div_sft": 0.007134308107197285, "learning_rate": 1.3246329526916802e-06, "loss": -0.1132, "ppo_loss": -0.9962905645370483, "sft_loss": 0.05456602945923805, "step": 425 }, { "epoch": 1.03, "grad_norm": 0.8058896020720482, "importance_ratio": 0.75, "kl_div": -0.2867456078529358, "kl_div_neg": -0.2867456078529358, "learning_rate": 1.3230016313213704e-06, "loss": -0.0598, "ppo_loss": 0.800000011920929, "step": 426 }, { "epoch": 1.04, "grad_norm": 1.1770250705489431, "importance_ratio": 0.89453125, "kl_div": -0.1184467077255249, "kl_div_neg": -0.2329927533864975, "kl_div_pos": -0.0039006578736007214, "learning_rate": 1.3213703099510602e-06, "loss": -0.0331, "ppo_loss": -0.0980534553527832, "step": 427 }, { "epoch": 1.04, "grad_norm": 1.0871435918710617, "importance_ratio": 0.578125, "kl_div": -0.27000996470451355, "kl_div_neg": -0.5461243987083435, "kl_div_sft": 0.006104460451751947, "learning_rate": 1.3197389885807504e-06, "loss": 0.0703, "ppo_loss": 0.800000011920929, "sft_loss": 0.0648282840847969, "step": 428 }, { "epoch": 1.04, "grad_norm": 1.8659609672618034, "importance_ratio": 1.015625, "kl_div": 0.01871345564723015, "kl_div_pos": 0.012792662717401981, "kl_div_sft": 0.02463424950838089, "learning_rate": 1.3181076672104404e-06, "loss": 0.0333, "ppo_loss": -1.0128748416900635, "sft_loss": 0.08509034663438797, "step": 429 }, { "epoch": 1.04, "grad_norm": 1.0429954953997032, "importance_ratio": 1.0078125, "kl_div": -0.006376433186233044, "kl_div_pos": 0.006001722067594528, "kl_div_sft": -0.018754588440060616, "learning_rate": 1.3164763458401306e-06, "loss": -0.0295, "ppo_loss": -1.0060198307037354, "sft_loss": 0.10955341160297394, "step": 430 }, { "epoch": 1.04, "grad_norm": 3.2222155830344135, "importance_ratio": 0.97265625, "kl_div": -0.01591493934392929, "kl_div_pos": -0.029178854078054428, "kl_div_sft": -0.002651023678481579, "learning_rate": 1.3148450244698205e-06, "loss": -0.1148, "ppo_loss": -0.9712427258491516, "sft_loss": 0.047940175980329514, "step": 431 }, { "epoch": 1.05, "grad_norm": 1.3930729953785197, "importance_ratio": 0.984375, "kl_div": -0.09542819857597351, "kl_div_pos": -0.014116327278316021, "kl_div_sft": -0.17674006521701813, "learning_rate": 1.3132137030995105e-06, "loss": -0.103, "ppo_loss": -0.9859828352928162, "sft_loss": 0.2210320681333542, "step": 432 }, { "epoch": 1.05, "grad_norm": 1.1302485422227517, "importance_ratio": 0.73046875, "kl_div": -0.15205544233322144, "kl_div_neg": -0.31576773524284363, "kl_div_sft": 0.011656835675239563, "learning_rate": 1.3115823817292007e-06, "loss": -0.0245, "ppo_loss": 0.800000011920929, "sft_loss": 0.07695038616657257, "step": 433 }, { "epoch": 1.05, "grad_norm": 1.670193762702793, "importance_ratio": 0.69140625, "kl_div": -0.17609615623950958, "kl_div_neg": -0.36890721321105957, "kl_div_sft": 0.01671488955616951, "learning_rate": 1.3099510603588905e-06, "loss": -0.1094, "ppo_loss": 0.800000011920929, "sft_loss": 0.05042430758476257, "step": 434 }, { "epoch": 1.05, "grad_norm": 1.2478484233268006, "importance_ratio": 0.67578125, "kl_div": -0.18994270265102386, "kl_div_neg": -0.3907208740711212, "kl_div_sft": 0.010835465043783188, "learning_rate": 1.3083197389885807e-06, "loss": 0.0549, "ppo_loss": 0.800000011920929, "sft_loss": 0.021010208874940872, "step": 435 }, { "epoch": 1.06, "grad_norm": 1.1062430645569032, "importance_ratio": 1.0, "kl_div": -0.0013482579961419106, "kl_div_neg": -0.014547939412295818, "kl_div_pos": 0.011851423420011997, "learning_rate": 1.3066884176182707e-06, "loss": -0.0807, "ppo_loss": -0.013182252645492554, "step": 436 }, { "epoch": 1.06, "grad_norm": 2.2080934382239104, "importance_ratio": 1.03125, "kl_div": 0.017685813829302788, "kl_div_pos": 0.030767135322093964, "kl_div_sft": 0.004604491405189037, "learning_rate": 1.3050570962479609e-06, "loss": -0.0194, "ppo_loss": -1.0312453508377075, "sft_loss": 0.17565016448497772, "step": 437 }, { "epoch": 1.06, "grad_norm": 1.1770000540831367, "importance_ratio": 1.0546875, "kl_div": 0.01865413226187229, "kl_div_pos": 0.05109777674078941, "kl_div_sft": -0.01378951221704483, "learning_rate": 1.3034257748776508e-06, "loss": -0.0517, "ppo_loss": -1.052425742149353, "sft_loss": 0.08110073208808899, "step": 438 }, { "epoch": 1.06, "grad_norm": 1.2024259146314618, "importance_ratio": 1.03125, "kl_div": 0.03114350140094757, "kl_div_pos": 0.03114350140094757, "learning_rate": 1.3017944535073408e-06, "loss": 0.0292, "ppo_loss": -1.0317292213439941, "step": 439 }, { "epoch": 1.07, "grad_norm": 1.219094056691804, "kl_div": -0.028548669070005417, "kl_div_sft": -0.028548669070005417, "learning_rate": 1.300163132137031e-06, "loss": 0.0065, "sft_loss": 0.11218331754207611, "step": 440 }, { "epoch": 1.07, "grad_norm": 3.7683389467998696, "importance_ratio": 0.8828125, "kl_div": -0.143281489610672, "kl_div_pos": -0.143281489610672, "learning_rate": 1.298531810766721e-06, "loss": 0.0254, "ppo_loss": -0.8847619295120239, "step": 441 }, { "epoch": 1.07, "grad_norm": 1.0317494310100623, "importance_ratio": 0.7421875, "kl_div": -0.15573081374168396, "kl_div_neg": -0.2998103201389313, "kl_div_sft": -0.011651305481791496, "learning_rate": 1.296900489396411e-06, "loss": -0.0031, "ppo_loss": 0.800000011920929, "sft_loss": 0.09555532038211823, "step": 442 }, { "epoch": 1.07, "grad_norm": 1.1342805492078307, "importance_ratio": 1.03125, "kl_div": -0.00379384309053421, "kl_div_pos": 0.03288000449538231, "kl_div_sft": -0.04046769067645073, "learning_rate": 1.295269168026101e-06, "loss": -0.0779, "ppo_loss": -1.0334265232086182, "sft_loss": 0.14797383546829224, "step": 443 }, { "epoch": 1.08, "grad_norm": 1.3489916391078236, "importance_ratio": 1.0, "kl_div": 0.004089490510523319, "kl_div_pos": 0.004089490510523319, "learning_rate": 1.2936378466557912e-06, "loss": -0.1421, "ppo_loss": -1.0044682025909424, "step": 444 }, { "epoch": 1.08, "grad_norm": 1.7208118902291312, "kl_div": 0.01862707920372486, "kl_div_sft": 0.01862707920372486, "learning_rate": 1.2920065252854812e-06, "loss": -0.0981, "sft_loss": 0.06423333287239075, "step": 445 }, { "epoch": 1.08, "grad_norm": 2.2753196868971592, "importance_ratio": 0.9765625, "kl_div": -0.02230643853545189, "kl_div_neg": -0.06035029515624046, "kl_div_pos": 0.015737419947981834, "learning_rate": 1.2903752039151711e-06, "loss": -0.0008, "ppo_loss": -0.03721359372138977, "step": 446 }, { "epoch": 1.08, "grad_norm": 3.5354757765214044, "importance_ratio": 0.9765625, "kl_div": -0.010805973783135414, "kl_div_pos": -0.023163456469774246, "kl_div_sft": 0.001551508903503418, "learning_rate": 1.2887438825448613e-06, "loss": -0.0635, "ppo_loss": -0.9771027565002441, "sft_loss": 0.05742768198251724, "step": 447 }, { "epoch": 1.09, "grad_norm": 1.1104740555028498, "importance_ratio": 1.015625, "kl_div": 0.010768914595246315, "kl_div_pos": 0.015338733792304993, "kl_div_sft": 0.006199096329510212, "learning_rate": 1.2871125611745513e-06, "loss": 0.0476, "ppo_loss": -1.0154569149017334, "sft_loss": 0.034216687083244324, "step": 448 }, { "epoch": 1.09, "grad_norm": 0.9657073075359999, "importance_ratio": 1.03125, "kl_div": 0.019894614815711975, "kl_div_pos": 0.03344016894698143, "kl_div_sft": 0.00634906068444252, "learning_rate": 1.2854812398042413e-06, "loss": 0.0014, "ppo_loss": -1.0340055227279663, "sft_loss": 0.10556098818778992, "step": 449 }, { "epoch": 1.09, "grad_norm": 2.6250586730212717, "importance_ratio": 1.015625, "kl_div": 0.000533820129930973, "kl_div_pos": 0.0163713488727808, "kl_div_sft": -0.015303708612918854, "learning_rate": 1.2838499184339315e-06, "loss": 0.0689, "ppo_loss": -1.0165060758590698, "sft_loss": 0.08886807411909103, "step": 450 }, { "epoch": 1.09, "grad_norm": 5.760296765735746, "importance_ratio": 0.63671875, "kl_div": -0.251537561416626, "kl_div_neg": -0.45127934217453003, "kl_div_sft": -0.05179579555988312, "learning_rate": 1.2822185970636215e-06, "loss": 0.0572, "ppo_loss": 0.800000011920929, "sft_loss": 0.13668057322502136, "step": 451 }, { "epoch": 1.1, "grad_norm": 3.3639225896576432, "importance_ratio": 0.9375, "kl_div": -0.03847883641719818, "kl_div_neg": -0.065961554646492, "kl_div_sft": -0.010996121913194656, "learning_rate": 1.2805872756933117e-06, "loss": 0.0486, "ppo_loss": 0.9361668825149536, "sft_loss": 0.0495573990046978, "step": 452 }, { "epoch": 1.1, "grad_norm": 4.073796924321981, "kl_div": -0.020316394045948982, "kl_div_sft": -0.020316394045948982, "learning_rate": 1.2789559543230015e-06, "loss": -0.0297, "sft_loss": 0.1286567747592926, "step": 453 }, { "epoch": 1.1, "grad_norm": 1.0464566092173484, "importance_ratio": 0.75390625, "kl_div": -0.2827034592628479, "kl_div_neg": -0.2827034592628479, "learning_rate": 1.2773246329526917e-06, "loss": -0.0794, "ppo_loss": 0.800000011920929, "step": 454 }, { "epoch": 1.1, "grad_norm": 1.2423138824085187, "importance_ratio": 0.73046875, "kl_div": -0.15301235020160675, "kl_div_neg": -0.31398218870162964, "kl_div_sft": 0.00795749295502901, "learning_rate": 1.2756933115823816e-06, "loss": 0.0354, "ppo_loss": 0.800000011920929, "sft_loss": 0.04708397015929222, "step": 455 }, { "epoch": 1.11, "grad_norm": 3.044844200019531, "importance_ratio": 0.703125, "kl_div": -0.356436163187027, "kl_div_neg": -0.356436163187027, "learning_rate": 1.2740619902120716e-06, "loss": 0.122, "ppo_loss": 0.800000011920929, "step": 456 }, { "epoch": 1.11, "grad_norm": 2.7220759601182714, "importance_ratio": 0.5078125, "kl_div": -0.3348856568336487, "kl_div_neg": -0.6812491416931152, "kl_div_sft": 0.01147780753672123, "learning_rate": 1.2724306688417618e-06, "loss": 0.0421, "ppo_loss": 0.800000011920929, "sft_loss": 0.03663746267557144, "step": 457 }, { "epoch": 1.11, "grad_norm": 2.3232017536020098, "kl_div": 0.007984797470271587, "kl_div_sft": 0.007984797470271587, "learning_rate": 1.2707993474714518e-06, "loss": -0.1334, "sft_loss": 0.034479185938835144, "step": 458 }, { "epoch": 1.11, "grad_norm": 2.1881886079592605, "importance_ratio": 1.0078125, "kl_div": 0.0066819763742387295, "kl_div_pos": 0.005560300312936306, "kl_div_sft": 0.007803652435541153, "learning_rate": 1.269168026101142e-06, "loss": 0.0103, "ppo_loss": -1.0055757761001587, "sft_loss": 0.10367721319198608, "step": 459 }, { "epoch": 1.12, "grad_norm": 1.2141547012175877, "importance_ratio": 0.59765625, "kl_div": -0.5515552163124084, "kl_div_neg": -0.5515552163124084, "learning_rate": 1.2675367047308318e-06, "loss": 0.017, "ppo_loss": 0.800000011920929, "step": 460 }, { "epoch": 1.12, "grad_norm": 1.2829905526935033, "importance_ratio": 0.859375, "kl_div": -0.1534956693649292, "kl_div_neg": -0.1534956693649292, "learning_rate": 1.265905383360522e-06, "loss": -0.0386, "ppo_loss": 0.8584249019622803, "step": 461 }, { "epoch": 1.12, "grad_norm": 2.182536351970647, "importance_ratio": 0.85546875, "kl_div": -0.06841041147708893, "kl_div_neg": -0.1546468734741211, "kl_div_sft": 0.017826057970523834, "learning_rate": 1.264274061990212e-06, "loss": 0.0789, "ppo_loss": 0.8567176461219788, "sft_loss": 0.012218399904668331, "step": 462 }, { "epoch": 1.12, "grad_norm": 2.370907117860891, "importance_ratio": 0.890625, "kl_div": -0.05210436135530472, "kl_div_neg": -0.11743360757827759, "kl_div_sft": 0.013224886730313301, "learning_rate": 1.2626427406199021e-06, "loss": -0.1983, "ppo_loss": 0.8891995549201965, "sft_loss": 0.030821723863482475, "step": 463 }, { "epoch": 1.12, "grad_norm": 3.6608074508667863, "importance_ratio": 0.74609375, "kl_div": -0.15941019356250763, "kl_div_neg": -0.2910042107105255, "kl_div_sft": -0.0278161708265543, "learning_rate": 1.2610114192495921e-06, "loss": -0.0598, "ppo_loss": 0.800000011920929, "sft_loss": 0.09093907475471497, "step": 464 }, { "epoch": 1.13, "grad_norm": 3.3330544514023743, "importance_ratio": 0.90625, "kl_div": -0.10372880846261978, "kl_div_neg": -0.23328396677970886, "kl_div_pos": 0.02582634426653385, "learning_rate": 1.2593800978792821e-06, "loss": -0.0951, "ppo_loss": -0.11308136582374573, "step": 465 }, { "epoch": 1.13, "grad_norm": 0.925412551278747, "importance_ratio": 0.99609375, "kl_div": -0.00442282110452652, "kl_div_pos": -0.00442282110452652, "learning_rate": 1.2577487765089723e-06, "loss": -0.1149, "ppo_loss": -0.9955874681472778, "step": 466 }, { "epoch": 1.13, "grad_norm": 4.15379203451391, "importance_ratio": 0.671875, "kl_div": -0.19799412786960602, "kl_div_neg": -0.3990720212459564, "kl_div_sft": 0.003083764109760523, "learning_rate": 1.256117455138662e-06, "loss": -0.1798, "ppo_loss": 0.800000011920929, "sft_loss": 0.049974698573350906, "step": 467 }, { "epoch": 1.13, "grad_norm": 2.139147617605486, "importance_ratio": 0.65234375, "kl_div": -0.206068754196167, "kl_div_neg": -0.42438942193984985, "kl_div_sft": 0.012251907959580421, "learning_rate": 1.2544861337683523e-06, "loss": -0.0662, "ppo_loss": 0.800000011920929, "sft_loss": 0.09358756244182587, "step": 468 }, { "epoch": 1.14, "grad_norm": 6.902050277610847, "kl_div": -0.0074700559489429, "kl_div_sft": -0.0074700559489429, "learning_rate": 1.2528548123980423e-06, "loss": -0.1717, "sft_loss": 0.08451381325721741, "step": 469 }, { "epoch": 1.14, "grad_norm": 1.4255865930905123, "importance_ratio": 0.56640625, "kl_div": -0.2817477285861969, "kl_div_neg": -0.5675917863845825, "kl_div_sft": 0.0040963380597531796, "learning_rate": 1.2512234910277325e-06, "loss": 0.0424, "ppo_loss": 0.800000011920929, "sft_loss": 0.04384386166930199, "step": 470 }, { "epoch": 1.14, "grad_norm": 0.9755837991288189, "importance_ratio": 0.83203125, "kl_div": -0.21140116453170776, "kl_div_neg": -0.4443986415863037, "kl_div_pos": 0.021596306934952736, "learning_rate": 1.2495921696574224e-06, "loss": -0.0578, "ppo_loss": -0.11091557145118713, "step": 471 }, { "epoch": 1.14, "grad_norm": 6.980723952211021, "importance_ratio": 1.03125, "kl_div": 0.0269265566021204, "kl_div_pos": 0.028264760971069336, "kl_div_sft": 0.025588352233171463, "learning_rate": 1.2479608482871124e-06, "loss": -0.084, "ppo_loss": -1.02866792678833, "sft_loss": 0.09589555114507675, "step": 472 }, { "epoch": 1.15, "grad_norm": 3.092916665405893, "importance_ratio": 0.8515625, "kl_div": -0.09231029450893402, "kl_div_neg": -0.15904003381729126, "kl_div_sft": -0.025580555200576782, "learning_rate": 1.2463295269168026e-06, "loss": -0.0882, "ppo_loss": 0.8529621958732605, "sft_loss": 0.0967254638671875, "step": 473 }, { "epoch": 1.15, "grad_norm": 1.3537538437159136, "kl_div": 0.011375309899449348, "kl_div_sft": 0.011375309899449348, "learning_rate": 1.2446982055464926e-06, "loss": 0.1465, "sft_loss": 0.028936251997947693, "step": 474 }, { "epoch": 1.15, "grad_norm": 4.675210329924026, "importance_ratio": 0.7578125, "kl_div": -0.1408734768629074, "kl_div_neg": -0.2784445285797119, "kl_div_sft": -0.0033024323638528585, "learning_rate": 1.2430668841761826e-06, "loss": -0.1304, "ppo_loss": 0.800000011920929, "sft_loss": 0.09153730422258377, "step": 475 }, { "epoch": 1.15, "grad_norm": 1.3261338960268474, "importance_ratio": 1.03125, "kl_div": 0.015856783837080002, "kl_div_pos": 0.029548203572630882, "kl_div_sft": 0.0021653659641742706, "learning_rate": 1.2414355628058728e-06, "loss": -0.12, "ppo_loss": -1.0299891233444214, "sft_loss": 0.04419136047363281, "step": 476 }, { "epoch": 1.16, "grad_norm": 1.8804943962551242, "importance_ratio": 0.9140625, "kl_div": -0.04008149728178978, "kl_div_neg": -0.09007521718740463, "kl_div_sft": 0.00991221982985735, "learning_rate": 1.2398042414355628e-06, "loss": 0.0098, "ppo_loss": 0.9138624668121338, "sft_loss": 0.07724327594041824, "step": 477 }, { "epoch": 1.16, "grad_norm": 0.8030858813306728, "importance_ratio": 0.90625, "kl_div": -0.11344952136278152, "kl_div_neg": -0.272797554731369, "kl_div_pos": 0.04589850828051567, "learning_rate": 1.238172920065253e-06, "loss": -0.0467, "ppo_loss": -0.12348410487174988, "step": 478 }, { "epoch": 1.16, "grad_norm": 1.2816110427916472, "importance_ratio": 1.1015625, "kl_div": 0.0360209122300148, "kl_div_pos": 0.09649316221475601, "kl_div_sft": -0.02445133589208126, "learning_rate": 1.2365415986949427e-06, "loss": -0.0362, "ppo_loss": -1.1013020277023315, "sft_loss": 0.06209849193692207, "step": 479 }, { "epoch": 1.16, "grad_norm": 1.037616727765905, "importance_ratio": 1.0703125, "kl_div": 0.02870725654065609, "kl_div_pos": 0.06513284146785736, "kl_div_sft": -0.007718327920883894, "learning_rate": 1.234910277324633e-06, "loss": 0.0174, "ppo_loss": -1.067300796508789, "sft_loss": 0.07386572659015656, "step": 480 }, { "epoch": 1.17, "grad_norm": 4.106961894343179, "importance_ratio": 0.984375, "kl_div": -0.05532138794660568, "kl_div_pos": -0.017548711970448494, "kl_div_sft": -0.09309406578540802, "learning_rate": 1.233278955954323e-06, "loss": -0.0662, "ppo_loss": -0.9826043844223022, "sft_loss": 0.14551551640033722, "step": 481 }, { "epoch": 1.17, "grad_norm": 1.3741362199404208, "importance_ratio": 0.7734375, "kl_div": -0.1376347541809082, "kl_div_neg": -0.2552362382411957, "kl_div_sft": -0.020033257082104683, "learning_rate": 1.231647634584013e-06, "loss": -0.0029, "ppo_loss": 0.800000011920929, "sft_loss": 0.13010145723819733, "step": 482 }, { "epoch": 1.17, "grad_norm": 2.532009246363337, "importance_ratio": 0.75390625, "kl_div": -0.28162503242492676, "kl_div_neg": -0.28162503242492676, "learning_rate": 1.230016313213703e-06, "loss": 0.0856, "ppo_loss": 0.800000011920929, "step": 483 }, { "epoch": 1.17, "grad_norm": 1.1639983396321676, "importance_ratio": 1.046875, "kl_div": 0.02006608620285988, "kl_div_pos": 0.04490042105317116, "kl_div_sft": -0.004768249578773975, "learning_rate": 1.228384991843393e-06, "loss": -0.0734, "ppo_loss": -1.0459237098693848, "sft_loss": 0.043344151228666306, "step": 484 }, { "epoch": 1.18, "grad_norm": 1.3736239396796828, "kl_div": -0.0036171970423310995, "kl_div_sft": -0.0036171970423310995, "learning_rate": 1.2267536704730833e-06, "loss": -0.0378, "sft_loss": 0.0710780918598175, "step": 485 }, { "epoch": 1.18, "grad_norm": 2.104948924480214, "importance_ratio": 1.015625, "kl_div": 0.012552674859762192, "kl_div_pos": 0.012552674859762192, "learning_rate": 1.225122349102773e-06, "loss": -0.0419, "ppo_loss": -1.0132044553756714, "step": 486 }, { "epoch": 1.18, "grad_norm": 3.6187121348137707, "importance_ratio": 0.8984375, "kl_div": -0.12204791605472565, "kl_div_neg": -0.2728288471698761, "kl_div_pos": 0.028733013197779655, "learning_rate": 1.2234910277324632e-06, "loss": 0.0313, "ppo_loss": -0.11457487940788269, "step": 487 }, { "epoch": 1.18, "grad_norm": 1.8025541039758028, "importance_ratio": 0.73828125, "kl_div": -0.14328588545322418, "kl_div_neg": -0.30288568139076233, "kl_div_sft": 0.016313914209604263, "learning_rate": 1.2218597063621532e-06, "loss": 0.1435, "ppo_loss": 0.800000011920929, "sft_loss": 0.040791235864162445, "step": 488 }, { "epoch": 1.19, "grad_norm": 1.0924486320168931, "importance_ratio": 0.9375, "kl_div": -0.026130639016628265, "kl_div_neg": -0.06307917833328247, "kl_div_sft": 0.01081790216267109, "learning_rate": 1.2202283849918434e-06, "loss": 0.024, "ppo_loss": 0.9388691782951355, "sft_loss": 0.02481883391737938, "step": 489 }, { "epoch": 1.19, "grad_norm": 0.8550332062390997, "kl_div": 0.006395288743078709, "kl_div_sft": 0.006395288743078709, "learning_rate": 1.2185970636215334e-06, "loss": -0.031, "sft_loss": 0.0363774374127388, "step": 490 }, { "epoch": 1.19, "grad_norm": 2.9976819937770967, "kl_div": -0.024867737665772438, "kl_div_sft": -0.024867737665772438, "learning_rate": 1.2169657422512234e-06, "loss": 0.0257, "sft_loss": 0.11894671618938446, "step": 491 }, { "epoch": 1.19, "grad_norm": 1.127945434988827, "kl_div": 0.004560403060168028, "kl_div_sft": 0.004560403060168028, "learning_rate": 1.2153344208809136e-06, "loss": 0.1788, "sft_loss": 0.06735538691282272, "step": 492 }, { "epoch": 1.2, "grad_norm": 1.2091607133143483, "importance_ratio": 0.72265625, "kl_div": -0.1636078655719757, "kl_div_neg": -0.32448697090148926, "kl_div_sft": -0.0027287707198411226, "learning_rate": 1.2137030995106034e-06, "loss": -0.2138, "ppo_loss": 0.800000011920929, "sft_loss": 0.09669335186481476, "step": 493 }, { "epoch": 1.2, "grad_norm": 2.0201115318705276, "importance_ratio": 0.99609375, "kl_div": -0.008533771149814129, "kl_div_pos": -0.0022959003690630198, "kl_div_sft": -0.014771642163395882, "learning_rate": 1.2120717781402936e-06, "loss": -0.1766, "ppo_loss": -0.9977067112922668, "sft_loss": 0.13752681016921997, "step": 494 }, { "epoch": 1.2, "grad_norm": 3.9285253546849286, "importance_ratio": 1.0, "kl_div": -3.4654553019208834e-05, "kl_div_pos": -2.737253271334339e-05, "kl_div_sft": -4.1936571506084874e-05, "learning_rate": 1.2104404567699838e-06, "loss": -0.1369, "ppo_loss": -0.9999725818634033, "sft_loss": 0.08990359306335449, "step": 495 }, { "epoch": 1.2, "grad_norm": 2.960428883694901, "importance_ratio": 0.51171875, "kl_div": -0.324387788772583, "kl_div_neg": -0.6732898354530334, "kl_div_sft": 0.024514272809028625, "learning_rate": 1.2088091353996737e-06, "loss": 0.0115, "ppo_loss": 0.800000011920929, "sft_loss": 0.026017149910330772, "step": 496 }, { "epoch": 1.2, "grad_norm": 1.1399086048064144, "importance_ratio": 0.76953125, "kl_div": -0.1295836865901947, "kl_div_pos": -0.2599563002586365, "kl_div_sft": 0.0007889288244768977, "learning_rate": 1.2071778140293637e-06, "loss": -0.0312, "ppo_loss": -0.7710853219032288, "sft_loss": 0.12191534042358398, "step": 497 }, { "epoch": 1.21, "grad_norm": 1.391111567380043, "kl_div": 0.016579890623688698, "kl_div_sft": 0.016579890623688698, "learning_rate": 1.2055464926590537e-06, "loss": -0.1784, "sft_loss": 0.06572773307561874, "step": 498 }, { "epoch": 1.21, "grad_norm": 2.207309725586876, "importance_ratio": 0.96484375, "kl_div": -0.035914648324251175, "kl_div_neg": -0.0898895412683487, "kl_div_pos": 0.018060242757201195, "learning_rate": 1.203915171288744e-06, "loss": 0.0222, "ppo_loss": -0.052096039056777954, "step": 499 }, { "epoch": 1.21, "grad_norm": 0.9474518583040042, "importance_ratio": 0.9609375, "kl_div": -0.040214769542217255, "kl_div_neg": -0.10180936008691788, "kl_div_pos": 0.021379824727773666, "learning_rate": 1.2022838499184339e-06, "loss": 0.0347, "ppo_loss": -0.059204161167144775, "step": 500 }, { "epoch": 1.21, "grad_norm": 1.169849731055024, "importance_ratio": 0.97265625, "kl_div": -0.003386242315173149, "kl_div_pos": -0.025821957737207413, "kl_div_sft": 0.019049473106861115, "learning_rate": 1.2006525285481239e-06, "loss": -0.0243, "ppo_loss": -0.9745085835456848, "sft_loss": 0.07731864601373672, "step": 501 }, { "epoch": 1.22, "grad_norm": 1.0761134413901443, "kl_div": 0.007640148513019085, "kl_div_sft": 0.007640148513019085, "learning_rate": 1.199021207177814e-06, "loss": -0.113, "sft_loss": 0.04595714062452316, "step": 502 }, { "epoch": 1.22, "grad_norm": 5.176839330233246, "kl_div": 0.005883101373910904, "kl_div_sft": 0.005883101373910904, "learning_rate": 1.197389885807504e-06, "loss": 0.0495, "sft_loss": 0.1581135392189026, "step": 503 }, { "epoch": 1.22, "grad_norm": 2.3081021116202844, "importance_ratio": 1.0234375, "kl_div": 0.018472742289304733, "kl_div_pos": 0.02372855134308338, "kl_div_sft": 0.013216935098171234, "learning_rate": 1.1957585644371942e-06, "loss": -0.0408, "ppo_loss": -1.0240123271942139, "sft_loss": 0.04031985253095627, "step": 504 }, { "epoch": 1.22, "grad_norm": 1.3674385930139834, "importance_ratio": 0.84765625, "kl_div": -0.08100101351737976, "kl_div_neg": -0.16434870660305023, "kl_div_sft": 0.002346683293581009, "learning_rate": 1.194127243066884e-06, "loss": -0.1544, "ppo_loss": 0.8484461903572083, "sft_loss": 0.06900664418935776, "step": 505 }, { "epoch": 1.23, "grad_norm": 1.0221263844603252, "kl_div": 0.0132527407258749, "kl_div_sft": 0.0132527407258749, "learning_rate": 1.1924959216965742e-06, "loss": 0.0204, "sft_loss": 0.07407552003860474, "step": 506 }, { "epoch": 1.23, "grad_norm": 1.1289567771084783, "kl_div": 0.00977835152298212, "kl_div_sft": 0.00977835152298212, "learning_rate": 1.1908646003262642e-06, "loss": 0.0007, "sft_loss": 0.07620455324649811, "step": 507 }, { "epoch": 1.23, "grad_norm": 3.1804808580565855, "kl_div": 0.0007696398533880711, "kl_div_sft": 0.0007696398533880711, "learning_rate": 1.1892332789559542e-06, "loss": -0.1673, "sft_loss": 0.07429905235767365, "step": 508 }, { "epoch": 1.23, "grad_norm": 2.4337740076700127, "importance_ratio": 0.37109375, "kl_div": -0.4959445297718048, "kl_div_neg": -0.993360161781311, "kl_div_sft": 0.0014711018884554505, "learning_rate": 1.1876019575856444e-06, "loss": -0.0313, "ppo_loss": 0.800000011920929, "sft_loss": 0.10419715940952301, "step": 509 }, { "epoch": 1.24, "grad_norm": 3.1832248216545675, "importance_ratio": 1.0625, "kl_div": 0.0376177541911602, "kl_div_pos": 0.057756274938583374, "kl_div_sft": 0.01747923344373703, "learning_rate": 1.1859706362153344e-06, "loss": -0.1664, "ppo_loss": -1.0594568252563477, "sft_loss": 0.027088165283203125, "step": 510 }, { "epoch": 1.24, "grad_norm": 1.554492727376147, "kl_div": 0.005406046286225319, "kl_div_sft": 0.005406046286225319, "learning_rate": 1.1843393148450246e-06, "loss": 0.0855, "sft_loss": 0.07011683285236359, "step": 511 }, { "epoch": 1.24, "grad_norm": 1.903888177296763, "importance_ratio": 0.6875, "kl_div": -0.20197127759456635, "kl_div_neg": -0.37399518489837646, "kl_div_sft": -0.02994735911488533, "learning_rate": 1.1827079934747143e-06, "loss": 0.0269, "ppo_loss": 0.800000011920929, "sft_loss": 0.11903167515993118, "step": 512 }, { "epoch": 1.24, "grad_norm": 0.8941020352296798, "importance_ratio": 0.7734375, "kl_div": -0.2627606689929962, "kl_div_neg": -0.3461109399795532, "kl_div_pos": -0.17941038310527802, "learning_rate": 1.1810766721044045e-06, "loss": -0.0986, "ppo_loss": -0.017881423234939575, "step": 513 }, { "epoch": 1.25, "grad_norm": 1.725176362683201, "kl_div": -0.05197884142398834, "kl_div_sft": -0.05197884142398834, "learning_rate": 1.1794453507340945e-06, "loss": -0.0622, "sft_loss": 0.10858382284641266, "step": 514 }, { "epoch": 1.25, "grad_norm": 1.5883963179667402, "importance_ratio": 1.015625, "kl_div": 0.01706155203282833, "kl_div_pos": 0.01706155203282833, "learning_rate": 1.1778140293637847e-06, "loss": -0.0126, "ppo_loss": -1.017217755317688, "step": 515 }, { "epoch": 1.25, "grad_norm": 1.0429578916291624, "kl_div": 0.004653987940400839, "kl_div_sft": 0.004653987940400839, "learning_rate": 1.1761827079934747e-06, "loss": -0.1474, "sft_loss": 0.04697205498814583, "step": 516 }, { "epoch": 1.25, "grad_norm": 1.8473786030530364, "importance_ratio": 1.0390625, "kl_div": 0.026094667613506317, "kl_div_pos": 0.04203187674283981, "kl_div_sft": 0.010157458484172821, "learning_rate": 1.1745513866231647e-06, "loss": -0.1446, "ppo_loss": -1.0429277420043945, "sft_loss": 0.10328664630651474, "step": 517 }, { "epoch": 1.26, "grad_norm": 2.3976654897780887, "kl_div": 0.020840991288423538, "kl_div_sft": 0.020840991288423538, "learning_rate": 1.1729200652528549e-06, "loss": -0.0236, "sft_loss": 0.04611089080572128, "step": 518 }, { "epoch": 1.26, "grad_norm": 1.6587128950854497, "importance_ratio": 0.8984375, "kl_div": -0.11170358210802078, "kl_div_neg": -0.21417810022830963, "kl_div_pos": -0.009229060262441635, "learning_rate": 1.1712887438825446e-06, "loss": -0.0484, "ppo_loss": -0.09180441498756409, "step": 519 }, { "epoch": 1.26, "grad_norm": 1.3053593533221195, "kl_div": 0.011289243586361408, "kl_div_sft": 0.011289243586361408, "learning_rate": 1.1696574225122348e-06, "loss": -0.0328, "sft_loss": 0.031042158603668213, "step": 520 }, { "epoch": 1.26, "grad_norm": 0.942260860794481, "importance_ratio": 0.99609375, "kl_div": -0.002506896387785673, "kl_div_pos": -0.004093681927770376, "kl_div_sft": -0.0009201108478009701, "learning_rate": 1.168026101141925e-06, "loss": -0.1133, "ppo_loss": -0.9959146976470947, "sft_loss": 0.0964369997382164, "step": 521 }, { "epoch": 1.27, "grad_norm": 2.0362589186443225, "kl_div": -0.06798586249351501, "kl_div_sft": -0.06798586249351501, "learning_rate": 1.166394779771615e-06, "loss": -0.0122, "sft_loss": 0.13926270604133606, "step": 522 }, { "epoch": 1.27, "grad_norm": 0.9343024720076063, "importance_ratio": 0.7578125, "kl_div": -0.2980799973011017, "kl_div_neg": -0.2980799973011017, "learning_rate": 1.164763458401305e-06, "loss": 0.0436, "ppo_loss": 0.8530257940292358, "step": 523 }, { "epoch": 1.27, "grad_norm": 1.3128374210962497, "importance_ratio": 0.99609375, "kl_div": -0.013152940198779106, "kl_div_pos": -0.0031146006658673286, "kl_div_sft": -0.02319127880036831, "learning_rate": 1.163132137030995e-06, "loss": -0.0097, "ppo_loss": -0.9968902468681335, "sft_loss": 0.13142527639865875, "step": 524 }, { "epoch": 1.27, "grad_norm": 2.119210378316291, "importance_ratio": 0.796875, "kl_div": -0.1095552071928978, "kl_div_neg": -0.2284746617078781, "kl_div_sft": 0.009364242665469646, "learning_rate": 1.1615008156606852e-06, "loss": -0.0474, "ppo_loss": 0.800000011920929, "sft_loss": 0.05101296305656433, "step": 525 }, { "epoch": 1.28, "grad_norm": 3.4480022561935515, "importance_ratio": 0.890625, "kl_div": -0.050689272582530975, "kl_div_neg": -0.11669891327619553, "kl_div_sft": 0.015320368111133575, "learning_rate": 1.1598694942903752e-06, "loss": -0.1617, "ppo_loss": 0.8898531198501587, "sft_loss": 0.033386651426553726, "step": 526 }, { "epoch": 1.28, "grad_norm": 1.7723430331609662, "importance_ratio": 0.734375, "kl_div": -0.1602250188589096, "kl_div_neg": -0.3096018433570862, "kl_div_sft": -0.010848197154700756, "learning_rate": 1.1582381729200651e-06, "loss": -0.1267, "ppo_loss": 0.800000011920929, "sft_loss": 0.0797138437628746, "step": 527 }, { "epoch": 1.28, "grad_norm": 2.107571657530528, "importance_ratio": 0.90234375, "kl_div": -0.11691511422395706, "kl_div_neg": -0.28963759541511536, "kl_div_pos": 0.05580736696720123, "learning_rate": 1.1566068515497553e-06, "loss": -0.0529, "ppo_loss": -0.128697007894516, "step": 528 }, { "epoch": 1.28, "grad_norm": 2.385418576790879, "kl_div": 0.013789523392915726, "kl_div_sft": 0.013789523392915726, "learning_rate": 1.1549755301794453e-06, "loss": -0.0812, "sft_loss": 0.05344651639461517, "step": 529 }, { "epoch": 1.28, "grad_norm": 1.4828436732658241, "importance_ratio": 0.85546875, "kl_div": -0.16320474445819855, "kl_div_neg": -0.16320474445819855, "learning_rate": 1.1533442088091353e-06, "loss": -0.1487, "ppo_loss": 0.8766193389892578, "step": 530 }, { "epoch": 1.29, "grad_norm": 1.5986096957800984, "importance_ratio": 0.95703125, "kl_div": -0.015350108034908772, "kl_div_neg": -0.044570233672857285, "kl_div_sft": 0.013870017603039742, "learning_rate": 1.1517128874388253e-06, "loss": -0.0947, "ppo_loss": 0.9564084410667419, "sft_loss": 0.0904233381152153, "step": 531 }, { "epoch": 1.29, "grad_norm": 3.1736066629465336, "importance_ratio": 0.8125, "kl_div": -0.21371789276599884, "kl_div_neg": -0.21371789276599884, "learning_rate": 1.1500815660685155e-06, "loss": -0.0531, "ppo_loss": 0.848617672920227, "step": 532 }, { "epoch": 1.29, "grad_norm": 1.4188297904679963, "kl_div": 0.009632952511310577, "kl_div_sft": 0.009632952511310577, "learning_rate": 1.1484502446982055e-06, "loss": 0.0202, "sft_loss": 0.04948854818940163, "step": 533 }, { "epoch": 1.29, "grad_norm": 1.2580826273802443, "kl_div": -0.010189337655901909, "kl_div_sft": -0.010189337655901909, "learning_rate": 1.1468189233278955e-06, "loss": -0.1035, "sft_loss": 0.08644433319568634, "step": 534 }, { "epoch": 1.3, "grad_norm": 1.0285720734367165, "kl_div": 0.012180252932012081, "kl_div_sft": 0.012180252932012081, "learning_rate": 1.1451876019575857e-06, "loss": -0.1224, "sft_loss": 0.055709097534418106, "step": 535 }, { "epoch": 1.3, "grad_norm": 2.3399933272046534, "kl_div": 0.006703513208776712, "kl_div_sft": 0.006703513208776712, "learning_rate": 1.1435562805872756e-06, "loss": 0.1456, "sft_loss": 0.05426019802689552, "step": 536 }, { "epoch": 1.3, "grad_norm": 1.0555467849222064, "kl_div": -0.0029783041682094336, "kl_div_sft": -0.0029783041682094336, "learning_rate": 1.1419249592169658e-06, "loss": -0.1068, "sft_loss": 0.043515875935554504, "step": 537 }, { "epoch": 1.3, "grad_norm": 2.871971027483153, "importance_ratio": 0.828125, "kl_div": -0.19169503450393677, "kl_div_neg": -0.19169503450393677, "learning_rate": 1.1402936378466556e-06, "loss": 0.139, "ppo_loss": 0.8521348237991333, "step": 538 }, { "epoch": 1.31, "grad_norm": 1.4585484845534662, "importance_ratio": 0.703125, "kl_div": -0.1812596321105957, "kl_div_neg": -0.35394641757011414, "kl_div_sft": -0.00857284665107727, "learning_rate": 1.1386623164763458e-06, "loss": 0.0067, "ppo_loss": 0.800000011920929, "sft_loss": 0.06204671785235405, "step": 539 }, { "epoch": 1.31, "grad_norm": 1.584166190628764, "importance_ratio": 0.76953125, "kl_div": -0.13018177449703217, "kl_div_neg": -0.2637389004230499, "kl_div_sft": 0.003375363303348422, "learning_rate": 1.1370309951060358e-06, "loss": -0.0823, "ppo_loss": 0.800000011920929, "sft_loss": 0.034781016409397125, "step": 540 }, { "epoch": 1.31, "grad_norm": 1.62456103044522, "importance_ratio": 1.0078125, "kl_div": 0.0058018844574689865, "kl_div_pos": 0.0058018844574689865, "learning_rate": 1.1353996737357258e-06, "loss": -0.0601, "ppo_loss": -1.0061110258102417, "step": 541 }, { "epoch": 1.31, "grad_norm": 1.0309449524969996, "kl_div": 0.016848277300596237, "kl_div_sft": 0.016848277300596237, "learning_rate": 1.133768352365416e-06, "loss": 0.0897, "sft_loss": 0.033322080969810486, "step": 542 }, { "epoch": 1.32, "grad_norm": 1.6610577458145217, "importance_ratio": 0.90234375, "kl_div": -0.07893979549407959, "kl_div_neg": -0.1038811206817627, "kl_div_sft": -0.053998466581106186, "learning_rate": 1.132137030995106e-06, "loss": 0.0988, "ppo_loss": 0.9013324975967407, "sft_loss": 0.10321355611085892, "step": 543 }, { "epoch": 1.32, "grad_norm": 1.3239489255141208, "importance_ratio": 0.6328125, "kl_div": -0.47106295824050903, "kl_div_neg": -0.47106295824050903, "learning_rate": 1.1305057096247961e-06, "loss": -0.1342, "ppo_loss": 0.800000011920929, "step": 544 }, { "epoch": 1.32, "grad_norm": 6.1853282469454225, "kl_div": 0.0043830047361552715, "kl_div_sft": 0.0043830047361552715, "learning_rate": 1.128874388254486e-06, "loss": -0.1029, "sft_loss": 0.04348590970039368, "step": 545 }, { "epoch": 1.32, "grad_norm": 6.376584024540215, "importance_ratio": 0.8984375, "kl_div": -0.10580405592918396, "kl_div_pos": -0.10580405592918396, "learning_rate": 1.1272430668841761e-06, "loss": -0.0738, "ppo_loss": -0.8998837471008301, "step": 546 }, { "epoch": 1.33, "grad_norm": 1.1272377963645992, "importance_ratio": 0.7890625, "kl_div": -0.12314502149820328, "kl_div_neg": -0.23518849909305573, "kl_div_sft": -0.011101537384092808, "learning_rate": 1.1256117455138663e-06, "loss": -0.0169, "ppo_loss": 0.800000011920929, "sft_loss": 0.10723263770341873, "step": 547 }, { "epoch": 1.33, "grad_norm": 2.1873014087449754, "kl_div": -0.00913550890982151, "kl_div_sft": -0.00913550890982151, "learning_rate": 1.1239804241435563e-06, "loss": 0.0708, "sft_loss": 0.07666724920272827, "step": 548 }, { "epoch": 1.33, "grad_norm": 7.292791428782978, "importance_ratio": 1.015625, "kl_div": 0.013908376917243004, "kl_div_pos": 0.018195772543549538, "kl_div_sft": 0.009620980359613895, "learning_rate": 1.1223491027732463e-06, "loss": -0.0659, "ppo_loss": -1.018362283706665, "sft_loss": 0.1339206099510193, "step": 549 }, { "epoch": 1.33, "grad_norm": 1.4911592146041783, "importance_ratio": 0.8984375, "kl_div": -0.11247290670871735, "kl_div_neg": -0.22601757943630219, "kl_div_pos": 0.0010717726545408368, "learning_rate": 1.1207177814029363e-06, "loss": -0.0758, "ppo_loss": -0.10053613781929016, "step": 550 }, { "epoch": 1.34, "grad_norm": 1.1687821837701196, "importance_ratio": 0.78515625, "kl_div": -0.11754447966814041, "kl_div_neg": -0.2395263910293579, "kl_div_sft": 0.004437429364770651, "learning_rate": 1.1190864600326265e-06, "loss": -0.1035, "ppo_loss": 0.800000011920929, "sft_loss": 0.08405223488807678, "step": 551 }, { "epoch": 1.34, "grad_norm": 1.7625336041865531, "importance_ratio": 0.83984375, "kl_div": -0.19569040834903717, "kl_div_neg": -0.3977075517177582, "kl_div_pos": 0.006326722446829081, "learning_rate": 1.1174551386623162e-06, "loss": 0.0337, "ppo_loss": -0.1031734049320221, "step": 552 }, { "epoch": 1.34, "grad_norm": 1.5047727788445464, "importance_ratio": 0.8203125, "kl_div": -0.20205961167812347, "kl_div_neg": -0.20205961167812347, "learning_rate": 1.1158238172920064e-06, "loss": -0.0112, "ppo_loss": 0.8295047283172607, "step": 553 }, { "epoch": 1.34, "grad_norm": 1.394685880919484, "importance_ratio": 0.78125, "kl_div": -0.2900198996067047, "kl_div_neg": -0.5972557663917542, "kl_div_pos": 0.01721596159040928, "learning_rate": 1.1141924959216966e-06, "loss": 0.051, "ppo_loss": -0.10868248343467712, "step": 554 }, { "epoch": 1.35, "grad_norm": 1.3764932933301735, "kl_div": 0.014602404087781906, "kl_div_sft": 0.014602404087781906, "learning_rate": 1.1125611745513866e-06, "loss": -0.1426, "sft_loss": 0.038136836141347885, "step": 555 }, { "epoch": 1.35, "grad_norm": 3.7956450554179524, "importance_ratio": 0.9453125, "kl_div": -0.026797577738761902, "kl_div_neg": -0.05635818466544151, "kl_div_sft": 0.002763028722256422, "learning_rate": 1.1109298531810766e-06, "loss": -0.1243, "ppo_loss": 0.9452005624771118, "sft_loss": 0.06745104491710663, "step": 556 }, { "epoch": 1.35, "grad_norm": 3.8833894051581703, "importance_ratio": 0.890625, "kl_div": -0.13215163350105286, "kl_div_neg": -0.3047674000263214, "kl_div_pos": 0.0404641255736351, "learning_rate": 1.1092985318107666e-06, "loss": -0.1974, "ppo_loss": -0.12064698338508606, "step": 557 }, { "epoch": 1.35, "grad_norm": 1.1478217933251562, "importance_ratio": 0.9609375, "kl_div": -0.06483356654644012, "kl_div_pos": -0.04164363443851471, "kl_div_sft": -0.08802350610494614, "learning_rate": 1.1076672104404568e-06, "loss": -0.1258, "ppo_loss": -0.959211528301239, "sft_loss": 0.1283067911863327, "step": 558 }, { "epoch": 1.36, "grad_norm": 1.27512121933286, "importance_ratio": 0.91796875, "kl_div": -0.08908012509346008, "kl_div_neg": -0.16968628764152527, "kl_div_pos": -0.008473969995975494, "learning_rate": 1.1060358890701468e-06, "loss": -0.1369, "ppo_loss": -0.07381615042686462, "step": 559 }, { "epoch": 1.36, "grad_norm": 1.2365470318794451, "kl_div": 0.008347313851118088, "kl_div_sft": 0.008347313851118088, "learning_rate": 1.1044045676998367e-06, "loss": -0.0697, "sft_loss": 0.047842901200056076, "step": 560 }, { "epoch": 1.36, "grad_norm": 2.7825982758337306, "kl_div": 0.018152426928281784, "kl_div_sft": 0.018152426928281784, "learning_rate": 1.102773246329527e-06, "loss": -0.0772, "sft_loss": 0.06452981382608414, "step": 561 }, { "epoch": 1.36, "grad_norm": 1.2984059960024126, "importance_ratio": 0.91015625, "kl_div": -0.035715144127607346, "kl_div_neg": -0.09511800855398178, "kl_div_sft": 0.02368772216141224, "learning_rate": 1.101141924959217e-06, "loss": -0.1642, "ppo_loss": 0.9092656970024109, "sft_loss": 0.038571763783693314, "step": 562 }, { "epoch": 1.36, "grad_norm": 1.388739604344503, "importance_ratio": 0.796875, "kl_div": -0.11054076999425888, "kl_div_neg": -0.2250179499387741, "kl_div_sft": 0.00393641646951437, "learning_rate": 1.0995106035889071e-06, "loss": -0.0283, "ppo_loss": 0.800000011920929, "sft_loss": 0.051326606422662735, "step": 563 }, { "epoch": 1.37, "grad_norm": 0.9820198465078197, "importance_ratio": 0.4453125, "kl_div": -0.4062121510505676, "kl_div_neg": -0.8068867325782776, "kl_div_sft": -0.005537545774132013, "learning_rate": 1.0978792822185969e-06, "loss": 0.0962, "ppo_loss": 0.800000011920929, "sft_loss": 0.040258102118968964, "step": 564 }, { "epoch": 1.37, "grad_norm": 1.6842497147374014, "importance_ratio": 0.99609375, "kl_div": 0.003686623414978385, "kl_div_pos": -0.00484278192743659, "kl_div_sft": 0.01221602875739336, "learning_rate": 1.096247960848287e-06, "loss": -0.0823, "ppo_loss": -0.9951689839363098, "sft_loss": 0.09219343215227127, "step": 565 }, { "epoch": 1.37, "grad_norm": 1.1904134647556424, "importance_ratio": 0.65625, "kl_div": -0.2421429604291916, "kl_div_neg": -0.4229358732700348, "kl_div_sft": -0.061350058764219284, "learning_rate": 1.0946166394779773e-06, "loss": -0.0565, "ppo_loss": 0.800000011920929, "sft_loss": 0.12906409800052643, "step": 566 }, { "epoch": 1.37, "grad_norm": 1.46299052693135, "kl_div": 0.009440741501748562, "kl_div_sft": 0.009440741501748562, "learning_rate": 1.092985318107667e-06, "loss": -0.1105, "sft_loss": 0.0516950823366642, "step": 567 }, { "epoch": 1.38, "grad_norm": 1.7776258307663522, "kl_div": 0.009271626360714436, "kl_div_sft": 0.009271626360714436, "learning_rate": 1.0913539967373572e-06, "loss": -0.0054, "sft_loss": 0.12384568154811859, "step": 568 }, { "epoch": 1.38, "grad_norm": 1.449626279221997, "importance_ratio": 0.7109375, "kl_div": -0.34696972370147705, "kl_div_neg": -0.34696972370147705, "learning_rate": 1.0897226753670472e-06, "loss": -0.1492, "ppo_loss": 0.800000011920929, "step": 569 }, { "epoch": 1.38, "grad_norm": 1.699506783463177, "kl_div": -0.0007659494876861572, "kl_div_sft": -0.0007659494876861572, "learning_rate": 1.0880913539967374e-06, "loss": -0.0088, "sft_loss": 0.060911569744348526, "step": 570 }, { "epoch": 1.38, "grad_norm": 1.6960867946206275, "importance_ratio": 0.87109375, "kl_div": -0.08905941992998123, "kl_div_neg": -0.13928547501564026, "kl_div_sft": -0.0388333685696125, "learning_rate": 1.0864600326264272e-06, "loss": -0.0991, "ppo_loss": 0.8699796795845032, "sft_loss": 0.15874932706356049, "step": 571 }, { "epoch": 1.39, "grad_norm": 1.1456683762616864, "kl_div": 0.004077468998730183, "kl_div_sft": 0.004077468998730183, "learning_rate": 1.0848287112561174e-06, "loss": -0.0106, "sft_loss": 0.07098381221294403, "step": 572 }, { "epoch": 1.39, "grad_norm": 1.715795300756801, "importance_ratio": 0.99609375, "kl_div": 0.0015950084198266268, "kl_div_pos": -0.004990215878933668, "kl_div_sft": 0.008180232718586922, "learning_rate": 1.0831973898858076e-06, "loss": -0.0068, "ppo_loss": -0.995022177696228, "sft_loss": 0.09894995391368866, "step": 573 }, { "epoch": 1.39, "grad_norm": 2.7129768374361563, "importance_ratio": 0.9375, "kl_div": -0.0715622529387474, "kl_div_neg": -0.18958507478237152, "kl_div_pos": 0.04646056890487671, "learning_rate": 1.0815660685154976e-06, "loss": -0.1, "ppo_loss": -0.11012718081474304, "step": 574 }, { "epoch": 1.39, "grad_norm": 1.9102451171173565, "importance_ratio": 0.7578125, "kl_div": -0.2801617681980133, "kl_div_neg": -0.2801617681980133, "learning_rate": 1.0799347471451876e-06, "loss": 0.1725, "ppo_loss": 0.8094266057014465, "step": 575 }, { "epoch": 1.4, "grad_norm": 1.21976238164542, "importance_ratio": 0.70703125, "kl_div": -0.16747649013996124, "kl_div_neg": -0.3467353284358978, "kl_div_sft": 0.011782352812588215, "learning_rate": 1.0783034257748775e-06, "loss": 0.0558, "ppo_loss": 0.800000011920929, "sft_loss": 0.0150612723082304, "step": 576 }, { "epoch": 1.4, "grad_norm": 1.8178579128626697, "importance_ratio": 0.78515625, "kl_div": -0.12120449542999268, "kl_div_neg": -0.24348323047161102, "kl_div_sft": 0.001074238563887775, "learning_rate": 1.0766721044045677e-06, "loss": -0.0141, "ppo_loss": 0.800000011920929, "sft_loss": 0.03843202069401741, "step": 577 }, { "epoch": 1.4, "grad_norm": 1.441867000059928, "importance_ratio": 0.96484375, "kl_div": -0.02505611814558506, "kl_div_neg": -0.03736988827586174, "kl_div_sft": -0.012742347083985806, "learning_rate": 1.0750407830342575e-06, "loss": 0.0226, "ppo_loss": 0.9633197784423828, "sft_loss": 0.06988085806369781, "step": 578 }, { "epoch": 1.4, "grad_norm": 2.2161701946660313, "importance_ratio": 1.0625, "kl_div": 0.04236027970910072, "kl_div_pos": 0.061362944543361664, "kl_div_sft": 0.023357614874839783, "learning_rate": 1.0734094616639477e-06, "loss": 0.0401, "ppo_loss": -1.0632847547531128, "sft_loss": 0.022046925500035286, "step": 579 }, { "epoch": 1.41, "grad_norm": 3.3108519286996834, "importance_ratio": 0.54296875, "kl_div": -0.6279046535491943, "kl_div_neg": -0.6279046535491943, "learning_rate": 1.071778140293638e-06, "loss": -0.015, "ppo_loss": 0.800000011920929, "step": 580 }, { "epoch": 1.41, "grad_norm": 1.4772493894107401, "importance_ratio": 0.90234375, "kl_div": -0.054423633962869644, "kl_div_neg": -0.10371052473783493, "kl_div_sft": -0.0051367441192269325, "learning_rate": 1.0701468189233279e-06, "loss": -0.0479, "ppo_loss": 0.9014862179756165, "sft_loss": 0.06811974197626114, "step": 581 }, { "epoch": 1.41, "grad_norm": 1.4673635553878324, "kl_div": -0.022171318531036377, "kl_div_sft": -0.022171318531036377, "learning_rate": 1.0685154975530179e-06, "loss": -0.0467, "sft_loss": 0.18078254163265228, "step": 582 }, { "epoch": 1.41, "grad_norm": 1.4955656947402662, "importance_ratio": 0.9453125, "kl_div": -0.013638187199831009, "kl_div_pos": -0.0558946430683136, "kl_div_sft": 0.02861826866865158, "learning_rate": 1.0668841761827079e-06, "loss": -0.134, "ppo_loss": -0.9456387162208557, "sft_loss": 0.013508289121091366, "step": 583 }, { "epoch": 1.42, "grad_norm": 1.9301770782273342, "importance_ratio": 0.9453125, "kl_div": -0.04264499992132187, "kl_div_pos": -0.056435685604810715, "kl_div_sft": -0.028854310512542725, "learning_rate": 1.065252854812398e-06, "loss": -0.1468, "ppo_loss": -0.9451273083686829, "sft_loss": 0.1179473027586937, "step": 584 }, { "epoch": 1.42, "grad_norm": 1.2775761104239642, "kl_div": 0.006246216129511595, "kl_div_sft": 0.006246216129511595, "learning_rate": 1.063621533442088e-06, "loss": -0.0571, "sft_loss": 0.05256784334778786, "step": 585 }, { "epoch": 1.42, "grad_norm": 2.3724816673543283, "importance_ratio": 0.99609375, "kl_div": 0.002640511840581894, "kl_div_pos": -0.004412751644849777, "kl_div_sft": 0.009693775326013565, "learning_rate": 1.061990212071778e-06, "loss": -0.0384, "ppo_loss": -0.9955970048904419, "sft_loss": 0.051869332790374756, "step": 586 }, { "epoch": 1.42, "grad_norm": 1.7260462476738236, "importance_ratio": 1.03125, "kl_div": 0.0022712545469403267, "kl_div_pos": 0.028654221445322037, "kl_div_sft": -0.024111712351441383, "learning_rate": 1.0603588907014682e-06, "loss": -0.1109, "ppo_loss": -1.0290687084197998, "sft_loss": 0.08297090977430344, "step": 587 }, { "epoch": 1.43, "grad_norm": 1.1074164388748575, "importance_ratio": 0.9453125, "kl_div": -0.015047087334096432, "kl_div_neg": -0.055990349501371384, "kl_div_sft": 0.02589617483317852, "learning_rate": 1.0587275693311582e-06, "loss": -0.0687, "ppo_loss": 0.9455482363700867, "sft_loss": 0.11951703578233719, "step": 588 }, { "epoch": 1.43, "grad_norm": 1.088531141215031, "importance_ratio": 1.0234375, "kl_div": 0.026111368089914322, "kl_div_pos": 0.026542862877249718, "kl_div_sft": 0.025679873302578926, "learning_rate": 1.0570962479608482e-06, "loss": -0.0353, "ppo_loss": -1.0268982648849487, "sft_loss": 0.09185680747032166, "step": 589 }, { "epoch": 1.43, "grad_norm": 2.5670330611783356, "importance_ratio": 1.0546875, "kl_div": 0.03792417794466019, "kl_div_pos": 0.05555087327957153, "kl_div_sft": 0.02029748447239399, "learning_rate": 1.0554649265905382e-06, "loss": -0.0404, "ppo_loss": -1.0571227073669434, "sft_loss": 0.07268325984477997, "step": 590 }, { "epoch": 1.43, "grad_norm": 1.467813396100296, "kl_div": 0.001873633824288845, "kl_div_sft": 0.001873633824288845, "learning_rate": 1.0538336052202284e-06, "loss": -0.103, "sft_loss": 0.08411164581775665, "step": 591 }, { "epoch": 1.44, "grad_norm": 2.3531129104688713, "kl_div": -0.0008771107532083988, "kl_div_sft": -0.0008771107532083988, "learning_rate": 1.0522022838499186e-06, "loss": -0.0855, "sft_loss": 0.053271256387233734, "step": 592 }, { "epoch": 1.44, "grad_norm": 1.677308566999723, "importance_ratio": 0.82421875, "kl_div": -0.10028732568025589, "kl_div_pos": -0.19314102828502655, "kl_div_sft": -0.007433618418872356, "learning_rate": 1.0505709624796083e-06, "loss": -0.071, "ppo_loss": -0.8243657350540161, "sft_loss": 0.08927972614765167, "step": 593 }, { "epoch": 1.44, "grad_norm": 1.5657319784258674, "kl_div": 0.009805003181099892, "kl_div_sft": 0.009805003181099892, "learning_rate": 1.0489396411092985e-06, "loss": -0.0231, "sft_loss": 0.052252329885959625, "step": 594 }, { "epoch": 1.44, "grad_norm": 1.1571238540589344, "importance_ratio": 1.03125, "kl_div": 0.02416916750371456, "kl_div_pos": 0.03090663067996502, "kl_div_sft": 0.017431704327464104, "learning_rate": 1.0473083197389885e-06, "loss": -0.1397, "ppo_loss": -1.0313892364501953, "sft_loss": 0.0322992280125618, "step": 595 }, { "epoch": 1.44, "grad_norm": 1.193779953985234, "kl_div": 0.010021679103374481, "kl_div_sft": 0.010021679103374481, "learning_rate": 1.0456769983686787e-06, "loss": -0.0705, "sft_loss": 0.05322499945759773, "step": 596 }, { "epoch": 1.45, "grad_norm": 1.3361645778833022, "kl_div": -0.0021144638303667307, "kl_div_sft": -0.0021144638303667307, "learning_rate": 1.0440456769983685e-06, "loss": -0.1023, "sft_loss": 0.13762226700782776, "step": 597 }, { "epoch": 1.45, "grad_norm": 1.4972359303775002, "kl_div": 0.005427949130535126, "kl_div_sft": 0.005427949130535126, "learning_rate": 1.0424143556280587e-06, "loss": -0.2345, "sft_loss": 0.05453133210539818, "step": 598 }, { "epoch": 1.45, "grad_norm": 1.742612231018511, "kl_div": -0.0006676320917904377, "kl_div_sft": -0.0006676320917904377, "learning_rate": 1.0407830342577489e-06, "loss": -0.1976, "sft_loss": 0.06037126109004021, "step": 599 }, { "epoch": 1.45, "grad_norm": 1.1586556025204708, "importance_ratio": 0.73828125, "kl_div": -0.14715160429477692, "kl_div_neg": -0.30401289463043213, "kl_div_sft": 0.009709681384265423, "learning_rate": 1.0391517128874386e-06, "loss": -0.0554, "ppo_loss": 0.800000011920929, "sft_loss": 0.029035640880465508, "step": 600 }, { "epoch": 1.46, "grad_norm": 1.6506692684987094, "importance_ratio": 1.0078125, "kl_div": 0.010380488820374012, "kl_div_pos": 0.006752700544893742, "kl_div_sft": 0.014008277095854282, "learning_rate": 1.0375203915171288e-06, "loss": -0.0073, "ppo_loss": -1.0067756175994873, "sft_loss": 0.05170813202857971, "step": 601 }, { "epoch": 1.46, "grad_norm": 1.1282360370971418, "kl_div": 0.01832808554172516, "kl_div_sft": 0.01832808554172516, "learning_rate": 1.0358890701468188e-06, "loss": -0.0271, "sft_loss": 0.03242221102118492, "step": 602 }, { "epoch": 1.46, "grad_norm": 1.5571367150556785, "kl_div": -0.0014522508718073368, "kl_div_sft": -0.0014522508718073368, "learning_rate": 1.034257748776509e-06, "loss": -0.0319, "sft_loss": 0.09149647504091263, "step": 603 }, { "epoch": 1.46, "grad_norm": 1.5800280286017057, "importance_ratio": 0.875, "kl_div": -0.15403921902179718, "kl_div_neg": -0.3375498056411743, "kl_div_pos": 0.029471376910805702, "learning_rate": 1.0326264274061988e-06, "loss": 0.039, "ppo_loss": -0.11495497822761536, "step": 604 }, { "epoch": 1.47, "grad_norm": 1.7090297148003473, "importance_ratio": 0.90625, "kl_div": -0.038361359387636185, "kl_div_neg": -0.0970546156167984, "kl_div_sft": 0.020331894978880882, "learning_rate": 1.030995106035889e-06, "loss": -0.01, "ppo_loss": 0.9075064659118652, "sft_loss": 0.0135499881580472, "step": 605 }, { "epoch": 1.47, "grad_norm": 1.528653181102798, "importance_ratio": 0.6171875, "kl_div": -0.2556186616420746, "kl_div_neg": -0.482948362827301, "kl_div_sft": -0.028288988396525383, "learning_rate": 1.0293637846655792e-06, "loss": -0.0538, "ppo_loss": 0.800000011920929, "sft_loss": 0.08954707533121109, "step": 606 }, { "epoch": 1.47, "grad_norm": 0.9950230846890066, "kl_div": 0.007246728520840406, "kl_div_sft": 0.007246728520840406, "learning_rate": 1.0277324632952692e-06, "loss": -0.0167, "sft_loss": 0.06424114853143692, "step": 607 }, { "epoch": 1.47, "grad_norm": 1.3094895670075744, "importance_ratio": 0.75, "kl_div": -0.13014312088489532, "kl_div_neg": -0.2881861627101898, "kl_div_sft": 0.027899926528334618, "learning_rate": 1.0261011419249592e-06, "loss": 0.1371, "ppo_loss": 0.800000011920929, "sft_loss": 0.03597124293446541, "step": 608 }, { "epoch": 1.48, "grad_norm": 1.157121072459821, "importance_ratio": 0.8203125, "kl_div": -0.10809502750635147, "kl_div_pos": -0.19788144528865814, "kl_div_sft": -0.018308604136109352, "learning_rate": 1.0244698205546491e-06, "loss": -0.0255, "ppo_loss": -0.8204671144485474, "sft_loss": 0.09640325605869293, "step": 609 }, { "epoch": 1.48, "grad_norm": 1.634697587979911, "importance_ratio": 0.78125, "kl_div": -0.11589974164962769, "kl_div_neg": -0.24510979652404785, "kl_div_sft": 0.013310307636857033, "learning_rate": 1.0228384991843393e-06, "loss": -0.1429, "ppo_loss": 0.800000011920929, "sft_loss": 0.02353717014193535, "step": 610 }, { "epoch": 1.48, "grad_norm": 1.1102423702287916, "importance_ratio": 1.0390625, "kl_div": 0.017195984721183777, "kl_div_pos": 0.0396147146821022, "kl_div_sft": -0.005222745705395937, "learning_rate": 1.021207177814029e-06, "loss": -0.0158, "ppo_loss": -1.0404099225997925, "sft_loss": 0.09494847804307938, "step": 611 }, { "epoch": 1.48, "grad_norm": 0.9871361774508806, "importance_ratio": 0.9609375, "kl_div": -0.042730703949928284, "kl_div_neg": -0.11213622242212296, "kl_div_pos": 0.026674814522266388, "learning_rate": 1.0195758564437193e-06, "loss": 0.0579, "ppo_loss": -0.06655561923980713, "step": 612 }, { "epoch": 1.49, "grad_norm": 1.6805677369712249, "kl_div": 0.012642841786146164, "kl_div_sft": 0.012642841786146164, "learning_rate": 1.0179445350734095e-06, "loss": 0.0846, "sft_loss": 0.025455590337514877, "step": 613 }, { "epoch": 1.49, "grad_norm": 1.4515161940104593, "importance_ratio": 0.65625, "kl_div": -0.4327019453048706, "kl_div_neg": -0.4327019453048706, "learning_rate": 1.0163132137030995e-06, "loss": -0.0264, "ppo_loss": 0.800000011920929, "step": 614 }, { "epoch": 1.49, "grad_norm": 1.2200070769464322, "importance_ratio": 0.76953125, "kl_div": -0.13624180853366852, "kl_div_neg": -0.2606610357761383, "kl_div_sft": -0.011822582222521305, "learning_rate": 1.0146818923327895e-06, "loss": -0.0443, "ppo_loss": 0.800000011920929, "sft_loss": 0.08891305327415466, "step": 615 }, { "epoch": 1.49, "grad_norm": 1.3707699298206422, "importance_ratio": 1.015625, "kl_div": -0.013705750927329063, "kl_div_pos": 0.013345234096050262, "kl_div_sft": -0.04075673595070839, "learning_rate": 1.0130505709624794e-06, "loss": -0.0962, "ppo_loss": -1.013434648513794, "sft_loss": 0.1080576702952385, "step": 616 }, { "epoch": 1.5, "grad_norm": 2.161681921245051, "kl_div": 0.007364877033978701, "kl_div_sft": 0.007364877033978701, "learning_rate": 1.0114192495921696e-06, "loss": -0.236, "sft_loss": 0.04366494342684746, "step": 617 }, { "epoch": 1.5, "grad_norm": 1.6888717444843546, "importance_ratio": 0.63671875, "kl_div": -0.23458042740821838, "kl_div_neg": -0.4501579999923706, "kl_div_sft": -0.019002839922904968, "learning_rate": 1.0097879282218598e-06, "loss": -0.0109, "ppo_loss": 0.800000011920929, "sft_loss": 0.11033762991428375, "step": 618 }, { "epoch": 1.5, "grad_norm": 1.202515782274504, "importance_ratio": 1.0078125, "kl_div": 0.015978161245584488, "kl_div_pos": 0.010856712237000465, "kl_div_sft": 0.02109961025416851, "learning_rate": 1.0081566068515496e-06, "loss": -0.0209, "ppo_loss": -1.0109158754348755, "sft_loss": 0.051598433405160904, "step": 619 }, { "epoch": 1.5, "grad_norm": 2.29203733278664, "importance_ratio": 0.81640625, "kl_div": -0.14957204461097717, "kl_div_neg": -0.20417694747447968, "kl_div_sft": -0.09496712684631348, "learning_rate": 1.0065252854812398e-06, "loss": 0.0295, "ppo_loss": 0.8153181076049805, "sft_loss": 0.14580783247947693, "step": 620 }, { "epoch": 1.51, "grad_norm": 1.6904680500051987, "kl_div": -0.009111708030104637, "kl_div_sft": -0.009111708030104637, "learning_rate": 1.0048939641109298e-06, "loss": -0.0565, "sft_loss": 0.07915902137756348, "step": 621 }, { "epoch": 1.51, "grad_norm": 3.122635824457841, "importance_ratio": 0.8515625, "kl_div": -0.1739480197429657, "kl_div_neg": -0.35731154680252075, "kl_div_pos": 0.00941550638526678, "learning_rate": 1.00326264274062e-06, "loss": -0.0549, "ppo_loss": -0.10472998023033142, "step": 622 }, { "epoch": 1.51, "grad_norm": 1.436956717617787, "kl_div": 0.005096603184938431, "kl_div_sft": 0.005096603184938431, "learning_rate": 1.0016313213703098e-06, "loss": -0.0869, "sft_loss": 0.09290395677089691, "step": 623 }, { "epoch": 1.51, "grad_norm": 0.8033987669657294, "importance_ratio": 1.046875, "kl_div": 0.024920709431171417, "kl_div_pos": 0.04217958450317383, "kl_div_sft": 0.007661834824830294, "learning_rate": 1e-06, "loss": 0.0153, "ppo_loss": -1.0430817604064941, "sft_loss": 0.040020450949668884, "step": 624 }, { "epoch": 1.52, "grad_norm": 1.0643403999766239, "kl_div": 0.00011259526945650578, "kl_div_sft": 0.00011259526945650578, "learning_rate": 9.9836867862969e-07, "loss": 0.0686, "sft_loss": 0.0579143762588501, "step": 625 }, { "epoch": 1.52, "grad_norm": 3.7411073149069636, "importance_ratio": 0.96484375, "kl_div": -0.013933196663856506, "kl_div_pos": -0.03640758991241455, "kl_div_sft": 0.008541197516024113, "learning_rate": 9.9673735725938e-07, "loss": -0.0891, "ppo_loss": -0.9642472267150879, "sft_loss": 0.02761666662991047, "step": 626 }, { "epoch": 1.52, "grad_norm": 6.6967108211071205, "importance_ratio": 0.9921875, "kl_div": -0.006364609580487013, "kl_div_pos": -0.006364609580487013, "learning_rate": 9.951060358890701e-07, "loss": 0.0462, "ppo_loss": -0.9936584234237671, "step": 627 }, { "epoch": 1.52, "grad_norm": 1.2668617234289494, "importance_ratio": 0.8828125, "kl_div": -0.1370735913515091, "kl_div_neg": -0.2952595353126526, "kl_div_pos": 0.021112343296408653, "learning_rate": 9.9347471451876e-07, "loss": 0.0278, "ppo_loss": -0.11066839098930359, "step": 628 }, { "epoch": 1.52, "grad_norm": 1.7302028890941692, "importance_ratio": 1.0, "kl_div": -0.012814854271709919, "kl_div_pos": 0.00020324558136053383, "kl_div_sft": -0.025832954794168472, "learning_rate": 9.918433931484503e-07, "loss": -0.0514, "ppo_loss": -1.000203251838684, "sft_loss": 0.10578396916389465, "step": 629 }, { "epoch": 1.53, "grad_norm": 0.9105489972194977, "importance_ratio": 0.71484375, "kl_div": -0.16649454832077026, "kl_div_neg": -0.3332861065864563, "kl_div_sft": 0.00029701562016271055, "learning_rate": 9.902120717781403e-07, "loss": -0.1376, "ppo_loss": 0.800000011920929, "sft_loss": 0.08367674052715302, "step": 630 }, { "epoch": 1.53, "grad_norm": 1.3256530196175094, "importance_ratio": 1.03125, "kl_div": 0.027810772880911827, "kl_div_pos": 0.027810772880911827, "learning_rate": 9.885807504078303e-07, "loss": -0.0227, "ppo_loss": -1.0282015800476074, "step": 631 }, { "epoch": 1.53, "grad_norm": 2.5581641382522244, "importance_ratio": 0.8125, "kl_div": -0.2111785113811493, "kl_div_neg": -0.2111785113811493, "learning_rate": 9.869494290375203e-07, "loss": 0.1847, "ppo_loss": 0.8429823517799377, "step": 632 }, { "epoch": 1.53, "grad_norm": 1.4855555608085709, "importance_ratio": 1.015625, "kl_div": 0.014578155241906643, "kl_div_pos": 0.012926433235406876, "kl_div_sft": 0.01622987724840641, "learning_rate": 9.853181076672104e-07, "loss": -0.1038, "ppo_loss": -1.0130102634429932, "sft_loss": 0.03075617365539074, "step": 633 }, { "epoch": 1.54, "grad_norm": 0.9026682216648737, "importance_ratio": 0.76953125, "kl_div": -0.1260625720024109, "kl_div_neg": -0.26258131861686707, "kl_div_sft": 0.010456175543367863, "learning_rate": 9.836867862969004e-07, "loss": 0.0844, "ppo_loss": 0.800000011920929, "sft_loss": 0.04138074815273285, "step": 634 }, { "epoch": 1.54, "grad_norm": 2.996379575064102, "importance_ratio": 1.0546875, "kl_div": 0.025566959753632545, "kl_div_pos": 0.056610144674777985, "kl_div_sft": -0.005476226564496756, "learning_rate": 9.820554649265906e-07, "loss": 0.1201, "ppo_loss": -1.0582431554794312, "sft_loss": 0.09242013841867447, "step": 635 }, { "epoch": 1.54, "grad_norm": 1.4259640929154063, "importance_ratio": 0.80859375, "kl_div": -0.09104778617620468, "kl_div_neg": -0.21008449792861938, "kl_div_sft": 0.027988923713564873, "learning_rate": 9.804241435562806e-07, "loss": -0.0378, "ppo_loss": 0.810515820980072, "sft_loss": 0.023999352008104324, "step": 636 }, { "epoch": 1.54, "grad_norm": 1.497605717641765, "importance_ratio": 0.80859375, "kl_div": -0.11556511372327805, "kl_div_neg": -0.21007847785949707, "kl_div_sft": -0.021051747724413872, "learning_rate": 9.787928221859706e-07, "loss": -0.1009, "ppo_loss": 0.8105207085609436, "sft_loss": 0.0773676410317421, "step": 637 }, { "epoch": 1.55, "grad_norm": 1.6909906524273515, "importance_ratio": 1.0, "kl_div": 0.0005287877283990383, "kl_div_neg": 0.0076847923919558525, "kl_div_pos": -0.006627216935157776, "learning_rate": 9.771615008156606e-07, "loss": -0.1343, "ppo_loss": 0.00715985894203186, "step": 638 }, { "epoch": 1.55, "grad_norm": 1.33005769016516, "importance_ratio": 1.015625, "kl_div": 0.019131600856781006, "kl_div_pos": 0.019131600856781006, "learning_rate": 9.755301794453506e-07, "loss": -0.0472, "ppo_loss": -1.0193370580673218, "step": 639 }, { "epoch": 1.55, "grad_norm": 2.4557192745933176, "importance_ratio": 0.71484375, "kl_div": -0.16165408492088318, "kl_div_neg": -0.3333815932273865, "kl_div_sft": 0.010073418729007244, "learning_rate": 9.738988580750408e-07, "loss": -0.0718, "ppo_loss": 0.800000011920929, "sft_loss": 0.04032623767852783, "step": 640 }, { "epoch": 1.55, "grad_norm": 5.33474776267878, "importance_ratio": 0.7421875, "kl_div": -0.1424480825662613, "kl_div_neg": -0.300345242023468, "kl_div_sft": 0.015449062921106815, "learning_rate": 9.722675367047307e-07, "loss": 0.0681, "ppo_loss": 0.800000011920929, "sft_loss": 0.053471680730581284, "step": 641 }, { "epoch": 1.56, "grad_norm": 5.099826488161199, "importance_ratio": 0.81640625, "kl_div": -0.22648264467716217, "kl_div_neg": -0.45588648319244385, "kl_div_pos": 0.0029212054796516895, "learning_rate": 9.70636215334421e-07, "loss": -0.0518, "ppo_loss": -0.10146275162696838, "step": 642 }, { "epoch": 1.56, "grad_norm": 3.907806330587893, "importance_ratio": 0.625, "kl_div": -0.2288985550403595, "kl_div_neg": -0.46892765164375305, "kl_div_sft": 0.01113053783774376, "learning_rate": 9.69004893964111e-07, "loss": 0.0213, "ppo_loss": 0.800000011920929, "sft_loss": 0.023388583213090897, "step": 643 }, { "epoch": 1.56, "grad_norm": 1.3800678493571776, "kl_div": 0.013283468782901764, "kl_div_sft": 0.013283468782901764, "learning_rate": 9.67373572593801e-07, "loss": -0.1028, "sft_loss": 0.058054424822330475, "step": 644 }, { "epoch": 1.56, "grad_norm": 1.2509233878847967, "importance_ratio": 0.59765625, "kl_div": -0.24729475378990173, "kl_div_neg": -0.5174387693405151, "kl_div_sft": 0.02284926176071167, "learning_rate": 9.657422512234909e-07, "loss": -0.0466, "ppo_loss": 0.800000011920929, "sft_loss": 0.026714660227298737, "step": 645 }, { "epoch": 1.57, "grad_norm": 5.377815928380768, "kl_div": -0.08838597685098648, "kl_div_sft": -0.08838597685098648, "learning_rate": 9.64110929853181e-07, "loss": -0.0711, "sft_loss": 0.14923830330371857, "step": 646 }, { "epoch": 1.57, "grad_norm": 1.5657150760405996, "importance_ratio": 0.87890625, "kl_div": -0.05644303932785988, "kl_div_neg": -0.12692682445049286, "kl_div_sft": 0.014040743932127953, "learning_rate": 9.62479608482871e-07, "loss": -0.1483, "ppo_loss": 0.8807981014251709, "sft_loss": 0.020476870238780975, "step": 647 }, { "epoch": 1.57, "grad_norm": 1.4235593440043515, "importance_ratio": 0.53515625, "kl_div": -0.3175111711025238, "kl_div_neg": -0.6287727952003479, "kl_div_sft": -0.006249555852264166, "learning_rate": 9.608482871125613e-07, "loss": -0.1587, "ppo_loss": 0.800000011920929, "sft_loss": 0.054232582449913025, "step": 648 }, { "epoch": 1.57, "grad_norm": 1.3064368848946577, "kl_div": -0.030326243489980698, "kl_div_sft": -0.030326243489980698, "learning_rate": 9.592169657422513e-07, "loss": -0.0821, "sft_loss": 0.07004794478416443, "step": 649 }, { "epoch": 1.58, "grad_norm": 5.415139711784409, "importance_ratio": 1.0078125, "kl_div": 0.01519959606230259, "kl_div_pos": 0.006254679523408413, "kl_div_sft": 0.02414451353251934, "learning_rate": 9.575856443719412e-07, "loss": 0.0018, "ppo_loss": -1.0062743425369263, "sft_loss": 0.031017892062664032, "step": 650 }, { "epoch": 1.58, "grad_norm": 1.4936301562384438, "importance_ratio": 0.83203125, "kl_div": -0.08170606940984726, "kl_div_neg": -0.18414528667926788, "kl_div_sft": 0.020733144134283066, "learning_rate": 9.559543230016312e-07, "loss": -0.1336, "ppo_loss": 0.8318149447441101, "sft_loss": 0.20881670713424683, "step": 651 }, { "epoch": 1.58, "grad_norm": 1.340476239894152, "importance_ratio": 1.0078125, "kl_div": 0.007561908569186926, "kl_div_neg": 0.001748603186570108, "kl_div_pos": 0.01337521430104971, "learning_rate": 9.543230016313212e-07, "loss": 0.0364, "ppo_loss": -0.0058574676513671875, "step": 652 }, { "epoch": 1.58, "grad_norm": 7.975741800948087, "importance_ratio": 0.4140625, "kl_div": -0.44976139068603516, "kl_div_neg": -0.8810076117515564, "kl_div_sft": -0.018515175208449364, "learning_rate": 9.526916802610113e-07, "loss": -0.0943, "ppo_loss": 0.800000011920929, "sft_loss": 0.08849439769983292, "step": 653 }, { "epoch": 1.59, "grad_norm": 2.3740506533495003, "kl_div": 0.011756137013435364, "kl_div_sft": 0.011756137013435364, "learning_rate": 9.510603588907015e-07, "loss": -0.0646, "sft_loss": 0.04911510646343231, "step": 654 }, { "epoch": 1.59, "grad_norm": 2.0039720431526855, "kl_div": 0.009900109842419624, "kl_div_sft": 0.009900109842419624, "learning_rate": 9.494290375203915e-07, "loss": 0.0081, "sft_loss": 0.07109043002128601, "step": 655 }, { "epoch": 1.59, "grad_norm": 1.9808406443040367, "kl_div": -0.0021666795946657658, "kl_div_sft": -0.0021666795946657658, "learning_rate": 9.477977161500816e-07, "loss": -0.0193, "sft_loss": 0.052859771996736526, "step": 656 }, { "epoch": 1.59, "grad_norm": 3.5823483259167084, "importance_ratio": 0.8359375, "kl_div": -0.18824321031570435, "kl_div_neg": -0.31431934237480164, "kl_div_pos": -0.062167081981897354, "learning_rate": 9.461663947797715e-07, "loss": 0.0357, "ppo_loss": -0.06986293196678162, "step": 657 }, { "epoch": 1.6, "grad_norm": 2.1106985955119675, "importance_ratio": 0.98828125, "kl_div": -0.02020912803709507, "kl_div_pos": -0.012419541366398335, "kl_div_sft": -0.02799871563911438, "learning_rate": 9.445350734094616e-07, "loss": -0.0225, "ppo_loss": -0.9876572489738464, "sft_loss": 0.17996813356876373, "step": 658 }, { "epoch": 1.6, "grad_norm": 1.2809024432434561, "importance_ratio": 1.015625, "kl_div": 0.017215736210346222, "kl_div_pos": 0.017215736210346222, "learning_rate": 9.429037520391516e-07, "loss": -0.0194, "ppo_loss": -1.0175464153289795, "step": 659 }, { "epoch": 1.6, "grad_norm": 2.0085101270090355, "kl_div": 0.0152102280408144, "kl_div_sft": 0.0152102280408144, "learning_rate": 9.412724306688418e-07, "loss": 0.0802, "sft_loss": 0.07296903431415558, "step": 660 }, { "epoch": 1.6, "grad_norm": 1.3454928960594914, "importance_ratio": 1.0390625, "kl_div": 0.030548464506864548, "kl_div_pos": 0.040144648402929306, "kl_div_sft": 0.02095227874815464, "learning_rate": 9.396411092985318e-07, "loss": -0.1293, "ppo_loss": -1.0409613847732544, "sft_loss": 0.0516112744808197, "step": 661 }, { "epoch": 1.6, "grad_norm": 1.159335992778984, "importance_ratio": 1.03125, "kl_div": 0.008757232688367367, "kl_div_pos": 0.03196336701512337, "kl_div_sft": -0.014448901638388634, "learning_rate": 9.380097879282218e-07, "loss": -0.0229, "ppo_loss": -1.0324797630310059, "sft_loss": 0.06966232508420944, "step": 662 }, { "epoch": 1.61, "grad_norm": 1.018373848212684, "importance_ratio": 1.0078125, "kl_div": 0.014435206539928913, "kl_div_pos": 0.010820697993040085, "kl_div_sft": 0.01804971508681774, "learning_rate": 9.363784665579119e-07, "loss": -0.0568, "ppo_loss": -1.0108795166015625, "sft_loss": 0.035217855125665665, "step": 663 }, { "epoch": 1.61, "grad_norm": 1.312975661052913, "importance_ratio": 0.9453125, "kl_div": -0.06321582943201065, "kl_div_neg": -0.17330032587051392, "kl_div_pos": 0.046868663281202316, "learning_rate": 9.347471451876019e-07, "loss": -0.0804, "ppo_loss": -0.10354965925216675, "step": 664 }, { "epoch": 1.61, "grad_norm": 1.2861361492695895, "importance_ratio": 0.9609375, "kl_div": -0.014686057344079018, "kl_div_neg": -0.04050711914896965, "kl_div_sft": 0.011135004460811615, "learning_rate": 9.33115823817292e-07, "loss": -0.058, "ppo_loss": 0.9603022933006287, "sft_loss": 0.04753648489713669, "step": 665 }, { "epoch": 1.61, "grad_norm": 1.4443818266247967, "importance_ratio": 0.859375, "kl_div": -0.07007308304309845, "kl_div_pos": -0.15025857090950012, "kl_div_sft": 0.01011241041123867, "learning_rate": 9.31484502446982e-07, "loss": -0.1818, "ppo_loss": -0.8604854345321655, "sft_loss": 0.07846605777740479, "step": 666 }, { "epoch": 1.62, "grad_norm": 1.0413113115080765, "kl_div": 0.013836363330483437, "kl_div_sft": 0.013836363330483437, "learning_rate": 9.298531810766721e-07, "loss": -0.0525, "sft_loss": 0.05567178130149841, "step": 667 }, { "epoch": 1.62, "grad_norm": 1.3742326415868675, "importance_ratio": 0.9453125, "kl_div": -0.022275906056165695, "kl_div_pos": -0.05634089559316635, "kl_div_sft": 0.011789082549512386, "learning_rate": 9.282218597063621e-07, "loss": -0.0949, "ppo_loss": -0.9452168941497803, "sft_loss": 0.11352340877056122, "step": 668 }, { "epoch": 1.62, "grad_norm": 1.3628873501949048, "importance_ratio": 0.90625, "kl_div": -0.10156269371509552, "kl_div_neg": -0.2054397463798523, "kl_div_pos": 0.0023143519647419453, "learning_rate": 9.265905383360522e-07, "loss": 0.0061, "ppo_loss": -0.09401395916938782, "step": 669 }, { "epoch": 1.62, "grad_norm": 3.272854190831507, "importance_ratio": 1.0078125, "kl_div": 0.003968629986047745, "kl_div_neg": -0.021340366452932358, "kl_div_pos": 0.029277626425027847, "learning_rate": 9.249592169657422e-07, "loss": -0.1134, "ppo_loss": -0.02541235089302063, "step": 670 }, { "epoch": 1.63, "grad_norm": 3.120346571918228, "kl_div": 0.009733067825436592, "kl_div_sft": 0.009733067825436592, "learning_rate": 9.233278955954323e-07, "loss": -0.0521, "sft_loss": 0.04392252117395401, "step": 671 }, { "epoch": 1.63, "grad_norm": 1.1634629516932495, "kl_div": 0.007590790279209614, "kl_div_sft": 0.007590790279209614, "learning_rate": 9.216965742251223e-07, "loss": -0.027, "sft_loss": 0.05298725515604019, "step": 672 }, { "epoch": 1.63, "grad_norm": 1.0945023265341634, "importance_ratio": 0.76171875, "kl_div": -0.136201411485672, "kl_div_neg": -0.2729892134666443, "kl_div_sft": 0.0005863768747076392, "learning_rate": 9.200652528548124e-07, "loss": -0.0103, "ppo_loss": 0.800000011920929, "sft_loss": 0.11422469466924667, "step": 673 }, { "epoch": 1.63, "grad_norm": 1.3032575526162251, "kl_div": 0.021982582286000252, "kl_div_sft": 0.021982582286000252, "learning_rate": 9.184339314845024e-07, "loss": 0.0219, "sft_loss": 0.04211244732141495, "step": 674 }, { "epoch": 1.64, "grad_norm": 2.6326199696040273, "importance_ratio": 1.03125, "kl_div": 0.01889719069004059, "kl_div_pos": 0.027834882959723473, "kl_div_sft": 0.00995949748903513, "learning_rate": 9.168026101141924e-07, "loss": 0.0839, "ppo_loss": -1.0282258987426758, "sft_loss": 0.026849152520298958, "step": 675 }, { "epoch": 1.64, "grad_norm": 1.2263257715455744, "importance_ratio": 0.8671875, "kl_div": -0.156332328915596, "kl_div_neg": -0.31329450011253357, "kl_div_pos": 0.0006298309890553355, "learning_rate": 9.151712887438825e-07, "loss": -0.1005, "ppo_loss": -0.10031500458717346, "step": 676 }, { "epoch": 1.64, "grad_norm": 1.1389040571200293, "importance_ratio": 0.96875, "kl_div": -0.030221477150917053, "kl_div_pos": -0.030221477150917053, "learning_rate": 9.135399673735725e-07, "loss": -0.1023, "ppo_loss": -0.9703148603439331, "step": 677 }, { "epoch": 1.64, "grad_norm": 2.1273286065926076, "kl_div": 0.0077500175684690475, "kl_div_sft": 0.0077500175684690475, "learning_rate": 9.119086460032626e-07, "loss": 0.0415, "sft_loss": 0.05046243965625763, "step": 678 }, { "epoch": 1.65, "grad_norm": 1.3444536717591244, "kl_div": 0.0037313615903258324, "kl_div_sft": 0.0037313615903258324, "learning_rate": 9.102773246329527e-07, "loss": -0.0019, "sft_loss": 0.03707285225391388, "step": 679 }, { "epoch": 1.65, "grad_norm": 2.2594354729585935, "importance_ratio": 1.015625, "kl_div": 0.007647077552974224, "kl_div_pos": 0.016760453581809998, "kl_div_sft": -0.0014662984758615494, "learning_rate": 9.086460032626428e-07, "loss": 0.0399, "ppo_loss": -1.0169017314910889, "sft_loss": 0.07068877667188644, "step": 680 }, { "epoch": 1.65, "grad_norm": 2.7407975111008636, "importance_ratio": 0.91796875, "kl_div": -0.051950324326753616, "kl_div_neg": -0.0857582613825798, "kl_div_sft": -0.01814238540828228, "learning_rate": 9.070146818923328e-07, "loss": 0.0222, "ppo_loss": 0.9178161025047302, "sft_loss": 0.05306997522711754, "step": 681 }, { "epoch": 1.65, "grad_norm": 1.200354172177631, "importance_ratio": 0.69921875, "kl_div": -0.17709381878376007, "kl_div_neg": -0.3601168096065521, "kl_div_sft": 0.005929179489612579, "learning_rate": 9.053833605220228e-07, "loss": 0.0515, "ppo_loss": 0.800000011920929, "sft_loss": 0.034444693475961685, "step": 682 }, { "epoch": 1.66, "grad_norm": 2.1378046097572883, "importance_ratio": 0.765625, "kl_div": -0.1365443766117096, "kl_div_pos": -0.26465514302253723, "kl_div_sft": -0.00843361672013998, "learning_rate": 9.037520391517128e-07, "loss": 0.0251, "ppo_loss": -0.7674705386161804, "sft_loss": 0.13605627417564392, "step": 683 }, { "epoch": 1.66, "grad_norm": 2.50043807959821, "importance_ratio": 1.0, "kl_div": 0.009185480885207653, "kl_div_pos": 0.0008363473461940885, "kl_div_sft": 0.017534613609313965, "learning_rate": 9.021207177814028e-07, "loss": -0.2217, "ppo_loss": -1.000836730003357, "sft_loss": 0.031047571450471878, "step": 684 }, { "epoch": 1.66, "grad_norm": 1.0978447694539437, "importance_ratio": 0.72265625, "kl_div": -0.15691974759101868, "kl_div_neg": -0.325582891702652, "kl_div_sft": 0.011743384413421154, "learning_rate": 9.004893964110929e-07, "loss": 0.139, "ppo_loss": 0.800000011920929, "sft_loss": 0.05710052326321602, "step": 685 }, { "epoch": 1.66, "grad_norm": 1.5917475967736583, "importance_ratio": 0.9921875, "kl_div": -0.007733147591352463, "kl_div_pos": -0.008901979774236679, "kl_div_sft": -0.006564315874129534, "learning_rate": 8.98858075040783e-07, "loss": -0.0465, "ppo_loss": -0.9911375045776367, "sft_loss": 0.058706916868686676, "step": 686 }, { "epoch": 1.67, "grad_norm": 2.629120000393848, "importance_ratio": 0.96484375, "kl_div": -0.03718624636530876, "kl_div_neg": -0.09633693844079971, "kl_div_pos": 0.02196444384753704, "learning_rate": 8.972267536704731e-07, "loss": -0.1139, "ppo_loss": -0.057024747133255005, "step": 687 }, { "epoch": 1.67, "grad_norm": 1.1114538352944412, "importance_ratio": 0.7265625, "kl_div": -0.1579190343618393, "kl_div_neg": -0.31698939204216003, "kl_div_sft": 0.0011513223871588707, "learning_rate": 8.955954323001631e-07, "loss": 0.011, "ppo_loss": 0.800000011920929, "sft_loss": 0.040994782000780106, "step": 688 }, { "epoch": 1.67, "grad_norm": 0.9097634642638004, "importance_ratio": 1.03125, "kl_div": 0.013047357089817524, "kl_div_pos": 0.02883242629468441, "kl_div_sft": -0.0027377125807106495, "learning_rate": 8.939641109298532e-07, "loss": 0.1438, "ppo_loss": -1.029252052307129, "sft_loss": 0.09796261787414551, "step": 689 }, { "epoch": 1.67, "grad_norm": 1.6195585404050543, "importance_ratio": 0.96484375, "kl_div": -0.0072251176461577415, "kl_div_neg": -0.037326835095882416, "kl_div_sft": 0.022876599803566933, "learning_rate": 8.923327895595431e-07, "loss": -0.1642, "ppo_loss": 0.9633612632751465, "sft_loss": 0.061763886362314224, "step": 690 }, { "epoch": 1.68, "grad_norm": 1.4068703025085305, "importance_ratio": 1.0390625, "kl_div": 0.020885484293103218, "kl_div_pos": 0.0366450771689415, "kl_div_sft": 0.005125890951603651, "learning_rate": 8.907014681892332e-07, "loss": -0.1869, "ppo_loss": -1.0373247861862183, "sft_loss": 0.029581492766737938, "step": 691 }, { "epoch": 1.68, "grad_norm": 1.3389147326418513, "kl_div": 0.006500979885458946, "kl_div_sft": 0.006500979885458946, "learning_rate": 8.890701468189233e-07, "loss": 0.0491, "sft_loss": 0.04796748608350754, "step": 692 }, { "epoch": 1.68, "grad_norm": 1.8068521571686462, "kl_div": -0.008079552091658115, "kl_div_sft": -0.008079552091658115, "learning_rate": 8.874388254486134e-07, "loss": 0.0581, "sft_loss": 0.05922364443540573, "step": 693 }, { "epoch": 1.68, "grad_norm": 2.0603291039395932, "importance_ratio": 0.890625, "kl_div": -0.1223197802901268, "kl_div_neg": -0.24422934651374817, "kl_div_pos": -0.0004102161037735641, "learning_rate": 8.858075040783034e-07, "loss": -0.0071, "ppo_loss": -0.09979492425918579, "step": 694 }, { "epoch": 1.68, "grad_norm": 2.567755171920564, "importance_ratio": 0.96875, "kl_div": -0.03187120705842972, "kl_div_pos": -0.030253032222390175, "kl_div_sft": -0.03348938003182411, "learning_rate": 8.841761827079935e-07, "loss": 0.0119, "ppo_loss": -0.9701999425888062, "sft_loss": 0.1174488514661789, "step": 695 }, { "epoch": 1.69, "grad_norm": 1.9102247729034183, "importance_ratio": 0.7890625, "kl_div": -0.10715979337692261, "kl_div_neg": -0.2363295704126358, "kl_div_sft": 0.022009991109371185, "learning_rate": 8.825448613376835e-07, "loss": 0.1096, "ppo_loss": 0.800000011920929, "sft_loss": 0.029377589002251625, "step": 696 }, { "epoch": 1.69, "grad_norm": 1.4168552198404951, "kl_div": 0.016493186354637146, "kl_div_sft": 0.016493186354637146, "learning_rate": 8.809135399673735e-07, "loss": -0.0076, "sft_loss": 0.038798198103904724, "step": 697 }, { "epoch": 1.69, "grad_norm": 1.2051259035997948, "importance_ratio": 0.99609375, "kl_div": 0.00634672399610281, "kl_div_pos": -0.005619870498776436, "kl_div_sft": 0.018313318490982056, "learning_rate": 8.792822185970635e-07, "loss": 0.0683, "ppo_loss": -0.994395911693573, "sft_loss": 0.021961260586977005, "step": 698 }, { "epoch": 1.69, "grad_norm": 1.9214684281188614, "importance_ratio": 0.80078125, "kl_div": -0.1102023497223854, "kl_div_neg": -0.2217598259449005, "kl_div_sft": 0.0013551327865570784, "learning_rate": 8.776508972267536e-07, "loss": -0.0939, "ppo_loss": 0.8011077642440796, "sft_loss": 0.0796297937631607, "step": 699 }, { "epoch": 1.7, "grad_norm": 1.5797531491607557, "importance_ratio": 0.84375, "kl_div": -0.16946262121200562, "kl_div_neg": -0.16946262121200562, "learning_rate": 8.760195758564437e-07, "loss": -0.2183, "ppo_loss": 0.8442213535308838, "step": 700 }, { "epoch": 1.7, "grad_norm": 0.9745875721176449, "importance_ratio": 1.015625, "kl_div": 0.01686589978635311, "kl_div_pos": 0.01686589978635311, "learning_rate": 8.743882544861337e-07, "loss": -0.1424, "ppo_loss": -1.0171172618865967, "step": 701 }, { "epoch": 1.7, "grad_norm": 1.0281778872722713, "importance_ratio": 0.734375, "kl_div": -0.1530963033437729, "kl_div_neg": -0.31053248047828674, "kl_div_sft": 0.004339868202805519, "learning_rate": 8.727569331158238e-07, "loss": 0.0715, "ppo_loss": 0.800000011920929, "sft_loss": 0.13882605731487274, "step": 702 }, { "epoch": 1.7, "grad_norm": 1.2211496741826067, "importance_ratio": 0.8046875, "kl_div": -0.24385526776313782, "kl_div_neg": -0.4737330973148346, "kl_div_pos": -0.013977449387311935, "learning_rate": 8.711256117455138e-07, "loss": -0.0602, "ppo_loss": -0.09305989742279053, "step": 703 }, { "epoch": 1.71, "grad_norm": 1.7288146369197956, "importance_ratio": 0.859375, "kl_div": -0.1643335372209549, "kl_div_neg": -0.31171128153800964, "kl_div_pos": -0.016955794766545296, "learning_rate": 8.694942903752039e-07, "loss": 0.0194, "ppo_loss": -0.09159353375434875, "step": 704 }, { "epoch": 1.71, "grad_norm": 1.391555217841228, "importance_ratio": 0.859375, "kl_div": -0.1633143424987793, "kl_div_neg": -0.2973756492137909, "kl_div_pos": -0.029253020882606506, "learning_rate": 8.67862969004894e-07, "loss": -0.0563, "ppo_loss": -0.08558535575866699, "step": 705 }, { "epoch": 1.71, "grad_norm": 0.925779576038505, "importance_ratio": 0.8984375, "kl_div": -0.12373246997594833, "kl_div_neg": -0.29810261726379395, "kl_div_pos": 0.050637681037187576, "learning_rate": 8.66231647634584e-07, "loss": -0.0361, "ppo_loss": -0.12597081065177917, "step": 706 }, { "epoch": 1.71, "grad_norm": 1.6802199828218412, "importance_ratio": 0.9921875, "kl_div": -0.006216323934495449, "kl_div_pos": -0.007722379639744759, "kl_div_sft": -0.0047102682292461395, "learning_rate": 8.64600326264274e-07, "loss": -0.0714, "ppo_loss": -0.9923073649406433, "sft_loss": 0.14663028717041016, "step": 707 }, { "epoch": 1.72, "grad_norm": 4.493721290464836, "kl_div": 0.01159009337425232, "kl_div_sft": 0.01159009337425232, "learning_rate": 8.629690048939641e-07, "loss": 0.0042, "sft_loss": 0.040383607149124146, "step": 708 }, { "epoch": 1.72, "grad_norm": 1.1044778325423805, "importance_ratio": 0.76953125, "kl_div": -0.12625326216220856, "kl_div_neg": -0.26072871685028076, "kl_div_sft": 0.008222181349992752, "learning_rate": 8.613376835236541e-07, "loss": -0.1057, "ppo_loss": 0.800000011920929, "sft_loss": 0.049650732427835464, "step": 709 }, { "epoch": 1.72, "grad_norm": 1.8488862216122248, "kl_div": 0.011091427877545357, "kl_div_sft": 0.011091427877545357, "learning_rate": 8.597063621533441e-07, "loss": -0.023, "sft_loss": 0.06612934917211533, "step": 710 }, { "epoch": 1.72, "grad_norm": 1.3987795448762412, "importance_ratio": 0.98046875, "kl_div": -0.004307438153773546, "kl_div_pos": -0.021483710035681725, "kl_div_sft": 0.012868833728134632, "learning_rate": 8.580750407830342e-07, "loss": -0.0557, "ppo_loss": -0.9787454605102539, "sft_loss": 0.007901977747678757, "step": 711 }, { "epoch": 1.73, "grad_norm": 1.7901759784653886, "kl_div": -0.007937498390674591, "kl_div_sft": -0.007937498390674591, "learning_rate": 8.564437194127243e-07, "loss": -0.0439, "sft_loss": 0.1107691079378128, "step": 712 }, { "epoch": 1.73, "grad_norm": 3.1402156430200727, "kl_div": 0.008624709211289883, "kl_div_sft": 0.008624709211289883, "learning_rate": 8.548123980424144e-07, "loss": 0.0598, "sft_loss": 0.05024949088692665, "step": 713 }, { "epoch": 1.73, "grad_norm": 1.5056068218522578, "kl_div": -0.0017463699914515018, "kl_div_sft": -0.0017463699914515018, "learning_rate": 8.531810766721043e-07, "loss": -0.1127, "sft_loss": 0.0635661706328392, "step": 714 }, { "epoch": 1.73, "grad_norm": 1.7111928392202491, "importance_ratio": 0.953125, "kl_div": -0.051379989832639694, "kl_div_pos": -0.051379989832639694, "learning_rate": 8.515497553017944e-07, "loss": -0.0515, "ppo_loss": -0.953635573387146, "step": 715 }, { "epoch": 1.74, "grad_norm": 2.907991277332683, "kl_div": -0.0014508292078971863, "kl_div_sft": -0.0014508292078971863, "learning_rate": 8.499184339314844e-07, "loss": -0.0116, "sft_loss": 0.08951911330223083, "step": 716 }, { "epoch": 1.74, "grad_norm": 1.0871951278765424, "kl_div": 0.011151503771543503, "kl_div_sft": 0.011151503771543503, "learning_rate": 8.482871125611745e-07, "loss": -0.1067, "sft_loss": 0.035712748765945435, "step": 717 }, { "epoch": 1.74, "grad_norm": 2.998019200299793, "importance_ratio": 1.046875, "kl_div": 0.042760252952575684, "kl_div_pos": 0.042760252952575684, "learning_rate": 8.466557911908646e-07, "loss": -0.0357, "ppo_loss": -1.0436962842941284, "step": 718 }, { "epoch": 1.74, "grad_norm": 1.0650319616688604, "importance_ratio": 0.97265625, "kl_div": -0.029345382004976273, "kl_div_pos": -0.028837639838457108, "kl_div_sft": -0.029853124171495438, "learning_rate": 8.450244698205547e-07, "loss": -0.1348, "ppo_loss": -0.9715741872787476, "sft_loss": 0.09074969589710236, "step": 719 }, { "epoch": 1.75, "grad_norm": 2.3564983287653996, "kl_div": 0.011116044595837593, "kl_div_sft": 0.011116044595837593, "learning_rate": 8.433931484502447e-07, "loss": -0.1878, "sft_loss": 0.038140468299388885, "step": 720 }, { "epoch": 1.75, "grad_norm": 1.3253419776665938, "importance_ratio": 0.953125, "kl_div": -0.047147031873464584, "kl_div_neg": -0.09410122781991959, "kl_div_pos": -0.0001928337151184678, "learning_rate": 8.417618270799347e-07, "loss": -0.0887, "ppo_loss": -0.044808268547058105, "step": 721 }, { "epoch": 1.75, "grad_norm": 1.0211129032781585, "kl_div": 0.012244272977113724, "kl_div_sft": 0.012244272977113724, "learning_rate": 8.401305057096247e-07, "loss": -0.0017, "sft_loss": 0.09197733551263809, "step": 722 }, { "epoch": 1.75, "grad_norm": 0.9798829657874045, "kl_div": -0.0023552451748400927, "kl_div_sft": -0.0023552451748400927, "learning_rate": 8.384991843393147e-07, "loss": -0.0282, "sft_loss": 0.06323568522930145, "step": 723 }, { "epoch": 1.76, "grad_norm": 1.1345348024910273, "importance_ratio": 0.6796875, "kl_div": -0.39538222551345825, "kl_div_neg": -0.550060510635376, "kl_div_pos": -0.24070391058921814, "learning_rate": 8.368678629690048e-07, "loss": -0.0131, "ppo_loss": 0.006962805986404419, "step": 724 }, { "epoch": 1.76, "grad_norm": 4.913288291990402, "importance_ratio": 0.75, "kl_div": -0.14327985048294067, "kl_div_neg": -0.28823381662368774, "kl_div_sft": 0.0016741086728870869, "learning_rate": 8.352365415986949e-07, "loss": 0.0042, "ppo_loss": 0.800000011920929, "sft_loss": 0.0790887251496315, "step": 725 }, { "epoch": 1.76, "grad_norm": 1.1523942322099083, "importance_ratio": 1.015625, "kl_div": 0.0058078221045434475, "kl_div_pos": 0.012630930170416832, "kl_div_sft": -0.0010152860777452588, "learning_rate": 8.33605220228385e-07, "loss": -0.1444, "ppo_loss": -1.0127110481262207, "sft_loss": 0.059882428497076035, "step": 726 }, { "epoch": 1.76, "grad_norm": 1.1735775596041245, "kl_div": -0.030080385506153107, "kl_div_sft": -0.030080385506153107, "learning_rate": 8.31973898858075e-07, "loss": -0.0156, "sft_loss": 0.09316494315862656, "step": 727 }, { "epoch": 1.76, "grad_norm": 1.015282441102564, "importance_ratio": 0.8515625, "kl_div": -0.17228472232818604, "kl_div_neg": -0.3388327360153198, "kl_div_pos": -0.005736696999520063, "learning_rate": 8.303425774877651e-07, "loss": 0.0329, "ppo_loss": -0.0971398651599884, "step": 728 }, { "epoch": 1.77, "grad_norm": 1.0764164862974561, "importance_ratio": 1.0234375, "kl_div": 0.020020831376314163, "kl_div_pos": 0.023958861827850342, "kl_div_sft": 0.016082800924777985, "learning_rate": 8.287112561174551e-07, "loss": -0.1059, "ppo_loss": -1.0242482423782349, "sft_loss": 0.02947784587740898, "step": 729 }, { "epoch": 1.77, "grad_norm": 1.3000123096763578, "importance_ratio": 0.828125, "kl_div": -0.09342669695615768, "kl_div_neg": -0.18810221552848816, "kl_div_sft": 0.0012488183565437794, "learning_rate": 8.270799347471451e-07, "loss": -0.0962, "ppo_loss": 0.8285300731658936, "sft_loss": 0.041453879326581955, "step": 730 }, { "epoch": 1.77, "grad_norm": 1.1521890827413293, "kl_div": 0.01180996373295784, "kl_div_sft": 0.01180996373295784, "learning_rate": 8.254486133768352e-07, "loss": -0.0657, "sft_loss": 0.05986681208014488, "step": 731 }, { "epoch": 1.77, "grad_norm": 1.7080742554257822, "importance_ratio": 0.7421875, "kl_div": -0.30273953080177307, "kl_div_neg": -0.30273953080177307, "learning_rate": 8.238172920065253e-07, "loss": 0.1383, "ppo_loss": 0.806269645690918, "step": 732 }, { "epoch": 1.78, "grad_norm": 1.0960210926991685, "kl_div": 0.010733826085925102, "kl_div_sft": 0.010733826085925102, "learning_rate": 8.221859706362153e-07, "loss": 0.0392, "sft_loss": 0.052053019404411316, "step": 733 }, { "epoch": 1.78, "grad_norm": 1.0775148074732415, "importance_ratio": 0.9609375, "kl_div": -0.03888477012515068, "kl_div_neg": -0.08518827706575394, "kl_div_pos": 0.007418735418468714, "learning_rate": 8.205546492659053e-07, "loss": 0.0344, "ppo_loss": -0.044553518295288086, "step": 734 }, { "epoch": 1.78, "grad_norm": 1.0134316455579038, "importance_ratio": 1.0234375, "kl_div": 0.01093167345970869, "kl_div_pos": 0.020649367943406105, "kl_div_sft": 0.0012139781611040235, "learning_rate": 8.189233278955954e-07, "loss": 0.0421, "ppo_loss": -1.0208640098571777, "sft_loss": 0.07622949033975601, "step": 735 }, { "epoch": 1.78, "grad_norm": 1.2524015721461401, "importance_ratio": 1.015625, "kl_div": 0.0013436046428978443, "kl_div_pos": 0.013851309195160866, "kl_div_sft": -0.011164099909365177, "learning_rate": 8.172920065252854e-07, "loss": -0.0786, "ppo_loss": -1.013947606086731, "sft_loss": 0.050719670951366425, "step": 736 }, { "epoch": 1.79, "grad_norm": 1.1603974884658244, "importance_ratio": 0.84765625, "kl_div": -0.17790842056274414, "kl_div_neg": -0.3409663736820221, "kl_div_pos": -0.014850452542304993, "learning_rate": 8.156606851549756e-07, "loss": 0.0241, "ppo_loss": -0.09262964129447937, "step": 737 }, { "epoch": 1.79, "grad_norm": 5.249287148172106, "importance_ratio": 1.0, "kl_div": 0.006068367511034012, "kl_div_pos": -5.9042658904218115e-06, "kl_div_sft": 0.012142639607191086, "learning_rate": 8.140293637846656e-07, "loss": -0.094, "ppo_loss": -0.999994158744812, "sft_loss": 0.024276066571474075, "step": 738 }, { "epoch": 1.79, "grad_norm": 0.9898027611776252, "kl_div": 0.0026179328560829163, "kl_div_sft": 0.0026179328560829163, "learning_rate": 8.123980424143556e-07, "loss": 0.0992, "sft_loss": 0.060176316648721695, "step": 739 }, { "epoch": 1.79, "grad_norm": 1.0231015185327998, "importance_ratio": 0.59375, "kl_div": -0.26476937532424927, "kl_div_neg": -0.5224133729934692, "kl_div_sft": -0.0071253604255616665, "learning_rate": 8.107667210440456e-07, "loss": -0.1075, "ppo_loss": 0.800000011920929, "sft_loss": 0.20790791511535645, "step": 740 }, { "epoch": 1.8, "grad_norm": 1.1156809325315264, "importance_ratio": 0.92578125, "kl_div": -0.023399610072374344, "kl_div_pos": -0.0772751122713089, "kl_div_sft": 0.03047589212656021, "learning_rate": 8.091353996737357e-07, "loss": -0.0045, "ppo_loss": -0.9256352186203003, "sft_loss": 0.03140160068869591, "step": 741 }, { "epoch": 1.8, "grad_norm": 0.9824747181325911, "kl_div": 0.010281100869178772, "kl_div_sft": 0.010281100869178772, "learning_rate": 8.075040783034257e-07, "loss": -0.0006, "sft_loss": 0.036707181483507156, "step": 742 }, { "epoch": 1.8, "grad_norm": 1.057966320850275, "kl_div": 0.02083955705165863, "kl_div_sft": 0.02083955705165863, "learning_rate": 8.058727569331158e-07, "loss": 0.0689, "sft_loss": 0.07779664546251297, "step": 743 }, { "epoch": 1.8, "grad_norm": 2.9374123418187548, "importance_ratio": 1.015625, "kl_div": 0.0048860725946724415, "kl_div_pos": 0.012217046692967415, "kl_div_sft": -0.002444901503622532, "learning_rate": 8.042414355628059e-07, "loss": -0.0725, "ppo_loss": -1.0122920274734497, "sft_loss": 0.06515536457300186, "step": 744 }, { "epoch": 1.81, "grad_norm": 1.1769438410562778, "importance_ratio": 0.7734375, "kl_div": -0.13132451474666595, "kl_div_neg": -0.25582724809646606, "kl_div_sft": -0.006821789778769016, "learning_rate": 8.026101141924959e-07, "loss": -0.0015, "ppo_loss": 0.800000011920929, "sft_loss": 0.09594070911407471, "step": 745 }, { "epoch": 1.81, "grad_norm": 1.1242838805566813, "importance_ratio": 0.91796875, "kl_div": -0.09715739637613297, "kl_div_neg": -0.23688404262065887, "kl_div_pos": 0.042569246143102646, "learning_rate": 8.00978792822186e-07, "loss": -0.0239, "ppo_loss": -0.12174412608146667, "step": 746 }, { "epoch": 1.81, "grad_norm": 0.7507215843348847, "kl_div": 0.01962902955710888, "kl_div_sft": 0.01962902955710888, "learning_rate": 7.993474714518759e-07, "loss": 0.017, "sft_loss": 0.013374298810958862, "step": 747 }, { "epoch": 1.81, "grad_norm": 2.0844715696552276, "kl_div": 0.016665317118167877, "kl_div_sft": 0.016665317118167877, "learning_rate": 7.97716150081566e-07, "loss": -0.0687, "sft_loss": 0.04137241095304489, "step": 748 }, { "epoch": 1.82, "grad_norm": 1.7014040870593201, "kl_div": 0.022186122834682465, "kl_div_sft": 0.022186122834682465, "learning_rate": 7.96084828711256e-07, "loss": 0.0706, "sft_loss": 0.042967766523361206, "step": 749 }, { "epoch": 1.82, "grad_norm": 0.851161547419168, "importance_ratio": 0.6484375, "kl_div": -0.4384397864341736, "kl_div_neg": -0.4384397864341736, "learning_rate": 7.944535073409462e-07, "loss": -0.0331, "ppo_loss": 0.800000011920929, "step": 750 }, { "epoch": 1.82, "grad_norm": 1.1825319065684825, "importance_ratio": 1.0390625, "kl_div": 0.02095329761505127, "kl_div_pos": 0.03723027557134628, "kl_div_sft": 0.004676317796111107, "learning_rate": 7.928221859706362e-07, "loss": 0.1072, "ppo_loss": -1.03793203830719, "sft_loss": 0.04656980186700821, "step": 751 }, { "epoch": 1.82, "grad_norm": 1.930849906686716, "importance_ratio": 0.8125, "kl_div": -0.217138409614563, "kl_div_neg": -0.217138409614563, "learning_rate": 7.911908646003263e-07, "loss": -0.0073, "ppo_loss": 0.8517619371414185, "step": 752 }, { "epoch": 1.83, "grad_norm": 2.1123037845293955, "importance_ratio": 0.765625, "kl_div": -0.2653629183769226, "kl_div_neg": -0.31069087982177734, "kl_div_pos": -0.22003495693206787, "learning_rate": 7.895595432300163e-07, "loss": -0.1358, "ppo_loss": -0.0012453794479370117, "step": 753 }, { "epoch": 1.83, "grad_norm": 0.9361953876798991, "importance_ratio": 0.859375, "kl_div": -0.06817443668842316, "kl_div_neg": -0.15062358975410461, "kl_div_sft": 0.014274713583290577, "learning_rate": 7.879282218597064e-07, "loss": 0.0415, "ppo_loss": 0.8601714372634888, "sft_loss": 0.015392109751701355, "step": 754 }, { "epoch": 1.83, "grad_norm": 1.2295459496194712, "importance_ratio": 0.78125, "kl_div": -0.24351197481155396, "kl_div_neg": -0.24351197481155396, "learning_rate": 7.862969004893963e-07, "loss": 0.1713, "ppo_loss": 0.800000011920929, "step": 755 }, { "epoch": 1.83, "grad_norm": 1.9602276858604508, "importance_ratio": 0.77734375, "kl_div": -0.12892000377178192, "kl_div_neg": -0.2504558265209198, "kl_div_sft": -0.007384192198514938, "learning_rate": 7.846655791190863e-07, "loss": -0.115, "ppo_loss": 0.800000011920929, "sft_loss": 0.09705698490142822, "step": 756 }, { "epoch": 1.84, "grad_norm": 1.3345687725516597, "kl_div": 0.003265599487349391, "kl_div_sft": 0.003265599487349391, "learning_rate": 7.830342577487765e-07, "loss": -0.082, "sft_loss": 0.08901108801364899, "step": 757 }, { "epoch": 1.84, "grad_norm": 1.2540828782457056, "importance_ratio": 0.984375, "kl_div": -0.012916380539536476, "kl_div_neg": -0.05277905613183975, "kl_div_pos": 0.0269462950527668, "learning_rate": 7.814029363784665e-07, "loss": 0.0208, "ppo_loss": -0.039361536502838135, "step": 758 }, { "epoch": 1.84, "grad_norm": 1.1282596518342276, "importance_ratio": 0.703125, "kl_div": -0.1804586499929428, "kl_div_neg": -0.3504132628440857, "kl_div_sft": -0.010504036210477352, "learning_rate": 7.797716150081566e-07, "loss": -0.0069, "ppo_loss": 0.800000011920929, "sft_loss": 0.1207689493894577, "step": 759 }, { "epoch": 1.84, "grad_norm": 1.3429166294389452, "importance_ratio": 0.84375, "kl_div": -0.1911240667104721, "kl_div_neg": -0.40841782093048096, "kl_div_pos": 0.026169700548052788, "learning_rate": 7.781402936378466e-07, "loss": -0.0265, "ppo_loss": -0.11325755715370178, "step": 760 }, { "epoch": 1.84, "grad_norm": 0.7505495124612518, "importance_ratio": 0.875, "kl_div": -0.13588908314704895, "kl_div_pos": -0.13588908314704895, "learning_rate": 7.765089722675367e-07, "loss": -0.0604, "ppo_loss": -0.8754533529281616, "step": 761 }, { "epoch": 1.85, "grad_norm": 1.8840970924126217, "kl_div": 0.0035053137689828873, "kl_div_sft": 0.0035053137689828873, "learning_rate": 7.748776508972267e-07, "loss": -0.0674, "sft_loss": 0.07195758819580078, "step": 762 }, { "epoch": 1.85, "grad_norm": 1.4304368186899392, "importance_ratio": 1.015625, "kl_div": 0.0013200687244534492, "kl_div_pos": 0.012445573695003986, "kl_div_sft": -0.009805436246097088, "learning_rate": 7.732463295269168e-07, "loss": -0.0422, "ppo_loss": -1.0125232934951782, "sft_loss": 0.09700474143028259, "step": 763 }, { "epoch": 1.85, "grad_norm": 1.1241353208801281, "importance_ratio": 0.6640625, "kl_div": -0.19818703830242157, "kl_div_neg": -0.4117022752761841, "kl_div_sft": 0.015328200533986092, "learning_rate": 7.716150081566068e-07, "loss": -0.0166, "ppo_loss": 0.800000011920929, "sft_loss": 0.0617440789937973, "step": 764 }, { "epoch": 1.85, "grad_norm": 1.331659617558531, "importance_ratio": 0.7421875, "kl_div": -0.14339332282543182, "kl_div_neg": -0.29687318205833435, "kl_div_sft": 0.010086532682180405, "learning_rate": 7.699836867862969e-07, "loss": 0.032, "ppo_loss": 0.800000011920929, "sft_loss": 0.05432585999369621, "step": 765 }, { "epoch": 1.86, "grad_norm": 0.9274721116245701, "importance_ratio": 1.0546875, "kl_div": 0.0332191027700901, "kl_div_pos": 0.05039292946457863, "kl_div_sft": 0.016045276075601578, "learning_rate": 7.683523654159869e-07, "loss": -0.0024, "ppo_loss": -1.0516842603683472, "sft_loss": 0.03535175696015358, "step": 766 }, { "epoch": 1.86, "grad_norm": 2.7703251958342054, "importance_ratio": 0.96875, "kl_div": -0.030892496928572655, "kl_div_neg": -0.10311172157526016, "kl_div_pos": 0.04132672771811485, "learning_rate": 7.66721044045677e-07, "loss": -0.0743, "ppo_loss": -0.07008317112922668, "step": 767 }, { "epoch": 1.86, "grad_norm": 3.411228007144001, "importance_ratio": 0.7265625, "kl_div": -0.15510594844818115, "kl_div_neg": -0.32163888216018677, "kl_div_sft": 0.011426973156630993, "learning_rate": 7.65089722675367e-07, "loss": -0.1231, "ppo_loss": 0.800000011920929, "sft_loss": 0.042363401502370834, "step": 768 }, { "epoch": 1.86, "grad_norm": 1.0573455659003186, "importance_ratio": 1.0546875, "kl_div": 0.02536001428961754, "kl_div_pos": 0.052674584090709686, "kl_div_sft": -0.0019545569084584713, "learning_rate": 7.63458401305057e-07, "loss": -0.0199, "ppo_loss": -1.0540865659713745, "sft_loss": 0.06650757044553757, "step": 769 }, { "epoch": 1.87, "grad_norm": 1.2626625522927828, "importance_ratio": 1.0390625, "kl_div": 0.03157272934913635, "kl_div_pos": 0.03687372803688049, "kl_div_sft": 0.026271730661392212, "learning_rate": 7.618270799347472e-07, "loss": 0.0742, "ppo_loss": -1.0375620126724243, "sft_loss": 0.0405106358230114, "step": 770 }, { "epoch": 1.87, "grad_norm": 1.1863100715597479, "importance_ratio": 1.03125, "kl_div": 0.021939104422926903, "kl_div_pos": 0.032872870564460754, "kl_div_sft": 0.011005338281393051, "learning_rate": 7.601957585644371e-07, "loss": 0.0095, "ppo_loss": -1.033419132232666, "sft_loss": 0.019082684069871902, "step": 771 }, { "epoch": 1.87, "grad_norm": 1.3042253600899398, "kl_div": -0.00686268787831068, "kl_div_sft": -0.00686268787831068, "learning_rate": 7.585644371941272e-07, "loss": -0.0553, "sft_loss": 0.0654834434390068, "step": 772 }, { "epoch": 1.87, "grad_norm": 1.2348815264999986, "kl_div": 0.004872541408985853, "kl_div_sft": 0.004872541408985853, "learning_rate": 7.569331158238172e-07, "loss": 0.0428, "sft_loss": 0.05468171834945679, "step": 773 }, { "epoch": 1.88, "grad_norm": 3.4244027055488826, "importance_ratio": 1.0390625, "kl_div": 0.01898658275604248, "kl_div_pos": 0.03458033502101898, "kl_div_sft": 0.00339283118955791, "learning_rate": 7.553017944535073e-07, "loss": -0.041, "ppo_loss": -1.0351852178573608, "sft_loss": 0.050571754574775696, "step": 774 }, { "epoch": 1.88, "grad_norm": 1.7280742534339981, "importance_ratio": 1.03125, "kl_div": 0.027021419256925583, "kl_div_pos": 0.027021419256925583, "learning_rate": 7.536704730831973e-07, "loss": -0.1909, "ppo_loss": -1.0274732112884521, "step": 775 }, { "epoch": 1.88, "grad_norm": 2.228668394878667, "importance_ratio": 0.8203125, "kl_div": -0.09695656597614288, "kl_div_pos": -0.1973302811384201, "kl_div_sft": 0.0034171519801020622, "learning_rate": 7.520391517128875e-07, "loss": -0.0268, "ppo_loss": -0.8209194540977478, "sft_loss": 0.10032328963279724, "step": 776 }, { "epoch": 1.88, "grad_norm": 1.9671372362522144, "importance_ratio": 0.90625, "kl_div": -0.03965136408805847, "kl_div_neg": -0.09838567674160004, "kl_div_sft": 0.01908295229077339, "learning_rate": 7.504078303425775e-07, "loss": -0.0087, "ppo_loss": 0.9062992930412292, "sft_loss": 0.06949137151241302, "step": 777 }, { "epoch": 1.89, "grad_norm": 1.5321125792878874, "importance_ratio": 1.046875, "kl_div": 0.024745360016822815, "kl_div_pos": 0.042934924364089966, "kl_div_sft": 0.006555794272571802, "learning_rate": 7.487765089722676e-07, "loss": -0.1395, "ppo_loss": -1.043869972229004, "sft_loss": 0.085297130048275, "step": 778 }, { "epoch": 1.89, "grad_norm": 1.0138319179490123, "importance_ratio": 1.0390625, "kl_div": 0.024765880778431892, "kl_div_pos": 0.03781313821673393, "kl_div_sft": 0.011718622408807278, "learning_rate": 7.471451876019575e-07, "loss": 0.0432, "ppo_loss": -1.0385371446609497, "sft_loss": 0.03418990597128868, "step": 779 }, { "epoch": 1.89, "grad_norm": 1.8636929678937624, "importance_ratio": 0.8671875, "kl_div": -0.062706358730793, "kl_div_neg": -0.1413581222295761, "kl_div_sft": 0.01594540849328041, "learning_rate": 7.455138662316476e-07, "loss": 0.0171, "ppo_loss": 0.8681783676147461, "sft_loss": 0.06166466325521469, "step": 780 }, { "epoch": 1.89, "grad_norm": 1.9722408046055935, "importance_ratio": 0.7734375, "kl_div": -0.12510272860527039, "kl_div_neg": -0.2547655999660492, "kl_div_sft": 0.0045601557940244675, "learning_rate": 7.438825448613376e-07, "loss": -0.0757, "ppo_loss": 0.800000011920929, "sft_loss": 0.09346868097782135, "step": 781 }, { "epoch": 1.9, "grad_norm": 2.440355828714277, "importance_ratio": 1.0546875, "kl_div": 0.019144684076309204, "kl_div_pos": 0.05361033231019974, "kl_div_sft": -0.015320963226258755, "learning_rate": 7.422512234910276e-07, "loss": -0.0695, "ppo_loss": -1.0550734996795654, "sft_loss": 0.08448739349842072, "step": 782 }, { "epoch": 1.9, "grad_norm": 1.55208335040133, "importance_ratio": 1.03125, "kl_div": 0.012653040699660778, "kl_div_pos": 0.027025144547224045, "kl_div_sft": -0.0017190633807331324, "learning_rate": 7.406199021207178e-07, "loss": -0.0467, "ppo_loss": -1.0273935794830322, "sft_loss": 0.06036945432424545, "step": 783 }, { "epoch": 1.9, "grad_norm": 1.109984968960003, "importance_ratio": 0.890625, "kl_div": -0.061940498650074005, "kl_div_neg": -0.11718282103538513, "kl_div_sft": -0.00669817766174674, "learning_rate": 7.389885807504078e-07, "loss": -0.0052, "ppo_loss": 0.8894225358963013, "sft_loss": 0.13916446268558502, "step": 784 }, { "epoch": 1.9, "grad_norm": 1.6532969509756765, "importance_ratio": 0.75, "kl_div": -0.14871735870838165, "kl_div_neg": -0.28607773780822754, "kl_div_sft": -0.011356977745890617, "learning_rate": 7.373572593800979e-07, "loss": -0.0826, "ppo_loss": 0.800000011920929, "sft_loss": 0.07842813432216644, "step": 785 }, { "epoch": 1.91, "grad_norm": 1.9565854964026683, "importance_ratio": 1.0234375, "kl_div": 0.010597184300422668, "kl_div_pos": 0.02512553334236145, "kl_div_sft": -0.003931163810193539, "learning_rate": 7.357259380097879e-07, "loss": -0.0408, "ppo_loss": -1.0254437923431396, "sft_loss": 0.05996675416827202, "step": 786 }, { "epoch": 1.91, "grad_norm": 3.3010744946477453, "importance_ratio": 0.99609375, "kl_div": 0.0037558861076831818, "kl_div_pos": -0.0037879080045968294, "kl_div_sft": 0.01129967998713255, "learning_rate": 7.340946166394779e-07, "loss": -0.023, "ppo_loss": -0.996219277381897, "sft_loss": 0.0625493973493576, "step": 787 }, { "epoch": 1.91, "grad_norm": 3.899235890189737, "kl_div": 0.01014263927936554, "kl_div_sft": 0.01014263927936554, "learning_rate": 7.324632952691679e-07, "loss": -0.0615, "sft_loss": 0.0858975425362587, "step": 788 }, { "epoch": 1.91, "grad_norm": 1.0165068685822132, "importance_ratio": 0.93359375, "kl_div": -0.01569426991045475, "kl_div_neg": -0.06786607205867767, "kl_div_sft": 0.03647753223776817, "learning_rate": 7.308319738988581e-07, "loss": 0.0064, "ppo_loss": 0.9343855977058411, "sft_loss": 0.06460925936698914, "step": 789 }, { "epoch": 1.92, "grad_norm": 4.0812106681202, "importance_ratio": 1.046875, "kl_div": 0.021152887493371964, "kl_div_pos": 0.047909412533044815, "kl_div_sft": -0.005603638477623463, "learning_rate": 7.292006525285481e-07, "loss": -0.0341, "ppo_loss": -1.0490756034851074, "sft_loss": 0.04746154323220253, "step": 790 }, { "epoch": 1.92, "grad_norm": 1.6711705229869975, "importance_ratio": 0.9296875, "kl_div": -0.07854534685611725, "kl_div_neg": -0.17257368564605713, "kl_div_pos": 0.015482988208532333, "learning_rate": 7.275693311582382e-07, "loss": -0.2077, "ppo_loss": -0.0870535671710968, "step": 791 }, { "epoch": 1.92, "grad_norm": 1.9255106583040016, "importance_ratio": 1.0390625, "kl_div": 0.02168772555887699, "kl_div_pos": 0.040172941982746124, "kl_div_sft": 0.003202509367838502, "learning_rate": 7.259380097879282e-07, "loss": -0.2886, "ppo_loss": -1.0409908294677734, "sft_loss": 0.055382560938596725, "step": 792 }, { "epoch": 1.92, "grad_norm": 3.2895379889376657, "importance_ratio": 0.609375, "kl_div": -0.24140967428684235, "kl_div_neg": -0.49316883087158203, "kl_div_sft": 0.010349491611123085, "learning_rate": 7.243066884176182e-07, "loss": 0.0002, "ppo_loss": 0.800000011920929, "sft_loss": 0.058419495820999146, "step": 793 }, { "epoch": 1.92, "grad_norm": 1.3688321493919273, "importance_ratio": 0.6953125, "kl_div": -0.17243990302085876, "kl_div_neg": -0.3653371036052704, "kl_div_sft": 0.020457301288843155, "learning_rate": 7.226753670473083e-07, "loss": -0.0483, "ppo_loss": 0.800000011920929, "sft_loss": 0.07426360249519348, "step": 794 }, { "epoch": 1.93, "grad_norm": 1.878562784822585, "importance_ratio": 1.0390625, "kl_div": 0.020934144034981728, "kl_div_pos": 0.037678271532058716, "kl_div_sft": 0.004190015606582165, "learning_rate": 7.210440456769983e-07, "loss": 0.0899, "ppo_loss": -1.0383970737457275, "sft_loss": 0.06750870496034622, "step": 795 }, { "epoch": 1.93, "grad_norm": 1.4040822593305553, "importance_ratio": 0.8046875, "kl_div": -0.21546536684036255, "kl_div_neg": -0.21546536684036255, "learning_rate": 7.194127243066884e-07, "loss": 0.0605, "ppo_loss": 0.832324206829071, "step": 796 }, { "epoch": 1.93, "grad_norm": 1.374617610125144, "importance_ratio": 1.03125, "kl_div": 0.01844792440533638, "kl_div_pos": 0.03335430100560188, "kl_div_sft": 0.003541549202054739, "learning_rate": 7.177814029363784e-07, "loss": -0.118, "ppo_loss": -1.033916711807251, "sft_loss": 0.046227868646383286, "step": 797 }, { "epoch": 1.93, "grad_norm": 1.3155559204522764, "kl_div": 0.003813364077359438, "kl_div_sft": 0.003813364077359438, "learning_rate": 7.161500815660685e-07, "loss": -0.1129, "sft_loss": 0.09659446775913239, "step": 798 }, { "epoch": 1.94, "grad_norm": 1.0441580859778266, "kl_div": -0.0024976865388453007, "kl_div_sft": -0.0024976865388453007, "learning_rate": 7.145187601957585e-07, "loss": -0.0499, "sft_loss": 0.06166272237896919, "step": 799 }, { "epoch": 1.94, "grad_norm": 1.4281665654049773, "kl_div": 0.012549598701298237, "kl_div_sft": 0.012549598701298237, "learning_rate": 7.128874388254486e-07, "loss": 0.1386, "sft_loss": 0.07515973597764969, "step": 800 }, { "epoch": 1.94, "grad_norm": 3.9301131455924017, "kl_div": 0.012901881709694862, "kl_div_sft": 0.012901881709694862, "learning_rate": 7.112561174551386e-07, "loss": -0.1478, "sft_loss": 0.04293525964021683, "step": 801 }, { "epoch": 1.94, "grad_norm": 1.1873570406340965, "importance_ratio": 0.79296875, "kl_div": -0.24081431329250336, "kl_div_neg": -0.24081431329250336, "learning_rate": 7.096247960848288e-07, "loss": -0.069, "ppo_loss": 0.8468914031982422, "step": 802 }, { "epoch": 1.95, "grad_norm": 1.5981748900311181, "kl_div": 0.013554854318499565, "kl_div_sft": 0.013554854318499565, "learning_rate": 7.079934747145187e-07, "loss": -0.0581, "sft_loss": 0.08860177546739578, "step": 803 }, { "epoch": 1.95, "grad_norm": 1.1497190422507106, "importance_ratio": 1.0078125, "kl_div": -0.0008298805914819241, "kl_div_pos": 0.009881866164505482, "kl_div_sft": -0.01154162734746933, "learning_rate": 7.063621533442088e-07, "loss": 0.099, "ppo_loss": -1.0099308490753174, "sft_loss": 0.12553054094314575, "step": 804 }, { "epoch": 1.95, "grad_norm": 2.7418610902932983, "importance_ratio": 0.7734375, "kl_div": -0.1349291205406189, "kl_div_neg": -0.2556372880935669, "kl_div_sft": -0.014220948331058025, "learning_rate": 7.047308319738988e-07, "loss": 0.1, "ppo_loss": 0.800000011920929, "sft_loss": 0.08331334590911865, "step": 805 }, { "epoch": 1.95, "grad_norm": 1.2000722426285284, "kl_div": 0.011318661272525787, "kl_div_sft": 0.011318661272525787, "learning_rate": 7.030995106035888e-07, "loss": 0.1115, "sft_loss": 0.08323749154806137, "step": 806 }, { "epoch": 1.96, "grad_norm": 1.2451685995819552, "kl_div": -0.004958365112543106, "kl_div_sft": -0.004958365112543106, "learning_rate": 7.014681892332789e-07, "loss": 0.1006, "sft_loss": 0.12090334296226501, "step": 807 }, { "epoch": 1.96, "grad_norm": 3.2934260113091613, "importance_ratio": 1.03125, "kl_div": 0.024032320827245712, "kl_div_pos": 0.03315795585513115, "kl_div_sft": 0.014906683936715126, "learning_rate": 6.99836867862969e-07, "loss": -0.0971, "ppo_loss": -1.0337138175964355, "sft_loss": 0.01600171998143196, "step": 808 }, { "epoch": 1.96, "grad_norm": 1.2325395385868936, "kl_div": 0.013921285048127174, "kl_div_sft": 0.013921285048127174, "learning_rate": 6.982055464926591e-07, "loss": 0.0345, "sft_loss": 0.09161141514778137, "step": 809 }, { "epoch": 1.96, "grad_norm": 2.126775224460074, "kl_div": 0.0037957075983285904, "kl_div_sft": 0.0037957075983285904, "learning_rate": 6.965742251223491e-07, "loss": 0.083, "sft_loss": 0.030382830649614334, "step": 810 }, { "epoch": 1.97, "grad_norm": 1.2775065933624745, "importance_ratio": 0.68359375, "kl_div": -0.1811116635799408, "kl_div_neg": -0.37784725427627563, "kl_div_sft": 0.015623941086232662, "learning_rate": 6.949429037520392e-07, "loss": 0.0044, "ppo_loss": 0.800000011920929, "sft_loss": 0.032511260360479355, "step": 811 }, { "epoch": 1.97, "grad_norm": 1.1472604129972903, "importance_ratio": 0.9140625, "kl_div": -0.09489559382200241, "kl_div_pos": -0.09489559382200241, "learning_rate": 6.933115823817291e-07, "loss": -0.0312, "ppo_loss": -0.914150595664978, "step": 812 }, { "epoch": 1.97, "grad_norm": 1.2691938215937906, "importance_ratio": 0.59375, "kl_div": -0.2605888545513153, "kl_div_neg": -0.5227549076080322, "kl_div_sft": 0.0015771690523251891, "learning_rate": 6.916802610114192e-07, "loss": -0.075, "ppo_loss": 0.800000011920929, "sft_loss": 0.04791930317878723, "step": 813 }, { "epoch": 1.97, "grad_norm": 1.3854534125175522, "kl_div": 0.010630996897816658, "kl_div_sft": 0.010630996897816658, "learning_rate": 6.900489396411092e-07, "loss": -0.0741, "sft_loss": 0.044867224991321564, "step": 814 }, { "epoch": 1.98, "grad_norm": 1.0842829112765733, "importance_ratio": 0.953125, "kl_div": -0.055763162672519684, "kl_div_neg": -0.18750636279582977, "kl_div_pos": 0.0759800374507904, "learning_rate": 6.884176182707994e-07, "loss": -0.1645, "ppo_loss": -0.12495854496955872, "step": 815 }, { "epoch": 1.98, "grad_norm": 1.3370248039529775, "importance_ratio": 0.88671875, "kl_div": -0.05268274247646332, "kl_div_neg": -0.1205388605594635, "kl_div_sft": 0.015173375606536865, "learning_rate": 6.867862969004894e-07, "loss": 0.0088, "ppo_loss": 0.8864426612854004, "sft_loss": 0.10430143773555756, "step": 816 }, { "epoch": 1.98, "grad_norm": 2.836920785813892, "importance_ratio": 0.70703125, "kl_div": -0.16925114393234253, "kl_div_pos": -0.34709298610687256, "kl_div_sft": 0.008590701036155224, "learning_rate": 6.851549755301794e-07, "loss": -0.0516, "ppo_loss": -0.7067395448684692, "sft_loss": 0.0393403097987175, "step": 817 }, { "epoch": 1.98, "grad_norm": 1.054002044599747, "importance_ratio": 1.0546875, "kl_div": 0.03092723712325096, "kl_div_pos": 0.05130693316459656, "kl_div_sft": 0.010547542944550514, "learning_rate": 6.835236541598695e-07, "loss": -0.1042, "ppo_loss": -1.0526460409164429, "sft_loss": 0.05216517299413681, "step": 818 }, { "epoch": 1.99, "grad_norm": 1.1683901533729915, "importance_ratio": 1.0546875, "kl_div": 0.023352043703198433, "kl_div_pos": 0.05334731191396713, "kl_div_sft": -0.006643224041908979, "learning_rate": 6.818923327895594e-07, "loss": -0.036, "ppo_loss": -1.0547959804534912, "sft_loss": 0.057002827525138855, "step": 819 }, { "epoch": 1.99, "grad_norm": 1.551231491876032, "importance_ratio": 1.015625, "kl_div": 0.014218551106750965, "kl_div_pos": 0.01491499226540327, "kl_div_sft": 0.01352210994809866, "learning_rate": 6.802610114192495e-07, "loss": -0.0743, "ppo_loss": -1.0150268077850342, "sft_loss": 0.027273595333099365, "step": 820 }, { "epoch": 1.99, "grad_norm": 1.755738115207708, "importance_ratio": 1.0234375, "kl_div": 0.020510466769337654, "kl_div_pos": 0.020663322880864143, "kl_div_sft": 0.020357610657811165, "learning_rate": 6.786296900489396e-07, "loss": -0.0518, "ppo_loss": -1.0208783149719238, "sft_loss": 0.09010978788137436, "step": 821 }, { "epoch": 1.99, "grad_norm": 2.787474963156988, "importance_ratio": 0.70703125, "kl_div": -0.16490840911865234, "kl_div_neg": -0.3457406163215637, "kl_div_sft": 0.015923811122775078, "learning_rate": 6.769983686786297e-07, "loss": 0.0045, "ppo_loss": 0.800000011920929, "sft_loss": 0.040341440588235855, "step": 822 }, { "epoch": 2.0, "grad_norm": 2.7430841876331677, "importance_ratio": 0.80078125, "kl_div": -0.10109415650367737, "kl_div_neg": -0.22379928827285767, "kl_div_sft": 0.021610967814922333, "learning_rate": 6.753670473083197e-07, "loss": -0.0856, "ppo_loss": 0.800000011920929, "sft_loss": 0.10019398480653763, "step": 823 }, { "epoch": 2.0, "grad_norm": 1.2497369012514388, "importance_ratio": 0.9921875, "kl_div": -0.00417862506583333, "kl_div_pos": -0.007548276800662279, "kl_div_sft": -0.0008089732145890594, "learning_rate": 6.737357259380098e-07, "loss": 0.047, "ppo_loss": -0.9924801588058472, "sft_loss": 0.08204149454832077, "step": 824 }, { "epoch": 2.0, "grad_norm": 3.340571381127005, "importance_ratio": 1.0390625, "kl_div": 0.02399800717830658, "kl_div_pos": 0.037386514246463776, "kl_div_sft": 0.010609501041471958, "learning_rate": 6.721044045676998e-07, "loss": -0.1545, "ppo_loss": -1.038094162940979, "sft_loss": 0.014390765689313412, "step": 825 }, { "epoch": 2.0, "grad_norm": 0.7295409649700262, "kl_div": 0.012158479541540146, "kl_div_sft": 0.012158479541540146, "learning_rate": 6.704730831973899e-07, "loss": -0.0283, "sft_loss": 0.0906473770737648, "step": 826 }, { "epoch": 2.0, "grad_norm": 1.7190898212535601, "importance_ratio": 0.796875, "kl_div": -0.1135992556810379, "kl_div_neg": -0.22905249893665314, "kl_div_sft": 0.001853986643254757, "learning_rate": 6.688417618270798e-07, "loss": -0.0515, "ppo_loss": 0.800000011920929, "sft_loss": 0.050841983407735825, "step": 827 }, { "epoch": 2.01, "grad_norm": 0.6960819573051047, "importance_ratio": 0.859375, "kl_div": -0.1718362420797348, "kl_div_neg": -0.3597458004951477, "kl_div_pos": 0.01607331819832325, "learning_rate": 6.6721044045677e-07, "loss": -0.1438, "ppo_loss": -0.10810157656669617, "step": 828 }, { "epoch": 2.01, "grad_norm": 0.4588395559847598, "importance_ratio": 0.91796875, "kl_div": -0.0542854405939579, "kl_div_pos": -0.08652933686971664, "kl_div_sft": -0.02204154245555401, "learning_rate": 6.6557911908646e-07, "loss": -0.0033, "ppo_loss": -0.9171086549758911, "sft_loss": 0.13468027114868164, "step": 829 }, { "epoch": 2.01, "grad_norm": 0.8429747304624693, "kl_div": 0.04222976416349411, "kl_div_sft": 0.04222976416349411, "learning_rate": 6.6394779771615e-07, "loss": -0.132, "sft_loss": 0.08493160456418991, "step": 830 }, { "epoch": 2.01, "grad_norm": 0.7845448253120335, "importance_ratio": 0.7109375, "kl_div": -0.3473590612411499, "kl_div_neg": -0.3473590612411499, "learning_rate": 6.623164763458401e-07, "loss": 0.0421, "ppo_loss": 0.800000011920929, "step": 831 }, { "epoch": 2.02, "grad_norm": 0.6056690500023906, "importance_ratio": 1.0859375, "kl_div": 0.05032026022672653, "kl_div_pos": 0.08291984349489212, "kl_div_sft": 0.017720678821206093, "learning_rate": 6.606851549755301e-07, "loss": -0.258, "ppo_loss": -1.0864547491073608, "sft_loss": 0.01477021537721157, "step": 832 }, { "epoch": 2.02, "grad_norm": 0.9148086045939298, "importance_ratio": 0.671875, "kl_div": -0.19181066751480103, "kl_div_neg": -0.3996953070163727, "kl_div_sft": 0.016073964536190033, "learning_rate": 6.590538336052202e-07, "loss": 0.0231, "ppo_loss": 0.800000011920929, "sft_loss": 0.0960092768073082, "step": 833 }, { "epoch": 2.02, "grad_norm": 0.6604697481919004, "importance_ratio": 1.046875, "kl_div": 0.017061958089470863, "kl_div_pos": 0.04899755120277405, "kl_div_sft": -0.014873635023832321, "learning_rate": 6.574225122349103e-07, "loss": -0.1018, "ppo_loss": -1.0502177476882935, "sft_loss": 0.09160658717155457, "step": 834 }, { "epoch": 2.02, "grad_norm": 0.6492688009384401, "importance_ratio": 0.80078125, "kl_div": -0.09790018200874329, "kl_div_neg": -0.22408372163772583, "kl_div_sft": 0.02828335016965866, "learning_rate": 6.557911908646004e-07, "loss": 0.0167, "ppo_loss": 0.800000011920929, "sft_loss": 0.07054585963487625, "step": 835 }, { "epoch": 2.03, "grad_norm": 0.5942027223177359, "importance_ratio": 1.046875, "kl_div": 0.029894528910517693, "kl_div_pos": 0.04286080598831177, "kl_div_sft": 0.016928251832723618, "learning_rate": 6.541598694942903e-07, "loss": -0.0582, "ppo_loss": -1.0437926054000854, "sft_loss": 0.03479551151394844, "step": 836 }, { "epoch": 2.03, "grad_norm": 0.9763045313572033, "importance_ratio": 0.7578125, "kl_div": -0.27878081798553467, "kl_div_neg": -0.27878081798553467, "learning_rate": 6.525285481239804e-07, "loss": -0.0547, "ppo_loss": 0.800000011920929, "step": 837 }, { "epoch": 2.03, "grad_norm": 1.6743166198463337, "importance_ratio": 0.6640625, "kl_div": -0.19439943134784698, "kl_div_pos": -0.41146811842918396, "kl_div_sft": 0.022669266909360886, "learning_rate": 6.508972267536704e-07, "loss": -0.0292, "ppo_loss": -0.6626766324043274, "sft_loss": 0.07602635025978088, "step": 838 }, { "epoch": 2.03, "grad_norm": 0.658178945940582, "importance_ratio": 0.93359375, "kl_div": -0.07545746117830276, "kl_div_neg": -0.1793633997440338, "kl_div_pos": 0.028448481112718582, "learning_rate": 6.492659053833605e-07, "loss": -0.0082, "ppo_loss": -0.09652745723724365, "step": 839 }, { "epoch": 2.04, "grad_norm": 0.7360289595804811, "importance_ratio": 0.6953125, "kl_div": -0.36604252457618713, "kl_div_neg": -0.36604252457618713, "learning_rate": 6.476345840130505e-07, "loss": -0.0225, "ppo_loss": 0.800000011920929, "step": 840 }, { "epoch": 2.04, "grad_norm": 0.8158893317648063, "kl_div": 0.019132403656840324, "kl_div_sft": 0.019132403656840324, "learning_rate": 6.460032626427406e-07, "loss": -0.1055, "sft_loss": 0.06531329452991486, "step": 841 }, { "epoch": 2.04, "grad_norm": 0.5638392717343974, "importance_ratio": 1.0390625, "kl_div": -0.001689983531832695, "kl_div_pos": 0.03776420280337334, "kl_div_sft": -0.04114416986703873, "learning_rate": 6.443719412724307e-07, "loss": -0.0369, "ppo_loss": -1.038486361503601, "sft_loss": 0.09056337922811508, "step": 842 }, { "epoch": 2.04, "grad_norm": 0.8054468220372359, "importance_ratio": 0.98828125, "kl_div": -0.001824448350816965, "kl_div_pos": -0.012244836427271366, "kl_div_sft": 0.008595939725637436, "learning_rate": 6.427406199021207e-07, "loss": -0.0229, "ppo_loss": -0.9878298044204712, "sft_loss": 0.0815005898475647, "step": 843 }, { "epoch": 2.05, "grad_norm": 2.3269584856776846, "importance_ratio": 0.875, "kl_div": -0.1502387821674347, "kl_div_neg": -0.3389727473258972, "kl_div_pos": 0.03849519044160843, "learning_rate": 6.411092985318107e-07, "loss": -0.1731, "ppo_loss": -0.1196228563785553, "step": 844 }, { "epoch": 2.05, "grad_norm": 0.8850740817147107, "kl_div": -0.005501694977283478, "kl_div_sft": -0.005501694977283478, "learning_rate": 6.394779771615007e-07, "loss": -0.1446, "sft_loss": 0.06264421343803406, "step": 845 }, { "epoch": 2.05, "grad_norm": 1.9384037494196507, "importance_ratio": 0.76171875, "kl_div": -0.12875968217849731, "kl_div_neg": -0.2738000452518463, "kl_div_sft": 0.016280686482787132, "learning_rate": 6.378466557911908e-07, "loss": 0.0103, "ppo_loss": 0.800000011920929, "sft_loss": 0.12097492069005966, "step": 846 }, { "epoch": 2.05, "grad_norm": 0.83877406641491, "importance_ratio": 1.0078125, "kl_div": 0.003864092053845525, "kl_div_pos": 0.009729886427521706, "kl_div_sft": -0.002001702319830656, "learning_rate": 6.362153344208809e-07, "loss": 0.0018, "ppo_loss": -1.0097774267196655, "sft_loss": 0.09012917429208755, "step": 847 }, { "epoch": 2.06, "grad_norm": 0.6029278754233961, "importance_ratio": 0.87890625, "kl_div": -0.1441148966550827, "kl_div_neg": -0.30787554383277893, "kl_div_pos": 0.01964573748409748, "learning_rate": 6.34584013050571e-07, "loss": 0.0073, "ppo_loss": -0.10991999506950378, "step": 848 }, { "epoch": 2.06, "grad_norm": 1.97101033352137, "kl_div": 0.005903508514165878, "kl_div_sft": 0.005903508514165878, "learning_rate": 6.32952691680261e-07, "loss": -0.0246, "sft_loss": 0.029841091483831406, "step": 849 }, { "epoch": 2.06, "grad_norm": 2.246006176112066, "kl_div": 0.017790446057915688, "kl_div_sft": 0.017790446057915688, "learning_rate": 6.313213703099511e-07, "loss": -0.1384, "sft_loss": 0.023750465363264084, "step": 850 }, { "epoch": 2.06, "grad_norm": 3.720825898154855, "importance_ratio": 0.890625, "kl_div": -0.13176052272319794, "kl_div_neg": -0.2837873101234436, "kl_div_pos": 0.020266273990273476, "learning_rate": 6.296900489396411e-07, "loss": -0.0339, "ppo_loss": -0.11023649573326111, "step": 851 }, { "epoch": 2.07, "grad_norm": 0.805438607777615, "kl_div": 0.014838488772511482, "kl_div_sft": 0.014838488772511482, "learning_rate": 6.28058727569331e-07, "loss": -0.1113, "sft_loss": 0.054096221923828125, "step": 852 }, { "epoch": 2.07, "grad_norm": 1.6791321302069588, "importance_ratio": 0.8515625, "kl_div": -0.17815786600112915, "kl_div_neg": -0.3900858759880066, "kl_div_pos": 0.03377014398574829, "learning_rate": 6.264274061990211e-07, "loss": -0.0931, "ppo_loss": -0.11717340350151062, "step": 853 }, { "epoch": 2.07, "grad_norm": 1.4237608926438445, "importance_ratio": 0.7265625, "kl_div": -0.3291565179824829, "kl_div_neg": -0.3291565179824829, "learning_rate": 6.247960848287112e-07, "loss": -0.0117, "ppo_loss": 0.8069021701812744, "step": 854 }, { "epoch": 2.07, "grad_norm": 1.0107552200435455, "importance_ratio": 1.0390625, "kl_div": 0.028783971443772316, "kl_div_pos": 0.03633904084563255, "kl_div_sft": 0.02122890204191208, "learning_rate": 6.231647634584013e-07, "loss": -0.1386, "ppo_loss": -1.0370073318481445, "sft_loss": 0.021868525072932243, "step": 855 }, { "epoch": 2.08, "grad_norm": 0.5036693873875817, "importance_ratio": 0.6015625, "kl_div": -0.5148600935935974, "kl_div_neg": -0.5148600935935974, "learning_rate": 6.215334420880913e-07, "loss": -0.0459, "ppo_loss": 0.800000011920929, "step": 856 }, { "epoch": 2.08, "grad_norm": 1.3641401061259197, "importance_ratio": 1.046875, "kl_div": 0.0433039590716362, "kl_div_pos": 0.0433039590716362, "learning_rate": 6.199021207177814e-07, "loss": -0.0362, "ppo_loss": -1.0442602634429932, "step": 857 }, { "epoch": 2.08, "grad_norm": 0.6578019728010779, "importance_ratio": 1.0234375, "kl_div": 0.025171402841806412, "kl_div_pos": 0.022367173805832863, "kl_div_sft": 0.02797563001513481, "learning_rate": 6.182707993474714e-07, "loss": -0.0778, "ppo_loss": -1.0226192474365234, "sft_loss": 0.03281998261809349, "step": 858 }, { "epoch": 2.08, "grad_norm": 0.9822804404163674, "kl_div": 0.012662078253924847, "kl_div_sft": 0.012662078253924847, "learning_rate": 6.166394779771615e-07, "loss": -0.1125, "sft_loss": 0.03483930230140686, "step": 859 }, { "epoch": 2.08, "grad_norm": 1.866113649204854, "importance_ratio": 0.53125, "kl_div": -0.30700746178627014, "kl_div_neg": -0.6296558380126953, "kl_div_sft": 0.015640929341316223, "learning_rate": 6.150081566068515e-07, "loss": -0.1434, "ppo_loss": 0.800000011920929, "sft_loss": 0.03329959884285927, "step": 860 }, { "epoch": 2.09, "grad_norm": 1.8112362369201367, "importance_ratio": 0.8125, "kl_div": -0.10016106814146042, "kl_div_neg": -0.20947012305259705, "kl_div_sft": 0.009147980250418186, "learning_rate": 6.133768352365416e-07, "loss": 0.0362, "ppo_loss": 0.81101393699646, "sft_loss": 0.030194612219929695, "step": 861 }, { "epoch": 2.09, "grad_norm": 0.9537721531846218, "importance_ratio": 0.96875, "kl_div": -0.029792316257953644, "kl_div_pos": -0.029792316257953644, "learning_rate": 6.117455138662316e-07, "loss": -0.1717, "ppo_loss": -0.9717547297477722, "step": 862 }, { "epoch": 2.09, "grad_norm": 0.9043738907053905, "kl_div": -0.06411665678024292, "kl_div_sft": -0.06411665678024292, "learning_rate": 6.101141924959217e-07, "loss": -0.1035, "sft_loss": 0.2276521474123001, "step": 863 }, { "epoch": 2.09, "grad_norm": 2.0284484323251943, "importance_ratio": 0.71875, "kl_div": -0.3353831470012665, "kl_div_neg": -0.3353831470012665, "learning_rate": 6.084828711256117e-07, "loss": 0.0273, "ppo_loss": 0.800000011920929, "step": 864 }, { "epoch": 2.1, "grad_norm": 0.8652953391200562, "importance_ratio": 1.0625, "kl_div": 0.03596498817205429, "kl_div_pos": 0.057203471660614014, "kl_div_sft": 0.014726504683494568, "learning_rate": 6.068515497553017e-07, "loss": -0.0778, "ppo_loss": -1.0588712692260742, "sft_loss": 0.03289495408535004, "step": 865 }, { "epoch": 2.1, "grad_norm": 0.8463662171117285, "importance_ratio": 0.7578125, "kl_div": -0.27933812141418457, "kl_div_neg": -0.27933812141418457, "learning_rate": 6.052202283849919e-07, "loss": -0.0995, "ppo_loss": 0.800000011920929, "step": 866 }, { "epoch": 2.1, "grad_norm": 0.48484302797303697, "importance_ratio": 0.6953125, "kl_div": -0.3646080493927002, "kl_div_neg": -0.3646080493927002, "learning_rate": 6.035889070146819e-07, "loss": 0.0461, "ppo_loss": 0.800000011920929, "step": 867 }, { "epoch": 2.1, "grad_norm": 0.7968388717539745, "importance_ratio": 0.82421875, "kl_div": -0.21529999375343323, "kl_div_neg": -0.4309925436973572, "kl_div_pos": 0.0003925491473637521, "learning_rate": 6.01957585644372e-07, "loss": 0.0053, "ppo_loss": -0.10019633173942566, "step": 868 }, { "epoch": 2.11, "grad_norm": 0.8793106844007031, "kl_div": 0.017604706808924675, "kl_div_sft": 0.017604706808924675, "learning_rate": 6.003262642740619e-07, "loss": -0.0568, "sft_loss": 0.05003702640533447, "step": 869 }, { "epoch": 2.11, "grad_norm": 0.5997317807386668, "kl_div": 0.013879947364330292, "kl_div_sft": 0.013879947364330292, "learning_rate": 5.98694942903752e-07, "loss": 0.1225, "sft_loss": 0.043585263192653656, "step": 870 }, { "epoch": 2.11, "grad_norm": 0.7998335277681405, "importance_ratio": 1.03125, "kl_div": 0.02605769783258438, "kl_div_pos": 0.027240796014666557, "kl_div_sft": 0.024874601513147354, "learning_rate": 5.97063621533442e-07, "loss": -0.0505, "ppo_loss": -1.0276151895523071, "sft_loss": 0.031028786674141884, "step": 871 }, { "epoch": 2.11, "grad_norm": 1.1196540333939047, "kl_div": 0.00864154938608408, "kl_div_sft": 0.00864154938608408, "learning_rate": 5.954323001631321e-07, "loss": -0.1298, "sft_loss": 0.04518849402666092, "step": 872 }, { "epoch": 2.12, "grad_norm": 1.0285343479954419, "importance_ratio": 0.890625, "kl_div": -0.13256366550922394, "kl_div_neg": -0.31670084595680237, "kl_div_pos": 0.05157352238893509, "learning_rate": 5.938009787928222e-07, "loss": -0.053, "ppo_loss": -0.12646332383155823, "step": 873 }, { "epoch": 2.12, "grad_norm": 1.8676648666960494, "importance_ratio": 1.015625, "kl_div": 0.00921483151614666, "kl_div_pos": 0.00921483151614666, "learning_rate": 5.921696574225123e-07, "loss": -0.084, "ppo_loss": -1.0095914602279663, "step": 874 }, { "epoch": 2.12, "grad_norm": 1.6134732111411296, "importance_ratio": 1.0390625, "kl_div": 0.01806982234120369, "kl_div_pos": 0.04134603589773178, "kl_div_sft": -0.005206390284001827, "learning_rate": 5.905383360522023e-07, "loss": -0.0232, "ppo_loss": -1.042212724685669, "sft_loss": 0.03829171508550644, "step": 875 }, { "epoch": 2.12, "grad_norm": 0.9257044378035186, "importance_ratio": 0.86328125, "kl_div": -0.08369235694408417, "kl_div_neg": -0.14850135147571564, "kl_div_sft": -0.018883369863033295, "learning_rate": 5.889070146818924e-07, "loss": 0.0204, "ppo_loss": 0.8619988560676575, "sft_loss": 0.10338804125785828, "step": 876 }, { "epoch": 2.13, "grad_norm": 3.690614064302966, "kl_div": 0.0038330499082803726, "kl_div_sft": 0.0038330499082803726, "learning_rate": 5.872756933115823e-07, "loss": -0.2202, "sft_loss": 0.03799348697066307, "step": 877 }, { "epoch": 2.13, "grad_norm": 0.9830280049488119, "kl_div": 0.004340812098234892, "kl_div_sft": 0.004340812098234892, "learning_rate": 5.856443719412723e-07, "loss": -0.0818, "sft_loss": 0.08931317925453186, "step": 878 }, { "epoch": 2.13, "grad_norm": 0.8184240493164358, "importance_ratio": 0.490234375, "kl_div": -0.34959229826927185, "kl_div_neg": -0.7119632363319397, "kl_div_sft": 0.012778629548847675, "learning_rate": 5.840130505709625e-07, "loss": -0.0543, "ppo_loss": 0.800000011920929, "sft_loss": 0.029221320524811745, "step": 879 }, { "epoch": 2.13, "grad_norm": 1.0711299252957196, "importance_ratio": 0.625, "kl_div": -0.25115224719047546, "kl_div_neg": -0.4688381254673004, "kl_div_sft": -0.0334663949906826, "learning_rate": 5.823817292006525e-07, "loss": -0.0823, "ppo_loss": 0.800000011920929, "sft_loss": 0.08490065485239029, "step": 880 }, { "epoch": 2.14, "grad_norm": 0.6690764806748656, "importance_ratio": 1.078125, "kl_div": 0.04035523161292076, "kl_div_pos": 0.073147714138031, "kl_div_sft": 0.007562749553471804, "learning_rate": 5.807504078303426e-07, "loss": -0.0222, "ppo_loss": -1.0758894681930542, "sft_loss": 0.041387416422367096, "step": 881 }, { "epoch": 2.14, "grad_norm": 0.5771412340673167, "importance_ratio": 1.0390625, "kl_div": 0.016257621347904205, "kl_div_pos": 0.03585897013545036, "kl_div_sft": -0.003343727672472596, "learning_rate": 5.791190864600326e-07, "loss": -0.0872, "ppo_loss": -1.03650963306427, "sft_loss": 0.07778169214725494, "step": 882 }, { "epoch": 2.14, "grad_norm": 0.639651279588748, "importance_ratio": 0.7109375, "kl_div": -0.3393230438232422, "kl_div_neg": -0.3393230438232422, "learning_rate": 5.774877650897227e-07, "loss": -0.0315, "ppo_loss": 0.800000011920929, "step": 883 }, { "epoch": 2.14, "grad_norm": 0.7186821615298014, "importance_ratio": 1.0625, "kl_div": 0.056774161756038666, "kl_div_pos": 0.056774161756038666, "learning_rate": 5.758564437194126e-07, "loss": -0.0984, "ppo_loss": -1.0584173202514648, "step": 884 }, { "epoch": 2.15, "grad_norm": 0.7896946178578934, "importance_ratio": 0.91796875, "kl_div": -0.029687166213989258, "kl_div_neg": -0.08769184350967407, "kl_div_sft": 0.028317509219050407, "learning_rate": 5.742251223491027e-07, "loss": 0.0452, "ppo_loss": 0.9160431027412415, "sft_loss": 0.0370275042951107, "step": 885 }, { "epoch": 2.15, "grad_norm": 0.5964225552143704, "kl_div": -0.007856599055230618, "kl_div_sft": -0.007856599055230618, "learning_rate": 5.725938009787928e-07, "loss": 0.0325, "sft_loss": 0.09961707890033722, "step": 886 }, { "epoch": 2.15, "grad_norm": 2.6939401864801154, "importance_ratio": 0.63671875, "kl_div": -0.22561708092689514, "kl_div_neg": -0.4499557614326477, "kl_div_sft": -0.0012784095015376806, "learning_rate": 5.709624796084829e-07, "loss": -0.0235, "ppo_loss": 0.800000011920929, "sft_loss": 0.061579037457704544, "step": 887 }, { "epoch": 2.15, "grad_norm": 0.7136453056713985, "importance_ratio": 0.67578125, "kl_div": -0.3997284173965454, "kl_div_neg": -0.3997284173965454, "learning_rate": 5.693311582381729e-07, "loss": 0.029, "ppo_loss": 0.800000011920929, "step": 888 }, { "epoch": 2.16, "grad_norm": 0.5778132706549846, "kl_div": 0.007670292165130377, "kl_div_sft": 0.007670292165130377, "learning_rate": 5.676998368678629e-07, "loss": -0.0585, "sft_loss": 0.04167616367340088, "step": 889 }, { "epoch": 2.16, "grad_norm": 1.6287109111507299, "kl_div": 0.012486516498029232, "kl_div_sft": 0.012486516498029232, "learning_rate": 5.66068515497553e-07, "loss": -0.0287, "sft_loss": 0.10539884120225906, "step": 890 }, { "epoch": 2.16, "grad_norm": 0.5478193032189899, "importance_ratio": 1.0859375, "kl_div": 0.02673727460205555, "kl_div_pos": 0.08578943461179733, "kl_div_sft": -0.032314885407686234, "learning_rate": 5.64437194127243e-07, "loss": -0.0571, "ppo_loss": -1.0895768404006958, "sft_loss": 0.1281270980834961, "step": 891 }, { "epoch": 2.16, "grad_norm": 0.8703841667870453, "importance_ratio": 0.62890625, "kl_div": -0.22016043961048126, "kl_div_neg": -0.46085503697395325, "kl_div_sft": 0.020534159615635872, "learning_rate": 5.628058727569332e-07, "loss": -0.1792, "ppo_loss": 0.800000011920929, "sft_loss": 0.0323028638958931, "step": 892 }, { "epoch": 2.16, "grad_norm": 3.2286924116468385, "importance_ratio": 1.125, "kl_div": 0.06784869730472565, "kl_div_pos": 0.11585932970046997, "kl_div_sft": 0.01983807049691677, "learning_rate": 5.611745513866231e-07, "loss": -0.1086, "ppo_loss": -1.1228379011154175, "sft_loss": 0.03183257207274437, "step": 893 }, { "epoch": 2.17, "grad_norm": 0.5008777424275753, "importance_ratio": 1.0546875, "kl_div": 0.03559987619519234, "kl_div_pos": 0.05102997273206711, "kl_div_sft": 0.020169777795672417, "learning_rate": 5.595432300163132e-07, "loss": -0.0837, "ppo_loss": -1.0523544549942017, "sft_loss": 0.049543678760528564, "step": 894 }, { "epoch": 2.17, "grad_norm": 1.9827208345235616, "importance_ratio": 0.734375, "kl_div": -0.31260108947753906, "kl_div_neg": -0.31260108947753906, "learning_rate": 5.579119086460032e-07, "loss": -0.0038, "ppo_loss": 0.800000011920929, "step": 895 }, { "epoch": 2.17, "grad_norm": 0.6935491812510783, "importance_ratio": 0.765625, "kl_div": -0.1380336731672287, "kl_div_neg": -0.2684652507305145, "kl_div_sft": -0.007602086756378412, "learning_rate": 5.562805872756933e-07, "loss": -0.0853, "ppo_loss": 0.800000011920929, "sft_loss": 0.06231473386287689, "step": 896 }, { "epoch": 2.17, "grad_norm": 0.9387845457910657, "kl_div": 0.025464002043008804, "kl_div_sft": 0.025464002043008804, "learning_rate": 5.546492659053833e-07, "loss": -0.0815, "sft_loss": 0.06399422883987427, "step": 897 }, { "epoch": 2.18, "grad_norm": 0.6214152529656375, "importance_ratio": 1.0234375, "kl_div": 0.022999495267868042, "kl_div_pos": 0.022714493796229362, "kl_div_sft": 0.02328449860215187, "learning_rate": 5.530179445350734e-07, "loss": 0.1013, "ppo_loss": -1.0229744911193848, "sft_loss": 0.04827561601996422, "step": 898 }, { "epoch": 2.18, "grad_norm": 0.8544614252275191, "importance_ratio": 0.88671875, "kl_div": -0.1337304562330246, "kl_div_neg": -0.30951204895973206, "kl_div_pos": 0.04205113649368286, "learning_rate": 5.513866231647635e-07, "loss": 0.0048, "ppo_loss": -0.12147393822669983, "step": 899 }, { "epoch": 2.18, "grad_norm": 2.1178801694981506, "importance_ratio": 0.7734375, "kl_div": -0.12312136590480804, "kl_div_neg": -0.2545829713344574, "kl_div_sft": 0.008340239524841309, "learning_rate": 5.497553017944536e-07, "loss": -0.0033, "ppo_loss": 0.800000011920929, "sft_loss": 0.06270673125982285, "step": 900 }, { "epoch": 2.18, "grad_norm": 1.019347075565765, "importance_ratio": 1.1328125, "kl_div": 0.07899191230535507, "kl_div_pos": 0.126931294798851, "kl_div_sft": 0.03105252794921398, "learning_rate": 5.481239804241435e-07, "loss": -0.162, "ppo_loss": -1.1353390216827393, "sft_loss": 0.05711635947227478, "step": 901 }, { "epoch": 2.19, "grad_norm": 0.5945166106057107, "importance_ratio": 0.75390625, "kl_div": -0.13973702490329742, "kl_div_neg": -0.2828894853591919, "kl_div_sft": 0.0034154406748712063, "learning_rate": 5.464926590538335e-07, "loss": -0.1883, "ppo_loss": 0.800000011920929, "sft_loss": 0.07432715594768524, "step": 902 }, { "epoch": 2.19, "grad_norm": 1.8871254176644539, "importance_ratio": 1.0234375, "kl_div": 0.028098221868276596, "kl_div_pos": 0.02675572969019413, "kl_div_sft": 0.029440714046359062, "learning_rate": 5.448613376835236e-07, "loss": -0.0609, "ppo_loss": -1.0271168947219849, "sft_loss": 0.1034909188747406, "step": 903 }, { "epoch": 2.19, "grad_norm": 0.9504609947007345, "importance_ratio": 0.63671875, "kl_div": -0.45120978355407715, "kl_div_neg": -0.45120978355407715, "learning_rate": 5.432300163132136e-07, "loss": -0.1774, "ppo_loss": 0.800000011920929, "step": 904 }, { "epoch": 2.19, "grad_norm": 0.74247535343558, "importance_ratio": 1.0703125, "kl_div": 0.04448920488357544, "kl_div_pos": 0.07105836272239685, "kl_div_sft": 0.01792004518210888, "learning_rate": 5.415986949429038e-07, "loss": -0.2217, "ppo_loss": -1.0736439228057861, "sft_loss": 0.028560511767864227, "step": 905 }, { "epoch": 2.2, "grad_norm": 1.131370415785853, "importance_ratio": 0.6796875, "kl_div": -0.38727688789367676, "kl_div_neg": -0.38727688789367676, "learning_rate": 5.399673735725938e-07, "loss": 0.0092, "ppo_loss": 0.800000011920929, "step": 906 }, { "epoch": 2.2, "grad_norm": 0.9877402894496916, "importance_ratio": 0.90625, "kl_div": -0.11487214267253876, "kl_div_neg": -0.27314162254333496, "kl_div_pos": 0.043397340923547745, "learning_rate": 5.383360522022839e-07, "loss": -0.001, "ppo_loss": -0.12217637896537781, "step": 907 }, { "epoch": 2.2, "grad_norm": 1.3842466858928706, "importance_ratio": 0.73828125, "kl_div": -0.14413465559482574, "kl_div_neg": -0.30117878317832947, "kl_div_sft": 0.012909467332065105, "learning_rate": 5.367047308319739e-07, "loss": -0.0474, "ppo_loss": 0.800000011920929, "sft_loss": 0.06256598234176636, "step": 908 }, { "epoch": 2.2, "grad_norm": 0.758970850079431, "importance_ratio": 1.0703125, "kl_div": 0.0683857724070549, "kl_div_pos": 0.0683857724070549, "learning_rate": 5.350734094616639e-07, "loss": -0.2193, "ppo_loss": -1.0707976818084717, "step": 909 }, { "epoch": 2.21, "grad_norm": 1.1457802268928465, "importance_ratio": 0.75390625, "kl_div": -0.15916511416435242, "kl_div_neg": -0.2836153507232666, "kl_div_sft": -0.03471486642956734, "learning_rate": 5.334420880913539e-07, "loss": -0.0862, "ppo_loss": 0.800000011920929, "sft_loss": 0.08879987895488739, "step": 910 }, { "epoch": 2.21, "grad_norm": 0.9460655554705513, "kl_div": 0.021832624450325966, "kl_div_sft": 0.021832624450325966, "learning_rate": 5.31810766721044e-07, "loss": -0.234, "sft_loss": 0.039057567715644836, "step": 911 }, { "epoch": 2.21, "grad_norm": 0.7130222146852393, "importance_ratio": 0.671875, "kl_div": -0.20167236030101776, "kl_div_neg": -0.3981626033782959, "kl_div_sft": -0.005182111170142889, "learning_rate": 5.301794453507341e-07, "loss": 0.0799, "ppo_loss": 0.800000011920929, "sft_loss": 0.11809124052524567, "step": 912 }, { "epoch": 2.21, "grad_norm": 1.105397953421926, "importance_ratio": 1.03125, "kl_div": 0.025202281773090363, "kl_div_pos": 0.03070555254817009, "kl_div_sft": 0.019699012860655785, "learning_rate": 5.285481239804241e-07, "loss": -0.0665, "ppo_loss": -1.031181812286377, "sft_loss": 0.005819185636937618, "step": 913 }, { "epoch": 2.22, "grad_norm": 1.443877874218789, "importance_ratio": 0.7109375, "kl_div": -0.16923661530017853, "kl_div_neg": -0.34361013770103455, "kl_div_sft": 0.005136905238032341, "learning_rate": 5.269168026101142e-07, "loss": -0.0395, "ppo_loss": 0.800000011920929, "sft_loss": 0.03511352464556694, "step": 914 }, { "epoch": 2.22, "grad_norm": 0.9870923999795622, "importance_ratio": 0.65625, "kl_div": -0.4261021018028259, "kl_div_neg": -0.4261021018028259, "learning_rate": 5.252854812398042e-07, "loss": -0.2032, "ppo_loss": 0.800000011920929, "step": 915 }, { "epoch": 2.22, "grad_norm": 0.7466714429986411, "kl_div": 0.016338294371962547, "kl_div_sft": 0.016338294371962547, "learning_rate": 5.236541598694943e-07, "loss": 0.0584, "sft_loss": 0.06320355087518692, "step": 916 }, { "epoch": 2.22, "grad_norm": 0.8658166996738293, "importance_ratio": 1.0703125, "kl_div": 0.033583857119083405, "kl_div_pos": 0.06586972624063492, "kl_div_sft": 0.0012979895109310746, "learning_rate": 5.220228384991842e-07, "loss": 0.026, "ppo_loss": -1.0680874586105347, "sft_loss": 0.05195733532309532, "step": 917 }, { "epoch": 2.23, "grad_norm": 1.9082970181248398, "importance_ratio": 0.953125, "kl_div": -0.05385783314704895, "kl_div_neg": -0.15265466272830963, "kl_div_pos": 0.04493899643421173, "learning_rate": 5.203915171288744e-07, "loss": -0.0781, "ppo_loss": -0.09376892447471619, "step": 918 }, { "epoch": 2.23, "grad_norm": 4.148648761319739, "importance_ratio": 0.69921875, "kl_div": -0.16899904608726501, "kl_div_neg": -0.3573715388774872, "kl_div_sft": 0.019373446702957153, "learning_rate": 5.187601957585644e-07, "loss": -0.0843, "ppo_loss": 0.800000011920929, "sft_loss": 0.07252917438745499, "step": 919 }, { "epoch": 2.23, "grad_norm": 2.7364635229817815, "importance_ratio": 0.8046875, "kl_div": -0.240549236536026, "kl_div_neg": -0.47470220923423767, "kl_div_pos": -0.006396274082362652, "learning_rate": 5.171288743882545e-07, "loss": -0.1051, "ppo_loss": -0.09681206941604614, "step": 920 }, { "epoch": 2.23, "grad_norm": 0.8553304691120556, "importance_ratio": 0.8828125, "kl_div": -0.14235982298851013, "kl_div_neg": -0.34726372361183167, "kl_div_pos": 0.062544085085392, "learning_rate": 5.154975530179445e-07, "loss": 0.0691, "ppo_loss": -0.1322707235813141, "step": 921 }, { "epoch": 2.24, "grad_norm": 1.9997982877101181, "importance_ratio": 1.046875, "kl_div": 0.00859884824603796, "kl_div_pos": 0.0443231463432312, "kl_div_sft": -0.02712544985115528, "learning_rate": 5.138662316476346e-07, "loss": -0.0244, "ppo_loss": -1.0453200340270996, "sft_loss": 0.1878640204668045, "step": 922 }, { "epoch": 2.24, "grad_norm": 0.9587369704573453, "kl_div": 0.010656565427780151, "kl_div_sft": 0.010656565427780151, "learning_rate": 5.122349102773246e-07, "loss": -0.0024, "sft_loss": 0.04456965997815132, "step": 923 }, { "epoch": 2.24, "grad_norm": 1.2664087011936005, "kl_div": 0.010272424668073654, "kl_div_sft": 0.010272424668073654, "learning_rate": 5.106035889070146e-07, "loss": 0.0762, "sft_loss": 0.09669818729162216, "step": 924 }, { "epoch": 2.24, "grad_norm": 4.297439815505914, "importance_ratio": 0.58984375, "kl_div": -0.25953978300094604, "kl_div_neg": -0.5252647995948792, "kl_div_sft": 0.006185252219438553, "learning_rate": 5.089722675367047e-07, "loss": -0.1525, "ppo_loss": 0.800000011920929, "sft_loss": 0.0307835154235363, "step": 925 }, { "epoch": 2.24, "grad_norm": 1.709687285690838, "importance_ratio": 0.76171875, "kl_div": -0.1354270726442337, "kl_div_neg": -0.2699092924594879, "kl_div_sft": -0.0009448446216993034, "learning_rate": 5.073409461663947e-07, "loss": -0.1006, "ppo_loss": 0.800000011920929, "sft_loss": 0.03979117423295975, "step": 926 }, { "epoch": 2.25, "grad_norm": 1.733235990407424, "importance_ratio": 1.0625, "kl_div": 0.06101357191801071, "kl_div_pos": 0.06101357191801071, "learning_rate": 5.057096247960848e-07, "loss": -0.1685, "ppo_loss": -1.0629152059555054, "step": 927 }, { "epoch": 2.25, "grad_norm": 1.0768405616111052, "importance_ratio": 0.8046875, "kl_div": -0.1038859635591507, "kl_div_neg": -0.21953067183494568, "kl_div_sft": 0.011758743785321712, "learning_rate": 5.040783034257748e-07, "loss": -0.1842, "ppo_loss": 0.8028955459594727, "sft_loss": 0.02808184176683426, "step": 928 }, { "epoch": 2.25, "grad_norm": 1.0042719193588434, "importance_ratio": 0.734375, "kl_div": -0.3099173307418823, "kl_div_neg": -0.3099173307418823, "learning_rate": 5.024469820554649e-07, "loss": -0.1502, "ppo_loss": 0.800000011920929, "step": 929 }, { "epoch": 2.25, "grad_norm": 0.6970447222415244, "importance_ratio": 0.58203125, "kl_div": -0.25831127166748047, "kl_div_neg": -0.5409226417541504, "kl_div_sft": 0.024300090968608856, "learning_rate": 5.008156606851549e-07, "loss": -0.0752, "ppo_loss": 0.800000011920929, "sft_loss": 0.057892248034477234, "step": 930 }, { "epoch": 2.26, "grad_norm": 0.6302256516107879, "importance_ratio": 0.65625, "kl_div": -0.21493804454803467, "kl_div_neg": -0.4216420352458954, "kl_div_sft": -0.008234056644141674, "learning_rate": 4.99184339314845e-07, "loss": 0.0039, "ppo_loss": 0.800000011920929, "sft_loss": 0.1816994994878769, "step": 931 }, { "epoch": 2.26, "grad_norm": 0.9363894560663912, "importance_ratio": 0.87890625, "kl_div": -0.15596584975719452, "kl_div_neg": -0.38737794756889343, "kl_div_pos": 0.0754462480545044, "learning_rate": 4.975530179445351e-07, "loss": -0.1136, "ppo_loss": -0.13918259739875793, "step": 932 }, { "epoch": 2.26, "grad_norm": 0.9081506529385803, "importance_ratio": 0.64453125, "kl_div": -0.2149648517370224, "kl_div_neg": -0.4411203861236572, "kl_div_sft": 0.011190692894160748, "learning_rate": 4.959216965742251e-07, "loss": -0.065, "ppo_loss": 0.800000011920929, "sft_loss": 0.04278969392180443, "step": 933 }, { "epoch": 2.26, "grad_norm": 1.1448285234307398, "importance_ratio": 0.56640625, "kl_div": -0.5740979909896851, "kl_div_neg": -0.5740979909896851, "learning_rate": 4.942903752039151e-07, "loss": 0.0292, "ppo_loss": 0.800000011920929, "step": 934 }, { "epoch": 2.27, "grad_norm": 0.9074364492269864, "importance_ratio": 0.6484375, "kl_div": -0.22268356382846832, "kl_div_neg": -0.43390703201293945, "kl_div_sft": -0.011460098437964916, "learning_rate": 4.926590538336052e-07, "loss": -0.0228, "ppo_loss": 0.800000011920929, "sft_loss": 0.06586343795061111, "step": 935 }, { "epoch": 2.27, "grad_norm": 0.9816613106724512, "importance_ratio": 0.8828125, "kl_div": -0.13887540996074677, "kl_div_neg": -0.3009295165538788, "kl_div_pos": 0.02317870408296585, "learning_rate": 4.910277324632953e-07, "loss": -0.0443, "ppo_loss": -0.11172470450401306, "step": 936 }, { "epoch": 2.27, "grad_norm": 1.1341153249581237, "kl_div": 0.009604312479496002, "kl_div_sft": 0.009604312479496002, "learning_rate": 4.893964110929853e-07, "loss": 0.0384, "sft_loss": 0.021207697689533234, "step": 937 }, { "epoch": 2.27, "grad_norm": 3.626492817746304, "importance_ratio": 0.81640625, "kl_div": -0.0927228033542633, "kl_div_neg": -0.20264843106269836, "kl_div_sft": 0.017202824354171753, "learning_rate": 4.877650897226753e-07, "loss": -0.1958, "ppo_loss": 0.816565215587616, "sft_loss": 0.06965680420398712, "step": 938 }, { "epoch": 2.28, "grad_norm": 0.8807685280788413, "importance_ratio": 1.0625, "kl_div": 0.0319003164768219, "kl_div_pos": 0.06287212669849396, "kl_div_sft": 0.0009285045089200139, "learning_rate": 4.861337683523654e-07, "loss": 0.0382, "ppo_loss": -1.064890742301941, "sft_loss": 0.09779992699623108, "step": 939 }, { "epoch": 2.28, "grad_norm": 0.6507075685918899, "importance_ratio": 0.61328125, "kl_div": -0.22991566359996796, "kl_div_neg": -0.48832497000694275, "kl_div_sft": 0.028493640944361687, "learning_rate": 4.845024469820555e-07, "loss": -0.0257, "ppo_loss": 0.800000011920929, "sft_loss": 0.04952215030789375, "step": 940 }, { "epoch": 2.28, "grad_norm": 0.7804789743908849, "importance_ratio": 0.671875, "kl_div": -0.40713441371917725, "kl_div_neg": -0.40713441371917725, "learning_rate": 4.828711256117454e-07, "loss": -0.0677, "ppo_loss": 0.800000011920929, "step": 941 }, { "epoch": 2.28, "grad_norm": 0.5817969958081254, "importance_ratio": 0.875, "kl_div": -0.1548028588294983, "kl_div_neg": -0.3525158166885376, "kl_div_pos": 0.04291009157896042, "learning_rate": 4.812398042414355e-07, "loss": -0.0212, "ppo_loss": -0.12192204594612122, "step": 942 }, { "epoch": 2.29, "grad_norm": 1.223707825476559, "kl_div": 0.02316691353917122, "kl_div_sft": 0.02316691353917122, "learning_rate": 4.796084828711256e-07, "loss": 0.0506, "sft_loss": 0.10178062319755554, "step": 943 }, { "epoch": 2.29, "grad_norm": 1.2030046576221893, "importance_ratio": 0.8203125, "kl_div": -0.08802450448274612, "kl_div_pos": -0.19667044281959534, "kl_div_sft": 0.020621435716748238, "learning_rate": 4.779771615008156e-07, "loss": -0.1252, "ppo_loss": -0.8214613199234009, "sft_loss": 0.08459824323654175, "step": 944 }, { "epoch": 2.29, "grad_norm": 1.4469212831546836, "importance_ratio": 1.0234375, "kl_div": 0.013117531314492226, "kl_div_pos": 0.024006225168704987, "kl_div_sft": 0.0022288374602794647, "learning_rate": 4.7634584013050565e-07, "loss": -0.1127, "ppo_loss": -1.024296760559082, "sft_loss": 0.0502149797976017, "step": 945 }, { "epoch": 2.29, "grad_norm": 1.606855475753341, "importance_ratio": 0.7890625, "kl_div": -0.23702040314674377, "kl_div_neg": -0.23702040314674377, "learning_rate": 4.7471451876019574e-07, "loss": -0.0082, "ppo_loss": 0.8120083808898926, "step": 946 }, { "epoch": 2.3, "grad_norm": 1.3974691266793597, "importance_ratio": 0.9296875, "kl_div": -0.08570179343223572, "kl_div_neg": -0.24382445216178894, "kl_div_pos": 0.0724208727478981, "learning_rate": 4.7308319738988577e-07, "loss": -0.1494, "ppo_loss": -0.13755390048027039, "step": 947 }, { "epoch": 2.3, "grad_norm": 1.7760453061004071, "importance_ratio": 0.734375, "kl_div": -0.14622443914413452, "kl_div_neg": -0.3102128803730011, "kl_div_sft": 0.01776399277150631, "learning_rate": 4.714518760195758e-07, "loss": 0.075, "ppo_loss": 0.800000011920929, "sft_loss": 0.020431511104106903, "step": 948 }, { "epoch": 2.3, "grad_norm": 0.9016649740018722, "importance_ratio": 1.0703125, "kl_div": 0.03968197479844093, "kl_div_pos": 0.0646844133734703, "kl_div_sft": 0.01467953436076641, "learning_rate": 4.698205546492659e-07, "loss": -0.0747, "ppo_loss": -1.0668222904205322, "sft_loss": 0.028933806344866753, "step": 949 }, { "epoch": 2.3, "grad_norm": 1.6019154229572639, "importance_ratio": 1.1171875, "kl_div": 0.06266265362501144, "kl_div_pos": 0.10981543362140656, "kl_div_sft": 0.01550986710935831, "learning_rate": 4.6818923327895594e-07, "loss": -0.0774, "ppo_loss": -1.1160720586776733, "sft_loss": 0.1458381563425064, "step": 950 }, { "epoch": 2.31, "grad_norm": 1.6397811899523762, "importance_ratio": 0.75, "kl_div": -0.13818220794200897, "kl_div_neg": -0.2875550389289856, "kl_div_sft": 0.01119061280041933, "learning_rate": 4.66557911908646e-07, "loss": -0.1208, "ppo_loss": 0.800000011920929, "sft_loss": 0.0633588433265686, "step": 951 }, { "epoch": 2.31, "grad_norm": 0.4302715147359019, "importance_ratio": 0.76171875, "kl_div": -0.11986764520406723, "kl_div_neg": -0.27146387100219727, "kl_div_sft": 0.031728580594062805, "learning_rate": 4.6492659053833606e-07, "loss": 0.0822, "ppo_loss": 0.800000011920929, "sft_loss": 0.030951837077736855, "step": 952 }, { "epoch": 2.31, "grad_norm": 0.8463842455091792, "kl_div": 0.02023821882903576, "kl_div_sft": 0.02023821882903576, "learning_rate": 4.632952691680261e-07, "loss": -0.1321, "sft_loss": 0.022637102752923965, "step": 953 }, { "epoch": 2.31, "grad_norm": 2.163751132200089, "importance_ratio": 1.0625, "kl_div": 0.03594045341014862, "kl_div_pos": 0.06286430358886719, "kl_div_sft": 0.009016606025397778, "learning_rate": 4.6166394779771614e-07, "loss": -0.2679, "ppo_loss": -1.0648823976516724, "sft_loss": 0.04815494269132614, "step": 954 }, { "epoch": 2.32, "grad_norm": 1.224781857744496, "importance_ratio": 0.4921875, "kl_div": -0.35339614748954773, "kl_div_neg": -0.7081003189086914, "kl_div_sft": 0.0013080260250717402, "learning_rate": 4.600326264274062e-07, "loss": 0.108, "ppo_loss": 0.800000011920929, "sft_loss": 0.07929542660713196, "step": 955 }, { "epoch": 2.32, "grad_norm": 1.1772556628894841, "importance_ratio": 0.64453125, "kl_div": -0.44552016258239746, "kl_div_neg": -0.44552016258239746, "learning_rate": 4.584013050570962e-07, "loss": 0.001, "ppo_loss": 0.800000011920929, "step": 956 }, { "epoch": 2.32, "grad_norm": 0.5728572612365842, "kl_div": 0.020160946995019913, "kl_div_sft": 0.020160946995019913, "learning_rate": 4.5676998368678625e-07, "loss": -0.0046, "sft_loss": 0.0823456272482872, "step": 957 }, { "epoch": 2.32, "grad_norm": 0.7880825340452763, "importance_ratio": 1.0546875, "kl_div": 0.022984405979514122, "kl_div_pos": 0.055373843759298325, "kl_div_sft": -0.009405032731592655, "learning_rate": 4.5513866231647634e-07, "loss": -0.0303, "ppo_loss": -1.0569356679916382, "sft_loss": 0.09787919372320175, "step": 958 }, { "epoch": 2.32, "grad_norm": 0.59643939435095, "importance_ratio": 1.078125, "kl_div": 0.04588526487350464, "kl_div_pos": 0.07213117927312851, "kl_div_sft": 0.01963934861123562, "learning_rate": 4.535073409461664e-07, "loss": -0.1171, "ppo_loss": -1.0747963190078735, "sft_loss": 0.03999362513422966, "step": 959 }, { "epoch": 2.33, "grad_norm": 0.7536200340791993, "importance_ratio": 0.7421875, "kl_div": -0.13872717320919037, "kl_div_neg": -0.2981773614883423, "kl_div_sft": 0.020723015069961548, "learning_rate": 4.518760195758564e-07, "loss": 0.0493, "ppo_loss": 0.800000011920929, "sft_loss": 0.054769787937402725, "step": 960 }, { "epoch": 2.33, "grad_norm": 0.9574148226866949, "importance_ratio": 1.0625, "kl_div": 0.03498686105012894, "kl_div_pos": 0.058754608035087585, "kl_div_sft": 0.011219117790460587, "learning_rate": 4.5024469820554645e-07, "loss": 0.0674, "ppo_loss": -1.06051504611969, "sft_loss": 0.07081925123929977, "step": 961 }, { "epoch": 2.33, "grad_norm": 1.7923191863604815, "importance_ratio": 0.69921875, "kl_div": -0.18562382459640503, "kl_div_neg": -0.3601547181606293, "kl_div_sft": -0.011092942208051682, "learning_rate": 4.4861337683523654e-07, "loss": -0.1316, "ppo_loss": 0.800000011920929, "sft_loss": 0.052693869918584824, "step": 962 }, { "epoch": 2.33, "grad_norm": 1.6299272808867897, "importance_ratio": 0.71484375, "kl_div": -0.1514943540096283, "kl_div_neg": -0.3330497741699219, "kl_div_sft": 0.03006105124950409, "learning_rate": 4.469820554649266e-07, "loss": -0.1489, "ppo_loss": 0.800000011920929, "sft_loss": 0.017525294795632362, "step": 963 }, { "epoch": 2.34, "grad_norm": 0.7107717299814393, "importance_ratio": 0.7734375, "kl_div": -0.13310004770755768, "kl_div_neg": -0.25532862544059753, "kl_div_sft": -0.010871472768485546, "learning_rate": 4.453507340946166e-07, "loss": -0.0232, "ppo_loss": 0.800000011920929, "sft_loss": 0.12232664972543716, "step": 964 }, { "epoch": 2.34, "grad_norm": 0.665382845354885, "importance_ratio": 0.78125, "kl_div": -0.3159344792366028, "kl_div_neg": -0.6828457117080688, "kl_div_pos": 0.05097677558660507, "learning_rate": 4.437194127243067e-07, "loss": -0.0329, "ppo_loss": -0.12614920735359192, "step": 965 }, { "epoch": 2.34, "grad_norm": 3.6051859801594266, "kl_div": 0.020080842077732086, "kl_div_sft": 0.020080842077732086, "learning_rate": 4.4208809135399674e-07, "loss": -0.1174, "sft_loss": 0.07262429594993591, "step": 966 }, { "epoch": 2.34, "grad_norm": 0.9211671342162377, "importance_ratio": 0.8671875, "kl_div": -0.15479469299316406, "kl_div_neg": -0.31137746572494507, "kl_div_pos": 0.0017880933592095971, "learning_rate": 4.404567699836867e-07, "loss": 0.1204, "ppo_loss": -0.10089483857154846, "step": 967 }, { "epoch": 2.35, "grad_norm": 0.8084663991109114, "importance_ratio": 0.89453125, "kl_div": -0.053698454052209854, "kl_div_pos": -0.1113470196723938, "kl_div_sft": 0.003950112033635378, "learning_rate": 4.388254486133768e-07, "loss": -0.0354, "ppo_loss": -0.8946282863616943, "sft_loss": 0.05712110549211502, "step": 968 }, { "epoch": 2.35, "grad_norm": 0.9693344568152478, "importance_ratio": 0.95703125, "kl_div": -0.04780184105038643, "kl_div_pos": -0.04780184105038643, "learning_rate": 4.3719412724306685e-07, "loss": -0.0423, "ppo_loss": -0.9561375975608826, "step": 969 }, { "epoch": 2.35, "grad_norm": 1.8175507252159182, "importance_ratio": 0.6640625, "kl_div": -0.2046709656715393, "kl_div_neg": -0.40851375460624695, "kl_div_sft": -0.0008281635236926377, "learning_rate": 4.355628058727569e-07, "loss": 0.0932, "ppo_loss": 0.800000011920929, "sft_loss": 0.04318346455693245, "step": 970 }, { "epoch": 2.35, "grad_norm": 0.6528844362965035, "kl_div": 0.014403751119971275, "kl_div_sft": 0.014403751119971275, "learning_rate": 4.33931484502447e-07, "loss": -0.0675, "sft_loss": 0.05766864866018295, "step": 971 }, { "epoch": 2.36, "grad_norm": 1.578720584695023, "importance_ratio": 1.0234375, "kl_div": 0.020572490990161896, "kl_div_pos": 0.025049732998013496, "kl_div_sft": 0.016095248982310295, "learning_rate": 4.32300163132137e-07, "loss": -0.2578, "ppo_loss": -1.025366187095642, "sft_loss": 0.09186916053295135, "step": 972 }, { "epoch": 2.36, "grad_norm": 1.0925616757160808, "importance_ratio": 0.85546875, "kl_div": -0.1612037867307663, "kl_div_neg": -0.2649836242198944, "kl_div_pos": -0.05742394179105759, "learning_rate": 4.3066884176182705e-07, "loss": -0.032, "ppo_loss": -0.0720968246459961, "step": 973 }, { "epoch": 2.36, "grad_norm": 1.4997664905626473, "importance_ratio": 0.82421875, "kl_div": -0.2255844920873642, "kl_div_neg": -0.4845183491706848, "kl_div_pos": 0.03334937244653702, "learning_rate": 4.290375203915171e-07, "loss": 0.0245, "ppo_loss": -0.11695578694343567, "step": 974 }, { "epoch": 2.36, "grad_norm": 1.6696932809091711, "importance_ratio": 0.69921875, "kl_div": -0.17535261809825897, "kl_div_neg": -0.3550969362258911, "kl_div_sft": 0.004391703754663467, "learning_rate": 4.274061990212072e-07, "loss": -0.0693, "ppo_loss": 0.800000011920929, "sft_loss": 0.04276962950825691, "step": 975 }, { "epoch": 2.37, "grad_norm": 0.9145076678409796, "importance_ratio": 1.0625, "kl_div": 0.05917125195264816, "kl_div_pos": 0.05917125195264816, "learning_rate": 4.257748776508972e-07, "loss": -0.1431, "ppo_loss": -1.0609924793243408, "step": 976 }, { "epoch": 2.37, "grad_norm": 0.7626321646793529, "importance_ratio": 1.046875, "kl_div": 0.025566166266798973, "kl_div_pos": 0.043650005012750626, "kl_div_sft": 0.007482328452169895, "learning_rate": 4.2414355628058725e-07, "loss": -0.0223, "ppo_loss": -1.04461669921875, "sft_loss": 0.06351668387651443, "step": 977 }, { "epoch": 2.37, "grad_norm": 0.8729367453390997, "importance_ratio": 0.80078125, "kl_div": -0.22225841879844666, "kl_div_neg": -0.22804026305675507, "kl_div_pos": -0.21647655963897705, "learning_rate": 4.2251223491027734e-07, "loss": -0.0014, "ppo_loss": -0.0026757121086120605, "step": 978 }, { "epoch": 2.37, "grad_norm": 1.016882133060283, "importance_ratio": 0.875, "kl_div": -0.15614619851112366, "kl_div_neg": -0.3702617883682251, "kl_div_pos": 0.05796937644481659, "learning_rate": 4.2088091353996733e-07, "loss": -0.0115, "ppo_loss": -0.12984129786491394, "step": 979 }, { "epoch": 2.38, "grad_norm": 0.7823746025595179, "importance_ratio": 0.54296875, "kl_div": -0.30497926473617554, "kl_div_neg": -0.6085367798805237, "kl_div_sft": -0.0014217606512829661, "learning_rate": 4.1924959216965737e-07, "loss": 0.0263, "ppo_loss": 0.800000011920929, "sft_loss": 0.021424463018774986, "step": 980 }, { "epoch": 2.38, "grad_norm": 0.8102134294481221, "kl_div": 0.012248167768120766, "kl_div_sft": 0.012248167768120766, "learning_rate": 4.1761827079934746e-07, "loss": -0.0891, "sft_loss": 0.04203151911497116, "step": 981 }, { "epoch": 2.38, "grad_norm": 0.678118110656901, "importance_ratio": 0.72265625, "kl_div": -0.15509912371635437, "kl_div_neg": -0.32470399141311646, "kl_div_sft": 0.014505734667181969, "learning_rate": 4.159869494290375e-07, "loss": -0.1236, "ppo_loss": 0.800000011920929, "sft_loss": 0.0410248227417469, "step": 982 }, { "epoch": 2.38, "grad_norm": 0.9372623142299381, "importance_ratio": 1.0859375, "kl_div": 0.04024956002831459, "kl_div_pos": 0.08440776914358139, "kl_div_sft": -0.00390864722430706, "learning_rate": 4.1435562805872753e-07, "loss": -0.0291, "ppo_loss": -1.088072419166565, "sft_loss": 0.04991704225540161, "step": 983 }, { "epoch": 2.39, "grad_norm": 0.817519309839498, "importance_ratio": 0.765625, "kl_div": -0.1263846457004547, "kl_div_neg": -0.2686377465724945, "kl_div_sft": 0.01586846634745598, "learning_rate": 4.127243066884176e-07, "loss": -0.0655, "ppo_loss": 0.800000011920929, "sft_loss": 0.03322933614253998, "step": 984 }, { "epoch": 2.39, "grad_norm": 0.9612455727087088, "importance_ratio": 1.0546875, "kl_div": 0.03542941063642502, "kl_div_pos": 0.05257461965084076, "kl_div_sft": 0.018284201622009277, "learning_rate": 4.1109298531810766e-07, "loss": -0.1122, "ppo_loss": -1.0539811849594116, "sft_loss": 0.04410533607006073, "step": 985 }, { "epoch": 2.39, "grad_norm": 0.8610901103654892, "importance_ratio": 1.0390625, "kl_div": 0.01973757892847061, "kl_div_pos": 0.03967132791876793, "kl_div_sft": -0.00019617100770119578, "learning_rate": 4.094616639477977e-07, "loss": 0.0131, "ppo_loss": -1.0404688119888306, "sft_loss": 0.04647931084036827, "step": 986 }, { "epoch": 2.39, "grad_norm": 0.9737974771817556, "importance_ratio": 1.03125, "kl_div": 0.025196190923452377, "kl_div_pos": 0.027397962287068367, "kl_div_sft": 0.022994421422481537, "learning_rate": 4.078303425774878e-07, "loss": -0.1519, "ppo_loss": -1.0277767181396484, "sft_loss": 0.03373304381966591, "step": 987 }, { "epoch": 2.4, "grad_norm": 1.075184613277687, "importance_ratio": 0.984375, "kl_div": -0.001032222993671894, "kl_div_pos": -0.01664351485669613, "kl_div_sft": 0.01457906886935234, "learning_rate": 4.061990212071778e-07, "loss": -0.1073, "ppo_loss": -0.983494222164154, "sft_loss": 0.012189331464469433, "step": 988 }, { "epoch": 2.4, "grad_norm": 0.8159042713378422, "importance_ratio": 0.671875, "kl_div": -0.18853308260440826, "kl_div_neg": -0.3979014456272125, "kl_div_sft": 0.02083529159426689, "learning_rate": 4.0456769983686786e-07, "loss": 0.0038, "ppo_loss": 0.800000011920929, "sft_loss": 0.046729568392038345, "step": 989 }, { "epoch": 2.4, "grad_norm": 0.7626515081625942, "kl_div": 0.019913557916879654, "kl_div_sft": 0.019913557916879654, "learning_rate": 4.029363784665579e-07, "loss": 0.04, "sft_loss": 0.09075594693422318, "step": 990 }, { "epoch": 2.4, "grad_norm": 0.945492120497141, "kl_div": 0.015917502343654633, "kl_div_sft": 0.015917502343654633, "learning_rate": 4.0130505709624793e-07, "loss": -0.1035, "sft_loss": 0.0952652245759964, "step": 991 }, { "epoch": 2.4, "grad_norm": 0.9477742510837185, "importance_ratio": 0.6484375, "kl_div": -0.21832098066806793, "kl_div_neg": -0.43150076270103455, "kl_div_sft": -0.005141206085681915, "learning_rate": 3.9967373572593797e-07, "loss": 0.0258, "ppo_loss": 0.800000011920929, "sft_loss": 0.13462142646312714, "step": 992 }, { "epoch": 2.41, "grad_norm": 0.4865183936814369, "importance_ratio": 0.69140625, "kl_div": -0.1786474585533142, "kl_div_neg": -0.369859904050827, "kl_div_sft": 0.012564979493618011, "learning_rate": 3.98042414355628e-07, "loss": -0.0915, "ppo_loss": 0.800000011920929, "sft_loss": 0.041070904582738876, "step": 993 }, { "epoch": 2.41, "grad_norm": 1.2837001819100236, "importance_ratio": 0.796875, "kl_div": -0.1052025631070137, "kl_div_neg": -0.2293727695941925, "kl_div_sft": 0.0189676433801651, "learning_rate": 3.964110929853181e-07, "loss": 0.0865, "ppo_loss": 0.800000011920929, "sft_loss": 0.031797245144844055, "step": 994 }, { "epoch": 2.41, "grad_norm": 0.6600711022715781, "importance_ratio": 0.80078125, "kl_div": -0.10373440384864807, "kl_div_neg": -0.22047407925128937, "kl_div_sft": 0.013005265966057777, "learning_rate": 3.9477977161500813e-07, "loss": -0.168, "ppo_loss": 0.8021385073661804, "sft_loss": 0.042339082807302475, "step": 995 }, { "epoch": 2.41, "grad_norm": 1.0313396992831134, "importance_ratio": 1.03125, "kl_div": 0.01869693025946617, "kl_div_pos": 0.03447434678673744, "kl_div_sft": 0.002919515362009406, "learning_rate": 3.9314845024469817e-07, "loss": -0.0382, "ppo_loss": -1.0350754261016846, "sft_loss": 0.06383674591779709, "step": 996 }, { "epoch": 2.42, "grad_norm": 1.0082966432648142, "importance_ratio": 0.60546875, "kl_div": -0.23840777575969696, "kl_div_neg": -0.5040940642356873, "kl_div_sft": 0.027278510853648186, "learning_rate": 3.9151712887438826e-07, "loss": -0.1549, "ppo_loss": 0.800000011920929, "sft_loss": 0.028871726244688034, "step": 997 }, { "epoch": 2.42, "grad_norm": 0.8104968687511205, "kl_div": 0.005753981880843639, "kl_div_sft": 0.005753981880843639, "learning_rate": 3.898858075040783e-07, "loss": -0.011, "sft_loss": 0.058913350105285645, "step": 998 }, { "epoch": 2.42, "grad_norm": 1.005984222663792, "importance_ratio": 0.78125, "kl_div": -0.2435922473669052, "kl_div_neg": -0.2435922473669052, "learning_rate": 3.8825448613376833e-07, "loss": 0.0712, "ppo_loss": 0.803253173828125, "step": 999 }, { "epoch": 2.42, "grad_norm": 0.8981189660683943, "kl_div": 0.013707583770155907, "kl_div_sft": 0.013707583770155907, "learning_rate": 3.866231647634584e-07, "loss": 0.0675, "sft_loss": 0.06317710131406784, "step": 1000 }, { "epoch": 2.43, "grad_norm": 3.1582724257528825, "importance_ratio": 1.0625, "kl_div": 0.03967162221670151, "kl_div_pos": 0.057782385498285294, "kl_div_sft": 0.02156086266040802, "learning_rate": 3.8499184339314846e-07, "loss": 0.1217, "ppo_loss": -1.0594844818115234, "sft_loss": 0.05303584784269333, "step": 1001 }, { "epoch": 2.43, "grad_norm": 0.8169442188499416, "importance_ratio": 0.6171875, "kl_div": -0.2412409782409668, "kl_div_neg": -0.48035579919815063, "kl_div_sft": -0.0021261433139443398, "learning_rate": 3.833605220228385e-07, "loss": -0.1994, "ppo_loss": 0.800000011920929, "sft_loss": 0.051242586225271225, "step": 1002 }, { "epoch": 2.43, "grad_norm": 0.8600466790949012, "kl_div": 0.00573944766074419, "kl_div_sft": 0.00573944766074419, "learning_rate": 3.817292006525285e-07, "loss": 0.0015, "sft_loss": 0.12035956233739853, "step": 1003 }, { "epoch": 2.43, "grad_norm": 0.7545500067336446, "kl_div": 0.003994662780314684, "kl_div_sft": 0.003994662780314684, "learning_rate": 3.8009787928221857e-07, "loss": 0.0088, "sft_loss": 0.09922191500663757, "step": 1004 }, { "epoch": 2.44, "grad_norm": 0.9927936414561115, "importance_ratio": 1.0546875, "kl_div": 0.028940679505467415, "kl_div_pos": 0.05019991099834442, "kl_div_sft": 0.007681448478251696, "learning_rate": 3.784665579119086e-07, "loss": -0.0966, "ppo_loss": -1.0514812469482422, "sft_loss": 0.03251166641712189, "step": 1005 }, { "epoch": 2.44, "grad_norm": 0.5373069261166618, "kl_div": 0.0025871756952255964, "kl_div_sft": 0.0025871756952255964, "learning_rate": 3.7683523654159865e-07, "loss": -0.0637, "sft_loss": 0.07286077737808228, "step": 1006 }, { "epoch": 2.44, "grad_norm": 1.1399458862310774, "kl_div": -0.01982680708169937, "kl_div_sft": -0.01982680708169937, "learning_rate": 3.7520391517128874e-07, "loss": -0.0391, "sft_loss": 0.10723453760147095, "step": 1007 }, { "epoch": 2.44, "grad_norm": 0.7950012017486894, "importance_ratio": 1.0546875, "kl_div": 0.023592008277773857, "kl_div_pos": 0.049902159720659256, "kl_div_sft": -0.00271814176812768, "learning_rate": 3.7357259380097877e-07, "loss": -0.0217, "ppo_loss": -1.0511683225631714, "sft_loss": 0.08355196565389633, "step": 1008 }, { "epoch": 2.45, "grad_norm": 0.9301867707171533, "importance_ratio": 0.7890625, "kl_div": -0.11879640072584152, "kl_div_neg": -0.2389979511499405, "kl_div_sft": 0.0014051523758098483, "learning_rate": 3.719412724306688e-07, "loss": -0.1237, "ppo_loss": 0.800000011920929, "sft_loss": 0.060970794409513474, "step": 1009 }, { "epoch": 2.45, "grad_norm": 0.8832422071513172, "importance_ratio": 0.70703125, "kl_div": -0.1883264183998108, "kl_div_neg": -0.34558987617492676, "kl_div_sft": -0.031062960624694824, "learning_rate": 3.703099510603589e-07, "loss": -0.1216, "ppo_loss": 0.800000011920929, "sft_loss": 0.09838508069515228, "step": 1010 }, { "epoch": 2.45, "grad_norm": 1.5865526932298137, "kl_div": 0.014200937002897263, "kl_div_sft": 0.014200937002897263, "learning_rate": 3.6867862969004894e-07, "loss": -0.1554, "sft_loss": 0.0974261462688446, "step": 1011 }, { "epoch": 2.45, "grad_norm": 0.6722549650421021, "importance_ratio": 0.7109375, "kl_div": -0.16757439076900482, "kl_div_neg": -0.3384281098842621, "kl_div_sft": 0.003279315773397684, "learning_rate": 3.6704730831973897e-07, "loss": 0.0471, "ppo_loss": 0.800000011920929, "sft_loss": 0.0659431666135788, "step": 1012 }, { "epoch": 2.46, "grad_norm": 0.9591359253759136, "importance_ratio": 0.8359375, "kl_div": -0.21374891698360443, "kl_div_neg": -0.48649612069129944, "kl_div_pos": 0.05899827927350998, "learning_rate": 3.6541598694942906e-07, "loss": -0.075, "ppo_loss": -0.13038673996925354, "step": 1013 }, { "epoch": 2.46, "grad_norm": 0.8733932183629812, "importance_ratio": 0.5625, "kl_div": -0.6281610131263733, "kl_div_neg": -0.6281610131263733, "learning_rate": 3.637846655791191e-07, "loss": -0.0703, "ppo_loss": 0.800000011920929, "step": 1014 }, { "epoch": 2.46, "grad_norm": 1.792351510509404, "importance_ratio": 0.828125, "kl_div": -0.2242492288351059, "kl_div_neg": -0.4955662488937378, "kl_div_pos": 0.047067780047655106, "learning_rate": 3.621533442088091e-07, "loss": 0.0967, "ppo_loss": -0.12409648299217224, "step": 1015 }, { "epoch": 2.46, "grad_norm": 1.0900571333934244, "kl_div": -0.003447586204856634, "kl_div_sft": -0.003447586204856634, "learning_rate": 3.605220228384992e-07, "loss": -0.0276, "sft_loss": 0.0526358038187027, "step": 1016 }, { "epoch": 2.47, "grad_norm": 0.7597449920127731, "importance_ratio": 0.9140625, "kl_div": -0.10268957912921906, "kl_div_neg": -0.257598876953125, "kl_div_pos": 0.05221971124410629, "learning_rate": 3.588907014681892e-07, "loss": 0.0907, "ppo_loss": -0.12680360674858093, "step": 1017 }, { "epoch": 2.47, "grad_norm": 1.5183983038447628, "importance_ratio": 0.8515625, "kl_div": -0.18091082572937012, "kl_div_neg": -0.38382551074028015, "kl_div_pos": 0.022003866732120514, "learning_rate": 3.5725938009787925e-07, "loss": -0.0696, "ppo_loss": -0.11112388968467712, "step": 1018 }, { "epoch": 2.47, "grad_norm": 0.6231943033796007, "importance_ratio": 0.609375, "kl_div": -0.49936342239379883, "kl_div_neg": -0.49936342239379883, "learning_rate": 3.556280587275693e-07, "loss": -0.0034, "ppo_loss": 0.800000011920929, "step": 1019 }, { "epoch": 2.47, "grad_norm": 1.3095852412227011, "kl_div": 0.010510137304663658, "kl_div_sft": 0.010510137304663658, "learning_rate": 3.539967373572594e-07, "loss": -0.2417, "sft_loss": 0.10299015045166016, "step": 1020 }, { "epoch": 2.48, "grad_norm": 1.312973572807166, "importance_ratio": 1.0078125, "kl_div": 0.01436000969260931, "kl_div_pos": 0.00994068942964077, "kl_div_sft": 0.01877932995557785, "learning_rate": 3.523654159869494e-07, "loss": 0.0862, "ppo_loss": -1.0099903345108032, "sft_loss": 0.010615055449306965, "step": 1021 }, { "epoch": 2.48, "grad_norm": 0.8203770929972757, "kl_div": 0.018396597355604172, "kl_div_sft": 0.018396597355604172, "learning_rate": 3.5073409461663945e-07, "loss": -0.0526, "sft_loss": 0.06409763544797897, "step": 1022 }, { "epoch": 2.48, "grad_norm": 1.5235168436369382, "importance_ratio": 0.78515625, "kl_div": -0.11560414731502533, "kl_div_neg": -0.24418854713439941, "kl_div_sft": 0.012980245985090733, "learning_rate": 3.4910277324632954e-07, "loss": 0.0121, "ppo_loss": 0.800000011920929, "sft_loss": 0.16275613009929657, "step": 1023 }, { "epoch": 2.48, "grad_norm": 0.8458514006490623, "importance_ratio": 0.75390625, "kl_div": -0.13325250148773193, "kl_div_neg": -0.2842373847961426, "kl_div_sft": 0.017732389271259308, "learning_rate": 3.474714518760196e-07, "loss": 0.0991, "ppo_loss": 0.800000011920929, "sft_loss": 0.029525436460971832, "step": 1024 }, { "epoch": 2.48, "grad_norm": 0.8619704500585501, "kl_div": 0.031707629561424255, "kl_div_sft": 0.031707629561424255, "learning_rate": 3.458401305057096e-07, "loss": -0.1025, "sft_loss": 0.028296634554862976, "step": 1025 }, { "epoch": 2.49, "grad_norm": 0.7322028481099582, "importance_ratio": 1.0625, "kl_div": 0.05670029670000076, "kl_div_pos": 0.05670029670000076, "learning_rate": 3.442088091353997e-07, "loss": -0.0422, "ppo_loss": -1.058358907699585, "step": 1026 }, { "epoch": 2.49, "grad_norm": 1.0721780154043024, "importance_ratio": 1.0546875, "kl_div": 0.031154815107584, "kl_div_pos": 0.053050797432661057, "kl_div_sft": 0.009258833713829517, "learning_rate": 3.425774877650897e-07, "loss": -0.0309, "ppo_loss": -1.05448317527771, "sft_loss": 0.05470652133226395, "step": 1027 }, { "epoch": 2.49, "grad_norm": 1.0051672470840838, "kl_div": 0.010653335601091385, "kl_div_sft": 0.010653335601091385, "learning_rate": 3.409461663947797e-07, "loss": -0.0996, "sft_loss": 0.0735035166144371, "step": 1028 }, { "epoch": 2.49, "grad_norm": 0.8605183018716919, "importance_ratio": 0.7890625, "kl_div": -0.28868263959884644, "kl_div_neg": -0.6043957471847534, "kl_div_pos": 0.0270304586738348, "learning_rate": 3.393148450244698e-07, "loss": -0.014, "ppo_loss": -0.11369958519935608, "step": 1029 }, { "epoch": 2.5, "grad_norm": 0.7443510944361741, "kl_div": 0.005600485950708389, "kl_div_sft": 0.005600485950708389, "learning_rate": 3.3768352365415985e-07, "loss": -0.1211, "sft_loss": 0.0647912248969078, "step": 1030 }, { "epoch": 2.5, "grad_norm": 0.5907332427125761, "importance_ratio": 0.703125, "kl_div": -0.16875949501991272, "kl_div_neg": -0.3517286479473114, "kl_div_sft": 0.014209664426743984, "learning_rate": 3.360522022838499e-07, "loss": -0.0071, "ppo_loss": 0.800000011920929, "sft_loss": 0.055921465158462524, "step": 1031 }, { "epoch": 2.5, "grad_norm": 0.9959272058419946, "importance_ratio": 1.0703125, "kl_div": 0.04712774604558945, "kl_div_pos": 0.06734217703342438, "kl_div_sft": 0.026913316920399666, "learning_rate": 3.344208809135399e-07, "loss": -0.1393, "ppo_loss": -1.0696613788604736, "sft_loss": 0.022098220884799957, "step": 1032 }, { "epoch": 2.5, "grad_norm": 0.8081967028606536, "kl_div": 0.024792511016130447, "kl_div_sft": 0.024792511016130447, "learning_rate": 3.3278955954323e-07, "loss": -0.0203, "sft_loss": 0.060120031237602234, "step": 1033 }, { "epoch": 2.51, "grad_norm": 0.7387214438249187, "importance_ratio": 1.0, "kl_div": 0.012671315111219883, "kl_div_pos": 0.0017745542572811246, "kl_div_sft": 0.02356807515025139, "learning_rate": 3.3115823817292005e-07, "loss": -0.1071, "ppo_loss": -1.0017762184143066, "sft_loss": 0.015665153041481972, "step": 1034 }, { "epoch": 2.51, "grad_norm": 2.323499757488017, "importance_ratio": 1.046875, "kl_div": 0.03274444863200188, "kl_div_pos": 0.04232431575655937, "kl_div_sft": 0.023164579644799232, "learning_rate": 3.295269168026101e-07, "loss": -0.0592, "ppo_loss": -1.043232798576355, "sft_loss": 0.058782611042261124, "step": 1035 }, { "epoch": 2.51, "grad_norm": 2.0824484916993877, "importance_ratio": 0.765625, "kl_div": -0.13460354506969452, "kl_div_neg": -0.2660696506500244, "kl_div_sft": -0.0031374366953969, "learning_rate": 3.278955954323002e-07, "loss": -0.0027, "ppo_loss": 0.800000011920929, "sft_loss": 0.1042267307639122, "step": 1036 }, { "epoch": 2.51, "grad_norm": 1.555489951218678, "importance_ratio": 0.73828125, "kl_div": -0.16115102171897888, "kl_div_neg": -0.30402418971061707, "kl_div_sft": -0.018277853727340698, "learning_rate": 3.262642740619902e-07, "loss": 0.0736, "ppo_loss": 0.800000011920929, "sft_loss": 0.1204533502459526, "step": 1037 }, { "epoch": 2.52, "grad_norm": 1.556094812216053, "importance_ratio": 1.0546875, "kl_div": 0.04028111696243286, "kl_div_pos": 0.05016597732901573, "kl_div_sft": 0.03039625659584999, "learning_rate": 3.2463295269168025e-07, "loss": -0.0944, "ppo_loss": -1.0514456033706665, "sft_loss": 0.013909861445426941, "step": 1038 }, { "epoch": 2.52, "grad_norm": 0.755530707295099, "importance_ratio": 0.8515625, "kl_div": -0.0725010484457016, "kl_div_neg": -0.16224679350852966, "kl_div_sft": 0.017244696617126465, "learning_rate": 3.230016313213703e-07, "loss": 0.0109, "ppo_loss": 0.8502313494682312, "sft_loss": 0.06347573548555374, "step": 1039 }, { "epoch": 2.52, "grad_norm": 0.5475334563114244, "kl_div": 0.009469101205468178, "kl_div_sft": 0.009469101205468178, "learning_rate": 3.2137030995106033e-07, "loss": -0.0214, "sft_loss": 0.12747272849082947, "step": 1040 }, { "epoch": 2.52, "grad_norm": 1.2012607448545887, "importance_ratio": 1.0390625, "kl_div": 0.01814403384923935, "kl_div_pos": 0.039489831775426865, "kl_div_sft": -0.0032017657067626715, "learning_rate": 3.1973898858075036e-07, "loss": 0.0054, "ppo_loss": -1.0402798652648926, "sft_loss": 0.07425745576620102, "step": 1041 }, { "epoch": 2.53, "grad_norm": 1.2494062921098452, "importance_ratio": 0.5546875, "kl_div": -0.2863028049468994, "kl_div_neg": -0.5887343883514404, "kl_div_sft": 0.01612876169383526, "learning_rate": 3.1810766721044045e-07, "loss": -0.0079, "ppo_loss": 0.800000011920929, "sft_loss": 0.013732396066188812, "step": 1042 }, { "epoch": 2.53, "grad_norm": 0.9357681809049531, "importance_ratio": 0.91796875, "kl_div": -0.0974334329366684, "kl_div_neg": -0.24848219752311707, "kl_div_pos": 0.05361533910036087, "learning_rate": 3.164763458401305e-07, "loss": -0.0336, "ppo_loss": -0.12753930687904358, "step": 1043 }, { "epoch": 2.53, "grad_norm": 1.8262376742824988, "importance_ratio": 1.0625, "kl_div": 0.05723012983798981, "kl_div_pos": 0.05723012983798981, "learning_rate": 3.1484502446982053e-07, "loss": -0.1627, "ppo_loss": -1.058899998664856, "step": 1044 }, { "epoch": 2.53, "grad_norm": 1.1505770313680412, "importance_ratio": 1.0546875, "kl_div": 0.010374104604125023, "kl_div_pos": 0.05278032273054123, "kl_div_sft": -0.032032113522291183, "learning_rate": 3.1321370309951056e-07, "loss": -0.0909, "ppo_loss": -1.0541980266571045, "sft_loss": 0.12327804416418076, "step": 1045 }, { "epoch": 2.54, "grad_norm": 1.1151452772154664, "kl_div": 0.012791233137249947, "kl_div_sft": 0.012791233137249947, "learning_rate": 3.1158238172920065e-07, "loss": -0.0217, "sft_loss": 0.03297749534249306, "step": 1046 }, { "epoch": 2.54, "grad_norm": 1.2764277075361212, "importance_ratio": 0.7265625, "kl_div": -0.15004746615886688, "kl_div_neg": -0.32086265087127686, "kl_div_sft": 0.020767726004123688, "learning_rate": 3.099510603588907e-07, "loss": -0.0545, "ppo_loss": 0.800000011920929, "sft_loss": 0.007463144138455391, "step": 1047 }, { "epoch": 2.54, "grad_norm": 0.6707315251197906, "importance_ratio": 1.015625, "kl_div": 0.013251978904008865, "kl_div_pos": 0.017632799223065376, "kl_div_sft": 0.008871159516274929, "learning_rate": 3.0831973898858073e-07, "loss": -0.0922, "ppo_loss": -1.0177891254425049, "sft_loss": 0.09840093553066254, "step": 1048 }, { "epoch": 2.54, "grad_norm": 0.8942643812802545, "importance_ratio": 0.75, "kl_div": -0.13627244532108307, "kl_div_neg": -0.28906428813934326, "kl_div_sft": 0.016519390046596527, "learning_rate": 3.066884176182708e-07, "loss": -0.1519, "ppo_loss": 0.800000011920929, "sft_loss": 0.028626246377825737, "step": 1049 }, { "epoch": 2.55, "grad_norm": 0.9543023888845963, "importance_ratio": 0.6875, "kl_div": -0.3724897503852844, "kl_div_neg": -0.3724897503852844, "learning_rate": 3.0505709624796086e-07, "loss": -0.0181, "ppo_loss": 0.800000011920929, "step": 1050 }, { "epoch": 2.55, "grad_norm": 0.8109235141261069, "importance_ratio": 1.0546875, "kl_div": 0.04136640205979347, "kl_div_pos": 0.05685226619243622, "kl_div_sft": 0.025880537927150726, "learning_rate": 3.0342577487765084e-07, "loss": -0.1933, "ppo_loss": -1.0584994554519653, "sft_loss": 0.010072030127048492, "step": 1051 }, { "epoch": 2.55, "grad_norm": 1.123338319859818, "importance_ratio": 0.69921875, "kl_div": -0.17182566225528717, "kl_div_neg": -0.3587484657764435, "kl_div_sft": 0.01509714126586914, "learning_rate": 3.0179445350734093e-07, "loss": -0.2156, "ppo_loss": 0.800000011920929, "sft_loss": 0.04835548251867294, "step": 1052 }, { "epoch": 2.55, "grad_norm": 0.7801343963453246, "importance_ratio": 1.0390625, "kl_div": 0.015188152901828289, "kl_div_pos": 0.041032060980796814, "kl_div_sft": -0.010655755177140236, "learning_rate": 3.0016313213703097e-07, "loss": -0.1176, "ppo_loss": -1.041885495185852, "sft_loss": 0.08199844509363174, "step": 1053 }, { "epoch": 2.56, "grad_norm": 0.7988252150055876, "importance_ratio": 0.7265625, "kl_div": -0.3288571238517761, "kl_div_neg": -0.3288571238517761, "learning_rate": 2.98531810766721e-07, "loss": -0.0802, "ppo_loss": 0.8001587390899658, "step": 1054 }, { "epoch": 2.56, "grad_norm": 1.2844207421392433, "importance_ratio": 1.0625, "kl_div": 0.03747943416237831, "kl_div_pos": 0.06310385465621948, "kl_div_sft": 0.011855010874569416, "learning_rate": 2.969004893964111e-07, "loss": -0.1941, "ppo_loss": -1.065137505531311, "sft_loss": 0.03686026483774185, "step": 1055 }, { "epoch": 2.56, "grad_norm": 1.375427916437548, "importance_ratio": 0.68359375, "kl_div": -0.18899478018283844, "kl_div_neg": -0.3808421492576599, "kl_div_sft": 0.002852577017620206, "learning_rate": 2.9526916802610113e-07, "loss": -0.0285, "ppo_loss": 0.800000011920929, "sft_loss": 0.045148205012083054, "step": 1056 }, { "epoch": 2.56, "grad_norm": 0.9253707812763302, "importance_ratio": 0.60546875, "kl_div": -0.24174775183200836, "kl_div_neg": -0.5002955794334412, "kl_div_sft": 0.01680006831884384, "learning_rate": 2.9363784665579117e-07, "loss": -0.0444, "ppo_loss": 0.800000011920929, "sft_loss": 0.08801782131195068, "step": 1057 }, { "epoch": 2.56, "grad_norm": 0.8836541341559392, "importance_ratio": 1.0625, "kl_div": 0.025106048211455345, "kl_div_pos": 0.057671915739774704, "kl_div_sft": -0.007459820713847876, "learning_rate": 2.9200652528548126e-07, "loss": 0.1365, "ppo_loss": -1.0593674182891846, "sft_loss": 0.13191211223602295, "step": 1058 }, { "epoch": 2.57, "grad_norm": 1.2667644693027453, "importance_ratio": 0.6796875, "kl_div": -0.3990887999534607, "kl_div_neg": -0.3990887999534607, "learning_rate": 2.903752039151713e-07, "loss": -0.0013, "ppo_loss": 0.800000011920929, "step": 1059 }, { "epoch": 2.57, "grad_norm": 0.9833057590971988, "importance_ratio": 1.15625, "kl_div": 0.06241554394364357, "kl_div_pos": 0.14272110164165497, "kl_div_sft": -0.017890015617012978, "learning_rate": 2.8874388254486133e-07, "loss": -0.0021, "ppo_loss": -1.1534080505371094, "sft_loss": 0.14776094257831573, "step": 1060 }, { "epoch": 2.57, "grad_norm": 1.0756397627992749, "importance_ratio": 0.66796875, "kl_div": -0.207151859998703, "kl_div_neg": -0.40292197465896606, "kl_div_sft": -0.01138173695653677, "learning_rate": 2.8711256117455137e-07, "loss": -0.0806, "ppo_loss": 0.800000011920929, "sft_loss": 0.05718496814370155, "step": 1061 }, { "epoch": 2.57, "grad_norm": 1.2466782302286152, "importance_ratio": 1.0078125, "kl_div": 0.00763517152518034, "kl_div_pos": 0.00763517152518034, "learning_rate": 2.8548123980424146e-07, "loss": -0.0701, "ppo_loss": -1.007724404335022, "step": 1062 }, { "epoch": 2.58, "grad_norm": 0.9938324336283981, "importance_ratio": 0.66796875, "kl_div": -0.19250355660915375, "kl_div_neg": -0.4044356346130371, "kl_div_sft": 0.019428521394729614, "learning_rate": 2.8384991843393144e-07, "loss": 0.0077, "ppo_loss": 0.800000011920929, "sft_loss": 0.03204822912812233, "step": 1063 }, { "epoch": 2.58, "grad_norm": 0.8122684442240617, "kl_div": -0.011821310967206955, "kl_div_sft": -0.011821310967206955, "learning_rate": 2.822185970636215e-07, "loss": -0.1022, "sft_loss": 0.11833268404006958, "step": 1064 }, { "epoch": 2.58, "grad_norm": 1.0927651467234645, "importance_ratio": 0.796875, "kl_div": -0.09760934859514236, "kl_div_neg": -0.22511491179466248, "kl_div_sft": 0.0298962090164423, "learning_rate": 2.8058727569331157e-07, "loss": -0.0521, "ppo_loss": 0.800000011920929, "sft_loss": 0.047688040882349014, "step": 1065 }, { "epoch": 2.58, "grad_norm": 0.9446047190323583, "kl_div": 0.001321147195994854, "kl_div_sft": 0.001321147195994854, "learning_rate": 2.789559543230016e-07, "loss": -0.0115, "sft_loss": 0.051537930965423584, "step": 1066 }, { "epoch": 2.59, "grad_norm": 0.7045568087018107, "importance_ratio": 1.03125, "kl_div": 0.03264131397008896, "kl_div_pos": 0.03413437679409981, "kl_div_sft": 0.03114825300872326, "learning_rate": 2.7732463295269164e-07, "loss": -0.0013, "ppo_loss": -1.0347236394882202, "sft_loss": 0.025318821892142296, "step": 1067 }, { "epoch": 2.59, "grad_norm": 0.7338484641819885, "importance_ratio": 1.0390625, "kl_div": 0.030414361506700516, "kl_div_pos": 0.04188847914338112, "kl_div_sft": 0.018940245732665062, "learning_rate": 2.7569331158238173e-07, "loss": -0.115, "ppo_loss": -1.0427781343460083, "sft_loss": 0.05671351030468941, "step": 1068 }, { "epoch": 2.59, "grad_norm": 0.7268600674880552, "kl_div": 0.007879311218857765, "kl_div_sft": 0.007879311218857765, "learning_rate": 2.7406199021207177e-07, "loss": -0.1696, "sft_loss": 0.06098397448658943, "step": 1069 }, { "epoch": 2.59, "grad_norm": 1.1483685738776235, "importance_ratio": 1.015625, "kl_div": 0.007197917439043522, "kl_div_pos": 0.017390882596373558, "kl_div_sft": -0.0029950477182865143, "learning_rate": 2.724306688417618e-07, "loss": -0.1216, "ppo_loss": -1.0175429582595825, "sft_loss": 0.1088942214846611, "step": 1070 }, { "epoch": 2.6, "grad_norm": 0.8431710977020026, "importance_ratio": 1.0546875, "kl_div": 0.03426618501543999, "kl_div_pos": 0.04973319172859192, "kl_div_sft": 0.018799176439642906, "learning_rate": 2.707993474714519e-07, "loss": -0.0724, "ppo_loss": -1.0509907007217407, "sft_loss": 0.04126371070742607, "step": 1071 }, { "epoch": 2.6, "grad_norm": 1.4190722498656456, "importance_ratio": 0.625, "kl_div": -0.23403041064739227, "kl_div_neg": -0.47112327814102173, "kl_div_sft": 0.0030624489299952984, "learning_rate": 2.6916802610114193e-07, "loss": -0.0026, "ppo_loss": 0.800000011920929, "sft_loss": 0.13048440217971802, "step": 1072 }, { "epoch": 2.6, "grad_norm": 0.7858382342466547, "importance_ratio": 0.93359375, "kl_div": -0.07867449522018433, "kl_div_neg": -0.21523505449295044, "kl_div_pos": 0.05788605660200119, "learning_rate": 2.6753670473083197e-07, "loss": -0.0918, "ppo_loss": -0.12662121653556824, "step": 1073 }, { "epoch": 2.6, "grad_norm": 1.6482051305925476, "kl_div": 0.002539373002946377, "kl_div_sft": 0.002539373002946377, "learning_rate": 2.65905383360522e-07, "loss": -0.1219, "sft_loss": 0.08029313385486603, "step": 1074 }, { "epoch": 2.61, "grad_norm": 1.0317026936449962, "importance_ratio": 1.0390625, "kl_div": 0.025399165228009224, "kl_div_pos": 0.04176368936896324, "kl_div_sft": 0.009034640155732632, "learning_rate": 2.6427406199021205e-07, "loss": -0.0674, "ppo_loss": -1.0426480770111084, "sft_loss": 0.03283477947115898, "step": 1075 }, { "epoch": 2.61, "grad_norm": 1.0097162762209868, "importance_ratio": 1.046875, "kl_div": 0.038147445768117905, "kl_div_pos": 0.04772398620843887, "kl_div_sft": 0.028570905327796936, "learning_rate": 2.626427406199021e-07, "loss": -0.113, "ppo_loss": -1.0488810539245605, "sft_loss": 0.06863260269165039, "step": 1076 }, { "epoch": 2.61, "grad_norm": 0.719903352264217, "kl_div": -0.0003017587587237358, "kl_div_sft": -0.0003017587587237358, "learning_rate": 2.610114192495921e-07, "loss": 0.0407, "sft_loss": 0.06353778392076492, "step": 1077 }, { "epoch": 2.61, "grad_norm": 0.8777904973000497, "importance_ratio": 0.66796875, "kl_div": -0.1905461549758911, "kl_div_neg": -0.4016769528388977, "kl_div_sft": 0.020584631711244583, "learning_rate": 2.593800978792822e-07, "loss": -0.0008, "ppo_loss": 0.800000011920929, "sft_loss": 0.06284645944833755, "step": 1078 }, { "epoch": 2.62, "grad_norm": 1.0830187646502971, "importance_ratio": 0.82421875, "kl_div": -0.08450426906347275, "kl_div_neg": -0.191858172416687, "kl_div_sft": 0.022849630564451218, "learning_rate": 2.5774877650897225e-07, "loss": 0.0971, "ppo_loss": 0.8254238963127136, "sft_loss": 0.10808595269918442, "step": 1079 }, { "epoch": 2.62, "grad_norm": 0.7372952694322946, "importance_ratio": 0.73046875, "kl_div": -0.15658007562160492, "kl_div_neg": -0.31368288397789, "kl_div_sft": 0.000522738613653928, "learning_rate": 2.561174551386623e-07, "loss": -0.1206, "ppo_loss": 0.800000011920929, "sft_loss": 0.053254082798957825, "step": 1080 }, { "epoch": 2.62, "grad_norm": 1.3891624732241223, "importance_ratio": 0.78515625, "kl_div": -0.24391339719295502, "kl_div_neg": -0.24391339719295502, "learning_rate": 2.544861337683524e-07, "loss": 0.0536, "ppo_loss": 0.8239413499832153, "step": 1081 }, { "epoch": 2.62, "grad_norm": 1.0054647616030914, "kl_div": 0.019152436405420303, "kl_div_sft": 0.019152436405420303, "learning_rate": 2.528548123980424e-07, "loss": -0.0373, "sft_loss": 0.0417667031288147, "step": 1082 }, { "epoch": 2.63, "grad_norm": 0.7566992851682453, "importance_ratio": 1.0546875, "kl_div": 0.04358925670385361, "kl_div_pos": 0.05337366834282875, "kl_div_sft": 0.03380484879016876, "learning_rate": 2.5122349102773245e-07, "loss": -0.0952, "ppo_loss": -1.0548237562179565, "sft_loss": 0.025316337123513222, "step": 1083 }, { "epoch": 2.63, "grad_norm": 1.101774046585049, "importance_ratio": 0.7890625, "kl_div": -0.11028125882148743, "kl_div_neg": -0.23718704283237457, "kl_div_sft": 0.01662452146410942, "learning_rate": 2.495921696574225e-07, "loss": 0.0535, "ppo_loss": 0.800000011920929, "sft_loss": 0.023163825273513794, "step": 1084 }, { "epoch": 2.63, "grad_norm": 3.775540541796825, "importance_ratio": 1.03125, "kl_div": 0.03306008130311966, "kl_div_pos": 0.03152577951550484, "kl_div_sft": 0.03459438309073448, "learning_rate": 2.479608482871126e-07, "loss": -0.0828, "ppo_loss": -1.0320279598236084, "sft_loss": 0.06501305103302002, "step": 1085 }, { "epoch": 2.63, "grad_norm": 1.3828568101238494, "importance_ratio": 0.53515625, "kl_div": -0.3016151189804077, "kl_div_neg": -0.6263535022735596, "kl_div_sft": 0.023123271763324738, "learning_rate": 2.463295269168026e-07, "loss": 0.0381, "ppo_loss": 0.800000011920929, "sft_loss": 0.024348098784685135, "step": 1086 }, { "epoch": 2.64, "grad_norm": 0.886695424583937, "importance_ratio": 0.63671875, "kl_div": -0.22029554843902588, "kl_div_neg": -0.45359790325164795, "kl_div_sft": 0.013006805442273617, "learning_rate": 2.4469820554649265e-07, "loss": -0.0932, "ppo_loss": 0.800000011920929, "sft_loss": 0.028095100075006485, "step": 1087 }, { "epoch": 2.64, "grad_norm": 1.9189425169946241, "importance_ratio": 1.0546875, "kl_div": 0.030528422445058823, "kl_div_pos": 0.05292826145887375, "kl_div_sft": 0.008128583431243896, "learning_rate": 2.430668841761827e-07, "loss": -0.012, "ppo_loss": -1.0543540716171265, "sft_loss": 0.0633447915315628, "step": 1088 }, { "epoch": 2.64, "grad_norm": 0.5676929460125901, "kl_div": 0.0012860526330769062, "kl_div_sft": 0.0012860526330769062, "learning_rate": 2.414355628058727e-07, "loss": -0.0232, "sft_loss": 0.0832691341638565, "step": 1089 }, { "epoch": 2.64, "grad_norm": 0.6963825136367197, "importance_ratio": 0.71484375, "kl_div": -0.15527908504009247, "kl_div_neg": -0.3331499993801117, "kl_div_sft": 0.02259182743728161, "learning_rate": 2.398042414355628e-07, "loss": -0.1482, "ppo_loss": 0.800000011920929, "sft_loss": 0.03868754953145981, "step": 1090 }, { "epoch": 2.64, "grad_norm": 0.9400337629646386, "importance_ratio": 0.765625, "kl_div": -0.1231977567076683, "kl_div_neg": -0.2665286362171173, "kl_div_sft": 0.020133126527071, "learning_rate": 2.3817292006525282e-07, "loss": 0.006, "ppo_loss": 0.800000011920929, "sft_loss": 0.0708237737417221, "step": 1091 }, { "epoch": 2.65, "grad_norm": 0.925268812040375, "kl_div": 0.008872696198523045, "kl_div_sft": 0.008872696198523045, "learning_rate": 2.3654159869494289e-07, "loss": -0.1638, "sft_loss": 0.05794968456029892, "step": 1092 }, { "epoch": 2.65, "grad_norm": 1.1531785430469386, "importance_ratio": 0.7890625, "kl_div": -0.3047209680080414, "kl_div_neg": -0.6697598099708557, "kl_div_pos": 0.06031789258122444, "learning_rate": 2.3491027732463295e-07, "loss": -0.1041, "ppo_loss": -0.13108709454536438, "step": 1093 }, { "epoch": 2.65, "grad_norm": 0.7584735700520077, "importance_ratio": 0.78515625, "kl_div": -0.12317772209644318, "kl_div_neg": -0.24293836951255798, "kl_div_sft": -0.0034170723520219326, "learning_rate": 2.33278955954323e-07, "loss": -0.0657, "ppo_loss": 0.800000011920929, "sft_loss": 0.06922072917222977, "step": 1094 }, { "epoch": 2.65, "grad_norm": 0.6423110406852908, "importance_ratio": 0.6953125, "kl_div": -0.3657418191432953, "kl_div_neg": -0.3657418191432953, "learning_rate": 2.3164763458401305e-07, "loss": 0.098, "ppo_loss": 0.800000011920929, "step": 1095 }, { "epoch": 2.66, "grad_norm": 0.870868807774345, "kl_div": 0.016433026641607285, "kl_div_sft": 0.016433026641607285, "learning_rate": 2.300163132137031e-07, "loss": -0.0642, "sft_loss": 0.029268991202116013, "step": 1096 }, { "epoch": 2.66, "grad_norm": 0.8481605541448505, "importance_ratio": 1.046875, "kl_div": 0.03731922805309296, "kl_div_pos": 0.043029073625802994, "kl_div_sft": 0.03160938620567322, "learning_rate": 2.2838499184339312e-07, "loss": -0.0741, "ppo_loss": -1.0439682006835938, "sft_loss": 0.012241056188941002, "step": 1097 }, { "epoch": 2.66, "grad_norm": 1.1204147675255658, "importance_ratio": 0.98046875, "kl_div": 0.0020015956833958626, "kl_div_pos": -0.018063034862279892, "kl_div_sft": 0.022066226229071617, "learning_rate": 2.267536704730832e-07, "loss": 0.0903, "ppo_loss": -0.9820991158485413, "sft_loss": 0.12652084231376648, "step": 1098 }, { "epoch": 2.66, "grad_norm": 1.0467702898637867, "importance_ratio": 0.875, "kl_div": -0.15396293997764587, "kl_div_neg": -0.36044037342071533, "kl_div_pos": 0.05251449719071388, "learning_rate": 2.2512234910277323e-07, "loss": -0.0469, "ppo_loss": -0.1269589364528656, "step": 1099 }, { "epoch": 2.67, "grad_norm": 1.2571962161784893, "importance_ratio": 0.74609375, "kl_div": -0.160195454955101, "kl_div_neg": -0.2913189232349396, "kl_div_sft": -0.029071999713778496, "learning_rate": 2.234910277324633e-07, "loss": -0.0143, "ppo_loss": 0.800000011920929, "sft_loss": 0.09783211350440979, "step": 1100 }, { "epoch": 2.67, "grad_norm": 1.1750347010581415, "kl_div": 0.014914116822183132, "kl_div_sft": 0.014914116822183132, "learning_rate": 2.2185970636215335e-07, "loss": -0.0397, "sft_loss": 0.04734306409955025, "step": 1101 }, { "epoch": 2.67, "grad_norm": 0.8239253457495405, "importance_ratio": 1.078125, "kl_div": 0.05389084294438362, "kl_div_pos": 0.07796481251716614, "kl_div_sft": 0.029816875234246254, "learning_rate": 2.2022838499184336e-07, "loss": -0.0211, "ppo_loss": -1.0810847282409668, "sft_loss": 0.018815357238054276, "step": 1102 }, { "epoch": 2.67, "grad_norm": 0.8150976178735944, "importance_ratio": 0.7578125, "kl_div": -0.2764035165309906, "kl_div_neg": -0.2764035165309906, "learning_rate": 2.1859706362153343e-07, "loss": 0.0045, "ppo_loss": 0.8088388442993164, "step": 1103 }, { "epoch": 2.68, "grad_norm": 0.6921501252910377, "kl_div": 0.01970837637782097, "kl_div_sft": 0.01970837637782097, "learning_rate": 2.169657422512235e-07, "loss": -0.0667, "sft_loss": 0.04228641837835312, "step": 1104 }, { "epoch": 2.68, "grad_norm": 0.665915428522638, "importance_ratio": 1.0390625, "kl_div": 0.00600157305598259, "kl_div_pos": 0.036812517791986465, "kl_div_sft": -0.024809371680021286, "learning_rate": 2.1533442088091353e-07, "loss": -0.0151, "ppo_loss": -1.0374984741210938, "sft_loss": 0.1946043074131012, "step": 1105 }, { "epoch": 2.68, "grad_norm": 1.555084484540579, "importance_ratio": 0.66015625, "kl_div": -0.21379652619361877, "kl_div_neg": -0.4146406650543213, "kl_div_sft": -0.012952383607625961, "learning_rate": 2.137030995106036e-07, "loss": -0.205, "ppo_loss": 0.800000011920929, "sft_loss": 0.11993511021137238, "step": 1106 }, { "epoch": 2.68, "grad_norm": 0.9577427301930038, "importance_ratio": 0.8984375, "kl_div": -0.1193065196275711, "kl_div_neg": -0.26879850029945374, "kl_div_pos": 0.03018546849489212, "learning_rate": 2.1207177814029363e-07, "loss": -0.1508, "ppo_loss": -0.11532279849052429, "step": 1107 }, { "epoch": 2.69, "grad_norm": 0.9782622841235084, "kl_div": 0.007449622265994549, "kl_div_sft": 0.007449622265994549, "learning_rate": 2.1044045676998366e-07, "loss": -0.0685, "sft_loss": 0.06773808598518372, "step": 1108 }, { "epoch": 2.69, "grad_norm": 0.9088608195909383, "importance_ratio": 0.75390625, "kl_div": -0.28366923332214355, "kl_div_neg": -0.28366923332214355, "learning_rate": 2.0880913539967373e-07, "loss": -0.0054, "ppo_loss": 0.800000011920929, "step": 1109 }, { "epoch": 2.69, "grad_norm": 1.9121598732877862, "importance_ratio": 0.8984375, "kl_div": -0.12701398134231567, "kl_div_neg": -0.345787912607193, "kl_div_pos": 0.09175995737314224, "learning_rate": 2.0717781402936376e-07, "loss": -0.1056, "ppo_loss": -0.14805081486701965, "step": 1110 }, { "epoch": 2.69, "grad_norm": 0.9042432204739338, "importance_ratio": 1.0625, "kl_div": 0.05639778822660446, "kl_div_pos": 0.05639778822660446, "learning_rate": 2.0554649265905383e-07, "loss": -0.0823, "ppo_loss": -1.0584442615509033, "step": 1111 }, { "epoch": 2.7, "grad_norm": 0.8756164014504375, "importance_ratio": 0.765625, "kl_div": -0.12315031886100769, "kl_div_neg": -0.26553070545196533, "kl_div_sft": 0.0192300695925951, "learning_rate": 2.039151712887439e-07, "loss": -0.1084, "ppo_loss": 0.800000011920929, "sft_loss": 0.049898453056812286, "step": 1112 }, { "epoch": 2.7, "grad_norm": 1.6528490524880572, "importance_ratio": 1.0234375, "kl_div": 0.02359095588326454, "kl_div_pos": 0.02359095588326454, "learning_rate": 2.0228384991843393e-07, "loss": -0.0684, "ppo_loss": -1.0239702463150024, "step": 1113 }, { "epoch": 2.7, "grad_norm": 0.8415497596362785, "kl_div": -0.02142334170639515, "kl_div_sft": -0.02142334170639515, "learning_rate": 2.0065252854812397e-07, "loss": -0.1327, "sft_loss": 0.08445389568805695, "step": 1114 }, { "epoch": 2.7, "grad_norm": 1.3523021724662885, "importance_ratio": 1.0625, "kl_div": 0.03919368237257004, "kl_div_pos": 0.062300436198711395, "kl_div_sft": 0.016086924821138382, "learning_rate": 1.99021207177814e-07, "loss": -0.0498, "ppo_loss": -1.0642820596694946, "sft_loss": 0.03224463015794754, "step": 1115 }, { "epoch": 2.71, "grad_norm": 1.5138710333950391, "importance_ratio": 1.09375, "kl_div": 0.04405112564563751, "kl_div_pos": 0.08768067508935928, "kl_div_sft": 0.00042157687130384147, "learning_rate": 1.9738988580750407e-07, "loss": 0.0862, "ppo_loss": -1.091639518737793, "sft_loss": 0.12167085707187653, "step": 1116 }, { "epoch": 2.71, "grad_norm": 0.9421302575583053, "importance_ratio": 0.90625, "kl_div": -0.11986620724201202, "kl_div_neg": -0.3113791048526764, "kl_div_pos": 0.07164669781923294, "learning_rate": 1.9575856443719413e-07, "loss": 0.0002, "ppo_loss": -0.13713786005973816, "step": 1117 }, { "epoch": 2.71, "grad_norm": 1.4199391695881942, "kl_div": 0.003954947926104069, "kl_div_sft": 0.003954947926104069, "learning_rate": 1.9412724306688417e-07, "loss": -0.0118, "sft_loss": 0.044476695358753204, "step": 1118 }, { "epoch": 2.71, "grad_norm": 0.6698292481917263, "kl_div": 0.01963566057384014, "kl_div_sft": 0.01963566057384014, "learning_rate": 1.9249592169657423e-07, "loss": -0.0074, "sft_loss": 0.04536540061235428, "step": 1119 }, { "epoch": 2.72, "grad_norm": 0.6886097880573838, "kl_div": -0.004337035119533539, "kl_div_sft": -0.004337035119533539, "learning_rate": 1.9086460032626424e-07, "loss": 0.0435, "sft_loss": 0.08166567981243134, "step": 1120 }, { "epoch": 2.72, "grad_norm": 1.5491112622221588, "importance_ratio": 0.796875, "kl_div": -0.227139413356781, "kl_div_neg": -0.227139413356781, "learning_rate": 1.892332789559543e-07, "loss": -0.1024, "ppo_loss": 0.8080419898033142, "step": 1121 }, { "epoch": 2.72, "grad_norm": 0.9438899662739387, "importance_ratio": 1.0625, "kl_div": 0.05969712883234024, "kl_div_pos": 0.05969712883234024, "learning_rate": 1.8760195758564437e-07, "loss": -0.0487, "ppo_loss": -1.0615639686584473, "step": 1122 }, { "epoch": 2.72, "grad_norm": 1.2169227596211318, "importance_ratio": 0.5390625, "kl_div": -0.30277031660079956, "kl_div_neg": -0.6170855760574341, "kl_div_sft": 0.011544971726834774, "learning_rate": 1.859706362153344e-07, "loss": 0.0772, "ppo_loss": 0.800000011920929, "sft_loss": 0.040595829486846924, "step": 1123 }, { "epoch": 2.72, "grad_norm": 1.4201241917097076, "importance_ratio": 1.046875, "kl_div": 0.03995908051729202, "kl_div_pos": 0.03995908051729202, "learning_rate": 1.8433931484502447e-07, "loss": -0.0486, "ppo_loss": -1.0409839153289795, "step": 1124 }, { "epoch": 2.73, "grad_norm": 0.9788433447473673, "kl_div": -0.01730550080537796, "kl_div_sft": -0.01730550080537796, "learning_rate": 1.8270799347471453e-07, "loss": -0.0301, "sft_loss": 0.08677445352077484, "step": 1125 }, { "epoch": 2.73, "grad_norm": 0.5361937330199169, "kl_div": -0.005295942537486553, "kl_div_sft": -0.005295942537486553, "learning_rate": 1.8107667210440454e-07, "loss": 0.0529, "sft_loss": 0.06039096787571907, "step": 1126 }, { "epoch": 2.73, "grad_norm": 0.724901659642847, "importance_ratio": 0.703125, "kl_div": -0.35365772247314453, "kl_div_neg": -0.35365772247314453, "learning_rate": 1.794453507340946e-07, "loss": -0.0044, "ppo_loss": 0.800000011920929, "step": 1127 }, { "epoch": 2.73, "grad_norm": 1.3223456805067588, "importance_ratio": 1.0234375, "kl_div": 0.022419404238462448, "kl_div_pos": 0.026006117463111877, "kl_div_sft": 0.01883268915116787, "learning_rate": 1.7781402936378464e-07, "loss": -0.0079, "ppo_loss": -1.026347279548645, "sft_loss": 0.11715763062238693, "step": 1128 }, { "epoch": 2.74, "grad_norm": 1.3684236006674422, "importance_ratio": 1.046875, "kl_div": 0.04095806926488876, "kl_div_pos": 0.04559039697051048, "kl_div_sft": 0.03632574528455734, "learning_rate": 1.761827079934747e-07, "loss": -0.2562, "ppo_loss": -1.0466456413269043, "sft_loss": 0.033348001539707184, "step": 1129 }, { "epoch": 2.74, "grad_norm": 2.0941610359857985, "importance_ratio": 0.796875, "kl_div": -0.10481736809015274, "kl_div_neg": -0.22943493723869324, "kl_div_sft": 0.019800204783678055, "learning_rate": 1.7455138662316477e-07, "loss": -0.0176, "ppo_loss": 0.800000011920929, "sft_loss": 0.07383913546800613, "step": 1130 }, { "epoch": 2.74, "grad_norm": 1.869183610603224, "importance_ratio": 0.82421875, "kl_div": -0.08802182972431183, "kl_div_neg": -0.19563773274421692, "kl_div_sft": 0.019594065845012665, "learning_rate": 1.729200652528548e-07, "loss": -0.1363, "ppo_loss": 0.8223100900650024, "sft_loss": 0.026904089376330376, "step": 1131 }, { "epoch": 2.74, "grad_norm": 1.0702346996237544, "kl_div": 0.005095012951642275, "kl_div_sft": 0.005095012951642275, "learning_rate": 1.7128874388254484e-07, "loss": 0.0549, "sft_loss": 0.055672984570264816, "step": 1132 }, { "epoch": 2.75, "grad_norm": 0.6563738978457855, "importance_ratio": 1.0234375, "kl_div": 0.01523815467953682, "kl_div_pos": 0.02387329190969467, "kl_div_sft": 0.006603018380701542, "learning_rate": 1.696574225122349e-07, "loss": 0.0573, "ppo_loss": -1.0241605043411255, "sft_loss": 0.07252994179725647, "step": 1133 }, { "epoch": 2.75, "grad_norm": 0.9554710905930678, "importance_ratio": 1.0625, "kl_div": 0.04103770852088928, "kl_div_pos": 0.05817992612719536, "kl_div_sft": 0.023895489051938057, "learning_rate": 1.6802610114192494e-07, "loss": -0.1059, "ppo_loss": -1.059905767440796, "sft_loss": 0.02852977253496647, "step": 1134 }, { "epoch": 2.75, "grad_norm": 1.332576114491777, "importance_ratio": 0.76171875, "kl_div": -0.27210551500320435, "kl_div_neg": -0.27210551500320435, "learning_rate": 1.66394779771615e-07, "loss": 0.0088, "ppo_loss": 0.800000011920929, "step": 1135 }, { "epoch": 2.75, "grad_norm": 1.3744154467839322, "importance_ratio": 0.890625, "kl_div": -0.12652842700481415, "kl_div_neg": -0.26943251490592957, "kl_div_pos": 0.01637565903365612, "learning_rate": 1.6476345840130504e-07, "loss": 0.0322, "ppo_loss": -0.10825523734092712, "step": 1136 }, { "epoch": 2.76, "grad_norm": 0.6300766284449495, "kl_div": 0.012872161343693733, "kl_div_sft": 0.012872161343693733, "learning_rate": 1.631321370309951e-07, "loss": 0.009, "sft_loss": 0.09325136244297028, "step": 1137 }, { "epoch": 2.76, "grad_norm": 0.7215131224154683, "importance_ratio": 1.0390625, "kl_div": 0.03919503092765808, "kl_div_pos": 0.03919503092765808, "learning_rate": 1.6150081566068514e-07, "loss": -0.1359, "ppo_loss": -1.0400731563568115, "step": 1138 }, { "epoch": 2.76, "grad_norm": 0.5619121234583224, "importance_ratio": 0.6328125, "kl_div": -0.46213510632514954, "kl_div_neg": -0.46213510632514954, "learning_rate": 1.5986949429037518e-07, "loss": -0.0543, "ppo_loss": 0.800000011920929, "step": 1139 }, { "epoch": 2.76, "grad_norm": 0.734594961461299, "importance_ratio": 0.44140625, "kl_div": -0.40974512696266174, "kl_div_neg": -0.8175477385520935, "kl_div_sft": -0.001942517701536417, "learning_rate": 1.5823817292006525e-07, "loss": -0.035, "ppo_loss": 0.800000011920929, "sft_loss": 0.06181861087679863, "step": 1140 }, { "epoch": 2.77, "grad_norm": 0.7942525309137227, "importance_ratio": 0.76953125, "kl_div": -0.12769439816474915, "kl_div_neg": -0.2644917666912079, "kl_div_sft": 0.009102970361709595, "learning_rate": 1.5660685154975528e-07, "loss": -0.0329, "ppo_loss": 0.800000011920929, "sft_loss": 0.03120669350028038, "step": 1141 }, { "epoch": 2.77, "grad_norm": 0.9624103950306931, "importance_ratio": 0.73828125, "kl_div": -0.14747409522533417, "kl_div_neg": -0.3031984865665436, "kl_div_sft": 0.008250309154391289, "learning_rate": 1.5497553017944535e-07, "loss": 0.0108, "ppo_loss": 0.800000011920929, "sft_loss": 0.05386658012866974, "step": 1142 }, { "epoch": 2.77, "grad_norm": 1.5596566077849165, "kl_div": 0.01757156103849411, "kl_div_sft": 0.01757156103849411, "learning_rate": 1.533442088091354e-07, "loss": 0.0472, "sft_loss": 0.0313275121152401, "step": 1143 }, { "epoch": 2.77, "grad_norm": 1.224527943411857, "kl_div": -0.005035087466239929, "kl_div_sft": -0.005035087466239929, "learning_rate": 1.5171288743882542e-07, "loss": 0.0123, "sft_loss": 0.08055819571018219, "step": 1144 }, { "epoch": 2.78, "grad_norm": 0.7722177617045625, "kl_div": 0.0151774100959301, "kl_div_sft": 0.0151774100959301, "learning_rate": 1.5008156606851548e-07, "loss": -0.0686, "sft_loss": 0.038552433252334595, "step": 1145 }, { "epoch": 2.78, "grad_norm": 0.8718422381080544, "importance_ratio": 0.73046875, "kl_div": -0.15404218435287476, "kl_div_neg": -0.31161925196647644, "kl_div_sft": 0.003534871619194746, "learning_rate": 1.4845024469820555e-07, "loss": 0.0104, "ppo_loss": 0.800000011920929, "sft_loss": 0.07242981344461441, "step": 1146 }, { "epoch": 2.78, "grad_norm": 2.0345420569579686, "importance_ratio": 1.03125, "kl_div": 0.031925879418849945, "kl_div_pos": 0.031925879418849945, "learning_rate": 1.4681892332789558e-07, "loss": 0.0601, "ppo_loss": -1.0334405899047852, "step": 1147 }, { "epoch": 2.78, "grad_norm": 1.0711314833969363, "importance_ratio": 1.046875, "kl_div": 0.04982160031795502, "kl_div_pos": 0.04982160031795502, "learning_rate": 1.4518760195758565e-07, "loss": -0.119, "ppo_loss": -1.051084041595459, "step": 1148 }, { "epoch": 2.79, "grad_norm": 1.0689223234219718, "importance_ratio": 0.703125, "kl_div": -0.16888612508773804, "kl_div_neg": -0.35433754324913025, "kl_div_sft": 0.01656527817249298, "learning_rate": 1.4355628058727568e-07, "loss": -0.0845, "ppo_loss": 0.800000011920929, "sft_loss": 0.03223303705453873, "step": 1149 }, { "epoch": 2.79, "grad_norm": 1.0808209973888399, "kl_div": -0.01934581622481346, "kl_div_sft": -0.01934581622481346, "learning_rate": 1.4192495921696572e-07, "loss": -0.2052, "sft_loss": 0.07561028003692627, "step": 1150 }, { "epoch": 2.79, "grad_norm": 1.2129563269273218, "kl_div": 0.01769608072936535, "kl_div_sft": 0.01769608072936535, "learning_rate": 1.4029363784665578e-07, "loss": -0.0735, "sft_loss": 0.04179126024246216, "step": 1151 }, { "epoch": 2.79, "grad_norm": 0.6347059955465805, "kl_div": 0.027414832264184952, "kl_div_sft": 0.027414832264184952, "learning_rate": 1.3866231647634582e-07, "loss": 0.0408, "sft_loss": 0.03668251633644104, "step": 1152 }, { "epoch": 2.8, "grad_norm": 0.7479490609622703, "importance_ratio": 0.82421875, "kl_div": -0.11562100797891617, "kl_div_neg": -0.1932021677494049, "kl_div_sft": -0.03803984820842743, "learning_rate": 1.3703099510603589e-07, "loss": -0.0478, "ppo_loss": 0.8243153095245361, "sft_loss": 0.0878666415810585, "step": 1153 }, { "epoch": 2.8, "grad_norm": 0.8725099213809392, "importance_ratio": 0.72265625, "kl_div": -0.15855799615383148, "kl_div_neg": -0.32498425245285034, "kl_div_sft": 0.00786825455725193, "learning_rate": 1.3539967373572595e-07, "loss": -0.0438, "ppo_loss": 0.800000011920929, "sft_loss": 0.044809021055698395, "step": 1154 }, { "epoch": 2.8, "grad_norm": 0.8801756783170777, "importance_ratio": 1.0390625, "kl_div": 0.009981222450733185, "kl_div_pos": 0.040560025721788406, "kl_div_sft": -0.020597580820322037, "learning_rate": 1.3376835236541599e-07, "loss": -0.0567, "ppo_loss": -1.0413938760757446, "sft_loss": 0.06672003120183945, "step": 1155 }, { "epoch": 2.8, "grad_norm": 1.125348355083032, "importance_ratio": 0.703125, "kl_div": -0.16469106078147888, "kl_div_neg": -0.35337555408477783, "kl_div_sft": 0.02399342507123947, "learning_rate": 1.3213703099510602e-07, "loss": 0.0195, "ppo_loss": 0.800000011920929, "sft_loss": 0.022216780111193657, "step": 1156 }, { "epoch": 2.8, "grad_norm": 0.9578812541834655, "importance_ratio": 0.578125, "kl_div": -0.27082908153533936, "kl_div_neg": -0.5504145622253418, "kl_div_sft": 0.008756392635405064, "learning_rate": 1.3050570962479606e-07, "loss": -0.1345, "ppo_loss": 0.800000011920929, "sft_loss": 0.022599216550588608, "step": 1157 }, { "epoch": 2.81, "grad_norm": 2.131705419482927, "importance_ratio": 1.0859375, "kl_div": 0.04747091233730316, "kl_div_pos": 0.08328525722026825, "kl_div_sft": 0.011656570248305798, "learning_rate": 1.2887438825448612e-07, "loss": -0.0991, "ppo_loss": -1.0868518352508545, "sft_loss": 0.04350946098566055, "step": 1158 }, { "epoch": 2.81, "grad_norm": 0.6816832083705758, "kl_div": 0.005396461579948664, "kl_div_sft": 0.005396461579948664, "learning_rate": 1.272430668841762e-07, "loss": -0.101, "sft_loss": 0.049411237239837646, "step": 1159 }, { "epoch": 2.81, "grad_norm": 0.830705401201232, "importance_ratio": 0.91015625, "kl_div": -0.060057319700717926, "kl_div_pos": -0.09451054036617279, "kl_div_sft": -0.02560410276055336, "learning_rate": 1.2561174551386622e-07, "loss": -0.0946, "ppo_loss": -0.909818172454834, "sft_loss": 0.13856370747089386, "step": 1160 }, { "epoch": 2.81, "grad_norm": 0.9586862384229029, "importance_ratio": 0.8671875, "kl_div": -0.14406529068946838, "kl_div_neg": -0.2500915229320526, "kl_div_pos": -0.03803904354572296, "learning_rate": 1.239804241435563e-07, "loss": 0.0083, "ppo_loss": -0.08133766055107117, "step": 1161 }, { "epoch": 2.82, "grad_norm": 1.770973947963446, "importance_ratio": 1.015625, "kl_div": 0.02230958268046379, "kl_div_pos": 0.012497110292315483, "kl_div_sft": 0.03212205320596695, "learning_rate": 1.2234910277324632e-07, "loss": -0.0917, "ppo_loss": -1.0125755071640015, "sft_loss": 0.05473034456372261, "step": 1162 }, { "epoch": 2.82, "grad_norm": 1.893070562366472, "kl_div": 0.00027730176225304604, "kl_div_sft": 0.00027730176225304604, "learning_rate": 1.2071778140293636e-07, "loss": 0.0764, "sft_loss": 0.06012003496289253, "step": 1163 }, { "epoch": 2.82, "grad_norm": 1.9890327757047541, "importance_ratio": 1.0703125, "kl_div": 0.039140187203884125, "kl_div_pos": 0.06697124987840652, "kl_div_sft": 0.011309120804071426, "learning_rate": 1.1908646003262641e-07, "loss": 0.0611, "ppo_loss": -1.0692646503448486, "sft_loss": 0.051590412855148315, "step": 1164 }, { "epoch": 2.82, "grad_norm": 1.3363702084433386, "importance_ratio": 0.66796875, "kl_div": -0.23694896697998047, "kl_div_neg": -0.40421345829963684, "kl_div_sft": -0.0696844831109047, "learning_rate": 1.1745513866231648e-07, "loss": 0.0501, "ppo_loss": 0.800000011920929, "sft_loss": 0.17454583942890167, "step": 1165 }, { "epoch": 2.83, "grad_norm": 1.1762943605658298, "importance_ratio": 0.78515625, "kl_div": -0.1164015606045723, "kl_div_neg": -0.23999521136283875, "kl_div_sft": 0.007192092947661877, "learning_rate": 1.1582381729200653e-07, "loss": -0.1663, "ppo_loss": 0.800000011920929, "sft_loss": 0.07211599498987198, "step": 1166 }, { "epoch": 2.83, "grad_norm": 1.150266682759822, "kl_div": -0.01800680160522461, "kl_div_sft": -0.01800680160522461, "learning_rate": 1.1419249592169656e-07, "loss": -0.1317, "sft_loss": 0.0823834165930748, "step": 1167 }, { "epoch": 2.83, "grad_norm": 0.8969832883317888, "importance_ratio": 0.875, "kl_div": -0.1475740224123001, "kl_div_neg": -0.3291796147823334, "kl_div_pos": 0.034031569957733154, "learning_rate": 1.1256117455138661e-07, "loss": 0.0277, "ppo_loss": -0.1173085868358612, "step": 1168 }, { "epoch": 2.83, "grad_norm": 1.3658282689545083, "importance_ratio": 1.03125, "kl_div": 0.017019763588905334, "kl_div_pos": 0.028323249891400337, "kl_div_sft": 0.005716277752071619, "learning_rate": 1.1092985318107668e-07, "loss": -0.0856, "ppo_loss": -1.0287281274795532, "sft_loss": 0.05883674696087837, "step": 1169 }, { "epoch": 2.84, "grad_norm": 1.0474936521970561, "importance_ratio": 0.875, "kl_div": -0.15186944603919983, "kl_div_neg": -0.3259763717651367, "kl_div_pos": 0.022237488999962807, "learning_rate": 1.0929853181076671e-07, "loss": -0.0053, "ppo_loss": -0.11124327778816223, "step": 1170 }, { "epoch": 2.84, "grad_norm": 1.2800057114533314, "importance_ratio": 0.87109375, "kl_div": -0.15982398390769958, "kl_div_neg": -0.36433860659599304, "kl_div_pos": 0.04469062760472298, "learning_rate": 1.0766721044045676e-07, "loss": -0.0283, "ppo_loss": -0.12285217642784119, "step": 1171 }, { "epoch": 2.84, "grad_norm": 0.7458908444258102, "importance_ratio": 1.0390625, "kl_div": 0.028081998229026794, "kl_div_pos": 0.04125192388892174, "kl_div_sft": 0.014912070706486702, "learning_rate": 1.0603588907014681e-07, "loss": -0.1004, "ppo_loss": -1.0421146154403687, "sft_loss": 0.02104533091187477, "step": 1172 }, { "epoch": 2.84, "grad_norm": 1.7487067484875245, "importance_ratio": 0.96484375, "kl_div": -0.03644672781229019, "kl_div_pos": -0.03644672781229019, "learning_rate": 1.0440456769983686e-07, "loss": 0.0135, "ppo_loss": -0.9646996259689331, "step": 1173 }, { "epoch": 2.85, "grad_norm": 1.228644092920269, "importance_ratio": 0.6640625, "kl_div": -0.19775894284248352, "kl_div_neg": -0.4072263538837433, "kl_div_sft": 0.011708474718034267, "learning_rate": 1.0277324632952691e-07, "loss": -0.0193, "ppo_loss": 0.800000011920929, "sft_loss": 0.029304752126336098, "step": 1174 }, { "epoch": 2.85, "grad_norm": 0.9653485788288345, "kl_div": 0.010237861424684525, "kl_div_sft": 0.010237861424684525, "learning_rate": 1.0114192495921696e-07, "loss": -0.0696, "sft_loss": 0.05024181306362152, "step": 1175 }, { "epoch": 2.85, "grad_norm": 0.709317341952596, "kl_div": 0.011810576543211937, "kl_div_sft": 0.011810576543211937, "learning_rate": 9.9510603588907e-08, "loss": 0.0471, "sft_loss": 0.04906405881047249, "step": 1176 }, { "epoch": 2.85, "grad_norm": 1.3330104506470002, "importance_ratio": 0.8671875, "kl_div": -0.16925135254859924, "kl_div_neg": -0.38317427039146423, "kl_div_pos": 0.04467155039310455, "learning_rate": 9.787928221859706e-08, "loss": -0.1224, "ppo_loss": -0.12284216284751892, "step": 1177 }, { "epoch": 2.86, "grad_norm": 0.8009218418169928, "importance_ratio": 0.67578125, "kl_div": -0.1921180933713913, "kl_div_neg": -0.392837256193161, "kl_div_sft": 0.008601064793765545, "learning_rate": 9.624796084828712e-08, "loss": -0.1025, "ppo_loss": 0.800000011920929, "sft_loss": 0.009927182458341122, "step": 1178 }, { "epoch": 2.86, "grad_norm": 1.1503104164127465, "importance_ratio": 1.0234375, "kl_div": 0.020346002653241158, "kl_div_pos": 0.024633346125483513, "kl_div_sft": 0.016058659180998802, "learning_rate": 9.461663947797715e-08, "loss": -0.1081, "ppo_loss": -1.0249391794204712, "sft_loss": 0.03225059062242508, "step": 1179 }, { "epoch": 2.86, "grad_norm": 0.639340648418175, "kl_div": 0.011185074225068092, "kl_div_sft": 0.011185074225068092, "learning_rate": 9.29853181076672e-08, "loss": 0.0115, "sft_loss": 0.06356431543827057, "step": 1180 }, { "epoch": 2.86, "grad_norm": 1.3975395004613398, "importance_ratio": 0.8203125, "kl_div": -0.08465030789375305, "kl_div_neg": -0.2001633197069168, "kl_div_sft": 0.030862705782055855, "learning_rate": 9.135399673735727e-08, "loss": -0.1217, "ppo_loss": 0.8185970187187195, "sft_loss": 0.060302335768938065, "step": 1181 }, { "epoch": 2.87, "grad_norm": 0.7561947573827391, "importance_ratio": 0.9921875, "kl_div": -0.007614566013216972, "kl_div_pos": -0.007614566013216972, "learning_rate": 8.97226753670473e-08, "loss": -0.0295, "ppo_loss": -0.9930227994918823, "step": 1182 }, { "epoch": 2.87, "grad_norm": 1.567095217360077, "importance_ratio": 0.7421875, "kl_div": -0.13958537578582764, "kl_div_neg": -0.30015498399734497, "kl_div_sft": 0.020984219387173653, "learning_rate": 8.809135399673735e-08, "loss": -0.0063, "ppo_loss": 0.800000011920929, "sft_loss": 0.04172850027680397, "step": 1183 }, { "epoch": 2.87, "grad_norm": 0.8954740661452641, "importance_ratio": 0.73828125, "kl_div": -0.14187641441822052, "kl_div_neg": -0.3050064742565155, "kl_div_sft": 0.021253643557429314, "learning_rate": 8.64600326264274e-08, "loss": -0.1237, "ppo_loss": 0.800000011920929, "sft_loss": 0.05894254520535469, "step": 1184 }, { "epoch": 2.87, "grad_norm": 1.0250840082865402, "importance_ratio": 1.03125, "kl_div": 0.01920795626938343, "kl_div_pos": 0.030632779002189636, "kl_div_sft": 0.007783134467899799, "learning_rate": 8.482871125611745e-08, "loss": -0.0527, "ppo_loss": -1.0311068296432495, "sft_loss": 0.08948075771331787, "step": 1185 }, { "epoch": 2.88, "grad_norm": 0.9286371917807487, "importance_ratio": 0.8359375, "kl_div": -0.2051847279071808, "kl_div_neg": -0.43950650095939636, "kl_div_pos": 0.029137054458260536, "learning_rate": 8.31973898858075e-08, "loss": -0.0859, "ppo_loss": -0.11478284001350403, "step": 1186 }, { "epoch": 2.88, "grad_norm": 0.9022022483156672, "importance_ratio": 0.76953125, "kl_div": -0.13229966163635254, "kl_div_neg": -0.2619016170501709, "kl_div_sft": -0.0026977057568728924, "learning_rate": 8.156606851549755e-08, "loss": 0.0617, "ppo_loss": 0.800000011920929, "sft_loss": 0.0610504224896431, "step": 1187 }, { "epoch": 2.88, "grad_norm": 1.2345106798762973, "importance_ratio": 0.7421875, "kl_div": -0.2970220446586609, "kl_div_neg": -0.2970220446586609, "learning_rate": 7.993474714518759e-08, "loss": -0.019, "ppo_loss": 0.800000011920929, "step": 1188 }, { "epoch": 2.88, "grad_norm": 0.6406334667111523, "importance_ratio": 1.0703125, "kl_div": 0.04250887781381607, "kl_div_pos": 0.06475761532783508, "kl_div_sft": 0.020260144025087357, "learning_rate": 7.830342577487764e-08, "loss": -0.1853, "ppo_loss": -1.066900372505188, "sft_loss": 0.05050653591752052, "step": 1189 }, { "epoch": 2.88, "grad_norm": 0.6454048940357756, "importance_ratio": 0.78125, "kl_div": -0.1214267909526825, "kl_div_neg": -0.24888797104358673, "kl_div_sft": 0.0060343933291733265, "learning_rate": 7.66721044045677e-08, "loss": 0.0587, "ppo_loss": 0.800000011920929, "sft_loss": 0.07454501837491989, "step": 1190 }, { "epoch": 2.89, "grad_norm": 0.6479468040162593, "importance_ratio": 1.03125, "kl_div": 0.018266797065734863, "kl_div_pos": 0.028083480894565582, "kl_div_sft": 0.008450115099549294, "learning_rate": 7.504078303425774e-08, "loss": 0.0932, "ppo_loss": -1.0284816026687622, "sft_loss": 0.033145107328891754, "step": 1191 }, { "epoch": 2.89, "grad_norm": 0.9432509460687126, "importance_ratio": 1.046875, "kl_div": 0.03311854973435402, "kl_div_pos": 0.04216361045837402, "kl_div_sft": 0.024073489010334015, "learning_rate": 7.340946166394779e-08, "loss": -0.0441, "ppo_loss": -1.0430651903152466, "sft_loss": 0.02739499695599079, "step": 1192 }, { "epoch": 2.89, "grad_norm": 1.4257255647837281, "importance_ratio": 0.8984375, "kl_div": -0.12912996113300323, "kl_div_neg": -0.341534823179245, "kl_div_pos": 0.08327490836381912, "learning_rate": 7.177814029363784e-08, "loss": 0.0506, "ppo_loss": -0.1434202492237091, "step": 1193 }, { "epoch": 2.89, "grad_norm": 0.9792687721313257, "importance_ratio": 0.859375, "kl_div": -0.16656380891799927, "kl_div_neg": -0.3459399938583374, "kl_div_pos": 0.012812378816306591, "learning_rate": 7.014681892332789e-08, "loss": -0.09, "ppo_loss": -0.10644736886024475, "step": 1194 }, { "epoch": 2.9, "grad_norm": 0.9705135229368407, "importance_ratio": 1.0390625, "kl_div": 0.0385335236787796, "kl_div_pos": 0.0385335236787796, "learning_rate": 6.851549755301794e-08, "loss": -0.0764, "ppo_loss": -1.0395358800888062, "step": 1195 }, { "epoch": 2.9, "grad_norm": 0.9032225849796234, "kl_div": 0.015084541402757168, "kl_div_sft": 0.015084541402757168, "learning_rate": 6.688417618270799e-08, "loss": -0.0185, "sft_loss": 0.08808805048465729, "step": 1196 }, { "epoch": 2.9, "grad_norm": 1.0286034812739275, "importance_ratio": 1.046875, "kl_div": 0.043941110372543335, "kl_div_pos": 0.047929711639881134, "kl_div_sft": 0.03995250537991524, "learning_rate": 6.525285481239803e-08, "loss": -0.1544, "ppo_loss": -1.049096941947937, "sft_loss": 0.08652502298355103, "step": 1197 }, { "epoch": 2.9, "grad_norm": 2.1520358936833315, "kl_div": -0.003481715451925993, "kl_div_sft": -0.003481715451925993, "learning_rate": 6.36215334420881e-08, "loss": -0.1079, "sft_loss": 0.06440573185682297, "step": 1198 }, { "epoch": 2.91, "grad_norm": 0.9205021738858639, "kl_div": 0.01033596321940422, "kl_div_sft": 0.01033596321940422, "learning_rate": 6.199021207177814e-08, "loss": -0.1552, "sft_loss": 0.06722992658615112, "step": 1199 }, { "epoch": 2.91, "grad_norm": 1.7470641714449784, "kl_div": 0.005851946771144867, "kl_div_sft": 0.005851946771144867, "learning_rate": 6.035889070146818e-08, "loss": -0.0059, "sft_loss": 0.033399973064661026, "step": 1200 }, { "epoch": 2.91, "grad_norm": 0.6236003462276024, "kl_div": 0.014642293564975262, "kl_div_sft": 0.014642293564975262, "learning_rate": 5.872756933115824e-08, "loss": 0.0587, "sft_loss": 0.07679164409637451, "step": 1201 }, { "epoch": 2.91, "grad_norm": 1.2955535222683352, "importance_ratio": 0.9375, "kl_div": -0.07583994418382645, "kl_div_neg": -0.19484470784664154, "kl_div_pos": 0.043164823204278946, "learning_rate": 5.709624796084828e-08, "loss": -0.0482, "ppo_loss": -0.11057373881340027, "step": 1202 }, { "epoch": 2.92, "grad_norm": 1.074403115837226, "kl_div": -0.00038597348611801863, "kl_div_sft": -0.00038597348611801863, "learning_rate": 5.546492659053834e-08, "loss": 0.0345, "sft_loss": 0.0903572291135788, "step": 1203 }, { "epoch": 2.92, "grad_norm": 0.7845828491455248, "kl_div": 0.019885778427124023, "kl_div_sft": 0.019885778427124023, "learning_rate": 5.383360522022838e-08, "loss": -0.0814, "sft_loss": 0.0418141670525074, "step": 1204 }, { "epoch": 2.92, "grad_norm": 2.3863401221896203, "importance_ratio": 1.0546875, "kl_div": 0.032779138535261154, "kl_div_pos": 0.051422979682683945, "kl_div_sft": 0.014135295525193214, "learning_rate": 5.220228384991843e-08, "loss": 0.0782, "ppo_loss": -1.0527681112289429, "sft_loss": 0.04892640933394432, "step": 1205 }, { "epoch": 2.92, "grad_norm": 0.771678504180194, "kl_div": 0.01562921702861786, "kl_div_sft": 0.01562921702861786, "learning_rate": 5.057096247960848e-08, "loss": -0.1562, "sft_loss": 0.02873784676194191, "step": 1206 }, { "epoch": 2.93, "grad_norm": 0.941453734660512, "kl_div": -0.0010949738789349794, "kl_div_sft": -0.0010949738789349794, "learning_rate": 4.893964110929853e-08, "loss": -0.0747, "sft_loss": 0.08132661879062653, "step": 1207 }, { "epoch": 2.93, "grad_norm": 1.0532282023135735, "importance_ratio": 0.8984375, "kl_div": -0.05000169575214386, "kl_div_pos": -0.10559673607349396, "kl_div_sft": 0.005593346897512674, "learning_rate": 4.7308319738988576e-08, "loss": -0.2808, "ppo_loss": -0.8997873663902283, "sft_loss": 0.11342725902795792, "step": 1208 }, { "epoch": 2.93, "grad_norm": 1.1130491315891495, "kl_div": 0.005920859519392252, "kl_div_sft": 0.005920859519392252, "learning_rate": 4.567699836867863e-08, "loss": -0.0926, "sft_loss": 0.04379911348223686, "step": 1209 }, { "epoch": 2.93, "grad_norm": 1.3022770343390342, "importance_ratio": 1.1015625, "kl_div": 0.04200161620974541, "kl_div_pos": 0.09724302589893341, "kl_div_sft": -0.013239794410765171, "learning_rate": 4.4045676998368676e-08, "loss": -0.088, "ppo_loss": -1.102128267288208, "sft_loss": 0.09195703268051147, "step": 1210 }, { "epoch": 2.94, "grad_norm": 1.2349028605509618, "importance_ratio": 0.76953125, "kl_div": -0.12613043189048767, "kl_div_neg": -0.2621628940105438, "kl_div_sft": 0.009902019053697586, "learning_rate": 4.241435562805873e-08, "loss": 0.0338, "ppo_loss": 0.800000011920929, "sft_loss": 0.13322798907756805, "step": 1211 }, { "epoch": 2.94, "grad_norm": 0.8190554478591701, "kl_div": 0.016479335725307465, "kl_div_sft": 0.016479335725307465, "learning_rate": 4.078303425774878e-08, "loss": -0.0813, "sft_loss": 0.06707711517810822, "step": 1212 }, { "epoch": 2.94, "grad_norm": 0.6853244344539269, "kl_div": 0.002904551802203059, "kl_div_sft": 0.002904551802203059, "learning_rate": 3.915171288743882e-08, "loss": 0.0193, "sft_loss": 0.07321298122406006, "step": 1213 }, { "epoch": 2.94, "grad_norm": 0.9767238025964826, "kl_div": 0.015007298439741135, "kl_div_sft": 0.015007298439741135, "learning_rate": 3.752039151712887e-08, "loss": -0.1253, "sft_loss": 0.04647599905729294, "step": 1214 }, { "epoch": 2.95, "grad_norm": 1.302545033296922, "importance_ratio": 1.0234375, "kl_div": 0.013774197548627853, "kl_div_pos": 0.02181210182607174, "kl_div_sft": 0.005736292339861393, "learning_rate": 3.588907014681892e-08, "loss": -0.11, "ppo_loss": -1.0220516920089722, "sft_loss": 0.03027566708624363, "step": 1215 }, { "epoch": 2.95, "grad_norm": 0.6969580733387959, "importance_ratio": 0.96875, "kl_div": -0.022010929882526398, "kl_div_pos": -0.031074170023202896, "kl_div_sft": -0.01294768787920475, "learning_rate": 3.425774877650897e-08, "loss": 0.0919, "ppo_loss": -0.9694036841392517, "sft_loss": 0.085136279463768, "step": 1216 }, { "epoch": 2.95, "grad_norm": 1.0893446350242888, "importance_ratio": 0.796875, "kl_div": -0.22950898110866547, "kl_div_neg": -0.22950898110866547, "learning_rate": 3.2626427406199015e-08, "loss": -0.124, "ppo_loss": 0.8365697264671326, "step": 1217 }, { "epoch": 2.95, "grad_norm": 1.0154642418031865, "kl_div": 0.017310921102762222, "kl_div_sft": 0.017310921102762222, "learning_rate": 3.099510603588907e-08, "loss": -0.0204, "sft_loss": 0.05979524925351143, "step": 1218 }, { "epoch": 2.96, "grad_norm": 1.3089672865018278, "importance_ratio": 1.0703125, "kl_div": 0.023883214220404625, "kl_div_pos": 0.06660286337137222, "kl_div_sft": -0.018836434930562973, "learning_rate": 2.936378466557912e-08, "loss": 0.0179, "ppo_loss": -1.0688709020614624, "sft_loss": 0.06836844235658646, "step": 1219 }, { "epoch": 2.96, "grad_norm": 0.7735948498894707, "importance_ratio": 1.0625, "kl_div": 0.04178933799266815, "kl_div_pos": 0.061134010553359985, "kl_div_sft": 0.02244466543197632, "learning_rate": 2.773246329526917e-08, "loss": 0.0265, "ppo_loss": -1.0630414485931396, "sft_loss": 0.029229391366243362, "step": 1220 }, { "epoch": 2.96, "grad_norm": 0.8287853990592418, "importance_ratio": 0.78125, "kl_div": -0.12434131652116776, "kl_div_neg": -0.24875983595848083, "kl_div_sft": 7.720127905486152e-05, "learning_rate": 2.6101141924959216e-08, "loss": -0.0963, "ppo_loss": 0.800000011920929, "sft_loss": 0.04770297557115555, "step": 1221 }, { "epoch": 2.96, "grad_norm": 0.6735727904476042, "importance_ratio": 1.109375, "kl_div": 0.06096731498837471, "kl_div_pos": 0.105230912566185, "kl_div_sft": 0.016703717410564423, "learning_rate": 2.4469820554649266e-08, "loss": -0.1107, "ppo_loss": -1.1109670400619507, "sft_loss": 0.02639000490307808, "step": 1222 }, { "epoch": 2.96, "grad_norm": 4.27384140644358, "importance_ratio": 1.078125, "kl_div": 0.047324467450380325, "kl_div_pos": 0.07398180663585663, "kl_div_sft": 0.02066713012754917, "learning_rate": 2.2838499184339316e-08, "loss": -0.1533, "ppo_loss": -1.0767872333526611, "sft_loss": 0.015478396788239479, "step": 1223 }, { "epoch": 2.97, "grad_norm": 0.7995789597116523, "importance_ratio": 1.0078125, "kl_div": 0.01401783712208271, "kl_div_pos": 0.010602614842355251, "kl_div_sft": 0.017433058470487595, "learning_rate": 2.1207177814029363e-08, "loss": -0.1075, "ppo_loss": -1.010659098625183, "sft_loss": 0.024448391050100327, "step": 1224 }, { "epoch": 2.97, "grad_norm": 0.6576333678713244, "kl_div": 0.009036296978592873, "kl_div_sft": 0.009036296978592873, "learning_rate": 1.957585644371941e-08, "loss": -0.0222, "sft_loss": 0.024351729080080986, "step": 1225 }, { "epoch": 2.97, "grad_norm": 1.0430261831864822, "importance_ratio": 0.6328125, "kl_div": -0.4564090967178345, "kl_div_neg": -0.4564090967178345, "learning_rate": 1.794453507340946e-08, "loss": -0.0877, "ppo_loss": 0.800000011920929, "step": 1226 }, { "epoch": 2.97, "grad_norm": 1.187129816279101, "importance_ratio": 0.7734375, "kl_div": -0.18304017186164856, "kl_div_neg": -0.25575414299964905, "kl_div_sft": -0.11032620817422867, "learning_rate": 1.6313213703099507e-08, "loss": -0.145, "ppo_loss": 0.800000011920929, "sft_loss": 0.17107929289340973, "step": 1227 }, { "epoch": 2.98, "grad_norm": 0.6751320868637353, "importance_ratio": 1.0625, "kl_div": 0.03507830947637558, "kl_div_pos": 0.05915962904691696, "kl_div_sft": 0.0109969861805439, "learning_rate": 1.468189233278956e-08, "loss": -0.1551, "ppo_loss": -1.0609445571899414, "sft_loss": 0.011611356399953365, "step": 1228 }, { "epoch": 2.98, "grad_norm": 0.8545330626384282, "importance_ratio": 1.0859375, "kl_div": 0.04665626212954521, "kl_div_pos": 0.08197905123233795, "kl_div_sft": 0.011333473958075047, "learning_rate": 1.3050570962479608e-08, "loss": -0.301, "ppo_loss": -1.0854331254959106, "sft_loss": 0.021651491522789, "step": 1229 }, { "epoch": 2.98, "grad_norm": 2.1311360572046136, "importance_ratio": 0.8125, "kl_div": -0.25613871216773987, "kl_div_neg": -0.563224196434021, "kl_div_pos": 0.05094676464796066, "learning_rate": 1.1419249592169658e-08, "loss": -0.1842, "ppo_loss": -0.12613347172737122, "step": 1230 }, { "epoch": 2.98, "grad_norm": 0.6622900252208386, "importance_ratio": 0.3671875, "kl_div": -0.49855610728263855, "kl_div_neg": -1.0007708072662354, "kl_div_sft": 0.0036585640627890825, "learning_rate": 9.787928221859705e-09, "loss": -0.0537, "ppo_loss": 0.800000011920929, "sft_loss": 0.06895247101783752, "step": 1231 }, { "epoch": 2.99, "grad_norm": 1.181454326981826, "importance_ratio": 1.109375, "kl_div": 0.06376396119594574, "kl_div_pos": 0.10577323287725449, "kl_div_sft": 0.021754683926701546, "learning_rate": 8.156606851549754e-09, "loss": -0.0258, "ppo_loss": -1.1115697622299194, "sft_loss": 0.01212720200419426, "step": 1232 }, { "epoch": 2.99, "grad_norm": 1.5721435499483776, "importance_ratio": 0.640625, "kl_div": -0.4436643123626709, "kl_div_neg": -0.4436643123626709, "learning_rate": 6.525285481239804e-09, "loss": -0.0093, "ppo_loss": 0.800000011920929, "step": 1233 }, { "epoch": 2.99, "grad_norm": 0.7975132947668616, "importance_ratio": 0.671875, "kl_div": -0.19387556612491608, "kl_div_neg": -0.4003700315952301, "kl_div_sft": 0.012618891894817352, "learning_rate": 4.8939641109298526e-09, "loss": -0.0591, "ppo_loss": 0.800000011920929, "sft_loss": 0.04209718108177185, "step": 1234 }, { "epoch": 2.99, "grad_norm": 0.5783795492283123, "importance_ratio": 0.74609375, "kl_div": -0.13893654942512512, "kl_div_neg": -0.2953203022480011, "kl_div_sft": 0.01744719222187996, "learning_rate": 3.262642740619902e-09, "loss": 0.0643, "ppo_loss": 0.800000011920929, "sft_loss": 0.006301300600171089, "step": 1235 }, { "epoch": 3.0, "grad_norm": 0.843983582553917, "importance_ratio": 0.80078125, "kl_div": -0.10064101964235306, "kl_div_neg": -0.22288663685321808, "kl_div_sft": 0.021604593843221664, "learning_rate": 1.631321370309951e-09, "loss": -0.1822, "ppo_loss": 0.8002055287361145, "sft_loss": 0.057481780648231506, "step": 1236 } ], "logging_steps": 1.0, "max_steps": 1236, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100.0, "total_flos": 120210122113024.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }