medgemma-4b-v6-2000 / trainer_state.json
santoshtec146's picture
Checkpoint at step 2000 - eval_loss: 1.1172
a0e712d verified
{
"best_global_step": 500,
"best_metric": 1.0007914304733276,
"best_model_checkpoint": "./medgemma-finetuned-checkpoints/checkpoint-500",
"epoch": 1.0810810810810811,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005405405405405405,
"grad_norm": 26.92987632751465,
"learning_rate": 0.0,
"loss": 3.0682,
"step": 1
},
{
"epoch": 0.005405405405405406,
"grad_norm": 5.15996789932251,
"learning_rate": 1.8e-05,
"loss": 2.5851,
"step": 10
},
{
"epoch": 0.010810810810810811,
"grad_norm": 1.1928019523620605,
"learning_rate": 3.8e-05,
"loss": 1.3708,
"step": 20
},
{
"epoch": 0.016216216216216217,
"grad_norm": 0.7614617347717285,
"learning_rate": 5.8e-05,
"loss": 1.0732,
"step": 30
},
{
"epoch": 0.021621621621621623,
"grad_norm": 0.7028294801712036,
"learning_rate": 7.800000000000001e-05,
"loss": 1.0224,
"step": 40
},
{
"epoch": 0.02702702702702703,
"grad_norm": 0.6575226783752441,
"learning_rate": 9.8e-05,
"loss": 0.963,
"step": 50
},
{
"epoch": 0.032432432432432434,
"grad_norm": 0.6642696261405945,
"learning_rate": 0.000118,
"loss": 0.9502,
"step": 60
},
{
"epoch": 0.03783783783783784,
"grad_norm": 0.6854572296142578,
"learning_rate": 0.000138,
"loss": 0.9465,
"step": 70
},
{
"epoch": 0.043243243243243246,
"grad_norm": 0.755558967590332,
"learning_rate": 0.00015800000000000002,
"loss": 0.9392,
"step": 80
},
{
"epoch": 0.04864864864864865,
"grad_norm": 0.6308918595314026,
"learning_rate": 0.00017800000000000002,
"loss": 0.9278,
"step": 90
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.6420126557350159,
"learning_rate": 0.00019800000000000002,
"loss": 0.9147,
"step": 100
},
{
"epoch": 0.05945945945945946,
"grad_norm": 0.7269052863121033,
"learning_rate": 0.00019999691576447898,
"loss": 0.9179,
"step": 110
},
{
"epoch": 0.06486486486486487,
"grad_norm": 0.6968148350715637,
"learning_rate": 0.00019998625445384374,
"loss": 0.8951,
"step": 120
},
{
"epoch": 0.07027027027027027,
"grad_norm": 0.6975257396697998,
"learning_rate": 0.00019996797880281932,
"loss": 0.9053,
"step": 130
},
{
"epoch": 0.07567567567567568,
"grad_norm": 0.8497925400733948,
"learning_rate": 0.0001999420902031673,
"loss": 0.8878,
"step": 140
},
{
"epoch": 0.08108108108108109,
"grad_norm": 0.7157301902770996,
"learning_rate": 0.00019990859062640477,
"loss": 0.8974,
"step": 150
},
{
"epoch": 0.08648648648648649,
"grad_norm": 0.657098114490509,
"learning_rate": 0.0001998674826236542,
"loss": 0.8784,
"step": 160
},
{
"epoch": 0.0918918918918919,
"grad_norm": 0.6943176984786987,
"learning_rate": 0.00019981876932544917,
"loss": 0.8935,
"step": 170
},
{
"epoch": 0.0972972972972973,
"grad_norm": 0.7490524053573608,
"learning_rate": 0.0001997624544414959,
"loss": 0.8523,
"step": 180
},
{
"epoch": 0.10270270270270271,
"grad_norm": 0.7332305312156677,
"learning_rate": 0.00019969854226039088,
"loss": 0.8589,
"step": 190
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.6616835594177246,
"learning_rate": 0.00019962703764929413,
"loss": 0.8727,
"step": 200
},
{
"epoch": 0.11351351351351352,
"grad_norm": 0.6945931315422058,
"learning_rate": 0.00019954794605355863,
"loss": 0.8255,
"step": 210
},
{
"epoch": 0.11891891891891893,
"grad_norm": 0.7543134093284607,
"learning_rate": 0.00019946127349631564,
"loss": 0.8157,
"step": 220
},
{
"epoch": 0.12432432432432433,
"grad_norm": 0.7396084070205688,
"learning_rate": 0.00019936702657801587,
"loss": 0.8329,
"step": 230
},
{
"epoch": 0.12972972972972974,
"grad_norm": 0.7503929138183594,
"learning_rate": 0.0001992652124759271,
"loss": 0.8281,
"step": 240
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.7012407779693604,
"learning_rate": 0.00019915583894358744,
"loss": 0.891,
"step": 250
},
{
"epoch": 0.14054054054054055,
"grad_norm": 0.6991789937019348,
"learning_rate": 0.00019903891431021477,
"loss": 0.8237,
"step": 260
},
{
"epoch": 0.14594594594594595,
"grad_norm": 0.6946726441383362,
"learning_rate": 0.0001989144474800726,
"loss": 0.8132,
"step": 270
},
{
"epoch": 0.15135135135135136,
"grad_norm": 0.7045170068740845,
"learning_rate": 0.00019878244793179197,
"loss": 0.8231,
"step": 280
},
{
"epoch": 0.15675675675675677,
"grad_norm": 0.7510865926742554,
"learning_rate": 0.00019864292571764955,
"loss": 0.8367,
"step": 290
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.7087928652763367,
"learning_rate": 0.00019849589146280213,
"loss": 0.8024,
"step": 300
},
{
"epoch": 0.16756756756756758,
"grad_norm": 0.7199848890304565,
"learning_rate": 0.00019834135636447747,
"loss": 0.8263,
"step": 310
},
{
"epoch": 0.17297297297297298,
"grad_norm": 0.7465829253196716,
"learning_rate": 0.00019817933219112158,
"loss": 0.8244,
"step": 320
},
{
"epoch": 0.1783783783783784,
"grad_norm": 0.791543185710907,
"learning_rate": 0.0001980098312815026,
"loss": 0.7822,
"step": 330
},
{
"epoch": 0.1837837837837838,
"grad_norm": 0.8022134900093079,
"learning_rate": 0.00019783286654377106,
"loss": 0.7901,
"step": 340
},
{
"epoch": 0.1891891891891892,
"grad_norm": 0.7753133773803711,
"learning_rate": 0.00019764845145447689,
"loss": 0.818,
"step": 350
},
{
"epoch": 0.1945945945945946,
"grad_norm": 0.729603111743927,
"learning_rate": 0.00019745660005754308,
"loss": 0.8011,
"step": 360
},
{
"epoch": 0.2,
"grad_norm": 0.7126052975654602,
"learning_rate": 0.00019725732696319632,
"loss": 0.7756,
"step": 370
},
{
"epoch": 0.20540540540540542,
"grad_norm": 0.7011649012565613,
"learning_rate": 0.00019705064734685425,
"loss": 0.7745,
"step": 380
},
{
"epoch": 0.21081081081081082,
"grad_norm": 0.7712651491165161,
"learning_rate": 0.00019683657694796985,
"loss": 0.8,
"step": 390
},
{
"epoch": 0.21621621621621623,
"grad_norm": 1.7429534196853638,
"learning_rate": 0.00019661513206883287,
"loss": 0.8019,
"step": 400
},
{
"epoch": 0.22162162162162163,
"grad_norm": 0.7776182293891907,
"learning_rate": 0.0001963863295733281,
"loss": 0.7547,
"step": 410
},
{
"epoch": 0.22702702702702704,
"grad_norm": 0.748671293258667,
"learning_rate": 0.0001961501868856515,
"loss": 0.7754,
"step": 420
},
{
"epoch": 0.23243243243243245,
"grad_norm": 0.7607825994491577,
"learning_rate": 0.00019590672198898295,
"loss": 0.7264,
"step": 430
},
{
"epoch": 0.23783783783783785,
"grad_norm": 0.7991960644721985,
"learning_rate": 0.000195655953424117,
"loss": 0.7033,
"step": 440
},
{
"epoch": 0.24324324324324326,
"grad_norm": 0.777020275592804,
"learning_rate": 0.0001953979002880507,
"loss": 0.7713,
"step": 450
},
{
"epoch": 0.24864864864864866,
"grad_norm": 0.7802757024765015,
"learning_rate": 0.00019513258223252948,
"loss": 0.7435,
"step": 460
},
{
"epoch": 0.25405405405405407,
"grad_norm": 0.753276526927948,
"learning_rate": 0.00019486001946255046,
"loss": 0.7578,
"step": 470
},
{
"epoch": 0.2594594594594595,
"grad_norm": 0.8009371757507324,
"learning_rate": 0.0001945802327348239,
"loss": 0.7306,
"step": 480
},
{
"epoch": 0.2648648648648649,
"grad_norm": 0.8639614582061768,
"learning_rate": 0.00019429324335619233,
"loss": 0.7406,
"step": 490
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.7347239851951599,
"learning_rate": 0.00019399907318200802,
"loss": 0.7526,
"step": 500
},
{
"epoch": 0.2702702702702703,
"eval_loss": 1.0007914304733276,
"eval_runtime": 1132.9831,
"eval_samples_per_second": 9.225,
"eval_steps_per_second": 2.306,
"step": 500
},
{
"epoch": 0.2756756756756757,
"grad_norm": 0.7618102431297302,
"learning_rate": 0.0001936977446144687,
"loss": 0.732,
"step": 510
},
{
"epoch": 0.2810810810810811,
"grad_norm": 0.7626764178276062,
"learning_rate": 0.00019338928060091143,
"loss": 0.7637,
"step": 520
},
{
"epoch": 0.2864864864864865,
"grad_norm": 0.7320432662963867,
"learning_rate": 0.0001930737046320651,
"loss": 0.7349,
"step": 530
},
{
"epoch": 0.2918918918918919,
"grad_norm": 0.8312086462974548,
"learning_rate": 0.00019275104074026152,
"loss": 0.7686,
"step": 540
},
{
"epoch": 0.2972972972972973,
"grad_norm": 0.7992410659790039,
"learning_rate": 0.00019242131349760534,
"loss": 0.727,
"step": 550
},
{
"epoch": 0.3027027027027027,
"grad_norm": 0.7825806140899658,
"learning_rate": 0.00019208454801410266,
"loss": 0.7416,
"step": 560
},
{
"epoch": 0.3081081081081081,
"grad_norm": 0.7664586305618286,
"learning_rate": 0.00019174076993574884,
"loss": 0.7572,
"step": 570
},
{
"epoch": 0.31351351351351353,
"grad_norm": 0.7890913486480713,
"learning_rate": 0.00019139000544257558,
"loss": 0.7235,
"step": 580
},
{
"epoch": 0.31891891891891894,
"grad_norm": 0.7417885065078735,
"learning_rate": 0.00019103228124665712,
"loss": 0.7293,
"step": 590
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.7056080102920532,
"learning_rate": 0.0001906676245900759,
"loss": 0.7299,
"step": 600
},
{
"epoch": 0.32972972972972975,
"grad_norm": 0.7992605566978455,
"learning_rate": 0.00019029606324284814,
"loss": 0.7445,
"step": 610
},
{
"epoch": 0.33513513513513515,
"grad_norm": 0.7960366606712341,
"learning_rate": 0.00018991762550080906,
"loss": 0.7448,
"step": 620
},
{
"epoch": 0.34054054054054056,
"grad_norm": 0.7352651357650757,
"learning_rate": 0.0001895323401834578,
"loss": 0.7246,
"step": 630
},
{
"epoch": 0.34594594594594597,
"grad_norm": 0.7147911190986633,
"learning_rate": 0.00018914023663176306,
"loss": 0.7206,
"step": 640
},
{
"epoch": 0.35135135135135137,
"grad_norm": 0.7172034382820129,
"learning_rate": 0.00018874134470592835,
"loss": 0.6924,
"step": 650
},
{
"epoch": 0.3567567567567568,
"grad_norm": 0.715886652469635,
"learning_rate": 0.00018833569478311817,
"loss": 0.7051,
"step": 660
},
{
"epoch": 0.3621621621621622,
"grad_norm": 0.6763896346092224,
"learning_rate": 0.0001879233177551447,
"loss": 0.7163,
"step": 670
},
{
"epoch": 0.3675675675675676,
"grad_norm": 0.7396337389945984,
"learning_rate": 0.00018750424502611527,
"loss": 0.753,
"step": 680
},
{
"epoch": 0.372972972972973,
"grad_norm": 0.7334195971488953,
"learning_rate": 0.00018707850851004058,
"loss": 0.6956,
"step": 690
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.7924448251724243,
"learning_rate": 0.00018664614062840473,
"loss": 0.7333,
"step": 700
},
{
"epoch": 0.3837837837837838,
"grad_norm": 0.7402865290641785,
"learning_rate": 0.00018620717430769586,
"loss": 0.707,
"step": 710
},
{
"epoch": 0.3891891891891892,
"grad_norm": 0.8460499048233032,
"learning_rate": 0.00018576164297689877,
"loss": 0.688,
"step": 720
},
{
"epoch": 0.3945945945945946,
"grad_norm": 0.7490622401237488,
"learning_rate": 0.00018530958056494932,
"loss": 0.6789,
"step": 730
},
{
"epoch": 0.4,
"grad_norm": 0.8118138909339905,
"learning_rate": 0.00018485102149815038,
"loss": 0.7102,
"step": 740
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.7157021164894104,
"learning_rate": 0.00018438600069755026,
"loss": 0.6946,
"step": 750
},
{
"epoch": 0.41081081081081083,
"grad_norm": 0.8860333561897278,
"learning_rate": 0.00018391455357628334,
"loss": 0.6833,
"step": 760
},
{
"epoch": 0.41621621621621624,
"grad_norm": 0.8259391784667969,
"learning_rate": 0.00018343671603687317,
"loss": 0.7003,
"step": 770
},
{
"epoch": 0.42162162162162165,
"grad_norm": 0.7634344100952148,
"learning_rate": 0.00018295252446849842,
"loss": 0.7218,
"step": 780
},
{
"epoch": 0.42702702702702705,
"grad_norm": 0.7924422025680542,
"learning_rate": 0.00018246201574422164,
"loss": 0.6759,
"step": 790
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.7786385416984558,
"learning_rate": 0.00018196522721818128,
"loss": 0.6768,
"step": 800
},
{
"epoch": 0.43783783783783786,
"grad_norm": 0.8325049877166748,
"learning_rate": 0.00018146219672274694,
"loss": 0.6845,
"step": 810
},
{
"epoch": 0.44324324324324327,
"grad_norm": 0.7333595156669617,
"learning_rate": 0.00018095296256563845,
"loss": 0.6891,
"step": 820
},
{
"epoch": 0.4486486486486487,
"grad_norm": 0.6770475506782532,
"learning_rate": 0.00018043756352700846,
"loss": 0.6923,
"step": 830
},
{
"epoch": 0.4540540540540541,
"grad_norm": 0.746793270111084,
"learning_rate": 0.0001799160388564892,
"loss": 0.7027,
"step": 840
},
{
"epoch": 0.4594594594594595,
"grad_norm": 0.8229703307151794,
"learning_rate": 0.00017938842827020348,
"loss": 0.73,
"step": 850
},
{
"epoch": 0.4648648648648649,
"grad_norm": 0.8228402733802795,
"learning_rate": 0.0001788547719477402,
"loss": 0.6763,
"step": 860
},
{
"epoch": 0.4702702702702703,
"grad_norm": 0.8647485971450806,
"learning_rate": 0.0001783151105290944,
"loss": 0.6937,
"step": 870
},
{
"epoch": 0.4756756756756757,
"grad_norm": 0.7954670786857605,
"learning_rate": 0.0001777694851115726,
"loss": 0.7183,
"step": 880
},
{
"epoch": 0.4810810810810811,
"grad_norm": 0.7662751078605652,
"learning_rate": 0.00017721793724666268,
"loss": 0.7343,
"step": 890
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.6992731094360352,
"learning_rate": 0.00017666050893687008,
"loss": 0.674,
"step": 900
},
{
"epoch": 0.4918918918918919,
"grad_norm": 0.7148111462593079,
"learning_rate": 0.0001760972426325187,
"loss": 0.7152,
"step": 910
},
{
"epoch": 0.4972972972972973,
"grad_norm": 1.4267653226852417,
"learning_rate": 0.00017552818122851838,
"loss": 0.6574,
"step": 920
},
{
"epoch": 0.5027027027027027,
"grad_norm": 0.7189494371414185,
"learning_rate": 0.00017495336806109827,
"loss": 0.6553,
"step": 930
},
{
"epoch": 0.5081081081081081,
"grad_norm": 0.7621554136276245,
"learning_rate": 0.00017437284690450654,
"loss": 0.7113,
"step": 940
},
{
"epoch": 0.5135135135135135,
"grad_norm": 0.8519230484962463,
"learning_rate": 0.00017378666196767685,
"loss": 0.6948,
"step": 950
},
{
"epoch": 0.518918918918919,
"grad_norm": 0.7450538873672485,
"learning_rate": 0.00017319485789086162,
"loss": 0.7074,
"step": 960
},
{
"epoch": 0.5243243243243243,
"grad_norm": 0.7189666032791138,
"learning_rate": 0.00017259747974223265,
"loss": 0.662,
"step": 970
},
{
"epoch": 0.5297297297297298,
"grad_norm": 0.7510509490966797,
"learning_rate": 0.00017199457301444868,
"loss": 0.6841,
"step": 980
},
{
"epoch": 0.5351351351351351,
"grad_norm": 0.8049071431159973,
"learning_rate": 0.00017138618362119137,
"loss": 0.6573,
"step": 990
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.7858609557151794,
"learning_rate": 0.00017077235789366842,
"loss": 0.6905,
"step": 1000
},
{
"epoch": 0.5405405405405406,
"eval_loss": 1.0087087154388428,
"eval_runtime": 1125.5024,
"eval_samples_per_second": 9.287,
"eval_steps_per_second": 2.322,
"step": 1000
},
{
"epoch": 0.5459459459459459,
"grad_norm": 0.7082162499427795,
"learning_rate": 0.0001701531425770856,
"loss": 0.6264,
"step": 1010
},
{
"epoch": 0.5513513513513514,
"grad_norm": 0.6960082054138184,
"learning_rate": 0.00016952858482708656,
"loss": 0.6739,
"step": 1020
},
{
"epoch": 0.5567567567567567,
"grad_norm": 0.6909856200218201,
"learning_rate": 0.00016889873220616206,
"loss": 0.7019,
"step": 1030
},
{
"epoch": 0.5621621621621622,
"grad_norm": 0.7821714282035828,
"learning_rate": 0.00016826363268002782,
"loss": 0.6896,
"step": 1040
},
{
"epoch": 0.5675675675675675,
"grad_norm": 0.7264735102653503,
"learning_rate": 0.00016762333461397156,
"loss": 0.6186,
"step": 1050
},
{
"epoch": 0.572972972972973,
"grad_norm": 0.7788714170455933,
"learning_rate": 0.00016697788676917007,
"loss": 0.6771,
"step": 1060
},
{
"epoch": 0.5783783783783784,
"grad_norm": 0.7341744303703308,
"learning_rate": 0.00016632733829897566,
"loss": 0.6633,
"step": 1070
},
{
"epoch": 0.5837837837837838,
"grad_norm": 0.7561785578727722,
"learning_rate": 0.00016567173874517307,
"loss": 0.6771,
"step": 1080
},
{
"epoch": 0.5891891891891892,
"grad_norm": 0.777637243270874,
"learning_rate": 0.00016501113803420658,
"loss": 0.6717,
"step": 1090
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.775454044342041,
"learning_rate": 0.0001643455864733779,
"loss": 0.6573,
"step": 1100
},
{
"epoch": 0.6,
"grad_norm": 0.7935672402381897,
"learning_rate": 0.0001636751347470152,
"loss": 0.65,
"step": 1110
},
{
"epoch": 0.6054054054054054,
"grad_norm": 0.6799283623695374,
"learning_rate": 0.00016299983391261324,
"loss": 0.6433,
"step": 1120
},
{
"epoch": 0.6108108108108108,
"grad_norm": 0.7489930987358093,
"learning_rate": 0.00016231973539694504,
"loss": 0.656,
"step": 1130
},
{
"epoch": 0.6162162162162163,
"grad_norm": 0.8089328408241272,
"learning_rate": 0.0001616348909921457,
"loss": 0.6679,
"step": 1140
},
{
"epoch": 0.6216216216216216,
"grad_norm": 0.769966185092926,
"learning_rate": 0.00016094535285176813,
"loss": 0.654,
"step": 1150
},
{
"epoch": 0.6270270270270271,
"grad_norm": 0.9080989360809326,
"learning_rate": 0.00016025117348681132,
"loss": 0.6612,
"step": 1160
},
{
"epoch": 0.6324324324324324,
"grad_norm": 0.7278012633323669,
"learning_rate": 0.00015955240576172165,
"loss": 0.6392,
"step": 1170
},
{
"epoch": 0.6378378378378379,
"grad_norm": 0.7645131945610046,
"learning_rate": 0.0001588491028903667,
"loss": 0.6899,
"step": 1180
},
{
"epoch": 0.6432432432432432,
"grad_norm": 0.7604426741600037,
"learning_rate": 0.00015814131843198308,
"loss": 0.6567,
"step": 1190
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.738106369972229,
"learning_rate": 0.00015742910628709756,
"loss": 0.6641,
"step": 1200
},
{
"epoch": 0.654054054054054,
"grad_norm": 0.7150177359580994,
"learning_rate": 0.00015671252069342247,
"loss": 0.6813,
"step": 1210
},
{
"epoch": 0.6594594594594595,
"grad_norm": 0.7188438773155212,
"learning_rate": 0.00015599161622172517,
"loss": 0.6387,
"step": 1220
},
{
"epoch": 0.6648648648648648,
"grad_norm": 0.7173952460289001,
"learning_rate": 0.00015526644777167219,
"loss": 0.6722,
"step": 1230
},
{
"epoch": 0.6702702702702703,
"grad_norm": 0.8392152786254883,
"learning_rate": 0.00015453707056764862,
"loss": 0.6926,
"step": 1240
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.8548230528831482,
"learning_rate": 0.0001538035401545525,
"loss": 0.6296,
"step": 1250
},
{
"epoch": 0.6810810810810811,
"grad_norm": 0.7427430748939514,
"learning_rate": 0.00015306591239356475,
"loss": 0.6508,
"step": 1260
},
{
"epoch": 0.6864864864864865,
"grad_norm": 0.7263091206550598,
"learning_rate": 0.0001523242434578952,
"loss": 0.6528,
"step": 1270
},
{
"epoch": 0.6918918918918919,
"grad_norm": 0.7919740080833435,
"learning_rate": 0.00015157858982850475,
"loss": 0.638,
"step": 1280
},
{
"epoch": 0.6972972972972973,
"grad_norm": 0.720586359500885,
"learning_rate": 0.00015082900828980423,
"loss": 0.667,
"step": 1290
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.9061957001686096,
"learning_rate": 0.00015007555592532997,
"loss": 0.6308,
"step": 1300
},
{
"epoch": 0.7081081081081081,
"grad_norm": 0.6996462345123291,
"learning_rate": 0.00014931829011339659,
"loss": 0.6463,
"step": 1310
},
{
"epoch": 0.7135135135135136,
"grad_norm": 0.7758413553237915,
"learning_rate": 0.00014855726852272753,
"loss": 0.6184,
"step": 1320
},
{
"epoch": 0.7189189189189189,
"grad_norm": 0.6599385142326355,
"learning_rate": 0.00014779254910806335,
"loss": 0.6329,
"step": 1330
},
{
"epoch": 0.7243243243243244,
"grad_norm": 0.7816442251205444,
"learning_rate": 0.00014702419010574825,
"loss": 0.6696,
"step": 1340
},
{
"epoch": 0.7297297297297297,
"grad_norm": 0.8203967213630676,
"learning_rate": 0.00014625225002929502,
"loss": 0.6835,
"step": 1350
},
{
"epoch": 0.7351351351351352,
"grad_norm": 0.8271581530570984,
"learning_rate": 0.00014547678766492917,
"loss": 0.6574,
"step": 1360
},
{
"epoch": 0.7405405405405405,
"grad_norm": 0.708739697933197,
"learning_rate": 0.00014469786206711214,
"loss": 0.6094,
"step": 1370
},
{
"epoch": 0.745945945945946,
"grad_norm": 0.6651840209960938,
"learning_rate": 0.00014391553255404385,
"loss": 0.6615,
"step": 1380
},
{
"epoch": 0.7513513513513513,
"grad_norm": 0.6395004987716675,
"learning_rate": 0.00014312985870314568,
"loss": 0.6278,
"step": 1390
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.7492053508758545,
"learning_rate": 0.00014234090034652324,
"loss": 0.6139,
"step": 1400
},
{
"epoch": 0.7621621621621621,
"grad_norm": 0.7917840480804443,
"learning_rate": 0.00014154871756640996,
"loss": 0.6471,
"step": 1410
},
{
"epoch": 0.7675675675675676,
"grad_norm": 0.7364196181297302,
"learning_rate": 0.00014075337069059158,
"loss": 0.6409,
"step": 1420
},
{
"epoch": 0.772972972972973,
"grad_norm": 0.7311397194862366,
"learning_rate": 0.00013995492028781202,
"loss": 0.6093,
"step": 1430
},
{
"epoch": 0.7783783783783784,
"grad_norm": 0.8387266397476196,
"learning_rate": 0.00013915342716316076,
"loss": 0.6334,
"step": 1440
},
{
"epoch": 0.7837837837837838,
"grad_norm": 0.7672157883644104,
"learning_rate": 0.00013834895235344242,
"loss": 0.6243,
"step": 1450
},
{
"epoch": 0.7891891891891892,
"grad_norm": 0.7562130093574524,
"learning_rate": 0.00013754155712252832,
"loss": 0.6561,
"step": 1460
},
{
"epoch": 0.7945945945945946,
"grad_norm": 0.7368100881576538,
"learning_rate": 0.0001367313029566913,
"loss": 0.6359,
"step": 1470
},
{
"epoch": 0.8,
"grad_norm": 0.7335553765296936,
"learning_rate": 0.0001359182515599231,
"loss": 0.618,
"step": 1480
},
{
"epoch": 0.8054054054054054,
"grad_norm": 0.8272745013237,
"learning_rate": 0.00013510246484923547,
"loss": 0.6431,
"step": 1490
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.9075089693069458,
"learning_rate": 0.00013428400494994484,
"loss": 0.642,
"step": 1500
},
{
"epoch": 0.8108108108108109,
"eval_loss": 1.1368237733840942,
"eval_runtime": 1125.1843,
"eval_samples_per_second": 9.289,
"eval_steps_per_second": 2.322,
"step": 1500
},
{
"epoch": 0.8162162162162162,
"grad_norm": 0.7253302931785583,
"learning_rate": 0.00013346293419094134,
"loss": 0.6315,
"step": 1510
},
{
"epoch": 0.8216216216216217,
"grad_norm": 0.8258981704711914,
"learning_rate": 0.0001326393150999422,
"loss": 0.5844,
"step": 1520
},
{
"epoch": 0.827027027027027,
"grad_norm": 0.7383174896240234,
"learning_rate": 0.00013181321039872993,
"loss": 0.6457,
"step": 1530
},
{
"epoch": 0.8324324324324325,
"grad_norm": 0.8157215714454651,
"learning_rate": 0.000130984682998376,
"loss": 0.6183,
"step": 1540
},
{
"epoch": 0.8378378378378378,
"grad_norm": 0.7327470183372498,
"learning_rate": 0.00013015379599444957,
"loss": 0.623,
"step": 1550
},
{
"epoch": 0.8432432432432433,
"grad_norm": 0.6934810280799866,
"learning_rate": 0.00012932061266221305,
"loss": 0.6548,
"step": 1560
},
{
"epoch": 0.8486486486486486,
"grad_norm": 0.8231585025787354,
"learning_rate": 0.00012848519645180295,
"loss": 0.5803,
"step": 1570
},
{
"epoch": 0.8540540540540541,
"grad_norm": 0.6958197355270386,
"learning_rate": 0.0001276476109833981,
"loss": 0.6392,
"step": 1580
},
{
"epoch": 0.8594594594594595,
"grad_norm": 0.7134739756584167,
"learning_rate": 0.00012680792004237477,
"loss": 0.6153,
"step": 1590
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.736421525478363,
"learning_rate": 0.00012596618757444917,
"loss": 0.5727,
"step": 1600
},
{
"epoch": 0.8702702702702703,
"grad_norm": 0.8574367761611938,
"learning_rate": 0.00012512247768080756,
"loss": 0.6177,
"step": 1610
},
{
"epoch": 0.8756756756756757,
"grad_norm": 0.7083766460418701,
"learning_rate": 0.00012427685461322496,
"loss": 0.6445,
"step": 1620
},
{
"epoch": 0.8810810810810811,
"grad_norm": 0.7544513940811157,
"learning_rate": 0.00012342938276917187,
"loss": 0.6136,
"step": 1630
},
{
"epoch": 0.8864864864864865,
"grad_norm": 0.6936790347099304,
"learning_rate": 0.0001225801266869104,
"loss": 0.5966,
"step": 1640
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.8453409075737,
"learning_rate": 0.00012172915104057919,
"loss": 0.5977,
"step": 1650
},
{
"epoch": 0.8972972972972973,
"grad_norm": 0.6956205368041992,
"learning_rate": 0.00012087652063526838,
"loss": 0.6106,
"step": 1660
},
{
"epoch": 0.9027027027027027,
"grad_norm": 0.7600128054618835,
"learning_rate": 0.00012002230040208447,
"loss": 0.6296,
"step": 1670
},
{
"epoch": 0.9081081081081082,
"grad_norm": 0.6999400854110718,
"learning_rate": 0.00011916655539320547,
"loss": 0.5919,
"step": 1680
},
{
"epoch": 0.9135135135135135,
"grad_norm": 0.7615451812744141,
"learning_rate": 0.00011830935077692695,
"loss": 0.6066,
"step": 1690
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.7937692999839783,
"learning_rate": 0.0001174507518326992,
"loss": 0.6108,
"step": 1700
},
{
"epoch": 0.9243243243243243,
"grad_norm": 0.6862788796424866,
"learning_rate": 0.00011659082394615607,
"loss": 0.609,
"step": 1710
},
{
"epoch": 0.9297297297297298,
"grad_norm": 0.7376629710197449,
"learning_rate": 0.00011572963260413547,
"loss": 0.6384,
"step": 1720
},
{
"epoch": 0.9351351351351351,
"grad_norm": 0.6539621949195862,
"learning_rate": 0.00011486724338969232,
"loss": 0.5801,
"step": 1730
},
{
"epoch": 0.9405405405405406,
"grad_norm": 0.7667025923728943,
"learning_rate": 0.00011400372197710414,
"loss": 0.585,
"step": 1740
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.7780829071998596,
"learning_rate": 0.00011313913412686981,
"loss": 0.6193,
"step": 1750
},
{
"epoch": 0.9513513513513514,
"grad_norm": 0.9730708003044128,
"learning_rate": 0.0001122735456807015,
"loss": 0.5813,
"step": 1760
},
{
"epoch": 0.9567567567567568,
"grad_norm": 0.8667449355125427,
"learning_rate": 0.00011140702255651063,
"loss": 0.6694,
"step": 1770
},
{
"epoch": 0.9621621621621622,
"grad_norm": 0.7646722197532654,
"learning_rate": 0.00011053963074338797,
"loss": 0.6237,
"step": 1780
},
{
"epoch": 0.9675675675675676,
"grad_norm": 0.7633317112922668,
"learning_rate": 0.00010967143629657842,
"loss": 0.5807,
"step": 1790
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.7953817844390869,
"learning_rate": 0.00010880250533245038,
"loss": 0.5905,
"step": 1800
},
{
"epoch": 0.9783783783783784,
"grad_norm": 0.6951556205749512,
"learning_rate": 0.00010793290402346094,
"loss": 0.5662,
"step": 1810
},
{
"epoch": 0.9837837837837838,
"grad_norm": 0.7796500325202942,
"learning_rate": 0.00010706269859311669,
"loss": 0.6131,
"step": 1820
},
{
"epoch": 0.9891891891891892,
"grad_norm": 0.8852221369743347,
"learning_rate": 0.00010619195531093017,
"loss": 0.5827,
"step": 1830
},
{
"epoch": 0.9945945945945946,
"grad_norm": 0.7465667128562927,
"learning_rate": 0.00010532074048737364,
"loss": 0.6013,
"step": 1840
},
{
"epoch": 1.0,
"grad_norm": 0.85235995054245,
"learning_rate": 0.00010444912046882888,
"loss": 0.5973,
"step": 1850
},
{
"epoch": 1.0054054054054054,
"grad_norm": 0.780853807926178,
"learning_rate": 0.00010357716163253497,
"loss": 0.4934,
"step": 1860
},
{
"epoch": 1.0108108108108107,
"grad_norm": 0.7970598340034485,
"learning_rate": 0.00010270493038153319,
"loss": 0.5365,
"step": 1870
},
{
"epoch": 1.0162162162162163,
"grad_norm": 0.7823016047477722,
"learning_rate": 0.0001018324931396103,
"loss": 0.4971,
"step": 1880
},
{
"epoch": 1.0216216216216216,
"grad_norm": 0.7091385722160339,
"learning_rate": 0.00010095991634624,
"loss": 0.5203,
"step": 1890
},
{
"epoch": 1.027027027027027,
"grad_norm": 0.8729794025421143,
"learning_rate": 0.00010008726645152353,
"loss": 0.4914,
"step": 1900
},
{
"epoch": 1.0324324324324325,
"grad_norm": 0.7777389883995056,
"learning_rate": 9.921460991112891e-05,
"loss": 0.53,
"step": 1910
},
{
"epoch": 1.037837837837838,
"grad_norm": 0.674041211605072,
"learning_rate": 9.834201318123025e-05,
"loss": 0.4955,
"step": 1920
},
{
"epoch": 1.0432432432432432,
"grad_norm": 0.7665605545043945,
"learning_rate": 9.746954271344703e-05,
"loss": 0.5419,
"step": 1930
},
{
"epoch": 1.0486486486486486,
"grad_norm": 0.7776033282279968,
"learning_rate": 9.659726494978325e-05,
"loss": 0.5131,
"step": 1940
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.7598256468772888,
"learning_rate": 9.572524631756778e-05,
"loss": 0.523,
"step": 1950
},
{
"epoch": 1.0594594594594595,
"grad_norm": 0.7463747262954712,
"learning_rate": 9.48535532243956e-05,
"loss": 0.5088,
"step": 1960
},
{
"epoch": 1.0648648648648649,
"grad_norm": 0.7281492948532104,
"learning_rate": 9.398225205307066e-05,
"loss": 0.5112,
"step": 1970
},
{
"epoch": 1.0702702702702702,
"grad_norm": 0.7457937002182007,
"learning_rate": 9.311140915655054e-05,
"loss": 0.5642,
"step": 1980
},
{
"epoch": 1.0756756756756758,
"grad_norm": 0.775947093963623,
"learning_rate": 9.224109085289343e-05,
"loss": 0.5331,
"step": 1990
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.7909854650497437,
"learning_rate": 9.137136342020768e-05,
"loss": 0.5022,
"step": 2000
},
{
"epoch": 1.0810810810810811,
"eval_loss": 1.1171799898147583,
"eval_runtime": 1127.8981,
"eval_samples_per_second": 9.267,
"eval_steps_per_second": 2.317,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 3700,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.802421158143263e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}